Deep Learning 32: 自己写的keras的一个callbacks函数,解决keras中不能在每个epoch实时显示学习速率learning rate的问题

一.问题:

keras中不能在每个epoch实时显示学习速率learning rate,从而方便调试,实际上也是为了调试解决这个问题:Deep Learning 31: 不同版本的keras,对同样的代码,得到不同结果的原因总结

二.解决方法

1.把下面代码加入keras文件callbacks.py中:

 1 class DisplayLearningRate(Callback):
 2     '''Display Learning rate .
 3     '''
 4     def __init__(self):
 5         super(DisplayLearningRate, self).__init__()
 6 
 7     def on_epoch_begin(self, epoch, logs={}):
 8         assert hasattr(self.model.optimizer, 'lr'), \
 9             'Optimizer must have a "lr" attribute.'
10         lr_now = K.get_value(self.model.optimizer.lr)
11 
12         print('Epoch %05d: Learning rate is  %s' % (epoch, lr_now))

2.应用方法如下:

 1 history = model.fit(X_train,
 2     Y_train,
 3     batch_size=batch_size,
 4     nb_epoch=nb_epoch,
 5     show_accuracy=False,
 6     verbose=2,
 7     validation_data=(X_test, Y_test),
 8     callbacks = [
 9         keras.callbacks.DisplayLearningRate(),
10         keras.callbacks.ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True, mode='auto'), # 该回调函数将在每个epoch后保存模型到filepath
11         # keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=0, mode='auto')# 当监测值不再改善时，该回调函数将中止训练.当early stop被激活（如发现loss相比上一个epoch训练没有下降），则经过patience个epoch后停止训练
12     ])

三.总结

按照上面的方法试了之后发现,每个epoch显示的learning rate都是一样的,原来按照这样显示的是最开始初始化时的learning rate,每次epoch学习速率更新后,并没有把值赋给初始时的learning rate,所以才会这样,那么要怎么样才能实时显示每个epoch的学习速率呢? 我觉得应该是显示optimizer中的updates.

四.最终办法

 1 # set the decay as 1e-1 to see the Ir change between epochs.
 2 sgd = SGD(lr=0.1, decay=1e-1, momentum=0.9, nesterov=True)
 3 model.compile(loss='categorical_crossentropy',
 4               optimizer=sgd,
 5               metrics=['accuracy'])
 6 class LossHistory(Callback):
 7     def on_epoch_begin(self, batch, logs={}):
 8         lr = self.lr * (1. / (1. + self.decay * self.iterations))
 9         print('Ir:', lr)
10 history=LossHistory()
11 model.fit(X_train, Y_train,
12           batch_size= batch_size,
13           nb_epoch= nb_epoch,
14           callbacks= [history])

参考：http://stackoverflow.com/questions/40144805/print-learning-rate-evary-epoch-in-sgd

下面我分别把keras==0.3.3和1.2.0时的optimizer.py分别贴出来:

keras==0.3.3时的optimizer.py如下:

  1 from __future__ import absolute_import
  2 from . import backend as K
  3 import numpy as np
  4 from .utils.generic_utils import get_from_module
  5 from six.moves import zip
  6 
  7 
  8 def clip_norm(g, c, n):
  9     if c > 0:
 10         g = K.switch(n >= c, g * c / n, g)
 11     return g
 12 
 13 
 14 def kl_divergence(p, p_hat):
 15     return p_hat - p + p * K.log(p / p_hat)
 16 
 17 
 18 class Optimizer(object):
 19     '''Abstract optimizer base class.
 20 
 21     Note: this is the parent class of all optimizers, not an actual optimizer
 22     that can be used for training models.
 23 
 24     All Keras optimizers support the following keyword arguments:
 25 
 26         clipnorm: float >= 0. Gradients will be clipped
 27             when their L2 norm exceeds this value.
 28         clipvalue: float >= 0. Gradients will be clipped
 29             when their absolute value exceeds this value.
 30     '''
 31     def __init__(self, **kwargs):
 32         self.__dict__.update(kwargs)
 33         self.updates = []
 34 
 35     def get_state(self):
 36         return [K.get_value(u[0]) for u in self.updates]
 37 
 38     def set_state(self, value_list):
 39         assert len(self.updates) == len(value_list)
 40         for u, v in zip(self.updates, value_list):
 41             K.set_value(u[0], v)
 42 
 43     def get_updates(self, params, constraints, loss):
 44         raise NotImplementedError
 45 
 46     def get_gradients(self, loss, params):
 47         grads = K.gradients(loss, params)
 48         if hasattr(self, 'clipnorm') and self.clipnorm > 0:
 49             norm = K.sqrt(sum([K.sum(K.square(g)) for g in grads]))
 50             grads = [clip_norm(g, self.clipnorm, norm) for g in grads]
 51         if hasattr(self, 'clipvalue') and self.clipvalue > 0:
 52             grads = [K.clip(g, -self.clipvalue, self.clipvalue) for g in grads]
 53         return grads
 54 
 55     def get_config(self):
 56         return {"name": self.__class__.__name__}
 57 
 58 
 59 class SGD(Optimizer):
 60     '''Stochastic gradient descent, with support for momentum,
 61     decay, and Nesterov momentum.
 62 
 63     # Arguments
 64         lr: float >= 0. Learning rate.
 65         momentum: float >= 0. Parameter updates momentum.
 66         decay: float >= 0. Learning rate decay over each update.
 67         nesterov: boolean. Whether to apply Nesterov momentum.
 68     '''
 69     def __init__(self, lr=0.01, momentum=0., decay=0., nesterov=False,
 70                  *args, **kwargs):
 71         super(SGD, self).__init__(**kwargs)
 72         self.__dict__.update(locals())
 73         self.iterations = K.variable(0.)
 74         self.lr = K.variable(lr)
 75         self.momentum = K.variable(momentum)
 76         self.decay = K.variable(decay)
 77 
 78     def get_updates(self, params, constraints, loss):
 79         grads = self.get_gradients(loss, params)
 80         lr = self.lr * (1.0 / (1.0 + self.decay * self.iterations))
 81         self.updates = [(self.iterations, self.iterations + 1.)]
 82 
 83         for p, g, c in zip(params, grads, constraints):
 84             m = K.variable(np.zeros(K.get_value(p).shape))  # momentum
 85             v = self.momentum * m - lr * g  # velocity
 86             self.updates.append((m, v))
 87 
 88             if self.nesterov:
 89                 new_p = p + self.momentum * v - lr * g
 90             else:
 91                 new_p = p + v
 92 
 93             self.updates.append((p, c(new_p)))  # apply constraints
 94         return self.updates
 95 
 96     def get_config(self):
 97         return {"name": self.__class__.__name__,
 98                 "lr": float(K.get_value(self.lr)),
 99                 "momentum": float(K.get_value(self.momentum)),
100                 "decay": float(K.get_value(self.decay)),
101                 "nesterov": self.nesterov}
102 
103 
104 class RMSprop(Optimizer):
105     '''RMSProp optimizer.
106 
107     It is recommended to leave the parameters of this optimizer
108     at their default values.
109 
110     This optimizer is usually a good choice for recurrent
111     neural networks.
112 
113     # Arguments
114         lr: float >= 0. Learning rate.
115         rho: float >= 0.
116         epsilon: float >= 0. Fuzz factor.
117     '''
118     def __init__(self, lr=0.001, rho=0.9, epsilon=1e-6, *args, **kwargs):
119         super(RMSprop, self).__init__(**kwargs)
120         self.__dict__.update(locals())
121         self.lr = K.variable(lr)
122         self.rho = K.variable(rho)
123 
124     def get_updates(self, params, constraints, loss):
125         grads = self.get_gradients(loss, params)
126         accumulators = [K.variable(np.zeros(K.get_value(p).shape)) for p in params]
127         self.updates = []
128 
129         for p, g, a, c in zip(params, grads, accumulators, constraints):
130             # update accumulator
131             new_a = self.rho * a + (1 - self.rho) * K.square(g)
132             self.updates.append((a, new_a))
133 
134             new_p = p - self.lr * g / K.sqrt(new_a + self.epsilon)
135             self.updates.append((p, c(new_p)))  # apply constraints
136         return self.updates
137 
138     def get_config(self):
139         return {"name": self.__class__.__name__,
140                 "lr": float(K.get_value(self.lr)),
141                 "rho": float(K.get_value(self.rho)),
142                 "epsilon": self.epsilon}
143 
144 
145 class Adagrad(Optimizer):
146     '''Adagrad optimizer.
147 
148     It is recommended to leave the parameters of this optimizer
149     at their default values.
150 
151     # Arguments
152         lr: float >= 0. Learning rate.
153         epsilon: float >= 0.
154     '''
155     def __init__(self, lr=0.01, epsilon=1e-6, *args, **kwargs):
156         super(Adagrad, self).__init__(**kwargs)
157         self.__dict__.update(locals())
158         self.lr = K.variable(lr)
159 
160     def get_updates(self, params, constraints, loss):
161         grads = self.get_gradients(loss, params)
162         accumulators = [K.variable(np.zeros(K.get_value(p).shape)) for p in params]
163         self.updates = []
164 
165         for p, g, a, c in zip(params, grads, accumulators, constraints):
166             new_a = a + K.square(g)  # update accumulator
167             self.updates.append((a, new_a))
168             new_p = p - self.lr * g / K.sqrt(new_a + self.epsilon)
169             self.updates.append((p, c(new_p)))  # apply constraints
170         return self.updates
171 
172     def get_config(self):
173         return {"name": self.__class__.__name__,
174                 "lr": float(K.get_value(self.lr)),
175                 "epsilon": self.epsilon}
176 
177 
178 class Adadelta(Optimizer):
179     '''Adadelta optimizer.
180 
181     It is recommended to leave the parameters of this optimizer
182     at their default values.
183 
184     # Arguments
185         lr: float >= 0. Learning rate. It is recommended to leave it at the default value.
186         rho: float >= 0.
187         epsilon: float >= 0. Fuzz factor.
188 
189     # References
190         - [Adadelta - an adaptive learning rate method](http://arxiv.org/abs/1212.5701)
191     '''
192     def __init__(self, lr=1.0, rho=0.95, epsilon=1e-6, *args, **kwargs):
193         super(Adadelta, self).__init__(**kwargs)
194         self.__dict__.update(locals())
195         self.lr = K.variable(lr)
196 
197     def get_updates(self, params, constraints, loss):
198         grads = self.get_gradients(loss, params)
199         accumulators = [K.variable(np.zeros(K.get_value(p).shape)) for p in params]
200         delta_accumulators = [K.variable(np.zeros(K.get_value(p).shape)) for p in params]
201         self.updates = []
202 
203         for p, g, a, d_a, c in zip(params, grads, accumulators,
204                                    delta_accumulators, constraints):
205             # update accumulator
206             new_a = self.rho * a + (1 - self.rho) * K.square(g)
207             self.updates.append((a, new_a))
208 
209             # use the new accumulator and the *old* delta_accumulator
210             update = g * K.sqrt(d_a + self.epsilon) / K.sqrt(new_a + self.epsilon)
211 
212             new_p = p - self.lr * update
213             self.updates.append((p, c(new_p)))  # apply constraints
214 
215             # update delta_accumulator
216             new_d_a = self.rho * d_a + (1 - self.rho) * K.square(update)
217             self.updates.append((d_a, new_d_a))
218         return self.updates
219 
220     def get_config(self):
221         return {"name": self.__class__.__name__,
222                 "lr": float(K.get_value(self.lr)),
223                 "rho": self.rho,
224                 "epsilon": self.epsilon}
225 
226 
227 class Adam(Optimizer):
228     '''Adam optimizer.
229 
230     Default parameters follow those provided in the original paper.
231 
232     # Arguments
233         lr: float >= 0. Learning rate.
234         beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
235         epsilon: float >= 0. Fuzz factor.
236 
237     # References
238         - [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8)
239     '''
240     def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8,
241                  *args, **kwargs):
242         super(Adam, self).__init__(**kwargs)
243         self.__dict__.update(locals())
244         self.iterations = K.variable(0)
245         self.lr = K.variable(lr)
246         self.beta_1 = K.variable(beta_1)
247         self.beta_2 = K.variable(beta_2)
248 
249     def get_updates(self, params, constraints, loss):
250         grads = self.get_gradients(loss, params)
251         self.updates = [(self.iterations, self.iterations+1.)]
252 
253         t = self.iterations + 1
254         lr_t = self.lr * K.sqrt(1 - K.pow(self.beta_2, t)) / (1 - K.pow(self.beta_1, t))
255 
256         for p, g, c in zip(params, grads, constraints):
257             # zero init of moment
258             m = K.variable(np.zeros(K.get_value(p).shape))
259             # zero init of velocity
260             v = K.variable(np.zeros(K.get_value(p).shape))
261 
262             m_t = (self.beta_1 * m) + (1 - self.beta_1) * g
263             v_t = (self.beta_2 * v) + (1 - self.beta_2) * K.square(g)
264             p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
265 
266             self.updates.append((m, m_t))
267             self.updates.append((v, v_t))
268             self.updates.append((p, c(p_t)))  # apply constraints
269         return self.updates
270 
271     def get_config(self):
272         return {"name": self.__class__.__name__,
273                 "lr": float(K.get_value(self.lr)),
274                 "beta_1": float(K.get_value(self.beta_1)),
275                 "beta_2": float(K.get_value(self.beta_2)),
276                 "epsilon": self.epsilon}
277 
278 
279 class Adamax(Optimizer):
280     '''Adamax optimizer from Adam paper's Section 7. It is a variant
281      of Adam based on the infinity norm.
282 
283     Default parameters follow those provided in the paper.
284 
285     # Arguments
286         lr: float >= 0. Learning rate.
287         beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
288         epsilon: float >= 0. Fuzz factor.
289 
290     # References
291         - [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8)
292     '''
293     def __init__(self, lr=0.002, beta_1=0.9, beta_2=0.999, epsilon=1e-8,
294                  *args, **kwargs):
295         super(Adamax, self).__init__(**kwargs)
296         self.__dict__.update(locals())
297         self.iterations = K.variable(0)
298         self.lr = K.variable(lr)
299         self.beta_1 = K.variable(beta_1)
300         self.beta_2 = K.variable(beta_2)
301 
302     def get_updates(self, params, constraints, loss):
303         grads = self.get_gradients(loss, params)
304         self.updates = [(self.iterations, self.iterations+1.)]
305 
306         t = self.iterations + 1
307         lr_t = self.lr / (1 - K.pow(self.beta_1, t))
308 
309         for p, g, c in zip(params, grads, constraints):
310             # zero init of 1st moment
311             m = K.variable(np.zeros(K.get_value(p).shape))
312             # zero init of exponentially weighted infinity norm
313             u = K.variable(np.zeros(K.get_value(p).shape))
314 
315             m_t = (self.beta_1 * m) + (1 - self.beta_1) * g
316             u_t = K.maximum(self.beta_2 * u, K.abs(g))
317             p_t = p - lr_t * m_t / (u_t + self.epsilon)
318 
319             self.updates.append((m, m_t))
320             self.updates.append((u, u_t))
321             self.updates.append((p, c(p_t)))  # apply constraints
322         return self.updates
323 
324     def get_config(self):
325         return {"name": self.__class__.__name__,
326                 "lr": float(K.get_value(self.lr)),
327                 "beta_1": float(K.get_value(self.beta_1)),
328                 "beta_2": float(K.get_value(self.beta_2)),
329                 "epsilon": self.epsilon}
330 
331 
332 # aliases
333 sgd = SGD
334 rmsprop = RMSprop
335 adagrad = Adagrad
336 adadelta = Adadelta
337 adam = Adam
338 adamax = Adamax
339 
340 
341 def get(identifier, kwargs=None):
342     return get_from_module(identifier, globals(), 'optimizer',
343                            instantiate=True, kwargs=kwargs)