一.问题:
keras中不能在每个epoch实时显示学习速率learning rate,从而方便调试,实际上也是为了调试解决这个问题:Deep Learning 31: 不同版本的keras,对同样的代码,得到不同结果的原因总结
二.解决方法
1.把下面代码加入keras文件callbacks.py中:
1 class DisplayLearningRate(Callback): 2 '''Display Learning rate . 3 ''' 4 def __init__(self): 5 super(DisplayLearningRate, self).__init__() 6 7 def on_epoch_begin(self, epoch, logs={}): 8 assert hasattr(self.model.optimizer, 'lr'), \ 9 'Optimizer must have a "lr" attribute.' 10 lr_now = K.get_value(self.model.optimizer.lr) 11 12 print('Epoch %05d: Learning rate is %s' % (epoch, lr_now))
2.应用方法如下:
1 history = model.fit(X_train, 2 Y_train, 3 batch_size=batch_size, 4 nb_epoch=nb_epoch, 5 show_accuracy=False, 6 verbose=2, 7 validation_data=(X_test, Y_test), 8 callbacks = [ 9 keras.callbacks.DisplayLearningRate(), 10 keras.callbacks.ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True, mode='auto'), # 该回调函数将在每个epoch后保存模型到filepath 11 # keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=0, mode='auto')# 当监测值不再改善时,该回调函数将中止训练.当early stop被激活(如发现loss相比上一个epoch训练没有下降),则经过patience个epoch后停止训练 12 ])
三.总结
按照上面的方法试了之后发现,每个epoch显示的learning rate都是一样的,原来按照这样显示的是最开始初始化时的learning rate,每次epoch学习速率更新后,并没有把值赋给初始时的learning rate,所以才会这样,那么要怎么样才能实时显示每个epoch的学习速率呢? 我觉得应该是显示optimizer中的updates.
四.最终办法
1 # set the decay as 1e-1 to see the Ir change between epochs. 2 sgd = SGD(lr=0.1, decay=1e-1, momentum=0.9, nesterov=True) 3 model.compile(loss='categorical_crossentropy', 4 optimizer=sgd, 5 metrics=['accuracy']) 6 class LossHistory(Callback): 7 def on_epoch_begin(self, batch, logs={}): 8 lr = self.lr * (1. / (1. + self.decay * self.iterations)) 9 print('Ir:', lr) 10 history=LossHistory() 11 model.fit(X_train, Y_train, 12 batch_size= batch_size, 13 nb_epoch= nb_epoch, 14 callbacks= [history])
参考:http://stackoverflow.com/questions/40144805/print-learning-rate-evary-epoch-in-sgd
下面我分别把keras==0.3.3和1.2.0时的optimizer.py分别贴出来:
keras==0.3.3时的optimizer.py如下:
1 from __future__ import absolute_import 2 from . import backend as K 3 import numpy as np 4 from .utils.generic_utils import get_from_module 5 from six.moves import zip 6 7 8 def clip_norm(g, c, n): 9 if c > 0: 10 g = K.switch(n >= c, g * c / n, g) 11 return g 12 13 14 def kl_divergence(p, p_hat): 15 return p_hat - p + p * K.log(p / p_hat) 16 17 18 class Optimizer(object): 19 '''Abstract optimizer base class. 20 21 Note: this is the parent class of all optimizers, not an actual optimizer 22 that can be used for training models. 23 24 All Keras optimizers support the following keyword arguments: 25 26 clipnorm: float >= 0. Gradients will be clipped 27 when their L2 norm exceeds this value. 28 clipvalue: float >= 0. Gradients will be clipped 29 when their absolute value exceeds this value. 30 ''' 31 def __init__(self, **kwargs): 32 self.__dict__.update(kwargs) 33 self.updates = [] 34 35 def get_state(self): 36 return [K.get_value(u[0]) for u in self.updates] 37 38 def set_state(self, value_list): 39 assert len(self.updates) == len(value_list) 40 for u, v in zip(self.updates, value_list): 41 K.set_value(u[0], v) 42 43 def get_updates(self, params, constraints, loss): 44 raise NotImplementedError 45 46 def get_gradients(self, loss, params): 47 grads = K.gradients(loss, params) 48 if hasattr(self, 'clipnorm') and self.clipnorm > 0: 49 norm = K.sqrt(sum([K.sum(K.square(g)) for g in grads])) 50 grads = [clip_norm(g, self.clipnorm, norm) for g in grads] 51 if hasattr(self, 'clipvalue') and self.clipvalue > 0: 52 grads = [K.clip(g, -self.clipvalue, self.clipvalue) for g in grads] 53 return grads 54 55 def get_config(self): 56 return {"name": self.__class__.__name__} 57 58 59 class SGD(Optimizer): 60 '''Stochastic gradient descent, with support for momentum, 61 decay, and Nesterov momentum. 62 63 # Arguments 64 lr: float >= 0. Learning rate. 65 momentum: float >= 0. Parameter updates momentum. 66 decay: float >= 0. Learning rate decay over each update. 67 nesterov: boolean. Whether to apply Nesterov momentum. 68 ''' 69 def __init__(self, lr=0.01, momentum=0., decay=0., nesterov=False, 70 *args, **kwargs): 71 super(SGD, self).__init__(**kwargs) 72 self.__dict__.update(locals()) 73 self.iterations = K.variable(0.) 74 self.lr = K.variable(lr) 75 self.momentum = K.variable(momentum) 76 self.decay = K.variable(decay) 77 78 def get_updates(self, params, constraints, loss): 79 grads = self.get_gradients(loss, params) 80 lr = self.lr * (1.0 / (1.0 + self.decay * self.iterations)) 81 self.updates = [(self.iterations, self.iterations + 1.)] 82 83 for p, g, c in zip(params, grads, constraints): 84 m = K.variable(np.zeros(K.get_value(p).shape)) # momentum 85 v = self.momentum * m - lr * g # velocity 86 self.updates.append((m, v)) 87 88 if self.nesterov: 89 new_p = p + self.momentum * v - lr * g 90 else: 91 new_p = p + v 92 93 self.updates.append((p, c(new_p))) # apply constraints 94 return self.updates 95 96 def get_config(self): 97 return {"name": self.__class__.__name__, 98 "lr": float(K.get_value(self.lr)), 99 "momentum": float(K.get_value(self.momentum)), 100 "decay": float(K.get_value(self.decay)), 101 "nesterov": self.nesterov} 102 103 104 class RMSprop(Optimizer): 105 '''RMSProp optimizer. 106 107 It is recommended to leave the parameters of this optimizer 108 at their default values. 109 110 This optimizer is usually a good choice for recurrent 111 neural networks. 112 113 # Arguments 114 lr: float >= 0. Learning rate. 115 rho: float >= 0. 116 epsilon: float >= 0. Fuzz factor. 117 ''' 118 def __init__(self, lr=0.001, rho=0.9, epsilon=1e-6, *args, **kwargs): 119 super(RMSprop, self).__init__(**kwargs) 120 self.__dict__.update(locals()) 121 self.lr = K.variable(lr) 122 self.rho = K.variable(rho) 123 124 def get_updates(self, params, constraints, loss): 125 grads = self.get_gradients(loss, params) 126 accumulators = [K.variable(np.zeros(K.get_value(p).shape)) for p in params] 127 self.updates = [] 128 129 for p, g, a, c in zip(params, grads, accumulators, constraints): 130 # update accumulator 131 new_a = self.rho * a + (1 - self.rho) * K.square(g) 132 self.updates.append((a, new_a)) 133 134 new_p = p - self.lr * g / K.sqrt(new_a + self.epsilon) 135 self.updates.append((p, c(new_p))) # apply constraints 136 return self.updates 137 138 def get_config(self): 139 return {"name": self.__class__.__name__, 140 "lr": float(K.get_value(self.lr)), 141 "rho": float(K.get_value(self.rho)), 142 "epsilon": self.epsilon} 143 144 145 class Adagrad(Optimizer): 146 '''Adagrad optimizer. 147 148 It is recommended to leave the parameters of this optimizer 149 at their default values. 150 151 # Arguments 152 lr: float >= 0. Learning rate. 153 epsilon: float >= 0. 154 ''' 155 def __init__(self, lr=0.01, epsilon=1e-6, *args, **kwargs): 156 super(Adagrad, self).__init__(**kwargs) 157 self.__dict__.update(locals()) 158 self.lr = K.variable(lr) 159 160 def get_updates(self, params, constraints, loss): 161 grads = self.get_gradients(loss, params) 162 accumulators = [K.variable(np.zeros(K.get_value(p).shape)) for p in params] 163 self.updates = [] 164 165 for p, g, a, c in zip(params, grads, accumulators, constraints): 166 new_a = a + K.square(g) # update accumulator 167 self.updates.append((a, new_a)) 168 new_p = p - self.lr * g / K.sqrt(new_a + self.epsilon) 169 self.updates.append((p, c(new_p))) # apply constraints 170 return self.updates 171 172 def get_config(self): 173 return {"name": self.__class__.__name__, 174 "lr": float(K.get_value(self.lr)), 175 "epsilon": self.epsilon} 176 177 178 class Adadelta(Optimizer): 179 '''Adadelta optimizer. 180 181 It is recommended to leave the parameters of this optimizer 182 at their default values. 183 184 # Arguments 185 lr: float >= 0. Learning rate. It is recommended to leave it at the default value. 186 rho: float >= 0. 187 epsilon: float >= 0. Fuzz factor. 188 189 # References 190 - [Adadelta - an adaptive learning rate method](http://arxiv.org/abs/1212.5701) 191 ''' 192 def __init__(self, lr=1.0, rho=0.95, epsilon=1e-6, *args, **kwargs): 193 super(Adadelta, self).__init__(**kwargs) 194 self.__dict__.update(locals()) 195 self.lr = K.variable(lr) 196 197 def get_updates(self, params, constraints, loss): 198 grads = self.get_gradients(loss, params) 199 accumulators = [K.variable(np.zeros(K.get_value(p).shape)) for p in params] 200 delta_accumulators = [K.variable(np.zeros(K.get_value(p).shape)) for p in params] 201 self.updates = [] 202 203 for p, g, a, d_a, c in zip(params, grads, accumulators, 204 delta_accumulators, constraints): 205 # update accumulator 206 new_a = self.rho * a + (1 - self.rho) * K.square(g) 207 self.updates.append((a, new_a)) 208 209 # use the new accumulator and the *old* delta_accumulator 210 update = g * K.sqrt(d_a + self.epsilon) / K.sqrt(new_a + self.epsilon) 211 212 new_p = p - self.lr * update 213 self.updates.append((p, c(new_p))) # apply constraints 214 215 # update delta_accumulator 216 new_d_a = self.rho * d_a + (1 - self.rho) * K.square(update) 217 self.updates.append((d_a, new_d_a)) 218 return self.updates 219 220 def get_config(self): 221 return {"name": self.__class__.__name__, 222 "lr": float(K.get_value(self.lr)), 223 "rho": self.rho, 224 "epsilon": self.epsilon} 225 226 227 class Adam(Optimizer): 228 '''Adam optimizer. 229 230 Default parameters follow those provided in the original paper. 231 232 # Arguments 233 lr: float >= 0. Learning rate. 234 beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1. 235 epsilon: float >= 0. Fuzz factor. 236 237 # References 238 - [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8) 239 ''' 240 def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8, 241 *args, **kwargs): 242 super(Adam, self).__init__(**kwargs) 243 self.__dict__.update(locals()) 244 self.iterations = K.variable(0) 245 self.lr = K.variable(lr) 246 self.beta_1 = K.variable(beta_1) 247 self.beta_2 = K.variable(beta_2) 248 249 def get_updates(self, params, constraints, loss): 250 grads = self.get_gradients(loss, params) 251 self.updates = [(self.iterations, self.iterations+1.)] 252 253 t = self.iterations + 1 254 lr_t = self.lr * K.sqrt(1 - K.pow(self.beta_2, t)) / (1 - K.pow(self.beta_1, t)) 255 256 for p, g, c in zip(params, grads, constraints): 257 # zero init of moment 258 m = K.variable(np.zeros(K.get_value(p).shape)) 259 # zero init of velocity 260 v = K.variable(np.zeros(K.get_value(p).shape)) 261 262 m_t = (self.beta_1 * m) + (1 - self.beta_1) * g 263 v_t = (self.beta_2 * v) + (1 - self.beta_2) * K.square(g) 264 p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) 265 266 self.updates.append((m, m_t)) 267 self.updates.append((v, v_t)) 268 self.updates.append((p, c(p_t))) # apply constraints 269 return self.updates 270 271 def get_config(self): 272 return {"name": self.__class__.__name__, 273 "lr": float(K.get_value(self.lr)), 274 "beta_1": float(K.get_value(self.beta_1)), 275 "beta_2": float(K.get_value(self.beta_2)), 276 "epsilon": self.epsilon} 277 278 279 class Adamax(Optimizer): 280 '''Adamax optimizer from Adam paper's Section 7. It is a variant 281 of Adam based on the infinity norm. 282 283 Default parameters follow those provided in the paper. 284 285 # Arguments 286 lr: float >= 0. Learning rate. 287 beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1. 288 epsilon: float >= 0. Fuzz factor. 289 290 # References 291 - [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8) 292 ''' 293 def __init__(self, lr=0.002, beta_1=0.9, beta_2=0.999, epsilon=1e-8, 294 *args, **kwargs): 295 super(Adamax, self).__init__(**kwargs) 296 self.__dict__.update(locals()) 297 self.iterations = K.variable(0) 298 self.lr = K.variable(lr) 299 self.beta_1 = K.variable(beta_1) 300 self.beta_2 = K.variable(beta_2) 301 302 def get_updates(self, params, constraints, loss): 303 grads = self.get_gradients(loss, params) 304 self.updates = [(self.iterations, self.iterations+1.)] 305 306 t = self.iterations + 1 307 lr_t = self.lr / (1 - K.pow(self.beta_1, t)) 308 309 for p, g, c in zip(params, grads, constraints): 310 # zero init of 1st moment 311 m = K.variable(np.zeros(K.get_value(p).shape)) 312 # zero init of exponentially weighted infinity norm 313 u = K.variable(np.zeros(K.get_value(p).shape)) 314 315 m_t = (self.beta_1 * m) + (1 - self.beta_1) * g 316 u_t = K.maximum(self.beta_2 * u, K.abs(g)) 317 p_t = p - lr_t * m_t / (u_t + self.epsilon) 318 319 self.updates.append((m, m_t)) 320 self.updates.append((u, u_t)) 321 self.updates.append((p, c(p_t))) # apply constraints 322 return self.updates 323 324 def get_config(self): 325 return {"name": self.__class__.__name__, 326 "lr": float(K.get_value(self.lr)), 327 "beta_1": float(K.get_value(self.beta_1)), 328 "beta_2": float(K.get_value(self.beta_2)), 329 "epsilon": self.epsilon} 330 331 332 # aliases 333 sgd = SGD 334 rmsprop = RMSprop 335 adagrad = Adagrad 336 adadelta = Adadelta 337 adam = Adam 338 adamax = Adamax 339 340 341 def get(identifier, kwargs=None): 342 return get_from_module(identifier, globals(), 'optimizer', 343 instantiate=True, kwargs=kwargs)
View Code
本站文章如无特殊说明,均为本站原创,如若转载,请注明出处:Deep Learning 32: 自己写的keras的一个callbacks函数,解决keras中不能在每个epoch实时显示学习速率learning rate的问题 - Python技术站