参考链接:https://blog.csdn.net/u013733326/article/details/79907419
代码:
# coding=utf-8 # This is a sample Python script. # Press ⌃R to execute it or replace it with your code. # Press Double ⇧ to search everywhere for classes, files, tool windows, actions, and settings. import numpy as np import matplotlib.pyplot as plt import scipy.io import math import sklearn import sklearn.datasets import opt_utils #参见数据包或者在本文底部copy import testCase #参见数据包或者在本文底部copy #%matplotlib inline #如果你用的是Jupyter Notebook请取消注释 # Press the green button in the gutter to run the script. def update_parameters_with_gd(parameters, grads, learning_rate): L = len(parameters) // 2 for l in range(L): parameters["W" + str(l + 1)] = parameters["W" + str(l + 1)] - learning_rate * grads["dW" + str(l + 1)] parameters["b" + str(l + 1)] = parameters["b" + str(l + 1)] - learning_rate * grads["db" + str(l + 1)] return parameters def random_mini_batches(X, Y, mini_batch_size = 64, seed = 0): np.random.seed(seed) m = X.shape[1] mini_batches = [] permutation = list(np.random.permutation(m)) shuffled_X = X[:, permutation] shuffled_Y = Y[:, permutation].reshape(1, m) num_complete_minibatches = math.floor(m / mini_batch_size) for k in range(0, num_complete_minibatches): mini_batches_X = shuffled_X[:, k * mini_batch_size: (k + 1) * mini_batch_size] mini_batches_Y = shuffled_Y[:, k * mini_batch_size: (k + 1) * mini_batch_size] mini_batch = (mini_batches_X, mini_batches_Y) mini_batches.append(mini_batch) if m % mini_batch_size != 0: mini_batches_X = shuffled_X[:, mini_batch_size * num_complete_minibatches:] mini_batches_Y = shuffled_Y[:, mini_batch_size * num_complete_minibatches:] mini_batch = (mini_batches_X, mini_batches_Y) mini_batches.append(mini_batch) return mini_batches def init_velocity(parameters): L = len(parameters) // 2 v = {} for l in range(L): v["dW" + str(l + 1)] = np.zeros_like(parameters["W" + str(l + 1)]) v["db" + str(l + 1)] = np.zeros_like(parameters["b" + str(l + 1)]) return v def update_parameters_with_momentun(parameters, grads, v, beta, learning_rate): L = len(parameters) // 2 for l in range(L): v["dW" + str(l + 1)] = beta * v["dW" + str(l + 1)] + (1 - beta) * grads["dW" + str(l + 1)] v["db" + str(l + 1)] = beta * v["db" + str(l + 1)] + (1 - beta) * grads["db" + str(l + 1)] parameters["W" + str(l + 1)] = parameters["W" + str(l + 1)] - learning_rate * v["dW" + str(l + 1)] parameters["b" + str(l + 1)] = parameters["b" + str(l + 1)] - learning_rate * v["db" + str(l + 1)] return parameters, v def init_adam(parameters): L = len(parameters) // 2 v = {} s = {} for l in range(L): v["dW" + str(l + 1)] = np.zeros_like(parameters["W" + str(l + 1)]) v["db" + str(l + 1)] = np.zeros_like(parameters["b" + str(l + 1)]) s["dW" + str(l + 1)] = np.zeros_like(parameters["W" + str(l + 1)]) s["db" + str(l + 1)] = np.zeros_like(parameters["b" + str(l + 1)]) return v, s def update_parameters_with_adam(parameters, grads, v, s, t, learning_rate = 0.01, beta1 = 0.9, beta2 = 0.999, eps = 1e-8): L = len(parameters) // 2 v_corrected = {}#修正偏差之后的值 s_corrected = {} for l in range(L): #Momentum部分 v["dW" + str(l + 1)] = beta1 * v["dW" + str(l + 1)] + (1 - beta1) * grads["dW" + str(l + 1)] v["db" + str(l + 1)] = beta1 * v["db" + str(l + 1)] + (1 - beta1) * grads["db" + str(l + 1)] #修正 v_corrected["dW" + str(l + 1)] = v["dW" + str(l + 1)] / (1 - np.power(beta1, t)) v_corrected["db" + str(l + 1)] = v["db" + str(l + 1)] / (1 - np.power(beta1, t)) #RMSprop部分 s["dW" + str(l + 1)] = beta2 * s["dW" + str(l + 1)] + (1 - beta2) * np.square(grads["dW" + str(l + 1)]) s["db" + str(l + 1)] = beta2 * s["db" + str(l + 1)] + (1 - beta2) * np.square(grads["db" + str(l + 1)]) #修正 s_corrected["dW" + str(l + 1)] = s["dW" + str(l + 1)] / (1 - np.power(beta2, t)) s_corrected["db" + str(l + 1)] = s["db" + str(l + 1)] / (1 - np.power(beta2, t)) #更新参数 parameters["W" + str(l + 1)] = parameters["W" + str(l + 1)] - learning_rate * (v_corrected["dW" + str(l + 1)] / np.sqrt(s_corrected["dW" + str(l + 1)] + eps)) parameters["b" + str(l + 1)] = parameters["b" + str(l + 1)] - learning_rate * (v_corrected["db" + str(l + 1)] / np.sqrt(s_corrected["db" + str(l + 1)] + eps)) return parameters, v, s def model(X, Y, layers_dims, optimizer, learning_rate=0.0007, mini_batch_size=64, beta=0.9, beta1=0.9, beta2=0.999, epsilon=1e-8, num_epochs=10000, print_cost=True, is_plot=True): """ 可以运行在不同优化器模式下的3层神经网络模型。 参数: X - 输入数据,维度为(2,输入的数据集里面样本数量) Y - 与X对应的标签 layers_dims - 包含层数和节点数量的列表 optimizer - 字符串类型的参数,用于选择优化类型,【 "gd" | "momentum" | "adam" 】 learning_rate - 学习率 mini_batch_size - 每个小批量数据集的大小 beta - 用于动量优化的一个超参数 beta1 - 用于计算梯度后的指数衰减的估计的超参数 beta1 - 用于计算平方梯度后的指数衰减的估计的超参数 epsilon - 用于在Adam中避免除零操作的超参数,一般不更改 num_epochs - 整个训练集的遍历次数,(视频2.9学习率衰减,1分55秒处,视频中称作“代”),相当于之前的num_iteration print_cost - 是否打印误差值,每遍历1000次数据集打印一次,但是每100次记录一个误差值,又称每1000代打印一次 is_plot - 是否绘制出曲线图 返回: parameters - 包含了学习后的参数 """ L = len(layers_dims) costs = [] t = 0 # 每学习完一个minibatch就增加1 seed = 10 # 随机种子 # 初始化参数 parameters = opt_utils.initialize_parameters(layers_dims) # 选择优化器 if optimizer == "gd": pass # 不使用任何优化器,直接使用梯度下降法 elif optimizer == "momentum": v = init_velocity(parameters) # 使用动量 elif optimizer == "adam": v, s = init_adam(parameters) # 使用Adam优化 else: print("optimizer参数错误,程序退出。") exit(1) # 开始学习 for i in range(num_epochs): # 定义随机 minibatches,我们在每次遍历数据集之后增加种子以重新排列数据集,使每次数据的顺序都不同 seed = seed + 1 minibatches = random_mini_batches(X, Y, mini_batch_size, seed) for minibatch in minibatches: # 选择一个minibatch (minibatch_X, minibatch_Y) = minibatch # 前向传播 A3, cache = opt_utils.forward_propagation(minibatch_X, parameters) # 计算误差 cost = opt_utils.compute_cost(A3, minibatch_Y) # 反向传播 grads = opt_utils.backward_propagation(minibatch_X, minibatch_Y, cache) # 更新参数 if optimizer == "gd": parameters = update_parameters_with_gd(parameters, grads, learning_rate) elif optimizer == "momentum": parameters, v = update_parameters_with_momentun(parameters, grads, v, beta, learning_rate) elif optimizer == "adam": t = t + 1 parameters, v, s = update_parameters_with_adam(parameters, grads, v, s, t, learning_rate, beta1, beta2, epsilon) # 记录误差值 if i % 100 == 0: costs.append(cost) # 是否打印误差值 if print_cost and i % 1000 == 0: print("第" + str(i) + "次遍历整个数据集,当前误差值:" + str(cost)) # 是否绘制曲线图 if is_plot: plt.plot(costs) plt.ylabel('cost') plt.xlabel('epochs (per 100)') plt.title("Learning rate = " + str(learning_rate)) plt.show() return parameters if __name__ == '__main__': # 测试initialize_adam train_X, train_Y = opt_utils.load_dataset(is_plot=True) layers_dims = [train_X.shape[0], 5, 2, 1] parameters = model(train_X, train_Y, layers_dims, optimizer="adam", is_plot=True) preditions = opt_utils.predict(train_X, train_Y, parameters) # 绘制分类图 plt.title("Model with Gradient Descent optimization") axes = plt.gca() axes.set_xlim([-1.5, 2.5]) axes.set_ylim([-1, 1.5]) opt_utils.plot_decision_boundary(lambda x: opt_utils.predict_dec(parameters, x.T), train_X, train_Y) plt.show() # plt.rcParams['figure.figsize'] = (7.0, 4.0) # set default size of plots # plt.rcParams['image.interpolation'] = 'nearest' # plt.rcParams['image.cmap'] = 'gray' # plt.show() # See PyCharm help at https://www.jetbrains.com/help/pycharm/
本站文章如无特殊说明,均为本站原创,如若转载,请注明出处:吴恩达《深度学习》第二课第二周编程作业 - Python技术站