机器学习作业（四）神经网络参数的拟合——Python(numpy)实现

题目下载【传送门】
题目简述：识别图片中的数字，训练该模型，求参数θ。
出现了一个问题：虽然训练的模型能够有很好的预测准确率，但是使用minimize函数时候始终无法成功，无论设计的迭代次数有多大，如下图：
机器学习作业（四）神经网络参数的拟合——Python(numpy)实现
  1 import numpy as np
  2 import scipy.io as scio
  3 import matplotlib.pyplot as plt
  4 import scipy.optimize as op
  5 
  6 # X:5000*400
  7 # Y:5000*10
  8 # a1:5000*401（后5000*400）
  9 # z2:5000*25
 10 # a2:5000*26（后5000*25）
 11 # z3:5000*10
 12 # a3:5000*10
 13 # Theta1:25*401
 14 # Theta2:10*26
 15 # delta3:5000*10
 16 # delta2:5000*25
 17 # bigDelta1:25*401
 18 # bigDelta2:10*26
 19 # Theta1_grad:25*401
 20 # Theta2_grad:10*26
 21 
 22 
 23 #显示图片数据
 24 def displayData(X):
 25     m = np.size(X, 0)  #X的行数，即样本数量
 26     n = np.size(X, 1)  #X的列数，即单个样本大小
 27     example_width = int(np.round(np.sqrt(n)))  #单张图片宽度
 28     example_height = int(np.floor(n / example_width))  #单张图片高度
 29     display_rows = int(np.floor(np.sqrt(m)))  #显示图中，一行多少张图
 30     display_cols = int(np.ceil(m / display_rows))  #显示图中，一列多少张图片
 31     pad = 1  #图片间的间隔
 32     display_array = - np.ones((pad + display_rows * (example_height + pad),
 33                             pad + display_cols * (example_width + pad)))  #初始化图片矩阵
 34     curr_ex = 0  #当前的图片计数
 35     #将每张小图插入图片数组中
 36     for j in range(0, display_rows):
 37         for i in range(0, display_cols):
 38             if curr_ex >= m:
 39                 break
 40             max_val = np.max(abs(X[curr_ex, :]))
 41             jstart = pad + j * (example_height + pad)
 42             istart = pad + i * (example_width + pad)
 43             display_array[jstart: (jstart + example_height), istart: (istart + example_width)] = \
 44                 np.array(X[curr_ex, :]).reshape(example_height, example_width) / max_val
 45             curr_ex = curr_ex + 1
 46         if curr_ex >= m:
 47             break
 48     display_array = display_array.T
 49     plt.imshow(display_array,cmap=plt.cm.gray)
 50     plt.axis('off')
 51     plt.show()
 52 
 53 
 54 #计算hθ(z)
 55 def sigmoid(z):
 56     g = 1.0 / (1.0 + np.exp(-z))
 57     return g
 58 
 59 
 60 #初始化Θ，保持在[-ε，ε]
 61 def randInitializeWeights(sizeList):
 62     epsilon_init = 0.12
 63     theta1_lx = sizeList['theta1_lx']
 64     theta1_ly = sizeList['theta1_ly']
 65     theta2_lx = sizeList['theta2_lx']
 66     theta2_ly = sizeList['theta2_ly']
 67     theta_size = theta1_lx * theta1_ly + theta2_lx * theta2_ly
 68     W = np.random.uniform(-epsilon_init, epsilon_init, theta_size)
 69     return W
 70 
 71 
 72 #把一维的矩阵改写为多维
 73 def changeForm(theta_vector, theta1_lx, theta1_ly, theta2_lx, theta2_ly):
 74     theta1 = np.array(theta_vector[0: theta1_lx * theta1_ly]).reshape(theta1_lx, theta1_ly)
 75     theta2 = np.array(theta_vector[theta1_lx * theta1_ly: theta1_lx * theta1_ly + theta2_lx * theta2_ly])\
 76         .reshape(theta2_lx, theta2_ly)
 77     theta = {'Theta1': theta1, 'Theta2': theta2}
 78     return theta
 79 
 80 
 81 #计算正向激励的参数a
 82 def computeA(nn_params, X):
 83     theta1 = nn_params['Theta1']
 84     theta2 = nn_params['Theta2']
 85     m = np.size(X, 0)
 86 
 87     #第二层计算
 88     one = np.ones(m)
 89     a1 = np.insert(X, 0, values=one, axis=1)
 90     a2 = sigmoid(np.dot(a1, theta1.T))
 91     #第三层计算
 92     one = np.ones(np.size(a2, 0))
 93     a2 = np.insert(a2, 0, values=one, axis=1)
 94     a3 = sigmoid(np.dot(a2, theta2.T))
 95     a_res = {'a1': a1, 'a2': a2, 'a3': a3}
 96     return a_res
 97 
 98 
 99 #计算g'(z)
100 def sigmoidGradient(z):
101     g = np.multiply(sigmoid(z), 1 - sigmoid(z))
102     return g
103 
104 
105 #计算 J
106 def nnCostFunction(nn_params, X, Y, lamb, sizeList):
107     theta = changeForm(nn_params,
108                        sizeList['theta1_lx'], sizeList['theta1_ly'],
109                        sizeList['theta2_lx'], sizeList['theta2_ly'])
110     theta1 = theta['Theta1']
111     theta2 = theta['Theta2']
112     m = np.size(X, 0)
113     a_res = computeA(theta, X)
114     a3 = a_res['a3']
115     #计算J
116     J = 1 / m * np.sum(-np.multiply(Y, np.log(a3)) - np.multiply((1 - Y), np.log(1 - a3)))
117     #规格化
118     theta1_copy = theta1[:, 1:]
119     theta2_copy = theta2[:, 1:]
120     J = J + lamb / (2 * m) * (np.sum(theta1_copy ** 2) + np.sum(theta2_copy ** 2))
121     print(J)
122     return J
123 
124 
125 #计算 D
126 def nnGradient(nn_params, X, Y, lamb, sizeList):
127     theta = changeForm(nn_params,
128                        sizeList['theta1_lx'], sizeList['theta1_ly'],
129                        sizeList['theta2_lx'], sizeList['theta2_ly'])
130     theta1 = theta['Theta1']
131     theta2 = theta['Theta2']
132     m = np.size(X, 0)
133     a_res = computeA(theta, X)
134     a1 = a_res['a1']
135     a2 = a_res['a2']
136     a3 = a_res['a3']
137     theta1_copy = theta1[:, 1:]
138     theta2_copy = theta2[:, 1:]
139     #计算δ
140     delta3 = a3 - Y
141     delta2 = np.multiply(np.dot(delta3, theta2_copy), sigmoidGradient(np.dot(a1, theta1.T)))
142     #计算Δ
143     bigDeilta1 = np.dot(delta2.T, a1)
144     bigDeilta2 = np.dot(delta3.T, a2)
145     #计算D
146     theta1_grad = bigDeilta1 / m + lamb / m * theta1
147     theta2_grad = bigDeilta2 / m + lamb / m * theta2
148     theta1_grad[:, 0] = bigDeilta1[:, 0] / m
149     theta2_grad[:, 0] = bigDeilta2[:, 0] / m
150     #当使用高级优化方法来优化神经网络时，需要将多个参数矩阵展开，才能传入优化函数
151     grad = np.r_[theta1_grad.flatten(), theta2_grad.flatten()]
152     # print(np.size(grad))
153     return grad
154 
155 
156 #测试参数的初始化
157 def debugInitializeWeights(L_out, L_in):
158     W = np.arange(1, L_out * (L_in + 1)+1)
159     W = np.sin(W)
160     W = np.array(W).reshape(L_out, (L_in + 1)) / 10;
161     return W
162 
163 
164 #数值方法计算梯度
165 def computeNumericalGradient(theta, X, Y ,lamb, sizeList):
166     numgrad = np.zeros(np.size(theta))
167     perturb = np.zeros(np.size(theta))
168     e = 1e-4
169     for p in range(0, np.size(theta)):
170         perturb[p] = e
171         theta_minus = theta - perturb
172         theta_plus = theta + perturb
173         loss1 = nnCostFunction(theta_minus, X, Y, lamb, sizeList)
174         loss2 = nnCostFunction(theta_plus, X, Y, lamb, sizeList)
175         numgrad[p] = (loss2 - loss1) / (2 * e)
176         perturb[p] = 0
177     return numgrad
178 
179 
180 #梯度检测函数
181 def checkNNGradients(lamb):
182     #设置测试参数
183     input_layer_size = 3;
184     hidden_layer_size = 5;
185     num_labels = 3;
186     lamb = 1
187     m = 5;
188     sizeList = {'theta1_lx': hidden_layer_size,
189                 'theta1_ly': input_layer_size + 1,
190                 'theta2_lx': num_labels,
191                 'theta2_ly': hidden_layer_size + 1}  # 保存θ大小的参数
192     theta1 = debugInitializeWeights(hidden_layer_size, input_layer_size)
193     theta2 = debugInitializeWeights(num_labels, hidden_layer_size)
194     theta = np.r_[theta1.flatten(), theta2.flatten()]
195     X = debugInitializeWeights(m, input_layer_size - 1)
196     y = np.random.randint(0, num_labels, (m, 1))
197     # 对y进行改写，改为 m*num_labels 规格的矩阵
198     Y = np.zeros((m, num_labels))
199     for i in range(0, m):
200         Y[i, y[i, 0]] = 1
201     grad = nnGradient(theta, X, Y, lamb, sizeList)
202     numGrad = computeNumericalGradient(theta, X, Y, lamb, sizeList)
203     diff = np.linalg.norm(numGrad - grad) / np.linalg.norm(numGrad + grad)
204     print('check NN Gradient: diff = ', diff)
205 
206 
207 #使用模型进行预测
208 def predict(theta1, theta2, X):
209     m = np.size(X,0)
210     p = np.zeros((np.size(X, 0), 1))
211     #第二层计算
212     one = np.ones(m)
213     X = np.insert(X, 0, values=one, axis=1)
214     a2 = sigmoid(np.dot(X, theta1.T))
215     #第三层计算
216     one = np.ones(np.size(a2,0))
217     a2 = np.insert(a2, 0, values=one, axis=1)
218     a3 = sigmoid(np.dot(a2, theta2.T))
219     p = a3.argmax(axis=1) + 1  #y的值为1-10，所以此处0-9要加1
220     return p.flatten()
221 
222 
223 # ——————————————主函数————————————————————
224 #初始化数据
225 input_layer_size = 400
226 hidden_layer_size = 25
227 num_labels = 10
228 sizeList = {'theta1_lx': hidden_layer_size,
229             'theta1_ly': input_layer_size + 1,
230             'theta2_lx': num_labels,
231             'theta2_ly': hidden_layer_size + 1}  #保存θ大小的参数
232 lamb = 1
233 
234 #加载数据文件
235 data = scio.loadmat('ex4data1.mat')
236 X = data['X']
237 m = np.size(X, 0)
238 y = data['y']
239 # 对y进行改写，改为5000*10规格的矩阵，第0-9个位置分别表示1，2，...，9，0
240 Y = np.zeros((m, num_labels))
241 for i in range(0, m):
242     Y[i, y[i, 0] - 1] = 1
243 rand_indices = np.random.randint(0, m, 100)
244 sel = X[rand_indices, :]
245 displayData(sel)
246 
247 #测试数据θ
248 theta = scio.loadmat('ex4weights.mat')
249 theta1 = theta['Theta1']
250 theta2 = theta['Theta2']
251 nn_theta = np.r_[theta1.flatten(), theta2.flatten()]
252 
253 #测试nnCostFunction
254 # J = nnCostFunction(nn_theta, X, Y, 3, sizeList)
255 # print(J)
256 
257 #测试nnGradient
258 print(nnGradient(nn_theta, X, Y, lamb, sizeList))
259 
260 #初始化参数
261 nn_params = randInitializeWeights(sizeList)
262 
263 # 梯度检测
264 # checkNNGradients(lamb)
265 
266 # 训练模型
267 res = op.minimize(fun=nnCostFunction,
268                   x0=nn_params,
269                   args=(X, Y, lamb, sizeList),
270                   method='TNC',
271                   jac=nnGradient,
272                   options={'maxiter': 100})
273 print(res)
274 
275 #计算准确率
276 all_theta = changeForm(res.x, sizeList['theta1_lx'], sizeList['theta1_ly'],
277                        sizeList['theta2_lx'], sizeList['theta2_ly'])
278 res_theta1 = all_theta['Theta1']
279 res_theta2 = all_theta['Theta2']
280 pred = predict(res_theta1, res_theta2, X)
281 acc = np.mean(pred == y.flatten())*100
282 print('Training Set Accuracy:',acc,'%')
283 
284 #显示中间隐藏层
285 displayData(res_theta1[:, 1:])