前边得到的anchor只区分了背景和圈中物体,并没有判别物体属于哪一类
目前看该代码,没有找到anchor后边接的softmax来判断是不是一个物体,前边的代码已经确定了
def rpn(base_layers,num_anchors): x = Convolution2D(512, (3, 3), padding='same', activation='relu', kernel_initializer='normal', name='rpn_conv1')(base_layers) x_class = Convolution2D(num_anchors, (1, 1), activation='sigmoid', kernel_initializer='uniform', name='rpn_out_class')(x) x_regr = Convolution2D(num_anchors * 4, (1, 1), activation='linear', kernel_initializer='zero', name='rpn_out_regress')(x) return [x_class, x_regr, base_layers]
很简单,从特征图输出大小相同,num_anchors通道的x_class和num_anchors*4(因为有中心点坐标还有宽高四个值)个通道的x_regr
model_rpn.compile(optimizer=Adam(lr=1e-4), loss=[losses.rpn_loss_cls(num_anchors), losses.rpn_loss_regr(num_anchors)])
def rpn_loss_regr(num_anchors): def rpn_loss_regr_fixed_num(y_true, y_pred): if K.image_dim_ordering() == 'th': x = y_true[:, 4 * num_anchors:, :, :] - y_pred x_abs = K.abs(x) x_bool = K.less_equal(x_abs, 1.0) return lambda_rpn_regr * K.sum( y_true[:, :4 * num_anchors, :, :] * (x_bool * (0.5 * x * x) + (1 - x_bool) * (x_abs - 0.5))) / K.sum(epsilon + y_true[:, :4 * num_anchors, :, :]) else: x = y_true[:, :, :, 4 * num_anchors:] - y_pred x_abs = K.abs(x) x_bool = K.cast(K.less_equal(x_abs, 1.0), tf.float32) return lambda_rpn_regr * K.sum( y_true[:, :, :, :4 * num_anchors] * (x_bool * (0.5 * x * x) + (1 - x_bool) * (x_abs - 0.5))) / K.sum(epsilon + y_true[:, :, :, :4 * num_anchors]) return rpn_loss_regr_fixed_numdef rpn_loss_cls(num_anchors): def rpn_loss_cls_fixed_num(y_true, y_pred): if K.image_dim_ordering() == 'tf': return lambda_rpn_class * K.sum(y_true[:, :, :, :num_anchors] * K.binary_crossentropy(y_pred[:, :, :, :], y_true[:, :, :, num_anchors:])) / K.sum(epsilon + y_true[:, :, :, :num_anchors]) else: return lambda_rpn_class * K.sum(y_true[:, :num_anchors, :, :] * K.binary_crossentropy(y_pred[:, :, :, :], y_true[:, num_anchors:, :, :])) / K.sum(epsilon + y_true[:, :num_anchors, :, :]) return rpn_loss_cls_fixed_num
求两个loss
由于在实际过程中,Ncls和Nreg差距过大,用参数λ平衡二者(如Ncls=256,Nreg=2400时设置λ=10),使总的网络Loss计算过程中能够均匀考虑2种Loss
X, Y, img_data = next(data_gen_train) loss_rpn = model_rpn.train_on_batch(X, Y)
P_rpn = model_rpn.predict_on_batch(X) R = roi_helpers.rpn_to_roi(P_rpn[0], P_rpn[1], C, K.image_dim_ordering(), use_regr=True, overlap_thresh=0.7, max_boxes=300)
这里我理解是先训练了一下,然后用训练后的参数做预测,将预测得到的框住物体的概率和框的中心点坐标宽高拿到,进入roi层
def rpn_to_roi(rpn_layer, regr_layer, C, dim_ordering, use_regr=True, max_boxes=300,overlap_thresh=0.9): regr_layer = regr_layer / C.std_scaling anchor_sizes = C.anchor_box_scales anchor_ratios = C.anchor_box_ratios assert rpn_layer.shape[0] == 1 if dim_ordering == 'th': (rows,cols) = rpn_layer.shape[2:] elif dim_ordering == 'tf': (rows, cols) = rpn_layer.shape[1:3] curr_layer = 0 if dim_ordering == 'tf': A = np.zeros((4, rpn_layer.shape[1], rpn_layer.shape[2], rpn_layer.shape[3])) elif dim_ordering == 'th': A = np.zeros((4, rpn_layer.shape[2], rpn_layer.shape[3], rpn_layer.shape[1])) for anchor_size in anchor_sizes: for anchor_ratio in anchor_ratios: anchor_x = (anchor_size * anchor_ratio[0])/C.rpn_stride anchor_y = (anchor_size * anchor_ratio[1])/C.rpn_stride if dim_ordering == 'th': regr = regr_layer[0, 4 * curr_layer:4 * curr_layer + 4, :, :] else: regr = regr_layer[0, :, :, 4 * curr_layer:4 * curr_layer + 4] regr = np.transpose(regr, (2, 0, 1)) X, Y = np.meshgrid(np.arange(cols),np. arange(rows)) A[0, :, :, curr_layer] = X - anchor_x/2 A[1, :, :, curr_layer] = Y - anchor_y/2 A[2, :, :, curr_layer] = anchor_x A[3, :, :, curr_layer] = anchor_y if use_regr: A[:, :, :, curr_layer] = apply_regr_np(A[:, :, :, curr_layer], regr) A[2, :, :, curr_layer] = np.maximum(1, A[2, :, :, curr_layer]) A[3, :, :, curr_layer] = np.maximum(1, A[3, :, :, curr_layer]) A[2, :, :, curr_layer] += A[0, :, :, curr_layer] A[3, :, :, curr_layer] += A[1, :, :, curr_layer] A[0, :, :, curr_layer] = np.maximum(0, A[0, :, :, curr_layer]) A[1, :, :, curr_layer] = np.maximum(0, A[1, :, :, curr_layer]) A[2, :, :, curr_layer] = np.minimum(cols-1, A[2, :, :, curr_layer]) A[3, :, :, curr_layer] = np.minimum(rows-1, A[3, :, :, curr_layer]) curr_layer += 1 all_boxes = np.reshape(A.transpose((0, 3, 1,2)), (4, -1)).transpose((1, 0)) all_probs = rpn_layer.transpose((0, 3, 1, 2)).reshape((-1)) x1 = all_boxes[:, 0] y1 = all_boxes[:, 1] x2 = all_boxes[:, 2] y2 = all_boxes[:, 3] idxs = np.where((x1 - x2 >= 0) | (y1 - y2 >= 0)) all_boxes = np.delete(all_boxes, idxs, 0) all_probs = np.delete(all_probs, idxs, 0) result = non_max_suppression_fast(all_boxes, all_probs, overlap_thresh=overlap_thresh, max_boxes=max_boxes)[0] return result
搞了一堆什么没仔细看,应该是把所有的框和可能框住物体的数值去掉里面x1-x2>=0|y1-y2>=0,因为这样明显不符合我们的逻辑
下边进行非最大值抑制,就是将所有框按照框住物体的概率大小排列,挑出来概率最大的依次和剩下来的做比较,当重叠IOU(就是交集比并集)超过设置的阙值(这里overlap_thresh设置为0.9)的框去除
# note: calc_iou converts from (x1,y1,x2,y2) to (x,y,w,h) formatX2, Y1, Y2 = roi_helpers.calc_iou(R, img_data, C, class_mapping)
def calc_iou(R, img_data, C, class_mapping): bboxes = img_data['bboxes'] (width, height) = (img_data['width'], img_data['height']) # get image dimensions for resizing resized_width, resized_height, _ = data_generators.get_new_img_size(width, height, C.im_size) gta = np.zeros((len(bboxes), 4)) for bbox_num, bbox in enumerate(bboxes): # get the GT box coordinates, and resize to account for image resizing gta[bbox_num, 0] = int(round(bbox['x1'] * (resized_width / float(width))/C.rpn_stride)) gta[bbox_num, 1] = int(round(bbox['x2'] * (resized_width / float(width))/C.rpn_stride)) gta[bbox_num, 2] = int(round(bbox['y1'] * (resized_height / float(height))/C.rpn_stride)) gta[bbox_num, 3] = int(round(bbox['y2'] * (resized_height / float(height))/C.rpn_stride))
跟之前一样,转换到resized后的尺寸,gta数组为转换后的bounding box的左上右下坐标
x_roi = [] y_class_num = [] y_class_regr_coords = [] y_class_regr_label = [] for ix in range(R.shape[0]): (x1, y1, x2, y2) = R[ix, :] x1 = int(round(x1)) y1 = int(round(y1)) x2 = int(round(x2)) y2 = int(round(y2)) best_iou = 0.0 best_bbox = -1 for bbox_num in range(len(bboxes)): curr_iou = data_generators.iou([gta[bbox_num, 0], gta[bbox_num, 2], gta[bbox_num, 1], gta[bbox_num, 3]], [x1, y1, x2, y2]) if curr_iou > best_iou: best_iou = curr_iou best_bbox = bbox_num if best_iou < C.classifier_min_overlap: continue else: w = x2 - x1 h = y2 - y1 x_roi.append([x1, y1, w, h]) if C.classifier_min_overlap <= best_iou < C.classifier_max_overlap: # hard negative example cls_name = 'bg' elif C.classifier_max_overlap <= best_iou: cls_name = bboxes[best_bbox]['class'] cxg = (gta[best_bbox, 0] + gta[best_bbox, 1]) / 2.0 cyg = (gta[best_bbox, 2] + gta[best_bbox, 3]) / 2.0 cx = x1 + w / 2.0 cy = y1 + h / 2.0 tx = (cxg - cx) / float(w) ty = (cyg - cy) / float(h) tw = np.log((gta[best_bbox, 1] - gta[best_bbox, 0]) / float(w)) th = np.log((gta[best_bbox, 3] - gta[best_bbox, 2]) / float(h)) else: print('roi = {}'.format(best_iou)) raise RuntimeError
拿到经过非最大值抑制的bounding box四个值,分别和所有的ground true值做交比并,得到该框的best_iou小于C.classifier_min_overlap
则忽略该框,如果该框在classifier_min和max_overlap之间,那么该框的cls_name为bg背景,当大于classifier_max_overlap(该值默认config为0.5)时,拿到该框对应的class类型,然后算出来预测值需要移动和缩放的值
class_num = class_mapping[cls_name] class_label = len(class_mapping) * [0] class_label[class_num] = 1 y_class_num.append(copy.deepcopy(class_label)) coords = [0] * 4 * (len(class_mapping) - 1) labels = [0] * 4 * (len(class_mapping) - 1) if cls_name != 'bg': label_pos = 4 * class_num sx, sy, sw, sh = C.classifier_regr_std coords[label_pos:4+label_pos] = [sx*tx, sy*ty, sw*tw, sh*th] labels[label_pos:4+label_pos] = [1, 1, 1, 1] y_class_regr_coords.append(copy.deepcopy(coords)) y_class_regr_label.append(copy.deepcopy(labels)) else: y_class_regr_coords.append(copy.deepcopy(coords)) y_class_regr_label.append(copy.deepcopy(labels)) if len(x_roi) == 0: return None, None, None X = np.array(x_roi) Y1 = np.array(y_class_num) Y2 = np.concatenate([np.array(y_class_regr_label),np.array(y_class_regr_coords)],axis=1) return np.expand_dims(X, axis=0), np.expand_dims(Y1, axis=0), np.expand_dims(Y2, axis=0)
先将对应的class_label置为1,(class_mapping这里还没有完全明白),最后把数据跌在一起,返回标记和移动缩放坐标(这里乘了sx,sy,sw,sh,我也不是多理解)
参考文章链接:
https://zhuanlan.zhihu.com/p/28585873
https://zhuanlan.zhihu.com/p/24916624
本站文章如无特殊说明,均为本站原创,如若转载,请注明出处:七扭八歪解faster rcnn(keras版)(三) - Python技术站