前边得到的anchor只区分了背景和圈中物体,并没有判别物体属于哪一类


七扭八歪解faster rcnn(keras版)(三)

目前看该代码,没有找到anchor后边接的softmax来判断是不是一个物体,前边的代码已经确定了

def rpn(base_layers,num_anchors):    x = Convolution2D(512, (3, 3), padding='same', activation='relu', kernel_initializer='normal', name='rpn_conv1')(base_layers)    x_class = Convolution2D(num_anchors, (1, 1), activation='sigmoid', kernel_initializer='uniform', name='rpn_out_class')(x)    x_regr = Convolution2D(num_anchors * 4, (1, 1), activation='linear', kernel_initializer='zero', name='rpn_out_regress')(x)    return [x_class, x_regr, base_layers]

很简单,从特征图输出大小相同,num_anchors通道的x_class和num_anchors*4(因为有中心点坐标还有宽高四个值)个通道的x_regr

model_rpn.compile(optimizer=Adam(lr=1e-4), loss=[losses.rpn_loss_cls(num_anchors), losses.rpn_loss_regr(num_anchors)])

def rpn_loss_regr(num_anchors):   def rpn_loss_regr_fixed_num(y_true, y_pred):      if K.image_dim_ordering() == 'th':         x = y_true[:, 4 * num_anchors:, :, :] - y_pred         x_abs = K.abs(x)         x_bool = K.less_equal(x_abs, 1.0)         return lambda_rpn_regr * K.sum(            y_true[:, :4 * num_anchors, :, :] * (x_bool * (0.5 * x * x) + (1 - x_bool) * (x_abs - 0.5))) / K.sum(epsilon + y_true[:, :4 * num_anchors, :, :])      else:         x = y_true[:, :, :, 4 * num_anchors:] - y_pred         x_abs = K.abs(x)         x_bool = K.cast(K.less_equal(x_abs, 1.0), tf.float32)         return lambda_rpn_regr * K.sum(            y_true[:, :, :, :4 * num_anchors] * (x_bool * (0.5 * x * x) + (1 - x_bool) * (x_abs - 0.5))) / K.sum(epsilon + y_true[:, :, :, :4 * num_anchors])   return rpn_loss_regr_fixed_numdef rpn_loss_cls(num_anchors):   def rpn_loss_cls_fixed_num(y_true, y_pred):      if K.image_dim_ordering() == 'tf':         return lambda_rpn_class * K.sum(y_true[:, :, :, :num_anchors] * K.binary_crossentropy(y_pred[:, :, :, :], y_true[:, :, :, num_anchors:])) / K.sum(epsilon + y_true[:, :, :, :num_anchors])      else:         return lambda_rpn_class * K.sum(y_true[:, :num_anchors, :, :] * K.binary_crossentropy(y_pred[:, :, :, :], y_true[:, num_anchors:, :, :])) / K.sum(epsilon + y_true[:, :num_anchors, :, :])   return rpn_loss_cls_fixed_num

求两个loss

由于在实际过程中,Ncls和Nreg差距过大,用参数λ平衡二者(如Ncls=256,Nreg=2400时设置λ=10),使总的网络Loss计算过程中能够均匀考虑2种Loss

七扭八歪解faster rcnn(keras版)(三)

X, Y, img_data = next(data_gen_train)
loss_rpn = model_rpn.train_on_batch(X, Y)

P_rpn = model_rpn.predict_on_batch(X)
R = roi_helpers.rpn_to_roi(P_rpn[0], P_rpn[1], C, K.image_dim_ordering(), use_regr=True, overlap_thresh=0.7, max_boxes=300)

这里我理解是先训练了一下,然后用训练后的参数做预测,将预测得到的框住物体的概率和框的中心点坐标宽高拿到,进入roi层

def rpn_to_roi(rpn_layer, regr_layer, C, dim_ordering, use_regr=True, max_boxes=300,overlap_thresh=0.9):

    regr_layer = regr_layer / C.std_scaling

    anchor_sizes = C.anchor_box_scales
    anchor_ratios = C.anchor_box_ratios

    assert rpn_layer.shape[0] == 1

    if dim_ordering == 'th':
        (rows,cols) = rpn_layer.shape[2:]

    elif dim_ordering == 'tf':
        (rows, cols) = rpn_layer.shape[1:3]

    curr_layer = 0
    if dim_ordering == 'tf':
        A = np.zeros((4, rpn_layer.shape[1], rpn_layer.shape[2], rpn_layer.shape[3]))
    elif dim_ordering == 'th':
        A = np.zeros((4, rpn_layer.shape[2], rpn_layer.shape[3], rpn_layer.shape[1]))

    for anchor_size in anchor_sizes:
        for anchor_ratio in anchor_ratios:

            anchor_x = (anchor_size * anchor_ratio[0])/C.rpn_stride
            anchor_y = (anchor_size * anchor_ratio[1])/C.rpn_stride
            if dim_ordering == 'th':
                regr = regr_layer[0, 4 * curr_layer:4 * curr_layer + 4, :, :]
            else:
                regr = regr_layer[0, :, :, 4 * curr_layer:4 * curr_layer + 4]
                regr = np.transpose(regr, (2, 0, 1))

            X, Y = np.meshgrid(np.arange(cols),np. arange(rows))

            A[0, :, :, curr_layer] = X - anchor_x/2
            A[1, :, :, curr_layer] = Y - anchor_y/2
            A[2, :, :, curr_layer] = anchor_x
            A[3, :, :, curr_layer] = anchor_y

            if use_regr:
                A[:, :, :, curr_layer] = apply_regr_np(A[:, :, :, curr_layer], regr)

            A[2, :, :, curr_layer] = np.maximum(1, A[2, :, :, curr_layer])
            A[3, :, :, curr_layer] = np.maximum(1, A[3, :, :, curr_layer])
            A[2, :, :, curr_layer] += A[0, :, :, curr_layer]
            A[3, :, :, curr_layer] += A[1, :, :, curr_layer]

            A[0, :, :, curr_layer] = np.maximum(0, A[0, :, :, curr_layer])
            A[1, :, :, curr_layer] = np.maximum(0, A[1, :, :, curr_layer])
            A[2, :, :, curr_layer] = np.minimum(cols-1, A[2, :, :, curr_layer])
            A[3, :, :, curr_layer] = np.minimum(rows-1, A[3, :, :, curr_layer])

            curr_layer += 1

    all_boxes = np.reshape(A.transpose((0, 3, 1,2)), (4, -1)).transpose((1, 0))
    all_probs = rpn_layer.transpose((0, 3, 1, 2)).reshape((-1))

    x1 = all_boxes[:, 0]
    y1 = all_boxes[:, 1]
    x2 = all_boxes[:, 2]
    y2 = all_boxes[:, 3]

    idxs = np.where((x1 - x2 >= 0) | (y1 - y2 >= 0))

    all_boxes = np.delete(all_boxes, idxs, 0)
    all_probs = np.delete(all_probs, idxs, 0)

    result = non_max_suppression_fast(all_boxes, all_probs, overlap_thresh=overlap_thresh, max_boxes=max_boxes)[0]

    return result

搞了一堆什么没仔细看,应该是把所有的框和可能框住物体的数值去掉里面x1-x2>=0|y1-y2>=0,因为这样明显不符合我们的逻辑

下边进行非最大值抑制,就是将所有框按照框住物体的概率大小排列,挑出来概率最大的依次和剩下来的做比较,当重叠IOU(就是交集比并集)超过设置的阙值(这里overlap_thresh设置为0.9的框去除

# note: calc_iou converts from (x1,y1,x2,y2) to (x,y,w,h) formatX2, Y1, Y2 = roi_helpers.calc_iou(R, img_data, C, class_mapping)
def calc_iou(R, img_data, C, class_mapping):    bboxes = img_data['bboxes']    (width, height) = (img_data['width'], img_data['height'])    # get image dimensions for resizing    resized_width, resized_height, _ = data_generators.get_new_img_size(width, height, C.im_size)    gta = np.zeros((len(bboxes), 4))    for bbox_num, bbox in enumerate(bboxes):        # get the GT box coordinates, and resize to account for image resizing        gta[bbox_num, 0] = int(round(bbox['x1'] * (resized_width / float(width))/C.rpn_stride))        gta[bbox_num, 1] = int(round(bbox['x2'] * (resized_width / float(width))/C.rpn_stride))        gta[bbox_num, 2] = int(round(bbox['y1'] * (resized_height / float(height))/C.rpn_stride))        gta[bbox_num, 3] = int(round(bbox['y2'] * (resized_height / float(height))/C.rpn_stride))

跟之前一样,转换到resized后的尺寸,gta数组为转换后的bounding box的左上右下坐标

x_roi = []
y_class_num = []
y_class_regr_coords = []
y_class_regr_label = []

for ix in range(R.shape[0]):
    (x1, y1, x2, y2) = R[ix, :]
    x1 = int(round(x1))
    y1 = int(round(y1))
    x2 = int(round(x2))
    y2 = int(round(y2))

    best_iou = 0.0
    best_bbox = -1
    for bbox_num in range(len(bboxes)):
        curr_iou = data_generators.iou([gta[bbox_num, 0], gta[bbox_num, 2], gta[bbox_num, 1], gta[bbox_num, 3]], [x1, y1, x2, y2])
        if curr_iou > best_iou:
            best_iou = curr_iou
            best_bbox = bbox_num

    if best_iou < C.classifier_min_overlap:
            continue
    else:
        w = x2 - x1
        h = y2 - y1
        x_roi.append([x1, y1, w, h])

        if C.classifier_min_overlap <= best_iou < C.classifier_max_overlap:
            # hard negative example
            cls_name = 'bg'
        elif C.classifier_max_overlap <= best_iou:
            cls_name = bboxes[best_bbox]['class']
            cxg = (gta[best_bbox, 0] + gta[best_bbox, 1]) / 2.0
            cyg = (gta[best_bbox, 2] + gta[best_bbox, 3]) / 2.0

            cx = x1 + w / 2.0
            cy = y1 + h / 2.0

            tx = (cxg - cx) / float(w)
            ty = (cyg - cy) / float(h)
            tw = np.log((gta[best_bbox, 1] - gta[best_bbox, 0]) / float(w))
            th = np.log((gta[best_bbox, 3] - gta[best_bbox, 2]) / float(h))
        else:
            print('roi = {}'.format(best_iou))
            raise RuntimeError

拿到经过非最大值抑制的bounding box四个值,分别和所有的ground true值做交比并,得到该框的best_iou小于C.classifier_min_overlap

则忽略该框,如果该框在classifier_min和max_overlap之间,那么该框的cls_name为bg背景,当大于classifier_max_overlap(该值默认config为0.5)时,拿到该框对应的class类型,然后算出来预测值需要移动和缩放的值

    class_num = class_mapping[cls_name]
    class_label = len(class_mapping) * [0]
    class_label[class_num] = 1
    y_class_num.append(copy.deepcopy(class_label))
    coords = [0] * 4 * (len(class_mapping) - 1)
    labels = [0] * 4 * (len(class_mapping) - 1)
    if cls_name != 'bg':
        label_pos = 4 * class_num
        sx, sy, sw, sh = C.classifier_regr_std
        coords[label_pos:4+label_pos] = [sx*tx, sy*ty, sw*tw, sh*th]
        labels[label_pos:4+label_pos] = [1, 1, 1, 1]
        y_class_regr_coords.append(copy.deepcopy(coords))
        y_class_regr_label.append(copy.deepcopy(labels))
    else:
        y_class_regr_coords.append(copy.deepcopy(coords))
        y_class_regr_label.append(copy.deepcopy(labels))

if len(x_roi) == 0:
    return None, None, None

X = np.array(x_roi)
Y1 = np.array(y_class_num)
Y2 = np.concatenate([np.array(y_class_regr_label),np.array(y_class_regr_coords)],axis=1)

return np.expand_dims(X, axis=0), np.expand_dims(Y1, axis=0), np.expand_dims(Y2, axis=0)

先将对应的class_label置为1,(class_mapping这里还没有完全明白),最后把数据跌在一起,返回标记和移动缩放坐标(这里乘了sx,sy,sw,sh,我也不是多理解)

参考文章链接:

https://zhuanlan.zhihu.com/p/28585873

https://zhuanlan.zhihu.com/p/24916624