目标检测算法-YOLO-V3训练代码详解

在YoLo-V3中使用Darknet53这个网络结构。下图为Darknet-53的网络结构，加入了残差块的结构。

目标检测算法-YOLO-V3训练代码详解

Yolo-V3中的改进：

（1）多尺度计算，Yolo-V3又3个不同特征尺度的输出（使用的是CoCo数据集），分别是13×13×225，26×26×225，52×52×225，这里借鉴了FPN的思想不仅在每个特征图上分别独立做预测，同时通过将小特征图上采样到与大的特征图大小相同，然后与大的特征图拼接做进一步预测。

（2）Yolo-V3代价函数修改，Yolo-v3对类别预测的代价函数进行了修改，没有收用softmax函数，因为原来的分类网络中使用softmax层都是假设一张图片或一个object只属于一个类别，但是在一些复杂的场景下，一个object可能属于多个类，那么在使用softmax可能就会导致漏掉一些类别，所以在Yolo-V3中使用逻辑回归层来对每个类别做二分类，因此当一张图像经过特征提取后的某一类输出如果大于0.5那么就属于这个类。这样一个框就可以预测多个类别。

在Yolo-V3中的维度聚类:

Yolo-V3中使用了k-means聚类计算anchor，聚类的目的是让anchor和邻近的ground truth有更大的IOU，这和anchor的尺寸没有直接的关系。

（1）使用聚类原始数据只有标签框的检测数据集，Yolo-V3都会生成一个包含标注框位置和类别的.txt文件，其中每行都包含(xi,yi,wi,hi)即ground truth相对于原图的坐标。

（2）首先给定k个聚类中心点(wi,hi)，这里wi，hi是anchor的宽和高，由于anchor位置不固定，所以没有(x,y)坐标，只有宽和高。

（3）计算每个标注框和每个聚类中心的距离，d=1-IOU(标注框，聚类中心)，这里在计算时将每个标注框的中心点都与聚类中心重合，然后计算IOU，将标注框分配给"距离"最近的聚类中心

（4）所有标注框分配完毕后，对每个族重新计算聚类中心，wi' = 1/Ni∑wi,hi'=1/Ni∑hi，Ni是第i个族的标注框个数，其实就是求该族中所有标注框宽和高的平均值，然后重复3，4步知道聚类中心变化很小。

网络结构（返回3个尺度的输出）

from keras.layers import BatchNormalization
from keras.layers.advanced_activations import LeakyReLU
from keras.layers import Conv2D,ZeroPadding2D,Add,UpSampling2D,Concatenate
from keras.regularizers import l2
def conv(x,*args,**kwargs):
    new_kwargs = {"kernel_regularizer":l2(5e-4),"use_bias":False}
    new_kwargs["padding"] = "valid" if kwargs.get("strides")==(2,2) else "same"
    new_kwargs.update(kwargs)
    x =Conv2D(*args,**new_kwargs)(x)
    return x


def CBL(x,*args,**kwargs):
    x = conv(x,*args,**kwargs)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha=0.1)(x)
    return x

def PCBL(x,num_filters):
    x = ZeroPadding2D(((1,0),(1,0)))(x)
    x = CBL(x,num_filters,(3,3),strides=(2,2))
    return x

def CBLR(x,num_filters):
    y = CBL(x,num_filters,(1,1))
    y = CBL(y,num_filters*2,(3,3))
    x = Add()([x,y])
    return x

def CBL5(x,num_filters):
    x =CBL(x,num_filters,(1,1))
    x =CBL(x,num_filters*2,(3,3))
    x =CBL(x,num_filters,(1,1))
    x =CBL(x,num_filters*2,(3,3))
    x =CBL(x,num_filters,(1,1))
    return x

def CBLC(x,num_filters,out_filters):
    x =CBL(x,num_filters*2,(3,3))
    x =conv(x,out_filters,(1,1))
    return x


def CBLU(x,num_filters):
    x = CBL(x,num_filters,(1,1))
    x =UpSampling2D(2)(x)
    return x


def body(inputs,num_anchors,num_classes):
    out=[]
    x = CBL(inputs,32,(3,3))
    n = [1,2,8,8,4]
    for i in range(5):
        x = PCBL(x,2**(6+i))
        for _ in range(n[i]):
            x = CBLR(x,2**(5+i))
        if i in [2,3,4]:
            out.append(x)
    x1 = CBL5(out[2],512)
    y1 = CBLC(x,512,num_anchors*(num_classes+5))

    x = CBLU(x1,256)
    x = Concatenate()([x,out[1]])

    x2 = CBL5(x,256)
    y2 = CBLC(x2,256,num_anchors*(num_classes+5))

    x = CBLU(x2,128)
    x =Concatenate()([x,out[0]])

    x3 = CBL5(x,128)
    y3 = CBLC(x3,128,num_anchors*(num_classes+5))

    return [y3,y2,y1]

从数据集中的xml文件中获取x,y,w,h，label的信息.

import numpy as np
from xml.etree.ElementTree import parse

class PascalVocXmlParser(object):
    def __init__(self):
        pass

    def get_fname(self,annotation_file):
        root = self._root_tag(annotation_file)
        return root.find("filename").text

    def get_width(self,annotation_file):
        tree = self._tree(annotation_file)
        for elem in tree.iter():
            print(elem)
            if "width" in elem.tag:
                return float(elem.text)
    def get_height(self,annotation_file):
        tree = self._tree(annotation_file)
        for elem in tree.iter():
            if "height" in elem.tag:
                return float(elem.text)

    def get_labels(self,annotation_file):
        root = self._root_tag(annotation_file)
        labels=[]
        obj_tags =root.findall("object")
        for t in obj_tags:
            labels.append(t.find("name").text)
        return labels

    def get_boxes(self,annotation_file):
        root = self._root_tag(annotation_file)
        bbs=[]
        obj_tags = root.findall("object")
        for t in obj_tags:
            box_tag = t.find("bndbox")
            x1 = box_tag.find("xmin").text
            y1 = box_tag.find("ymin").text
            x2 = box_tag.find("xmax").text
            y2 = box_tag.find("ymax").text
            box = np.array([float(x1),float(x2),float(y1),float(y2)])
            bbs.append(box)
        bbs = np.array(bbs)
        return bbs
    #获取所有根节点
    def _root_tag(self,fname):
        tree = parse(fname)
        root = tree.getroot()
        return root

    def _tree(self,fname):
        tree = parse(fname)
        return tree

根据从xml文件中获得的信息求ytrue

import numpy as np
import os
from PIL import Image
from nets.YoLo_v3_get_xml import PascalVocXmlParser

#根据xml文件获取文件名，图片大小,label,box的信息
def get_parse(ann_fname,input_size):
    parser = PascalVocXmlParser()
    fname = parser.get_fname(ann_fname)
    weight = parser.get_width(ann_fname)
    height = parser.get_height(ann_fname)
    labels = parser.get_labels(ann_fname)
    boxes = parser.get_boxes(ann_fname)

    for i in range(len(boxes)):
        boxes[i][0] = boxes[i][0]/weight*input_size
        boxes[i][1] = boxes[i][1]/weight*input_size
        boxes[i][2] = boxes[i][2]/height*input_size
        boxes[i][3] = boxes[i][3]/height*input_size
    return fname,labels,boxes

#计算IOU
def get_IOU(box1,box2):
    w_min = min(box1[1],box2[1])
    h_min = min(box1[3],box2[3])
    w = w_min-box2[0]
    h = h_min-box1[2]

    intersect = w*h
    merge = (box1[1]-box1[0])*(box1[3]-box1[2]) +(box2[1]-box2[0])*(box2[3]-box2[2])
    IOU = intersect/(merge-intersect)
    return IOU



#把box和anchor一个点对齐计算IOU
#计算anchor和ground truth的最大IOU的位置。
def get_anchor(anchors,box):
    IOUList = []
    anchorslist =np.zeros(((len(anchors)),4),dtype="float32")
    for i in range(len(anchorslist)):
        anchorslist[i][0] = box[0]
        anchorslist[i][1] = anchorslist[i][0] + anchors[i][0]
        anchorslist[i][2] = box[2]
        anchorslist[i][3] = anchorslist[i][2] + anchors[i][1]
        IOU = get_IOU(box,anchorslist[i])
        IOUList.append(IOU)
    anchor =IOUList.index((max(IOUList)))
    return anchor

def get_img(img_dir,fname,input_size):
    img_fname =os.path.join(img_dir,fname)
    image = Image.open(img_fname)
    image = image.resize((input_size,input_size))
    image = np.array(image,dtype="float32")
    image /=255.

    return image
#anchor共有9个，每个尺度3个
def get_ytrue(boxes,anchors,anchor_shape,b,pattern_shape,input_size,classes,labels,ytrues):
    newbox = np.zeros((4), dtype="float32")
    for i in range(len(boxes)):
        #计算出所有anchor与ground truth的最大IOU的index
        anchor = get_anchor(anchors,boxes[i])
        #计算出anchor属于哪个尺度
        layer_anchor = anchor//anchor_shape[1]
        #计算anchor属于该尺度的哪个w,h
        box_anchor = anchor%anchor_shape[1]

        rate = pattern_shape[layer_anchor]/input_size

        cent_x = (boxes[i][0]+boxes[i][1])/2*rate
        cent_y = (boxes[i][2]+boxes[i][3])/2*rate
        #向下取整
        x = np.floor(cent_x).astype("int32")
        y = np.floor(cent_y).astype("int32")
        w = boxes[i][1]-boxes[i][0]
        h = boxes[i][3]-boxes[i][2]
        #类别
        c = classes.index(labels[i])
        newbox[0] = cent_x
        newbox[1] = cent_y
        newbox[2] = np.log(max(w,1))/anchors[anchor][0]
        newbox[3] = np.log(max(h,1))/anchors[anchor][1]
        #获得ytrue
        ytrues[layer_anchor][b,x,y,box_anchor,0:4] = newbox[0:4]
        ytrues[layer_anchor][b,x,y,box_anchor,4] =1
        ytrues[layer_anchor][b,x,y,box_anchor,5+c] =1
    return ytrues


#数据生成器
def generator(batch_size,classes,ann_fnames,img_dir,input_size,anchors):
    pattern_shape = [52, 26, 13]
    anchor_shape=[3,3]
    n = len(ann_fnames)
    i = 0
    while True:
        inputs = []
        ytrues = [np.zeros((batch_size, pattern_shape[l], pattern_shape[l], anchor_shape[1], 5 + len(classes)))
                  for l in range(3)]
        #构造一个batch_size
        for b in range(batch_size):
            if i == 0:
                np.random.shuffle(ann_fnames)
            fname, labels, boxes = get_parse(ann_fnames[i], input_size)
            ytrues = get_ytrue(boxes,anchors,anchor_shape,b,pattern_shape,input_size,classes,labels,ytrues)
            img = get_img(img_dir, fname, input_size)
            inputs.append(img)
            i = (i + 1) % n
        inputs = np.array(inputs)
        #返回一个batch_size
        yield inputs,[ytrues[2],ytrues[1],ytrues[0]]

计算loss

Yolo-V3采用直接位置预测，就是预测边界框中心点相对于对应cell左上角的相对位置偏移，为了将边界框中心点约束在当前cell中，使用sigmoi函数处理偏移值，这样预测的偏移值在（0，1）范围内。在Faster-RCNN中不加任何限制就会导致不管初始的bbox在图像的什么位置，通过预测偏移量可以将bbox移动到图像任何位置。

目标检测算法-YOLO-V3训练代码详解

loss组成

#计算回归loss
def get_loss_box(ytrue,ypre,box_scale,object_mask):
    xy_delta = box_scale * object_mask * (ypre[...,:2]-ytrue[...,:2])
    wh_delta = box_scale * object_mask * (tf.sqrt(ypre[...,2:4])-tf.sqrt(ytrue[...,2:4]))
    loss_xy = K.sum(K.square(xy_delta),list(range(1,5)))
    loss_wh = K.sum(K.square(wh_delta),list(range(1,5)))

    return loss_xy+loss_wh
#计算置信度loss
def get_loss_con(ytrue,ypre,noobj_scale,object_mask,IOU):
    object_mask = K.squeeze(object_mask,axis=-1)
    con_delta = object_mask * (ypre*IOU-ytrue) + noobj_scale * (1-object_mask)*(ypre*IOU-ytrue)
    loss_con = K.sum(K.square(con_delta),list(range(1,4)))

    return loss_con
#计算类别loss
def get_loss_c(ytrue,ypre,object_mask):
    ytrue = tf.cast(ytrue,tf.int64)
    loss_class = object_mask*tf.expand_dims(tf.nn.softmax_cross_entropy_with_logits_v2(labels=ytrue,logits=ypre),4)

    return loss_class


def lossCalculator(ytrue,ypre,anchors,batch_size,input_size,box_scale,noobj_scale,ignore_thresh):
    #ypre从网络中得到的shape=(batch_size,13,13,3*(num_classes+5))这里要转换成(batch_size,13,13,3,num_classes+5)
    ypre = K.reshape(ypre,shape=[-1, ypre.shape[-3], ypre.shape[-2], anchors.shape[0], ypre.shape[-1] // anchors.shape[0]])
    ytrue = K.reshape(ytrue, shape=[-1, ypre.shape[1], ypre.shape[2], ypre.shape[3], ypre.shape[4]])

    ytrue,ypre = get_ytrue_ypre(ytrue,ypre,anchors,batch_size)
    object_mask = K.expand_dims(ytrue[...,4],4)
    IOU = get_IOU(ytrue[...,:4],ypre[...,:4],input_size)
    loss_box = get_loss_box(ytrue[...,:4],ypre[...,:4],box_scale,object_mask)
    loss_con = get_loss_con(ytrue[...,4],ypre[...,4],noobj_scale,object_mask,IOU)
    loss_class = get_loss_c(ytrue[...,5:],ypre[...,5:],object_mask)

    losses = loss_box+loss_con+loss_class

    return tf.reduce_mean(losses)

def fn_loss(ytrues,ypres):
    ignore_thresh =0.5
    noobj_scale=0.5
    box_scale=1
    input_size =416
    batch_size =1
    anchors = np.array([[[10, 13], [16, 30], [33, 23]],
                        [[30, 61], [62, 45], [59, 119]],
                        [[116, 90], [156, 198], [373, 326]]])
    losses=[]
    loss =lossCalculator(ytrues,ypres,anchors[2-ypres.shape[1]//26],batch_size,input_size,box_scale,noobj_scale,ignore_thresh)
    losses.append(loss)

    return tf.sqrt(losses)