基于keras的YOLOv3的代码详解

默认输入图片尺寸为[416,416]。

# coding: utf-8

from __future__ import division, print_function

import tensorflow as tf
import numpy as np
import argparse
import cv2

from utils.misc_utils import parse_anchors, read_class_names
from utils.nms_utils import gpu_nms
from utils.plot_utils import get_color_table, plot_one_box

from model import yolov3

# 设置命令行参数，具体可参见每一个命令行参数的含义
parser = argparse.ArgumentParser(description="YOLO-V3 test single image test procedure.")
parser.add_argument("input_image", type=str,
                    help="The path of the input image.")
parser.add_argument("--anchor_path", type=str, default="./data/yolo_anchors.txt",
                    help="The path of the anchor txt file.")
parser.add_argument("--new_size", nargs='*', type=int, default=[416, 416],
                    help="Resize the input image with `new_size`, size format: [width, height]")
parser.add_argument("--class_name_path", type=str, default="./data/coco.names",
                    help="The path of the class names.")
parser.add_argument("--restore_path", type=str, default="./data/darknet_weights/yolov3.ckpt",
                    help="The path of the weights to restore.")
args = parser.parse_args()

# 处理anchors，这些anchors是通过数据聚类获得，一共9个，shape为：[9, 2]。
# 需要注意的是，最后一个维度的顺序是[width, height]
args.anchors = parse_anchors(args.anchor_path)

# 处理classes， 这里是将所有的class的名称提取了出来，组成了一个列表
args.classes = read_class_names(args.class_name_path)

# 类别的数目
args.num_class = len(args.classes)

# 根据类别的数目为每一个类别分配不同的颜色，以便展示
color_table = get_color_table(args.num_class)

# 读取图片
img_ori = cv2.imread(args.input_image)

# 获取图片的尺寸
height_ori, width_ori = img_ori.shape[:2]

# resize，根据之前设定的尺寸值进行resize，默认是[416, 416]，还是[width, height]的顺序
img = cv2.resize(img_ori, tuple(args.new_size))

# 对图片像素进行一定的数据处理
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = np.asarray(img, np.float32)
img = img[np.newaxis, :] / 255.

# TF会话
with tf.Session() as sess:
    # 输入的placeholder，用于输入图片
    input_data = tf.placeholder(tf.float32, [1, args.new_size[1], args.new_size[0], 3], name='input_data')
    # 定义一个YOLOv3的类，在后面可以用来做模型建立以及loss计算等操作，参数分别是类别的数目和anchors
    yolo_model = yolov3(args.num_class, args.anchors)
    with tf.variable_scope('yolov3'):
        # 对图片进行正向传播，返回多张特征图
        pred_feature_maps = yolo_model.forward(input_data, False)
    # 对这些特征图进行处理，获得计算出的bounding box以及属于前景的概率已经每一个类别的概率分布
    pred_boxes, pred_confs, pred_probs = yolo_model.predict(pred_feature_maps)

    # 将两个概率值分别相乘就可以获得最终的概率值
    pred_scores = pred_confs * pred_probs

    # 对这些bounding boxes和概率值进行非最大抑制（NMS）就可以获得最后的bounding boxes和与其对应的概率值以及标签
    boxes, scores, labels = gpu_nms(pred_boxes, pred_scores, args.num_class, max_boxes=30, score_thresh=0.4, nms_thresh=0.5)

    # Saver类，用以保存和恢复模型
    saver = tf.train.Saver()
    # 恢复模型参数
    saver.restore(sess, args.restore_path)

    # 运行graph，获得对应tensors的具体数值，这里是[boxes, scores, labels]，对应于NMS之后获得的结果
    boxes_, scores_, labels_ = sess.run([boxes, scores, labels], feed_dict={input_data: img})

    # rescale the coordinates to the original image
    # 将坐标重新映射到原始图片上，因为前面的计算都是在resize之后的图片上进行的，所以需要进行映射
    boxes_[:, 0] *= (width_ori/float(args.new_size[0]))
    boxes_[:, 2] *= (width_ori/float(args.new_size[0]))
    boxes_[:, 1] *= (height_ori/float(args.new_size[1]))
    boxes_[:, 3] *= (height_ori/float(args.new_size[1]))

    # 输出
    print("box coords:")
    print(boxes_)
    print('*' * 30)
    print("scores:")
    print(scores_)
    print('*' * 30)
    print("labels:")
    print(labels_)

    # 绘制并展示，保存最后的结果
    for i in range(len(boxes_)):
        x0, y0, x1, y1 = boxes_[i]
        plot_one_box(img_ori, [x0, y0, x1, y1], label=args.classes[labels_[i]], color=color_table[labels_[i]])
    cv2.imshow('Detection result', img_ori)
    cv2.imwrite('detection_result.jpg', img_ori)
    cv2.waitKey(0)

（二）get_kmeans.py

这里函数的主要作用是使用kmeans聚类产生若干个anchors中心，在训练的时候使用这些作为一种先验条件。这里的聚类主要是对目标检测框的尺寸进行聚类。

# coding: utf-8
# This script is modified from https://github.com/lars76/kmeans-anchor-boxes

from __future__ import division, print_function

import numpy as np

# 计算IOU，box一个长度为2的数组，表示box的尺寸，clusters表示的是若干集群的中心，同样也是尺寸。
def iou(box, clusters):
    """
    Calculates the Intersection over Union (IoU) between a box and k clusters.
    param:
        box: tuple or array, shifted to the origin (i. e. width and height)
        clusters: numpy array of shape (k, 2) where k is the number of clusters
    return:
        numpy array of shape (k, 0) where k is the number of clusters
    """
    x = np.minimum(clusters[:, 0], box[0])
    y = np.minimum(clusters[:, 1], box[1])
    if np.count_nonzero(x == 0) > 0 or np.count_nonzero(y == 0) > 0:
        raise ValueError("Box has no area")

    intersection = x * y
    box_area = box[0] * box[1]
    cluster_area = clusters[:, 0] * clusters[:, 1]

    iou_ = intersection / (box_area + cluster_area - intersection + 1e-10)

    return iou_


def avg_iou(boxes, clusters):
    """
    Calculates the average Intersection over Union (IoU) between a numpy array of boxes and k clusters.
    param:
        boxes: numpy array of shape (r, 2), where r is the number of rows
        clusters: numpy array of shape (k, 2) where k is the number of clusters
    return:
        average IoU as a single float
    """
    # 计算平均IOU
    return np.mean([np.max(iou(boxes[i], clusters)) for i in range(boxes.shape[0])])


# 这个函数并未在任何地方被使用
def translate_boxes(boxes):
    """
    Translates all the boxes to the origin.
    param:
        boxes: numpy array of shape (r, 4)
    return:
    numpy array of shape (r, 2)
    """
    new_boxes = boxes.copy()
    for row in range(new_boxes.shape[0]):
        new_boxes[row][2] = np.abs(new_boxes[row][2] - new_boxes[row][0])
        new_boxes[row][3] = np.abs(new_boxes[row][3] - new_boxes[row][1])
    return np.delete(new_boxes, [0, 1], axis=1)


def kmeans(boxes, k, dist=np.median):
    """
    Calculates k-means clustering with the Intersection over Union (IoU) metric.
    param:
        boxes: numpy array of shape (r, 2), where r is the number of rows
        k: number of clusters
        dist: distance function
    return:
        numpy array of shape (k, 2)
    """
    # rows表示的是数据集中一共有多少个标注框
    rows = boxes.shape[0]

    # 初始化统计距离的矩阵和每一个标注框的所属集群编号，
    # 这里使用last cluster记录下一轮循环开始时标注框的集群编号，如果在这某一轮的迭代中不发生改变则算法已经收敛。
    distances = np.empty((rows, k))
    last_clusters = np.zeros((rows,))

    np.random.seed()

    # the Forgy method will fail if the whole array contains the same rows
    # 随机选择几个数据作为初始的集群中心
    clusters = boxes[np.random.choice(rows, k, replace=False)]

    # 循环
    while True:
        # 对每一个标注框,计算其与每个集群中心的距离,这里的距离采用的是(1 - 标注框与集群中心的IOU)来表示,
        # IOU数值越大, 则(1- IOU)越小， 则表示距离越接近.
        for row in range(rows):
            distances[row] = 1 - iou(boxes[row], clusters)

        # 对每个标注框选择与其距离最接近的集群中心的标号作为所属类别的编号。
        nearest_clusters = np.argmin(distances, axis=1)

        # 如果在这轮循环中所有的标注框的所属类别不再变化，则说明算法已经收敛，可以跳出循环。
        if (last_clusters == nearest_clusters).all():
            break

        # 对每一类集群，取出所有属于该集群的数据，并按照给定的方法计算集群的中心，
        # 这里默认采用中位数的方法来计算集群中心
        for cluster in range(k):
            clusters[cluster] = dist(boxes[nearest_clusters == cluster], axis=0)

        # 更新每一个标注框所属的集群类别。
        last_clusters = nearest_clusters

    # 返回所有的集群中心
    return clusters


def parse_anno(annotation_path):
    # 打开数据标记的文件
    anno = open(annotation_path, 'r')

    # 用以储存最后的提取出的所有的高度和宽度的结果，
    result = []

    # 对每一个标记图片
    for line in anno:
        # 根据空格将数据行进行分割
        s = line.strip().split(' ')

        # 按照数据的标记规则，每一行的第一个数据是编号，第二个数据是图片地址，从第三个开始才是标记框的信息。
        s = s[2:]

        # 当前图片的标记框的数目，每个标记框包含五个信息，四个坐标信息和一个类别信息
        box_cnt = len(s) // 5

        # 分别处理每一个标记框的信息，并提取标记框的高度和宽度，存入result 列表。
        for i in range(box_cnt):
            x_min, y_min, x_max, y_max = float(s[i*5+1]), float(s[i*5+2]), float(s[i*5+3]), float(s[i*5+4])
            width = x_max - x_min
            height = y_max - y_min
            assert width > 0
            assert height > 0
            result.append([width, height])

    # 将list变为numpy的数组
    result = np.asarray(result)

    # 返回
    return result


def get_kmeans(anno, cluster_num=9):

    # 使用kmeans算法计算需要的anchors
    anchors = kmeans(anno, cluster_num)

    # 计算平均IOU
    ave_iou = avg_iou(anno, anchors)

    # 格式化为int类型
    anchors = anchors.astype('int').tolist()

    # 按照面积大小排序，
    anchors = sorted(anchors, key=lambda x: x[0] * x[1])

    # 返回
    return anchors, ave_iou


if __name__ == '__main__':
    annotation_path = "./data/my_data/train.txt"
    anno_result = parse_anno(annotation_path)
    anchors, ave_iou = get_kmeans(anno_result, 9)

    # 格式化输出anchors数据
    anchor_string = ''
    for anchor in anchors:
        anchor_string += '{},{}, '.format(anchor[0], anchor[1])
    anchor_string = anchor_string[:-2]

    print('anchors are:')
    print(anchor_string)
    print('the average iou is:')
    print(ave_iou)

（三）model.py

这里函数和类的主要作用是对YOLO模型进行封装，类中的函数主要包括：

模型的简历
特征图信息和anchors的联合使用
loss的计算


# coding=utf-8
# for better understanding about yolov3 architecture, refer to this website (in Chinese):
# https://blog.csdn.net/leviopku/article/details/82660381

from __future__ import division, print_function

import tensorflow as tf

slim = tf.contrib.slim

from utils.layer_utils import conv2d, darknet53_body, yolo_block, upsample_layer


class yolov3(object):

    def __init__(self,
                 class_num,
                 anchors,
                 use_label_smooth=False,
                 use_focal_loss=False,
                 batch_norm_decay=0.999,
                 weight_decay=5e-4):
        """
        yolov3 class
        :param class_num: 类别数目
        :param anchors: anchors，一般来说是9个anchors
        :param use_label_smooth: 是否使用label smooth，默认为False
        :param use_focal_loss: 是否使用focal loss，默认为False
        :param batch_norm_decay: BN的衰减系数
        :param weight_decay: 权重衰减系数
        """
        # self.anchors = [[10, 13], [16, 30], [33, 23],
        # [30, 61], [62, 45], [59,  119],
        # [116, 90], [156, 198], [373,326]]
        self.class_num = class_num
        self.anchors = anchors
        self.batch_norm_decay = batch_norm_decay
        self.use_label_smooth = use_label_smooth
        self.use_focal_loss = use_focal_loss
        self.weight_decay = weight_decay

    def forward(self, inputs, is_training=False, reuse=False):
        """
        进行正向传播，返回的是若干特征图
        :param inputs: shape: [N, height, width, channel]
        :param is_training:
        :param reuse:
        :return:
        """

        # 获取输入图片的高度height和宽度width
        # the input img_size, form: [height, width]
        # it will be used later
        self.img_size = tf.shape(inputs)[1:3]

        # batch normalization的相关参数
        # set batch norm params
        batch_norm_params = {
            'decay': self.batch_norm_decay,
            'epsilon': 1e-05,
            'scale': True,
            'is_training': is_training,
            'fused': None,  # Use fused batch norm if possible.
        }

        # slim的arg scope，可以简化代码的编写，共用一套参数设置
        with slim.arg_scope([slim.conv2d, slim.batch_norm], reuse=reuse):
            with slim.arg_scope([slim.conv2d],
                                normalizer_fn=slim.batch_norm,
                                normalizer_params=batch_norm_params,
                                biases_initializer=None,
                                activation_fn=lambda x: tf.nn.leaky_relu(x, alpha=0.1),
                                weights_regularizer=slim.l2_regularizer(self.weight_decay)):

                # DarkNet 的主体部分，主要作用是提取图片中的各种特征信息。
                # 这里可以获取三张特征图，分别取自DarkNet的三个不同的阶段。
                # 每一个阶段对应于不同的特征粒度，结合更多的特征可以增强模型的表达能力。
                # 理论上来说特征提取网络也可以采用其他的网络结构，但是效果可能会有所差异。
                # 如果输入图片的尺寸为[416, 416]，则三张特征图的尺寸分别为
                # route_1 : [1, 52, 52, 256]
                # route_2 : [1, 26, 26, 512]
                # route_3 : [1, 13, 13, 1024]
                with tf.variable_scope('darknet53_body'):
                    route_1, route_2, route_3 = darknet53_body(inputs)

                # 根据前面的特征图，进行特征融合操作，这样可以提供更多的信息。
                with tf.variable_scope('yolov3_head'):

                    # 使用YOLO_block函数来处理得到的特征图，并返回两张特征图。
                    # 本质上，YOLO_block函数仅仅包含若干层卷积层。
                    # 其中，inter1的作用是用来后续进行特征融合，net的主要作用是用以计算后续的坐标和概率等信息。
                    inter1, net = yolo_block(route_3, 512)

                    # 进行依次卷积，主要是为了进行通道数目调整
                    feature_map_1 = slim.conv2d(net, 3 * (5 + self.class_num), 1,
                                                stride=1, normalizer_fn=None,
                                                activation_fn=None, biases_initializer=tf.zeros_initializer())
                    feature_map_1 = tf.identity(feature_map_1, name='feature_map_1')

                    # 进行一次卷积，调整通道数目为256。并进行上采样，这里的上采样主要是用最近邻插值法。
                    inter1 = conv2d(inter1, 256, 1)
                    inter1 = upsample_layer(inter1, tf.shape(route_2))
                    # 进行特征的融合，这里是通道的融合
                    concat1 = tf.concat([inter1, route_2], axis=3)

                    # 下面的和前面的过程是一致的，不再赘述。
                    inter2, net = yolo_block(concat1, 256)
                    feature_map_2 = slim.conv2d(net, 3 * (5 + self.class_num), 1,
                                                stride=1, normalizer_fn=None,
                                                activation_fn=None, biases_initializer=tf.zeros_initializer())
                    feature_map_2 = tf.identity(feature_map_2, name='feature_map_2')

                    inter2 = conv2d(inter2, 128, 1)
                    inter2 = upsample_layer(inter2, tf.shape(route_1))
                    concat2 = tf.concat([inter2, route_1], axis=3)

                    _, feature_map_3 = yolo_block(concat2, 128)
                    feature_map_3 = slim.conv2d(feature_map_3, 3 * (5 + self.class_num), 1,
                                                stride=1, normalizer_fn=None,
                                                activation_fn=None, biases_initializer=tf.zeros_initializer())
                    feature_map_3 = tf.identity(feature_map_3, name='feature_map_3')
            # 将三张特征图返回，shape分别如下：（输入图片尺寸默认为[416, 416])
            # feature_map_1: [1, 13, 13, 255]
            # feature_map_2: [1, 26, 25, 255]
            # feature_map_3: [1, 52, 52, 255]
            return feature_map_1, feature_map_2, feature_map_3

    def reorg_layer(self, feature_map, anchors):
        '''
        feature_map: a feature_map from [feature_map_1, feature_map_2, feature_map_3] returned
            from `forward` function
        anchors: shape: [3, 2]
        '''
        """需要注意的是，我们在下面的代码中会经常涉及到height， width这两个概念，在YOLOv3中，height表示的是竖直方向，
            width表示的是水平方向，同样，x的方向也表示的是水平方向，y的方向是竖直方向"""
        # NOTE: size in [h, w] format! don't get messed up!
        # 获取特征图的尺寸信息，顺序为： [height, width]
        grid_size = tf.shape(feature_map)[1:3]  # [13, 13]

        # the downscale ratio in height and weight
        # 计算此特征图和原图片的缩放尺寸，顺序为： [height, width]
        ratio = tf.cast(self.img_size / grid_size, tf.float32)

        # rescale the anchors to the feature_map
        # NOTE: the anchor is in [w, h] format!
        # 将anchors映射到特征图上,主要是大小上的映射,将anchors的尺寸分别处以下采样倍数即可
        # 需要注意的是，anchors的顺序是[width, height]！所因此下面代码中ratio的下标是反的.
        # 所以计算出的rescaled_anchors的顺序也是[width, height]。
        rescaled_anchors = [(anchor[0] / ratio[1], anchor[1] / ratio[0]) for anchor in anchors]

        # 将特征图reshape一下,主要是将最后一个通道进行分离
        feature_map = tf.reshape(feature_map, [-1, grid_size[0], grid_size[1], 3, 5 + self.class_num])

        # split the feature_map along the last dimension
        # shape info: take 416x416 input image and the 13*13 feature_map for example:
        # box_centers: [N, 13, 13, 3, 2] last_dimension: [center_x, center_y].
        # 需要注意的是这里的center_x, 和center_y的方向表示,center_x表示的是
        # box_sizes: [N, 13, 13, 3, 2] last_dimension: [width, height]
        # conf_logits: [N, 13, 13, 3, 1]
        # prob_logits: [N, 13, 13, 3, class_num]
        # 沿着最后一个数据通道进行分离,分别分离成2, 2, 1, class_num的矩阵.
        box_centers, box_sizes, conf_logits, prob_logits = tf.split(feature_map, [2, 2, 1, self.class_num], axis=-1)

        # 将box的中心数据限制在（0， 1）的范围之内，
        # 因为YOLO将图片分成了一个一个的格子，每一个格子的长宽被设置为1，这里的中心数据本质上是相对于格子左上角的偏移。
        box_centers = tf.nn.sigmoid(box_centers)

        # use some broadcast tricks to get the mesh coordinates
        # grid_x: [0, 1, 2, ..., width - 1]
        grid_x = tf.range(grid_size[1], dtype=tf.int32)
        # grid_y: [0, 1, 2, ..., height - 1]
        grid_y = tf.range(grid_size[0], dtype=tf.int32)

        # grid_x: [[0, 1, 2, ..., width - 1],
        #          [0, 1, 2, ..., width - 1],
        #          ...
        #          [0, 1, 2, ..., width - 1]]
        # grid_y: [[0, 0, 0, ..., 0],
        #          [1, 1, 1, ..., 1],
        #          ...
        #          [height - 1, height - 1, height - 1, ..., height - 1]]
        grid_x, grid_y = tf.meshgrid(grid_x, grid_y)
        x_offset = tf.reshape(grid_x, (-1, 1)) # [0, 1, 2, .., width - 1, 0, 1, 2, ..width - 1, ......, 0, 1, 2, .. width - 1]
        y_offset = tf.reshape(grid_y, (-1, 1)) # [0, 0, 0, .., 0, 1, 1, 1, ...1, ......, height -1, height -1, .., height - 1]

        # x_y_offset: [[0, 0],
        #              [1, 0],
        #              ...
        #              [width - 1, 0],
        #              [0, 1],
        #              [1, 1],
        #              ...
        #              [width - 1, 1],
        #              ......
        #              [0, height - 1],
        #              [1, height - 1],
        #              ...
        #              [width - 1, height - 1]]
        x_y_offset = tf.concat([x_offset, y_offset], axis=-1)
        # shape: [13, 13, 1, 2] 、[height, width, 1, 2]
        x_y_offset = tf.cast(tf.reshape(x_y_offset, [grid_size[0], grid_size[1], 1, 2]), tf.float32)

        # get the absolute box coordinates on the feature_map
        # broadcast机制： [N, height, width, 3, 2] = [N, height, width, 3, 2] + [height, width, 1, 2]
        box_centers = box_centers + x_y_offset

        # rescale to the original image scale
        # 将box的中心重新映射到原始尺寸的图片上。
        # 在前面的代码中，最后一个维度的顺序一直是[width, height]的格式，二ratio的顺序是[height, width]，
        # 因此这是需要对ratio取反遍历，结果的顺序依然是[width, height]。
        box_centers = box_centers * ratio[::-1]

        # avoid getting possible nan value with tf.clip_by_value
        # 和前面的过程一样，这里对box的尺寸进行变换，最后一维度的顺序依然是[width, height]
        box_sizes = tf.exp(box_sizes) * rescaled_anchors
        # box_sizes = tf.clip_by_value(tf.exp(box_sizes), 1e-9, 100) * rescaled_anchors
        # rescale to the original image scale
        # 一样是将box的尺寸重新映射到原始图片上
        box_sizes = box_sizes * ratio[::-1]

        # shape: [N, 13, 13, 3, 4]、[N, height, width, 3, 4]
        # last dimension: (center_x, center_y, w, h)
        boxes = tf.concat([box_centers, box_sizes], axis=-1)

        # shape:
        # x_y_offset: [13, 13, 1, 2], [height, width, 1, 2]
        # boxes: [N, 13, 13, 3, 4], rescaled to the original image scale
        # conf_logits: [N, 13, 13, 3, 1]、 [N, height, width, 3, 1]
        # prob_logits: [N, 13, 13, 3, class_num]、 [N, height, width, 3, class_num]
        return x_y_offset, boxes, conf_logits, prob_logits

    def predict(self, feature_maps):
        '''
        Receive the returned feature_maps from `forward` function,
        the produce the output predictions at the test stage.
        '''
        #
        feature_map_1, feature_map_2, feature_map_3 = feature_maps

        # 将特征图和不同尺寸的anchors相结合，缩放程度大的特征图和大尺寸的anchors相结合，
        # 反之，缩放程度小的特征图和小尺寸的anchors相结合
        feature_map_anchors = [(feature_map_1, self.anchors[6:9]),
                               (feature_map_2, self.anchors[3:6]),
                               (feature_map_3, self.anchors[0:3])]

        # 利用特征图和其对应的anchors计算每一张特征图的预测回归框，置信程度，分类概率等
        reorg_results = [self.reorg_layer(feature_map, anchors) for (feature_map, anchors) in feature_map_anchors]

        def _reshape(result):
            # 取出每一个特征图对应的所有信息，包括预测回归框，置信程度，分类概率等
            x_y_offset, boxes, conf_logits, prob_logits = result

            # 获得特征图的尺寸，[height, width]
            grid_size = tf.shape(x_y_offset)[:2]

            # 将boxes， 前景置信度，分类概率展开
            boxes = tf.reshape(boxes, [-1, grid_size[0] * grid_size[1] * 3, 4])
            conf_logits = tf.reshape(conf_logits, [-1, grid_size[0] * grid_size[1] * 3, 1])
            prob_logits = tf.reshape(prob_logits, [-1, grid_size[0] * grid_size[1] * 3, self.class_num])
            # shape: (take 416*416 input image and feature_map_1 for example),
            # boxes: [N, 13*13*3, 4] , [N, height * width * anchor_num, 4]
            # conf_logits: [N, 13*13*3, 1], [N, height * width * anchor_num, 1]
            # prob_logits: [N, 13*13*3, class_num], [N, height * width * anchor_num, class_num]
            return boxes, conf_logits, prob_logits

        boxes_list, confs_list, probs_list = [], [], []
        for result in reorg_results:
            # 对每个特征图的偏移量，boxes，前景置信度，分类概率等进行处理（主要是reshape），得到boxes，前景置信度，分类概率。
            boxes, conf_logits, prob_logits = _reshape(result)

            # 对置信度和概率进行sigmoid处理，保证数值位于0~1之间
            confs = tf.sigmoid(conf_logits)
            probs = tf.sigmoid(prob_logits)

            # 将所有的boxes， 前景置信度，分类概率保存起来
            boxes_list.append(boxes)
            confs_list.append(confs)
            probs_list.append(probs)

        # collect results on three scales
        # take 416*416 input image for example:
        # shape: [N, (13*13+26*26+52*52)*3, 4]、[N, box_num, 4]
        boxes = tf.concat(boxes_list, axis=1)
        # shape: [N, (13*13+26*26+52*52)*3, 1]、[N, box_num, 1]
        confs = tf.concat(confs_list, axis=1)
        # shape: [N, (13*13+26*26+52*52)*3, class_num]、[N, box_num, class_num]
        probs = tf.concat(probs_list, axis=1)

        # 接下来处理boxes，我们需要将存储格式为中心加尺寸的box数据变换成左上角和右下角的坐标。
        center_x, center_y, width, height = tf.split(boxes, [1, 1, 1, 1], axis=-1)
        x_min = center_x - width / 2
        y_min = center_y - height / 2
        x_max = center_x + width / 2
        y_max = center_y + height / 2

        boxes = tf.concat([x_min, y_min, x_max, y_max], axis=-1)

        # 返回boxes，前景置信度，以及分类概率
        return boxes, confs, probs

    def loss_layer(self, feature_map_i, y_true, anchors):
        '''
        calc loss function from a certain scale
        input:
            feature_map_i: feature maps of a certain scale. shape: [N, 13, 13, 3*(5 + num_class)] etc.
            y_true: y_ture from a certain scale. shape: [N, 13, 13, 3, 5 + num_class + 1] etc.
            anchors: shape [9, 2]
        '''

        # size in [h, w] format! don't get messed up!
        # 获取特征图的尺寸，这里的顺序是[height, width]
        grid_size = tf.shape(feature_map_i)[1:3]

        # the downscale ratio in height and weight
        # 计算下采样的倍数，使用的是原始图片的尺寸除以特征图的尺寸，所以顺序依然是[height, width]
        ratio = tf.cast(self.img_size / grid_size, tf.float32)

        # N: batch_size
        # 样本数目，或者说batch size，这里转换成了浮点数
        N = tf.cast(tf.shape(feature_map_i)[0], tf.float32)

        # 根据特征图和每一个特征图对应的anchors计算预测的Bboxes，每一个框的概率以及每一个框属于前景的概率。
        # 这里返回的第一个参数是每一张特征图上的偏移量。
        # x_y_offset: [height, width, 1, 2]
        # pred_boxes: [N, height, width, 3, 4]
        # pred_conf_logits: [N, height, width, 3, 1]
        # pred_prob_logits: [N, height, width, 3, 80(num_class)]
        x_y_offset, pred_boxes, pred_conf_logits, pred_prob_logits = self.reorg_layer(feature_map_i, anchors)

        ###########
        # get mask
        ###########
        # shape: take 416x416 input image and 13*13 feature_map for example:
        # [N, 13, 13, 3, 1]
        # y true的最后一维的格式是[4, 1, 80, 1],分别表示4位坐标， 1位前景标志位，80个分类标记，1位mix up标记位
        # y_true的最后一个维度的4号位（由0开始计数）上存储的是当前位置是否是一个有效的前景.
        # 如果某一个目标的中心落入框中，则是一个有效的前景，当前位是1，否则当前位置是0.
        # 以13 * 13的特征图为例，object mask的shape是[N, 13, 13, 3, 1] ([N, height, width, 3, 1]).
        object_mask = y_true[..., 4:5]

        # shape: [N, 13, 13, 3, 4] & [N, 13, 13, 3] ==> [V, 4]
        # V: num of true gt box
        # 根据上面计算出来的有效前景框，提取有效的ground truth前景框的坐标，
        # valid true boxes的shape：[V, 4]， 这里的V表示的是有效的ground truth前景框的数目。
        valid_true_boxes = tf.boolean_mask(y_true[..., 0:4], tf.cast(object_mask[..., 0], 'bool'))

        # shape: [V, 2]
        # 将gt目标框的中心和高度宽度分离成两个矩阵，每个矩阵的shape都是[V, 2]
        valid_true_box_xy = valid_true_boxes[:, 0:2]
        valid_true_box_wh = valid_true_boxes[:, 2:4]

        # shape: [N, 13, 13, 3, 2]
        # 同样，我们将特征图预测的每个位置的目标框的中心坐标和高度宽度提取出来。
        # pred boxes的最后一个维度是[2, 2, 1, 80, 1],
        # 分别表示预测的边界框的中心位置（2），预测的边界框的高度宽度（2），预测的边界框的前景置信度（1），分类置信度（80），mixup权重（1）
        pred_box_xy = pred_boxes[..., 0:2]
        pred_box_wh = pred_boxes[..., 2:4]

        # calc iou
        # shape: [N, 13, 13, 3, V]
        # 计算在每个位置上，每个预测的目标框和V个gt目标框之间的iou，返回相对应的矩阵。
        iou = self.broadcast_iou(valid_true_box_xy, valid_true_box_wh, pred_box_xy, pred_box_wh)

        # shape: [N, 13, 13, 3]
        # 这一步相当于是为每一个预测的目标框匹配一个最佳的iou。
        # 当然有些预测的目标框是不和任何的gt目标框相交的，此时它的最佳匹配的iou就是0.
        best_iou = tf.reduce_max(iou, axis=-1)

        # get_ignore_mask
        # 计算出那些和任何一个gt目标边界框的iou都小于0.5的预测目标框的标记。
        # 虽然某些框和目标有一定的重叠，但是重叠部分不是很大，我们忽略掉这些框
        # shape：[N, 13, 13, 3]
        ignore_mask = tf.cast(best_iou < 0.5, tf.float32)
        # shape: [N, 13, 13, 3, 1]
        # 扩展出最后一个维度，这个ignore mask后面计算损失会用到
        ignore_mask = tf.expand_dims(ignore_mask, -1)

        # get xy coordinates in one cell from the feature_map
        # numerical range: 0 ~ 1
        # shape: [N, 13, 13, 3, 2]
        # 计算gt目标框和预测的目标框相对于网格坐标的偏移量。
        true_xy = y_true[..., 0:2] / ratio[::-1] - x_y_offset
        pred_xy = pred_box_xy / ratio[::-1] - x_y_offset

        # get_tw_th
        # numerical range: 0 ~ 1
        # shape: [N, 13, 13, 3, 2]
        # 计算gt目标框和预测的目标框相对于anchors的大小缩放量
        true_tw_th = y_true[..., 2:4] / anchors
        pred_tw_th = pred_box_wh / anchors
        
        # for numerical stability
        # 为了保证数据的稳定性，因为log(0)会趋向于负无穷大，因此将0设置为1，log之后就会变成0，可以看作不影响。
        true_tw_th = tf.where(condition=tf.equal(true_tw_th, 0),
                              x=tf.ones_like(true_tw_th), y=true_tw_th)
        pred_tw_th = tf.where(condition=tf.equal(pred_tw_th, 0),
                              x=tf.ones_like(pred_tw_th), y=pred_tw_th)
        
        # 取对数，这里使用了范围的限制，小于1e-9的会强制变成1e-9，大于1e9的数据会变成1e9。
        # shape: [N, 13, 13, 3, 2]
        true_tw_th = tf.log(tf.clip_by_value(true_tw_th, 1e-9, 1e9))
        pred_tw_th = tf.log(tf.clip_by_value(pred_tw_th, 1e-9, 1e9))

        # box size punishment: 
        # box with smaller area has bigger weight. This is taken from the yolo darknet C source code.
        # shape: [N, 13, 13, 3, 1]
        # 对于目标框尺寸的惩罚，尺寸较小的框具有较大的权重。
        box_loss_scale = 2. - (y_true[..., 2:3] / tf.cast(self.img_size[1], tf.float32)) * (
                    y_true[..., 3:4] / tf.cast(self.img_size[0], tf.float32))

        ############
        # loss_part
        ############
        
        # mix_up weight
        # [N, 13, 13, 3, 1]
        # mix up 权重
        mix_w = y_true[..., -1:]
        
        # shape: [N, 13, 13, 3, 1]
        # 这里计算目标框的中心偏移的损失和高度宽度的损失，这里使用了均方和的方式计算。
        # 从式子中可以看出，我们关注的只有object mask为1的目标，即有效的目标框，其他的目标框就被忽略了。
        xy_loss = tf.reduce_sum(tf.square(true_xy - pred_xy) * object_mask * box_loss_scale * mix_w) / N
        wh_loss = tf.reduce_sum(tf.square(true_tw_th - pred_tw_th) * object_mask * box_loss_scale * mix_w) / N

        # shape: [N, 13, 13, 3, 1]
        # 前景的正样本mask，这里直接使用了object mask，因为这一部分肯定是正确的前景
        conf_pos_mask = object_mask
        
        # 前景的负样本mask
        # 这里的采样法是没有任何一个gt目标框的中心落入框中，并且和任何一个gt目标框的iou都小于0.5的框作为前景采样的负样本。
        # 这里的iou控制就是使用的ignore mask
        conf_neg_mask = (1 - object_mask) * ignore_mask
        
        # 使用交叉熵公式计算最后的损失，唯一的区别就是采样的方式，一个是正样本采样，一个是负样本采样
        conf_loss_pos = conf_pos_mask * tf.nn.sigmoid_cross_entropy_with_logits(labels=object_mask,
                                                                                logits=pred_conf_logits)
        conf_loss_neg = conf_neg_mask * tf.nn.sigmoid_cross_entropy_with_logits(labels=object_mask,
                                                                                logits=pred_conf_logits)
        
        # TODO: may need to balance the pos-neg by multiplying some weights
        # 二者相加就是最后的前景分类的损失
        conf_loss = conf_loss_pos + conf_loss_neg
        
        # 是否使用focal loss，默认为False
        if self.use_focal_loss:
            alpha = 1.0
            gamma = 2.0
            # TODO: alpha should be a mask array if needed
            # Focal loss的计算，这不是YOLO的中点，在此省略
            focal_mask = alpha * tf.pow(tf.abs(object_mask - tf.sigmoid(pred_conf_logits)), gamma)
            conf_loss *= focal_mask
        
        # 将结果和mis up权重相乘，并取均值作为最后的损失标量
        conf_loss = tf.reduce_sum(conf_loss * mix_w) / N

        # shape: [N, 13, 13, 3, 1]
        # whether to use label smooth
        # 是否使用label smooth，默认为False
        if self.use_label_smooth:
            delta = 0.01
            label_target = (1 - delta) * y_true[..., 5:-1] + delta * 1. / self.class_num
        else:
            label_target = y_true[..., 5:-1]
        
        # 分类损失，这里仍然使用的是交叉熵损失。这里还是只对有效的前景框计算损失。最后仍然要和mix up权重相乘
        class_loss = object_mask * tf.nn.sigmoid_cross_entropy_with_logits(labels=label_target,
                                                                           logits=pred_prob_logits) * mix_w
        # 取均值作为最后的分类损失的标量
        class_loss = tf.reduce_sum(class_loss) / N
        
        # 返回最后的所有损失
        return xy_loss, wh_loss, conf_loss, class_loss

    def compute_loss(self, y_pred, y_true):
        '''
        param:
            y_pred: returned feature_map list by `forward` function: [feature_map_1, feature_map_2, feature_map_3]
            y_true: input y_true by the tf.data pipeline
        '''

        # 以下的四个变量分别用来保存四个方面的loss。
        loss_xy, loss_wh, loss_conf, loss_class = 0., 0., 0., 0.

        # 对anchors进行分组，因为每一层特征图都对应三个不同尺度的anchors。
        anchor_group = [self.anchors[6:9], self.anchors[3:6], self.anchors[0:3]]

        # 对每一张特征图和其对应的真实值以及其对应的anchors计算损失。
        # 一共有三张特征图，故一共存在三个不同尺度的损失。
        # calc loss in 3 scales
        for i in range(len(y_pred)):
            # 分别计算损失
            result = self.loss_layer(y_pred[i], y_true[i], anchor_group[i])
            loss_xy += result[0]
            loss_wh += result[1]
            loss_conf += result[2]
            loss_class += result[3]
        total_loss = loss_xy + loss_wh + loss_conf + loss_class
        return [total_loss, loss_xy, loss_wh, loss_conf, loss_class]

    def broadcast_iou(self, true_box_xy, true_box_wh, pred_box_xy, pred_box_wh):
        '''
        maintain an efficient way to calculate the ios matrix between ground truth true boxes and the predicted boxes
        note: here we only care about the size match
        '''
        # shape:
        # true_box_??: [V, 2]
        # pred_box_??: [N, 13, 13, 3, 2]

        # shape: [N, 13, 13, 3, 1, 2]
        pred_box_xy = tf.expand_dims(pred_box_xy, -2)
        pred_box_wh = tf.expand_dims(pred_box_wh, -2)

        # shape: [1, V, 2]
        true_box_xy = tf.expand_dims(true_box_xy, 0)
        true_box_wh = tf.expand_dims(true_box_wh, 0)

        # [N, 13, 13, 3, 1, 2] & [1, V, 2] ==> [N, 13, 13, 3, V, 2]
        intersect_mins = tf.maximum(pred_box_xy - pred_box_wh / 2.,
                                    true_box_xy - true_box_wh / 2.)
        intersect_maxs = tf.minimum(pred_box_xy + pred_box_wh / 2.,
                                    true_box_xy + true_box_wh / 2.)
        intersect_wh = tf.maximum(intersect_maxs - intersect_mins, 0.)

        # shape: [N, 13, 13, 3, V]
        intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1]
        # shape: [N, 13, 13, 3, 1]
        pred_box_area = pred_box_wh[..., 0] * pred_box_wh[..., 1]
        # shape: [1, V]
        true_box_area = true_box_wh[..., 0] * true_box_wh[..., 1]

        # [N, 13, 13, 3, V]
        iou = intersect_area / (pred_box_area + true_box_area - intersect_area + 1e-10)

        return iou

（四）layer_utils.py

这里函数的主要作用是对卷积等操作做出一定的个性化封装，方便代码的编写。主要包括：

卷积的封装
darknet网络结构的定义
resize的定义，默认是最近邻方法
在主体网络的基础上做的YOLO的附加的卷积操作，为后面的特征融合做准备

# coding: utf-8

from __future__ import division, print_function

import numpy as np
import tensorflow as tf

slim = tf.contrib.slim


def conv2d(inputs, filters, kernel_size, strides=1):
    # 对conv2d做一定的个性化封装，方便代码的编写和阅读
    def _fixed_padding(inputs, kernel_size):
        pad_total = kernel_size - 1
        pad_beg = pad_total // 2
        pad_end = pad_total - pad_beg

        padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg, pad_end],
                                        [pad_beg, pad_end], [0, 0]], mode='CONSTANT')
        return padded_inputs

    if strides > 1:
        inputs = _fixed_padding(inputs, kernel_size)
    inputs = slim.conv2d(inputs, filters, kernel_size, stride=strides,
                         padding=('SAME' if strides == 1 else 'VALID'))
    return inputs


def darknet53_body(inputs):
    """
    darknet的主体网络框架
    :param inputs: 
    :return: 三张不同尺度的特征图
    """
    def res_block(inputs, filters):
        shortcut = inputs
        net = conv2d(inputs, filters * 1, 1)
        net = conv2d(net, filters * 2, 3)

        net = net + shortcut

        return net

    # first two conv2d layers
    net = conv2d(inputs, 32, 3, strides=1)
    net = conv2d(net, 64, 3, strides=2)

    # res_block * 1
    net = res_block(net, 32)

    net = conv2d(net, 128, 3, strides=2)

    # res_block * 2
    for i in range(2):
        net = res_block(net, 64)

    net = conv2d(net, 256, 3, strides=2)

    # res_block * 8
    for i in range(8):
        net = res_block(net, 128)

    route_1 = net
    net = conv2d(net, 512, 3, strides=2)

    # res_block * 8
    for i in range(8):
        net = res_block(net, 256)

    route_2 = net
    net = conv2d(net, 1024, 3, strides=2)

    # res_block * 4
    for i in range(4):
        net = res_block(net, 512)
    route_3 = net

    return route_1, route_2, route_3


def yolo_block(inputs, filters):
    """
    在darknet主体网络提取特征的基础上增加的若干卷积层，为了后面的特征融合做准备
    :param inputs: 
    :param filters: 
    :return: 
    """
    net = conv2d(inputs, filters * 1, 1)
    net = conv2d(net, filters * 2, 3)
    net = conv2d(net, filters * 1, 1)
    net = conv2d(net, filters * 2, 3)
    net = conv2d(net, filters * 1, 1)
    route = net
    net = conv2d(net, filters * 2, 3)
    return route, net


def upsample_layer(inputs, out_shape):
    """
    这一部分主要是对特征图进行resize，默认使用最近邻方法
    :param inputs: 
    :param out_shape: 
    :return: 
    """
    new_height, new_width = out_shape[1], out_shape[2]
    # NOTE: here height is the first
    # TODO: Do we need to set `align_corners` as True?
    inputs = tf.image.resize_nearest_neighbor(inputs, (new_height, new_width), name='upsampled')
    return inputs

（五）nms_utils.py

这一部分代码主要是非最大值抑制（NMS）的实现，原理都是相同，过程大致如下：

首先按照目标的置信度从大到小排序
取出当前最大的置信度的目标框
计算剩下的目标框和取出的目标框的iou
依次检查iou的大小，如果iou高于一定的阈值，则说明对应的目标框被取出的目标框抑制了，因此只留下iou小于一定阈值的框。
重复2~4步骤，直至处理完所有的目标框
返回所有取出的目标框，就是NMS的结果

需要注意的是，NMS只针对于一类类别的数据，如果有多个类别，则需要分别处理。

# coding: utf-8

from __future__ import division, print_function

import numpy as np
import tensorflow as tf


def gpu_nms(boxes, scores, num_classes, max_boxes=50, score_thresh=0.5, nms_thresh=0.5):
    """
    Perform NMS on GPU using TensorFlow.

    params:
        boxes: tensor of shape [1, 10647, 4] # 10647=(13*13+26*26+52*52)*3, for input 416*416 image
        scores: tensor of shape [1, 10647, num_classes], score=conf*prob
        num_classes: total number of classes
        max_boxes: integer, maximum number of predicted boxes you'd like, default is 50
        score_thresh: if [ highest class probability score < score_threshold]
                        then get rid of the corresponding box
        nms_thresh: real value, "intersection over union" threshold used for NMS filtering
    """

    boxes_list, label_list, score_list = [], [], []
    max_boxes = tf.constant(max_boxes, dtype='int32')

    # since we do nms for single image, then reshape it
    boxes = tf.reshape(boxes, [-1, 4])  # '-1' means we don't konw the exact number of boxes
    score = tf.reshape(scores, [-1, num_classes])

    # Step 1: Create a filtering mask based on "box_class_scores" by using "threshold".
    mask = tf.greater_equal(score, tf.constant(score_thresh))
    # Step 2: Do non_max_suppression for each class
    for i in range(num_classes):
        # Step 3: Apply the mask to scores, boxes and pick them out
        filter_boxes = tf.boolean_mask(boxes, mask[:, i])
        filter_score = tf.boolean_mask(score[:, i], mask[:, i])
        nms_indices = tf.image.non_max_suppression(boxes=filter_boxes,
                                                   scores=filter_score,
                                                   max_output_size=max_boxes,
                                                   iou_threshold=nms_thresh, name='nms_indices')
        label_list.append(tf.ones_like(tf.gather(filter_score, nms_indices), 'int32') * i)
        boxes_list.append(tf.gather(filter_boxes, nms_indices))
        score_list.append(tf.gather(filter_score, nms_indices))

    boxes = tf.concat(boxes_list, axis=0)
    score = tf.concat(score_list, axis=0)
    label = tf.concat(label_list, axis=0)

    return boxes, score, label


def py_nms(boxes, scores, max_boxes=50, iou_thresh=0.5):
    """
    Pure Python NMS baseline.

    Arguments: boxes: shape of [-1, 4], the value of '-1' means that dont know the
                      exact number of boxes
               scores: shape of [-1,]
               max_boxes: representing the maximum of boxes to be selected by non_max_suppression
               iou_thresh: representing iou_threshold for deciding to keep boxes
    """
    assert boxes.shape[1] == 4 and len(scores.shape) == 1

    # 下面几行的代码主要是用于求解每个box的面积，然后按照每个box的score的大小进行排序
    x1 = boxes[:, 0]
    y1 = boxes[:, 1]
    x2 = boxes[:, 2]
    y2 = boxes[:, 3]

    areas = (x2 - x1) * (y2 - y1)
    # 按照每个box的score大小进行排序，这里返回的是排序之后的box的index。
    # 本质上order储存的是需要处理的box的索引
    order = scores.argsort()[::-1]

    # keep用于储存保留下来的box的索引index
    keep = []

    # 如果还存在没有被处理的box的索引
    while order.size > 0:
        # 由于之前进行了排序，所以order的第一个肯定是score最高的
        i = order[0]
        # 将这个索引保存起来
        keep.append(i)

        # 下面的代码主要是求解第一个box和剩下的所有的box的IOU，
        # 因为第一个是目标box，所以在order的选取上需要加上[1:]，取遍剩下的所有的box
        xx1 = np.maximum(x1[i], x1[order[1:]])
        yy1 = np.maximum(y1[i], y1[order[1:]])
        xx2 = np.minimum(x2[i], x2[order[1:]])
        yy2 = np.minimum(y2[i], y2[order[1:]])

        w = np.maximum(0.0, xx2 - xx1 + 1)
        h = np.maximum(0.0, yy2 - yy1 + 1)
        inter = w * h
        # IOU计算
        ovr = inter / (areas[i] + areas[order[1:]] - inter)

        # 将和目标box的IOU小于一定阈值的box的索引取出，因为高于这一阈值的box都已经被目标box抑制了
        inds = np.where(ovr <= iou_thresh)[0]
        # 然后更新我们的order，重复下一轮循环。
        order = order[inds + 1]

    # 最后返回给定数目的box的索引
    return keep[:max_boxes]


def cpu_nms(boxes, scores, num_classes, max_boxes=50, score_thresh=0.5, iou_thresh=0.5):
    """
    Perform NMS on CPU.
    Arguments:
        boxes: shape [1, 10647, 4]
        scores: shape [1, 10647, num_classes]
    """

    boxes = boxes.reshape(-1, 4)
    scores = scores.reshape(-1, num_classes)
    # Picked bounding boxes
    picked_boxes, picked_score, picked_label = [], [], []

    for i in range(num_classes):
        indices = np.where(scores[:, i] >= score_thresh)
        filter_boxes = boxes[indices]
        filter_scores = scores[:, i][indices]
        if len(filter_boxes) == 0:
            continue
        # do non_max_suppression on the cpu
        indices = py_nms(filter_boxes, filter_scores,
                         max_boxes=max_boxes, iou_thresh=iou_thresh)
        picked_boxes.append(filter_boxes[indices])
        picked_score.append(filter_scores[indices])
        picked_label.append(np.ones(len(indices), dtype='int32') * i)
    if len(picked_boxes) == 0:
        return None, None, None

    boxes = np.concatenate(picked_boxes, axis=0)
    score = np.concatenate(picked_score, axis=0)
    label = np.concatenate(picked_label, axis=0)

    return boxes, score, label

（六）train.py

这一部分代码主要是训练模型的入口，按照要求准备号训练数据之后，就可以从这里开始训练了。

# coding: utf-8

from __future__ import division, print_function

import tensorflow as tf
import numpy as np
import logging
from tqdm import trange

import args

from utils.data_utils import get_batch_data
from utils.misc_utils import shuffle_and_overwrite, make_summary, config_learning_rate, config_optimizer, AverageMeter
from utils.eval_utils import evaluate_on_cpu, evaluate_on_gpu, get_preds_gpu, voc_eval, parse_gt_rec
from utils.nms_utils import gpu_nms

from model import yolov3

# setting loggers
# 设置日志记录
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s',
                    datefmt='%a, %d %b %Y %H:%M:%S', filename=args.progress_log_path, filemode='w')

# setting placeholders
# 整个网络的数据输入入口

# 是否是训练阶段，针对BN等操作有用
is_training = tf.placeholder(tf.bool, name="phase_train")

# 这个数据输入入口未被使用，原因不明
handle_flag = tf.placeholder(tf.string, [], name='iterator_handle_flag')

# register the gpu nms operation here for the following evaluation scheme
# 为了后面的模型评估的计算，这里首先定义好在gpu上的nms操作
pred_boxes_flag = tf.placeholder(tf.float32, [1, None, None])
pred_scores_flag = tf.placeholder(tf.float32, [1, None, None])
gpu_nms_op = gpu_nms(pred_boxes_flag, pred_scores_flag, args.class_num, args.nms_topk, args.score_threshold, args.nms_threshold)

##################
# tf.data pipeline
##################
# 输入输入流，我们是从一个文本文件读入数据，因此，可以使用TextLineDataset类来帮助数据读入
train_dataset = tf.data.TextLineDataset(args.train_file)
# 随机打乱
train_dataset = train_dataset.shuffle(args.train_img_cnt)
# 设定batch size
train_dataset = train_dataset.batch(args.batch_size)
# 自定义输入的返回格式，因为文本文件中的数据不一定就是正式的使用数据，可以自定义真正的数据读取操作
train_dataset = train_dataset.map(
    lambda x: tf.py_func(get_batch_data,
                         inp=[x, args.class_num, args.img_size, args.anchors, 'train', args.multi_scale_train, args.use_mix_up],
                         Tout=[tf.int64, tf.float32, tf.float32, tf.float32, tf.float32]),
    num_parallel_calls=args.num_threads
)
# 预先读取
train_dataset = train_dataset.prefetch(args.prefetech_buffer)

# 和训练数据的读取类似,这里读取的是验证集的数据
val_dataset = tf.data.TextLineDataset(args.val_file)
val_dataset = val_dataset.batch(1)
val_dataset = val_dataset.map(
    lambda x: tf.py_func(get_batch_data,
                         inp=[x, args.class_num, args.img_size, args.anchors, 'val', False, False],
                         Tout=[tf.int64, tf.float32, tf.float32, tf.float32, tf.float32]),
    num_parallel_calls=args.num_threads
)
val_dataset.prefetch(args.prefetech_buffer)

# 定义迭代器
iterator = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes)
train_init_op = iterator.make_initializer(train_dataset)
val_init_op = iterator.make_initializer(val_dataset)

# get an element from the chosen dataset iterator
# 利用迭代器获取数据.由于之前我们自定义了数据的读取方式,这里返回的正是我们希望的数据
image_ids, image, y_true_13, y_true_26, y_true_52 = iterator.get_next()
y_true = [y_true_13, y_true_26, y_true_52]

# tf.data pipeline will lose the data `static` shape, so we need to set it manually
# 手动设置shape
image_ids.set_shape([None])
image.set_shape([None, None, None, 3])
for y in y_true:
    y.set_shape([None, None, None, None, None])

##################
# Model definition
##################
# 模型定义,这一部分和预测时的一致.
yolo_model = yolov3(args.class_num, args.anchors, args.use_label_smooth, args.use_focal_loss, args.batch_norm_decay, args.weight_decay)
with tf.variable_scope('yolov3'):
    pred_feature_maps = yolo_model.forward(image, is_training=is_training)

# 计算损失
loss = yolo_model.compute_loss(pred_feature_maps, y_true)

# 计算预测的结果
y_pred = yolo_model.predict(pred_feature_maps)

# 正则化的损失
l2_loss = tf.losses.get_regularization_loss()

# setting restore parts and vars to update
# 定义Saver,
saver_to_restore = tf.train.Saver(var_list=tf.contrib.framework.get_variables_to_restore(include=args.restore_part))
update_vars = tf.contrib.framework.get_variables_to_restore(include=args.update_part)

# 这一部分是为了tensor board可视化做的准备,主要是一些曲线,反映loss的变化
tf.summary.scalar('train_batch_statistics/total_loss', loss[0])
tf.summary.scalar('train_batch_statistics/loss_xy', loss[1])
tf.summary.scalar('train_batch_statistics/loss_wh', loss[2])
tf.summary.scalar('train_batch_statistics/loss_conf', loss[3])
tf.summary.scalar('train_batch_statistics/loss_class', loss[4])
tf.summary.scalar('train_batch_statistics/loss_l2', l2_loss)
tf.summary.scalar('train_batch_statistics/loss_ratio', l2_loss / loss[0])

# global step
global_step = tf.Variable(float(args.global_step), trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES])

# 是否使用warm up,默认是True,主要是定义学习率的方法上有些区别
if args.use_warm_up:
    learning_rate = tf.cond(tf.less(global_step, args.train_batch_num * args.warm_up_epoch), 
                            lambda: args.learning_rate_init * global_step / (args.train_batch_num * args.warm_up_epoch),
                            lambda: config_learning_rate(args, global_step - args.train_batch_num * args.warm_up_epoch))
else:
    learning_rate = config_learning_rate(args, global_step)
tf.summary.scalar('learning_rate', learning_rate)

#
if not args.save_optimizer:
    saver_to_save = tf.train.Saver()
    saver_best = tf.train.Saver()

# 优化器
optimizer = config_optimizer(args.optimizer_name, learning_rate)

if args.save_optimizer:
    saver_to_save = tf.train.Saver()
    saver_best = tf.train.Saver()

# set dependencies for BN ops
# 为BN操作设置依赖
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
    train_op = optimizer.minimize(loss[0] + l2_loss, var_list=update_vars, global_step=global_step)

# 设置会话Session
with tf.Session() as sess:
    # 初始化全局的variable
    sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()])

    saver_to_restore.restore(sess, args.restore_path)
    merged = tf.summary.merge_all()
    writer = tf.summary.FileWriter(args.log_dir, sess.graph)

    print('\n----------- start to train -----------\n')

    best_mAP = -np.Inf

    # 开始循环训练
    for epoch in range(args.total_epoches):

        sess.run(train_init_op)

        # 定义记录数据的类，主要是保存当前为止的所有数据的均值
        loss_total, loss_xy, loss_wh, loss_conf, loss_class = AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter()

        # 对每一个bacth size
        for i in trange(args.train_batch_num):
            _, summary, __y_pred, __y_true, __loss, __global_step, __lr = sess.run(
                [train_op, merged, y_pred, y_true, loss, global_step, learning_rate],
                feed_dict={is_training: True})

            writer.add_summary(summary, global_step=__global_step)

            # 更新均值
            loss_total.update(__loss[0], len(__y_pred[0]))
            loss_xy.update(__loss[1], len(__y_pred[0]))
            loss_wh.update(__loss[2], len(__y_pred[0]))
            loss_conf.update(__loss[3], len(__y_pred[0]))
            loss_class.update(__loss[4], len(__y_pred[0]))

            # 每隔一段时间进行模型的评估，这里主要计算的是recall和precision
            # 这里计算的是训练集上的评估结果
            if __global_step % args.train_evaluation_step == 0 and __global_step > 0:
                # recall, precision = evaluate_on_cpu(__y_pred, __y_true, args.class_num, args.nms_topk, args.score_threshold, args.eval_threshold)
                recall, precision = evaluate_on_gpu(sess, gpu_nms_op, pred_boxes_flag, pred_scores_flag, __y_pred, __y_true, args.class_num, args.eval_threshold)

                info = "Epoch: {}, global_step: {} | loss: total: {:.2f}, xy: {:.2f}, wh: {:.2f}, conf: {:.2f}, class: {:.2f} | ".format(
                        epoch, int(__global_step), loss_total.average, loss_xy.average, loss_wh.average, loss_conf.average, loss_class.average)
                info += 'Last batch: rec: {:.3f}, prec: {:.3f} | lr: {:.5g}'.format(recall, precision, __lr)
                print(info)
                logging.info(info)

                writer.add_summary(make_summary('evaluation/train_batch_recall', recall), global_step=__global_step)
                writer.add_summary(make_summary('evaluation/train_batch_precision', precision), global_step=__global_step)

                if np.isnan(loss_total.average):
                    print('****' * 10)
                    raise ArithmeticError(
                        'Gradient exploded! Please train again and you may need modify some parameters.')

        # 重置相关的均值记录类
        tmp_total_loss = loss_total.average
        loss_total.reset()
        loss_xy.reset()
        loss_wh.reset()
        loss_conf.reset()
        loss_class.reset()

        # 保存模型
        # NOTE: this is just demo. You can set the conditions when to save the weights.
        if epoch % args.save_epoch == 0 and epoch > 0:
            if tmp_total_loss <= 2.:
                saver_to_save.save(sess, args.save_dir + 'model-epoch_{}_step_{}_loss_{:.4f}_lr_{:.5g}'.format(epoch, int(__global_step), loss_total.last_avg, __lr))

        # 验证集用以评估模型，这一部分和前面类似
        # switch to validation dataset for evaluation
        if epoch % args.val_evaluation_epoch == 0 and epoch > 0:
            sess.run(val_init_op)

            val_loss_total, val_loss_xy, val_loss_wh, val_loss_conf, val_loss_class = \
                AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter()

            val_preds = []

            for j in trange(args.val_img_cnt):
                __image_ids, __y_pred, __loss = sess.run([image_ids, y_pred, loss],
                                                         feed_dict={is_training: False})
                pred_content = get_preds_gpu(sess, gpu_nms_op, pred_boxes_flag, pred_scores_flag, __image_ids, __y_pred)
                val_preds.extend(pred_content)
                val_loss_total.update(__loss[0])
                val_loss_xy.update(__loss[1])
                val_loss_wh.update(__loss[2])
                val_loss_conf.update(__loss[3])
                val_loss_class.update(__loss[4])

            # calc mAP
            # 计算mAP
            rec_total, prec_total, ap_total = AverageMeter(), AverageMeter(), AverageMeter()
            gt_dict = parse_gt_rec(args.val_file, args.img_size)

            info = '======> Epoch: {}, global_step: {}, lr: {:.6g} <======\n'.format(epoch, __global_step, __lr)

            for ii in range(args.class_num):
                npos, nd, rec, prec, ap = voc_eval(gt_dict, val_preds, ii, iou_thres=args.eval_threshold, use_07_metric=False)
                info += 'EVAL: Class {}: Recall: {:.4f}, Precision: {:.4f}, AP: {:.4f}\n'.format(ii, rec, prec, ap)
                rec_total.update(rec, npos)
                prec_total.update(prec, nd)
                ap_total.update(ap, 1)

            mAP = ap_total.avg
            info += 'EVAL: Recall: {:.4f}, Precison: {:.4f}, mAP: {:.4f}\n'.format(rec_total.avg, prec_total.avg, mAP)
            info += 'EVAL: loss: total: {:.2f}, xy: {:.2f}, wh: {:.2f}, conf: {:.2f}, class: {:.2f}\n'.format(
                val_loss_total.avg, val_loss_xy.avg, val_loss_wh.avg, val_loss_conf.avg, val_loss_class.avg)
            print(info)
            logging.info(info)

            if mAP > best_mAP:
                best_mAP = mAP
                saver_best.save(sess, args.save_dir + 'best_model_Epoch_{}_step_{}_mAP_{:.4f}_loss_{:.4f}_lr_{:.7g}'.format(
                                   epoch, __global_step, best_mAP, val_loss_total.last_avg, __lr))
            
            writer.add_summary(make_summary('evaluation/val_mAP', mAP), global_step=epoch)
            writer.add_summary(make_summary('evaluation/val_recall', rec_total.last_avg), global_step=epoch)
            writer.add_summary(make_summary('evaluation/val_precision', prec_total.last_avg), global_step=epoch)
            writer.add_summary(make_summary('validation_statistics/total_loss', val_loss_total.last_avg), global_step=epoch)
            writer.add_summary(make_summary('validation_statistics/loss_xy', val_loss_xy.last_avg), global_step=epoch)
            writer.add_summary(make_summary('validation_statistics/loss_wh', val_loss_wh.last_avg), global_step=epoch)
            writer.add_summary(make_summary('validation_statistics/loss_conf', val_loss_conf.last_avg), global_step=epoch)
            writer.add_summary(make_summary('validation_statistics/loss_class', val_loss_class.last_avg), global_step=epoch)

（七）data_utils.py

这一部分代码主要是准备训练用的数据。算得上是YOLO模型中另一个十分重要的部分。

# coding: utf-8

from __future__ import division, print_function

import numpy as np
import cv2
import sys
from utils.data_aug import *
import random

PY_VERSION = sys.version_info[0]
iter_cnt = 0


def parse_line(line):
    '''
    Given a line from the training/test txt file, return parsed info.
    return:
        line_idx: int64
        pic_path: string.
        boxes: shape [N, 4], N is the ground truth count, elements in the second
            dimension are [x_min, y_min, x_max, y_max]
        labels: shape [N]. class index.
    '''
    """
    这一部分代码的主要功能是对给定的数据字符串进行处理，提取出其中的有效信息，包括如下：
    1. 图片索引
    2. 图片路径
    3. 每一个目标框的坐标，
    4. 每一个目标框的label
    """
    if 'str' not in str(type(line)):
        line = line.decode()
    # 按照空格划分数据
    s = line.strip().split(' ')

    # 第一个数据是图片索引
    line_idx = int(s[0])

    # 第二个数据是图片的路径
    pic_path = s[1]

    # 去除掉前两个数据之后，剩下的就和目标框有关系了
    s = s[2:]

    # 每一个目标框都包含五个数据，4个坐标信息和1个label信息，因此数据总数除以5之后就是目标框的总数目
    box_cnt = len(s) // 5

    # 存储数据的list
    boxes = []
    labels = []

    # 对每一个目标框
    for i in range(box_cnt):
        # 提取出label以及四个坐标数据
        label, x_min, y_min, x_max, y_max = int(s[i * 5]), float(s[i * 5 + 1]), float(s[i * 5 + 2]), float(
            s[i * 5 + 3]), float(s[i * 5 + 4])
        boxes.append([x_min, y_min, x_max, y_max])
        labels.append(label)

    # numpy处理一下
    boxes = np.asarray(boxes, np.float32)
    labels = np.asarray(labels, np.int64)

    # 返回
    return line_idx, pic_path, boxes, labels


def process_box(boxes, labels, img_size, class_num, anchors):
    '''
    Generate the y_true label, i.e. the ground truth feature_maps in 3 different scales.
    params:
        boxes: [N, 5] shape, float32 dtype. `x_min, y_min, x_max, y_mix, mixup_weight`.
        labels: [N] shape, int64 dtype.
        class_num: int64 num.
        anchors: [9, 4] shape, float32 dtype.
    '''
    """
    这一部分是数据预处理中最重要的一部分,因为这里才是生成最后的y true的地方
    """
    # anchor的编号,分别对应于每一个不同尺寸的特征图,
    # 大尺寸的特征图对应的anchor是6,7,8,中尺寸的特征图对应的是3,4,5,小尺寸的对应的是0,1,2
    anchors_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]

    # convert boxes form:
    # shape: [N, 2]
    # (x_center, y_center)
    # 计算目标框的中心坐标
    box_centers = (boxes[:, 0:2] + boxes[:, 2:4]) / 2
    # (width, height)
    # 计算目标框的大小
    box_sizes = boxes[:, 2:4] - boxes[:, 0:2]

    # [13, 13, 3, 5+num_class+1] `5` means coords and labels. `1` means mix up weight.
    # 储存数据的矩阵,初始全部数据都是0,分别对应的是三个不同尺寸的特征图
    # 矩阵的shape: [height, width , 3, 4 + 1 + num_class + 1],
    # 最后一维的第一个1表示的是前景后景的标志位,最后一个1表示的是mix up的权重.
    y_true_13 = np.zeros((img_size[1] // 32, img_size[0] // 32, 3, 6 + class_num), np.float32)
    y_true_26 = np.zeros((img_size[1] // 16, img_size[0] // 16, 3, 6 + class_num), np.float32)
    y_true_52 = np.zeros((img_size[1] // 8, img_size[0] // 8, 3, 6 + class_num), np.float32)

    # mix up weight default to 1.
    # mix up的权重默认值设置为1
    y_true_13[..., -1] = 1.
    y_true_26[..., -1] = 1.
    y_true_52[..., -1] = 1.

    # 将他们放在一起,可以统一操作
    y_true = [y_true_13, y_true_26, y_true_52]

    # [N, 1, 2]
    # 扩展一维,shape: [N, 1, 2]
    # 需要注意的是,这里的N表示的是目标框的数目,而不是样本的数据.
    box_sizes = np.expand_dims(box_sizes, 1)

    # broadcast tricks
    # [N, 1, 2] & [9, 2] ==> [N, 9, 2]
    # 使用numpy的广播机制,很容易计算出目标框和anchor之间的交集部分.
    mins = np.maximum(- box_sizes / 2, - anchors / 2)
    maxs = np.minimum(box_sizes / 2, anchors / 2)

    # [N, 9, 2]
    whs = maxs - mins

    # [N, 9]
    # 计算每一个目标框和每一个anchor的iou值
    iou = (whs[:, :, 0] * whs[:, :, 1]) / (
                box_sizes[:, :, 0] * box_sizes[:, :, 1] + anchors[:, 0] * anchors[:, 1] - whs[:, :, 0] * whs[:, :,
                                                                                                         1] + 1e-10)
    # [N]
    # 计算每一个目标框和某一个anchor之间的最佳iou值,并返回最佳iou值对应的下标索引
    best_match_idx = np.argmax(iou, axis=1)

    # 这个字典是为了后续的计算方便才定义的
    ratio_dict = {1.: 8., 2.: 16., 3.: 32.}

    for i, idx in enumerate(best_match_idx):

        # idx: 0,1,2 ==> 2; 3,4,5 ==> 1; 6,7,8 ==> 0
        # 根据上面的下标索引,下面的代码可以计算出该目标框应该对应与哪一张特征图.
        # 因为不同的anchor对应于不同尺寸的特征图,
        # 所以如果一个目标框和其中一个anchor具有最大的iou,那么我们应该将该目标框和这个anchor对应的特征图联系起来.
        feature_map_group = 2 - idx // 3

        # scale ratio: 0,1,2 ==> 8; 3,4,5 ==> 16; 6,7,8 ==> 32
        # 这里就是利用了前面定义的字典,方便的获取缩放倍数
        ratio = ratio_dict[np.ceil((idx + 1) / 3.)]

        # 计算目标框的中心.这里是指缩放之后的中心
        x = int(np.floor(box_centers[i, 0] / ratio))
        y = int(np.floor(box_centers[i, 1] / ratio))

        # 根据特征图的编号,获取anchor的下标索引
        k = anchors_mask[feature_map_group].index(idx)

        # 类别标记
        c = labels[i]
        # print(feature_map_group, '|', y,x,k,c)

        # 分别将数据添加到合适的位置,其中需要注意的是k的使用,它表明的是目标框对应的是哪个anchor
        # 目标框的中心
        y_true[feature_map_group][y, x, k, :2] = box_centers[i]
        # 目标框的尺寸
        y_true[feature_map_group][y, x, k, 2:4] = box_sizes[i]
        # 前景标记
        y_true[feature_map_group][y, x, k, 4] = 1.
        # 类别标记
        y_true[feature_map_group][y, x, k, 5 + c] = 1.
        # mix up权重
        y_true[feature_map_group][y, x, k, -1] = boxes[i, -1]

    # 当我们处理好所有的目标框之后就返回
    return y_true_13, y_true_26, y_true_52


def parse_data(line, class_num, img_size, anchors, mode):
    '''
    param:
        line: a line from the training/test txt file
        class_num: totol class nums.
        img_size: the size of image to be resized to. [width, height] format.
        anchors: anchors.
        mode: 'train' or 'val'. When set to 'train', data_augmentation will be applied.
    '''

    # 如果line不是一个list，说明这里的line是一个str
    if not isinstance(line, list):
        # 直接处理即可，返回图片索引，图片路径，以及gt目标框的坐标和对应的labels
        img_idx, pic_path, boxes, labels = parse_line(line)

        # 根据图片路径读取图片
        img = cv2.imread(pic_path)

        # expand the 2nd dimension, mix up weight default to 1.
        # 扩展矩阵的维度，这里主要是在每一行的末尾添加一个表示mix up权重的信息，此处默认设置为1
        boxes = np.concatenate((boxes, np.full(shape=(boxes.shape[0], 1), fill_value=1., dtype=np.float32)), axis=-1)

    else:
        # the mix up case
        # 如果line表示的是一个list，说明需要使用mix up策略

        # 处理第一张图片
        _, pic_path1, boxes1, labels1 = parse_line(line[0])
        # 读取第一张图片
        img1 = cv2.imread(pic_path1)
        # 处理第二张图片
        img_idx, pic_path2, boxes2, labels2 = parse_line(line[1])
        # 读取第二张图片
        img2 = cv2.imread(pic_path2)

        # 将他们混合在一起
        img, boxes = mix_up(img1, img2, boxes1, boxes2)

        labels = np.concatenate((labels1, labels2))

    # 如果是训练阶段，则会做一些数据增强的操作，如随机颜色抖动，随机裁剪，随机翻转等操作
    if mode == 'train':
        # random color jittering
        # NOTE: applying color distort may lead to bad performance sometimes
        # img = random_color_distort(img)

        # random expansion with prob 0.5
        if np.random.uniform(0, 1) > 0.5:
            img, boxes = random_expand(img, boxes, 2)

        # random cropping
        h, w, _ = img.shape
        boxes, crop = random_crop_with_constraints(boxes, (w, h))
        x0, y0, w, h = crop
        img = img[y0: y0+h, x0: x0+w]

        # resize with random interpolation
        h, w, _ = img.shape
        interp = np.random.randint(0, 5)
        img, boxes = resize_with_bbox(img, boxes, img_size[0], img_size[1], interp)

        # random horizontal flip
        h, w, _ = img.shape
        img, boxes = random_flip(img, boxes, px=0.5)
    else:
        img, boxes = resize_with_bbox(img, boxes, img_size[0], img_size[1], interp=1)

    # 将颜色的通道顺序进行更改
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32)

    # 规范化数据至0~1
    # the input of yolo_v3 should be in range 0~1
    img = img / 255.

    # 将给出的gt 目标框进行处理，返回对应的gt矩阵，用以后面的损失计算。
    y_true_13, y_true_26, y_true_52 = process_box(boxes, labels, img_size, class_num, anchors)

    # 返回
    return img_idx, img, y_true_13, y_true_26, y_true_52


def get_batch_data(batch_line, class_num, img_size, anchors, mode, multi_scale=False, mix_up=False, interval=10):
    '''
    generate a batch of imgs and labels
    param:
        batch_line: a batch of lines from train/val.txt files
        class_num: num of total classes.
        img_size: the image size to be resized to. format: [width, height].
        anchors: anchors. shape: [9, 2].
        mode: 'train' or 'val'. if set to 'train', data augmentation will be applied.
        multi_scale: whether to use multi_scale training, img_size varies from [320, 320] to [640, 640] by default. Note that it will take effect only when mode is set to 'train'.
        interval: change the scale of image every interval batches. Note that it's indeterministic because of the multi threading.
    '''


    # 全局的计数器
    global iter_cnt

    # multi_scale training
    # 是否使用多种尺寸进行训练， 默认是False
    if multi_scale and mode == 'train':
        # 设置随机数种子
        random.seed(iter_cnt // interval)
        # 设定选择范围，并随机采样
        random_img_size = [[x * 32, x * 32] for x in range(10, 20)]
        img_size = random.sample(random_img_size, 1)[0]

    # 计数器加1
    iter_cnt += 1

    # 用以保存数据的list
    img_idx_batch, img_batch, y_true_13_batch, y_true_26_batch, y_true_52_batch = [], [], [], [], []

    # mix up strategy
    # 是否使用mix up策略，默认是False
    if mix_up and mode == 'train':
        mix_lines = []
        batch_line = batch_line.tolist()
        for idx, line in enumerate(batch_line):
            if np.random.uniform(0, 1) < 0.5:
                mix_lines.append([line, random.sample(batch_line[:idx] + batch_line[idx+1:], 1)[0]])
            else:
                mix_lines.append(line)
        batch_line = mix_lines

    # 对一个batch中的数据，这里的line一般指的是一行文本数据
    for line in batch_line:
        # 处里数据中的信息，主要是数据索引（一般用不上）图片的像素矩阵，不同特征图所对应的gt信息。
        img_idx, img, y_true_13, y_true_26, y_true_52 = parse_data(line, class_num, img_size, anchors, mode)

        # 附加到这些list的末尾
        img_idx_batch.append(img_idx)
        img_batch.append(img)
        y_true_13_batch.append(y_true_13)
        y_true_26_batch.append(y_true_26)
        y_true_52_batch.append(y_true_52)

    # 使用numpy处理一下
    img_idx_batch, img_batch, y_true_13_batch, y_true_26_batch, y_true_52_batch = np.asarray(img_idx_batch, np.int64), np.asarray(img_batch), np.asarray(y_true_13_batch), np.asarray(y_true_26_batch), np.asarray(y_true_52_batch)

    # 返回
    return img_idx_batch, img_batch, y_true_13_batch, y_true_26_batch, y_true_52_batch