Caffe代码分析-crop_layer.cu

因为要修改Caffe crop layer GPU部分的代码，现将自己对这部分GPU代码的理解总结一下，请大家多多指教！

crop layer完成的功能（以matlab的方式表示）：A（N，C，H，W），Reference（n，c，h，w），Offsets（o1, o2, o3，o4）， croped_A=A[o1:o1+n, o2:o2+c, o3:o3+h, o4:o4+w]

先代码，后解释

#include <vector>

#include "caffe/layers/crop_layer.hpp"

namespace caffe {

__device__ int compute_uncropped_index(
    int index,
    const int ndims,
    const int* src_strides,
    const int* dest_strides,
    const int* offsets) {
  int dest_index = index;
  int src_index = 0;
  for (int i = 0; i < ndims; ++i) {
      int coord = dest_index / dest_strides[i];
      dest_index -= coord * dest_strides[i];
      src_index += src_strides[i] * (coord + offsets[i]);
  }
  return src_index;
}

template <typename Dtype>
__global__ void crop_kernel_forward(const int nthreads,
    const int ndims,
    const int* src_strides,
    const int* dest_strides,
    const int* offsets,
    const Dtype* src, Dtype* dest) {
  CUDA_KERNEL_LOOP(index, nthreads) {
    int src_index = compute_uncropped_index(
        index, ndims, src_strides, dest_strides, offsets);
    dest[index] = src[src_index];
  }
}

template <typename Dtype>
__global__ void crop_kernel_backward(const int nthreads,
    const int ndims,
    const int* src_strides,
    const int* dest_strides,
    const int* offsets,
    Dtype* src, const Dtype* dest) {
  CUDA_KERNEL_LOOP(index, nthreads) {
    int src_index = compute_uncropped_index(
        index, ndims, src_strides, dest_strides, offsets);
    src[src_index] = dest[index];
  }
}

template <typename Dtype>
void CropLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top) {
  const Dtype* bottom_data = bottom[0]->gpu_data();
  Dtype* top_data = top[0]->mutable_gpu_data();
  int n = top[0]->count();
  // NOLINT_NEXT_LINE(whitespace/operators)
  crop_kernel_forward<<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(n,
      bottom[0]->num_axes(),
      src_strides_.gpu_data(),
      dest_strides_.gpu_data(),
      offsets.gpu_data(),
      bottom_data, top_data);
}

template <typename Dtype>
void CropLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
  const Dtype* top_diff = top[0]->gpu_diff();
  Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
  int n = top[0]->count();

  if (propagate_down[0]) {
    caffe_gpu_set(bottom[0]->count(), static_cast<Dtype>(0), bottom_diff);
    // NOLINT_NEXT_LINE(whitespace/operators)
    crop_kernel_backward<<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(n,
        bottom[0]->num_axes(),
        src_strides_.gpu_data(),
        dest_strides_.gpu_data(),
        offsets.gpu_data(),
        bottom_diff, top_diff);
  }
}

INSTANTIATE_LAYER_GPU_FUNCS(CropLayer);

}  // namespace caffe

我将分析的重点放在Forward_gpu函数上，该函数在获取bottom、top data的指针之后，调用GPU端程序crop_kernel_forward。

其参数含义如下：

nthreads： nxcxhxw
ndims：4
src_strides: (CxHxW，HxW，W，1)
dest_strides：（cxhxw，hxw，w，1）
offsets：（o1, o2, o3, o4）
src：源指针
dest：目的指针

可以理解为src是A矩阵，dest就是我们需要的croped_A矩阵

crop_kernel_forward函数将每一个数据影射到一个线程，先计算通过compute_uncropped_index函数计算src_index，然后进行赋值。这里的重点是compute_uncropped_index，下面我通过函数注释的方式解析一下该函数的具体含义。

__device__ int compute_uncropped_index(
    int index,
    const int ndims,
    const int* src_strides,
    const int* dest_strides,
    const int* offsets) {
  int dest_index = index; //将线程号赋给dest_index
  int src_index = 0; //初始化src_index
  for (int i = 0; i < ndims; ++i) { //每个维度分别处理
      int coord = dest_index / dest_strides[i];//coord表示dest第i个维度的坐标
      dest_index -= coord * dest_strides[i];//消除第i维坐标的影响
      src_index += src_strides[i] * (coord + offsets[i]);//coord和offsets[i]在src_index引入的偏移
  }
  return src_index;
}