CNN5 调用 C实现pool2d im2col col2im Python 调用 C/C++实现卷积

1. swig实现

在Python 调用 C/C++实现卷积中，尝试了python通过swig调用c提高性能的方法。

以下为pool2d im2col col2im三个函数在swig下的配置。

%module t

%include <stdint.i>

%typemap(in,numinputs=0,noblock=1) size_t *l1
{
  size_t templen1;
  $1 = &templen1;
}

%typemap(in,numinputs=0,noblock=1) size_t *l2
{
  size_t templen2;
  $1 = &templen2;
}

%typemap(in,numinputs=0,noblock=1) size_t *l3
{
  size_t templen3;
  $1 = &templen3;
}

%typemap(in,numinputs=0,noblock=1) size_t *l4
{
  size_t templen4;
  $1 = &templen4;
}


%typemap(in) double*
{
    int i,j,k,l,index=0;
    if (!PySequence_Check($input))
        {
          PyErr_SetString(PyExc_TypeError,"Expecting a sequence");
          return NULL;
        }
    int l1=PyObject_Length($input);
    PyObject *t,*t1,*t2,*t3,*t4;
    t= PySequence_GetItem($input,0);
    int l2=PyObject_Length(t);
    t = PySequence_GetItem(t,0);
    int l3=PyObject_Length(t);
    t = PySequence_GetItem(t,0);
    int l4=PyObject_Length(t);
    $1 = (double *) malloc(l1*l2*l3*l4*sizeof(double));
    for(i=0;i<l1;i++)
    {
      t1 = PySequence_GetItem($input,i);
       for(j=0;j<l2;j++)
       {
           t2 = PySequence_GetItem(t1,j);
            for(k=0;k<l3;k++)
            {
                t3 = PySequence_GetItem(t2,k);
                 for(l=0;l<l4;l++)
                 {
                     t4 = PySequence_GetItem(t3,l);
                      if (!PyNumber_Check(t4))
                        {
                            PyErr_SetString(PyExc_ValueError,"Expecting a 4 dim sequence of doubles");
                            free($1);
                            return NULL;
                        }
                      *($1+index)= (double)PyFloat_AsDouble(t4);
                      ++index;
                 }
            }
       }
    }
}


%typemap(out) double* pool2d
{
    int i,j,k,l,index=0;
    $result = PyList_New(templen1);
      for (i = 0; i < templen1; i++)
      {
        PyObject *t2= PyList_New(templen2);
        for (j = 0; j < templen2; j++)
        {
            PyObject *t1= PyList_New(templen3);
            for (k = 0; k < templen3; k++)
            {
                PyObject *t= PyList_New(templen4);
                for (l = 0; l < templen4; l++)
                {
                    PyObject *o = PyFloat_FromDouble((double)$1[index]);
                    PyList_SetItem(t,l,o);
                    ++index;
                }
                PyList_SetItem(t1,k,t);
            }
            PyList_SetItem(t2,j,t1);
        }
        PyList_SetItem($result,i,t2);
      }
    delete [] $1;
}

%typemap(out) double* im2col
{
    int i,j,index=0;
    $result = PyList_New(templen1);
    for (i = 0; i < templen1; i++)
    {
        PyObject *t= PyList_New(templen2);
        for (j = 0; j < templen2; j++)
        {
            PyObject *o = PyFloat_FromDouble((double)$1[index]);
            PyList_SetItem(t,j,o);
            ++index;
        }
        PyList_SetItem($result,i,t);
    }
    delete [] $1;
}


%typemap(out) double* col2im
{
    int i,j,k,l,index=0;
    $result = PyList_New(templen1);
      for (i = 0; i < templen1; i++)
      {
        PyObject *t2= PyList_New(templen2);
        for (j = 0; j < templen2; j++)
        {
            PyObject *t1= PyList_New(templen3);
            for (k = 0; k < templen3; k++)
            {
                PyObject *t= PyList_New(templen4);
                for (l = 0; l < templen4; l++)
                {
                    PyObject *o = PyFloat_FromDouble((double)$1[index]);
                    PyList_SetItem(t,l,o);
                    ++index;
                }
                PyList_SetItem(t1,k,t);
            }
            PyList_SetItem(t2,j,t1);
        }
        PyList_SetItem($result,i,t2);
      }
    delete [] $1;
}


%typemap(freearg) double *a
{
   if ($1) free($1);
}

%inline %{
double* pool2d(const double* const bottom_data, const int num, const int channels,
    const int height, const int width, const int pooled_height,size_t *l1,size_t *l2,size_t *l3,size_t *l4)
{
    const int w = width;
    const int h = height;
    const int m = channels;
    const int n = num;
    const int d = pooled_height;
    const int zh = h / d + h % d;
    const int zw = w / d + w % d;
    double* top_data;
    top_data=new double [n*m*zh*zw*sizeof(double)];
    int i,j,k,o,u,v,index,index2=0;
    double s;

    for (k = 0; k < n; ++k)
        for (o = 0; o < m; ++o)
            for (i = 0; i < zh; ++i)
                for (j = 0; j < zw; ++j)
                {
                    index=k*m*h*w+o*h*w+d*i*w+d*j;
                    s=-10000.0;
                    for (u = 0; u < d&&(u+d*i)<h; ++u)
                        for (v = 0; v < d&&(v+d*j)<w; ++v)
                            if (*(bottom_data+index+u*w+v)>s)
                                s=*(bottom_data+index+u*w+v);
                    *(top_data+index2)=s;
                    ++index2;
                }
    *l1=n;
    *l2=m;
    *l3=zh;
    *l4=zw;
    return top_data;
}


double* im2col(const double* const bottom_data, const int num, const int channels,
    const int height, const int width, const int filter_height,
    const int filter_width,size_t *l1,size_t *l2)
{
    const int w = width;
    const int h = height;
    const int m = channels;
    const int n = num;
    const int vh = filter_height;
    const int vw = filter_width;
    const int t = h - vh + 1;
    const int kh = t *t;
    const int kw = vh * vh;
    double* top_data;
    top_data=new double [n*m*kh*kw*sizeof(double)];
    int k,o,i,j,u,v,index,index2;
    for (k = 0; k < n; ++k)
        for (o = 0; o < m; ++o)
            for (i = 0; i < t; ++i)
                for (j = 0; j < t; ++j)
                {
                    index=k*kh*kw*m+(i*t+j)*kw*m+o*kw;
                    index2=k*m*h*w+o*h*w;
                    for (u = 0; u < vh; ++u)
                        for (v = 0; v < vw; ++v)
                        {
                            *(top_data+index+u*vh+v)=
                            *(bottom_data+index2+(i+u)*w+j+v);
                        }

                }

    *l1=n*kh;
    *l2=kw*m;
    return top_data;
}

double* col2im(const double* const bottom_data, const int num, const int channels,
    const int height, const int width, const int filter_height,
    const int filter_width, size_t *l1,size_t *l2,size_t *l3,size_t *l4)
{
    const int w = width;
    const int h = height;
    const int m = channels;
    const int n = num;
    const int vh = filter_height;
    const int vw = filter_width;
    const int t = h - vh + 1;
    const int kh = t *t;
    const int kw = vh * vh;
    int k,o,i,j,u,v,index,rh,b;
    double* top_data;
    top_data=new double [n*m*h*w*sizeof(double)];
    for (k = 0; k < n; ++k)
        for (o = 0; o < m; ++o)
            for (i = 0; i < h; ++i)
            {
                rh=vh * (i<(vh - 1)?i:(vh-1));
                b = ((i + 1 - vh)> 0?(i + 1 - vh):0) * t;
                index=k*m*h*h+o*h*h+i*h;
                for (u = 0; u < vh; ++u)
                {
                    *(top_data+index+u)=
                    *(bottom_data+k*kh*kw*m+b*kw*m+rh+u);
                }
                for (v = 0; v < t-1; ++v)
                {
                    *(top_data+index+vh+v)=
                    *(bottom_data+k*kh*kw*m+(b+1+v)*kw*m+rh+vh-1);
                }
            }
    *l1=n;
    *l2=m;
    *l3=h;
    *l4=w;
    return top_data;

}

%}

2.使用效果

只有pool2d提升很大，使用另两个函数速度不仅没有提升，反而有所下降。

# coding:utf8
import cPickle
import numpy as np
import time
import t

"""
cost per 500 data:
default 22s
+pool2d 10s
+im2col 11s
+col2im 12s

"""


class ConvPoolLayer(object):
    def __init__(self, image_shape,filter_shape,poolsize=(2,2)):
        self.filter_shape = filter_shape
        self.image_shape = image_shape
        self.weights = np.random.normal(loc=0, scale=np.sqrt(1.0/np.prod(filter_shape[1:])),
                                  size=(np.prod(filter_shape[1:]),filter_shape[0]))
        self.b = np.random.normal(loc=0, scale=0.2, size=(1,filter_shape[0],1,1))
        self.poolsize = poolsize
        self.samp_shape=(image_shape[-2] - filter_shape[-2] + 1,image_shape[-1] - filter_shape[-1] + 1)
        self.out_shape=(self.samp_shape[0]/poolsize[0],self.samp_shape[1]/poolsize[1])
        self.v=0

    def im2col(self,a):
        n, m, h, w = self.image_shape
        vn, vm, vh, vw = self.filter_shape
        z=t.im2col(a,n,m,h,w,vh,vw)
        z = np.array(z)
        return z

    def col2im(self,a):
        n, m, h, w = self.image_shape
        vn, vm, vh, vw = self.filter_shape
        z=t.col2im(a,n,m,h,w,vh,vw)
        z = np.array(z)
        return z

    def fw_shape(self,a):
        res = np.reshape(a, (self.image_shape[0], -1, self.filter_shape[0]))
        res = np.rollaxis(res, 2, 1)
        res = np.reshape(res, (self.image_shape[0], -1, self.samp_shape[0], self.samp_shape[1]))
        return res

    def bp_shape(self,a):
        res = np.reshape(a, (self.image_shape[0], self.filter_shape[0], -1))
        res = np.rollaxis(res, 1, 3)
        res = np.reshape(res, (-1, self.filter_shape[0]))
        return res

    def feedforward(self, a):
        z = self.im2col(a)
        res = np.dot(z, self.weights)
        res = self.fw_shape(res)
        self.out = self.relu(res+self.b)
        return np.array(self.pool2d(self.out))

    def backprop(self, x, dnext,eta=0.001,weight_decay=0,momentum=0.9):
        if dnext.ndim<3:
            dnext = np.reshape(dnext,(self.image_shape[0],self.filter_shape[0], self.out_shape[0], -1))
        u = self.relu_prime(self.out)
        dnext = np.multiply(u,self.up(dnext,self.poolsize[0]))
        delta = self.bp_shape(dnext)/self.image_shape[0]
        x=self.im2col(x)
        out_delta = np.dot(delta, self.weights.T)
        out_delta=self.col2im(out_delta)
        w = np.dot(x.T, delta)
        w = eta * w+weight_decay*self.weights**2
        self.v = momentum * self.v - w
        self.weights += self.v
        self.b -= eta * np.reshape(np.sum(delta,0),(self.b.shape))
        return out_delta

    def pool2d(self,input, ds=(2, 2)):
        n, m, h, w = np.shape(input)
        return t.pool2d(input,n,m,h,w,self.poolsize[0])

    def up(self,a,n):
        b=np.ones((n,n))
        return np.kron(a,b)

    def relu(self,z):
        return np.maximum(z, 0.0)

    def relu_prime(self,z):
        z[z>0]=1
        return z

class SoftmaxLayer(object):
    def __init__(self, in_num=100,out_num=10):
        self.weights = np.random.randn(in_num, out_num)/np.sqrt(out_num)
        self.v=0

    def feedforward(self, input):
        self.out=self.softmax(np.dot(input, self.weights))
        return self.out

    def backprop(self, input, y,eta=0.001,weight_decay=0,momentum=0.9):
        o=self.out
        delta =o-y
        out_delta=np.dot(delta,self.weights.T)
        w = np.dot(input.T,delta)
        w=eta*w+weight_decay*self.weights**2
        self.v = momentum * self.v - w
        self.weights += self.v
        return out_delta

    def softmax(self,a):
        m = np.exp(a)
        return m / (np.sum(m,axis=1)[:,np.newaxis])

class FullLayer(object):
    def __init__(self, in_num=720,out_num=100):
        self.in_num=in_num
        self.out_num=out_num
        self.biases = np.random.randn(out_num)
        self.weights = np.random.randn(in_num, out_num)/np.sqrt(out_num)
        self.v=0

    def feedforward(self, x):
        if x.ndim>2:
            x = np.reshape(x, (len(x), self.in_num))
        self.out = self.sigmoid(np.dot(x, self.weights)+self.biases)
        return self.out

    def backprop(self, x,delta,eta=0.001,weight_decay=0,momentum=0.9):
        if x.ndim>2:
            x = np.reshape(x, (len(x), self.in_num))
        sp=self.sigmoid_prime(self.out)
        delta = delta * sp
        out_delta = np.dot(delta, self.weights.T)
        w = np.dot( x.T,delta)
        w=eta*w+weight_decay*self.weights**2
        self.v=momentum*self.v-w
        self.weights +=self.v
        self.biases -= eta*np.sum(delta,0)
        return out_delta

    def sigmoid(self,z):
        return 1.0/(1.0+np.exp(-z))

    def sigmoid_prime(self,z):
        return z*(1-z)

class Network(object):
    def __init__(self, layers):
        self.layers=layers
        self.num_layers = len(layers)
        self.a=[]

    def feedforward(self, x):
        self.a.append(x)
        for layer in self.layers:
            x=layer.feedforward(x)
            self.a.append(x)
        return x

    def SGD(self, training_data, test_data,epochs, mini_batch_size, lr=0.001,weight_decay=0.0005,momentum=0.9):
        self.n = len(training_data[0])
        self.mini_batch_size=mini_batch_size
        self.weight_decay = weight_decay
        self.momentum=momentum
        self.lr = lr
        rate=np.exp(np.log(0.9)/2000)**mini_batch_size
        cx=range(epochs)
        for j in cx:
            for k in xrange(0, self.n , mini_batch_size):
                self.lr=self.lr*rate
                batch_x = np.array(training_data[0][k:k + mini_batch_size])
                batch_y = training_data[1][k:k + mini_batch_size]
                self.backprop(batch_x,batch_y)
                if k%500==0:
                    print "Epoch {0}:{1},test:{2},cost={3},lr={4}".format(j,k,
                    self.evaluate([test_data[0],test_data[1]]),self.cost,self.lr)
                    print time.time()

    def backprop(self, x_in, y):
        self.feedforward(x_in)
        for i in range(self.num_layers):
            delta=self.layers[-i-1].backprop(self.a[-i-2],y,eta=self.lr,
                            weight_decay=self.weight_decay,momentum=self.momentum)
            y=delta

    def evaluate(self, test_data):
        x,y=test_data
        num=len(x)
        x=[self.feedforward(np.array(x[size*i:size*i+size])) for i in range((num/size))]
        x=np.reshape(x,(num,np.shape(x)[-1]))
        xp = np.argmax(x, axis=1)
        yp= np.argmax(y, axis=1) if y[0].ndim else y
        self.cost = -np.mean(np.log(x)[np.arange(num),yp])
        return np.mean(yp == xp)*100


if __name__ == '__main__':
        def get_data(data):
            return [np.reshape(x, (1,28,28)) for x in data[0]]

        def get_label(i):
            c = np.zeros((10))
            c[i] = 1
            return c

        f = open('data/mnist.pkl', 'rb')
        training_data, validation_data, test_data = cPickle.load(f)
        training_inputs = get_data(training_data)
        training_label=[get_label(y_) for y_ in training_data[1]]
        test_inputs = get_data(test_data)
        test = zip(test_inputs,test_data[1])
        size=50
        net = Network([ConvPoolLayer(image_shape=[size,1,28,28],filter_shape=[20,1,5,5],poolsize=(2,2)),
                       ConvPoolLayer(image_shape=[size,20,12,12],filter_shape=[40,20,5,5], poolsize=(2,2)),
                       FullLayer(in_num=40*4*4,out_num=100),
                       SoftmaxLayer(in_num=100,out_num=10)])
        net.SGD([training_inputs,training_label],[test_inputs[:500],test_data[1][:500]],
                epochs=3,mini_batch_size=size, lr=0.005,weight_decay=0,momentum=0.9)

3.内存

上面的swig配置内存泄露严重，以下经过修改有所降低，但仍然不断增长。

可能是由于返回时创建了太多PyObject。

%module t

%include <stdint.i>

%typemap(in,numinputs=0,noblock=1) size_t *l1
{
  size_t templen1;
  $1 = &templen1;
}

%typemap(in,numinputs=0,noblock=1) size_t *l2
{
  size_t templen2;
  $1 = &templen2;
}

%typemap(in,numinputs=0,noblock=1) size_t *l3
{
  size_t templen3;
  $1 = &templen3;
}

%typemap(in,numinputs=0,noblock=1) size_t *l4
{
  size_t templen4;
  $1 = &templen4;
}

%typemap(in) double*
{
    int i,j,k,l,index=0;
    if (!PySequence_Check($input))
        {
          PyErr_SetString(PyExc_TypeError,"Expecting a sequence");
          return NULL;
        }
    int len1=PyObject_Length($input);
    PyObject *t;
    t = PySequence_GetItem($input,0);
    int len2=PyObject_Length(t);
    t = PySequence_GetItem(t,0);
    int len3=PyObject_Length(t);
    t = PySequence_GetItem(t,0);
    int len4=PyObject_Length(t);
    $1 = (double *) malloc(len1*len2*len3*len4*sizeof(double));
    for(i=0;i<len1;i++)
    {
       PyObject *t1 = PySequence_GetItem($input,i);
       for(j=0;j<len2;j++)
       {
            PyObject *t2 = PySequence_GetItem(t1,j);
            for(k=0;k<len3;k++)
            {
                 PyObject *t3 = PySequence_GetItem(t2,k);
                 for(l=0;l<len4;l++)
                 {
                     PyObject *t4 = PySequence_GetItem(t3,l);
                     if (!PyNumber_Check(t4))
                     {
                            PyErr_SetString(PyExc_ValueError,"Expecting a 4 dim sequence of doubles");
                            free($1);
                            Py_DECREF(t1);
                            Py_DECREF(t2);
                            Py_DECREF(t3);
                            Py_XDECREF(t4);
                            Py_DECREF($input);
                            return NULL;
                      }
                      *($1+index)= (double)PyFloat_AsDouble(t4);
                      ++index;
                      Py_DECREF(t4);
                 }
                 Py_DECREF(t3);
            }
            Py_DECREF(t2);
       }
       Py_DECREF(t1);
    }
    Py_DECREF($input);
    //Py_DECREF(t);
}


%typemap(out) double* pool2d
{
    int i,j,k,l,index=0;
    $result = PyList_New(templen1);
      for (i = 0; i < templen1; i++)
      {
        PyObject *t2= PyList_New(templen2);
        for (j = 0; j < templen2; j++)
        {
            PyObject *t1= PyList_New(templen3);
            for (k = 0; k < templen3; k++)
            {
                PyObject *t= PyList_New(templen4);
                for (l = 0; l < templen4; l++)
                {
                    PyObject *o = PyFloat_FromDouble((double)$1[index]);
                    PyList_SetItem(t,l,o);
                    ++index;
                }
                PyList_SetItem(t1,k,t);
            }
            PyList_SetItem(t2,j,t1);
        }
        PyList_SetItem($result,i,t2);
      }
    delete [] $1;
}

%typemap(out) double* im2col
{
    int i,j,index=0;
    $result = PyList_New(templen1);
    for (i = 0; i < templen1; i++)
    {
        PyObject *t= PyList_New(templen2);
        for (j = 0; j < templen2; j++)
        {
            PyObject *o = PyFloat_FromDouble((double)$1[index]);
            PyList_SetItem(t,j,o);
            ++index;
        }
        PyList_SetItem($result,i,t);
    }
    delete [] $1;
}


%typemap(out) double* col2im
{
    int i,j,k,l,index=0;
    $result = PyList_New(templen1);
      for (i = 0; i < templen1; i++)
      {
        PyObject *t2= PyList_New(templen2);
        for (j = 0; j < templen2; j++)
        {
            PyObject *t1= PyList_New(templen3);
            for (k = 0; k < templen3; k++)
            {
                PyObject *t= PyList_New(templen4);
                for (l = 0; l < templen4; l++)
                {
                    PyObject *o = PyFloat_FromDouble((double)$1[index]);
                    PyList_SetItem(t,l,o);
                    ++index;
                }
                PyList_SetItem(t1,k,t);
            }
            PyList_SetItem(t2,j,t1);
        }
        PyList_SetItem($result,i,t2);
      }
    delete [] $1;
}


%typemap(freearg) double *a
{
   if ($1) free($1);
}

%inline %{
double* pool2d(const double* const bottom_data, const int num, const int channels,
    const int height, const int width, const int pooled_height,size_t *l1,size_t *l2,size_t *l3,size_t *l4)
{
    const int w = width;
    const int h = height;
    const int m = channels;
    const int n = num;
    const int d = pooled_height;
    const int zh = h / d + h % d;
    const int zw = w / d + w % d;
    double* top_data;
    top_data=new double [n*m*zh*zw*sizeof(double)];
    int i,j,k,o,u,v,index,index2=0;
    double s;

    for (k = 0; k < n; ++k)
        for (o = 0; o < m; ++o)
            for (i = 0; i < zh; ++i)
                for (j = 0; j < zw; ++j)
                {
                    index=k*m*h*w+o*h*w+d*i*w+d*j;
                    s=-10000.0;
                    for (u = 0; u < d&&(u+d*i)<h; ++u)
                        for (v = 0; v < d&&(v+d*j)<w; ++v)
                            if (*(bottom_data+index+u*w+v)>s)
                                s=*(bottom_data+index+u*w+v);
                    *(top_data+index2)=s;
                    ++index2;
                }
    *l1=n;
    *l2=m;
    *l3=zh;
    *l4=zw;
    return top_data;
}


double* im2col(const double* const bottom_data, const int num, const int channels,
    const int height, const int width, const int filter_height,
    const int filter_width,size_t *l1,size_t *l2)
{
    const int w = width;
    const int h = height;
    const int m = channels;
    const int n = num;
    const int vh = filter_height;
    const int vw = filter_width;
    const int t = h - vh + 1;
    const int kh = t *t;
    const int kw = vh * vh;
    double* top_data;
    top_data=new double [n*m*kh*kw*sizeof(double)];
    int k,o,i,j,u,v,index,index2;
    for (k = 0; k < n; ++k)
        for (o = 0; o < m; ++o)
            for (i = 0; i < t; ++i)
                for (j = 0; j < t; ++j)
                {
                    index=k*kh*kw*m+(i*t+j)*kw*m+o*kw;
                    index2=k*m*h*w+o*h*w;
                    for (u = 0; u < vh; ++u)
                        for (v = 0; v < vw; ++v)
                        {
                            *(top_data+index+u*vh+v)=
                            *(bottom_data+index2+(i+u)*w+j+v);
                        }

                }

    *l1=n*kh;
    *l2=kw*m;
    return top_data;
}


double* col2im(const double* const bottom_data, const int num, const int channels,
    const int height, const int width, const int filter_height,
    const int filter_width, size_t *l1,size_t *l2,size_t *l3,size_t *l4)
{
    const int w = width;
    const int h = height;
    const int m = channels;
    const int n = num;
    const int vh = filter_height;
    const int vw = filter_width;
    const int t = h - vh + 1;
    const int kh = t *t;
    const int kw = vh * vh;
    int k,o,i,j,u,v,index,rh,b;
    double* top_data;
    top_data=new double [n*m*h*w*sizeof(double)];
    for (k = 0; k < n; ++k)
        for (o = 0; o < m; ++o)
            for (i = 0; i < h; ++i)
            {
                rh=vh * (i<(vh - 1)?i:(vh-1));
                b = ((i + 1 - vh)> 0?(i + 1 - vh):0) * t;
                index=k*m*h*h+o*h*h+i*h;
                for (u = 0; u < vh; ++u)
                {
                    *(top_data+index+u)=
                    *(bottom_data+k*kh*kw*m+b*kw*m+rh+u);
                }
                for (v = 0; v < t-1; ++v)
                {
                    *(top_data+index+vh+v)=
                    *(bottom_data+k*kh*kw*m+(b+1+v)*kw*m+rh+vh-1);
                }
            }
    *l1=n;
    *l2=m;
    *l3=h;
    *l4=w;
    return top_data;

}

%}