generic/THTensorConv.c - platform/external/pytorch - Git at Google

 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/THTensorConv.c"
 #else


 /*
   2D Input, 2D kernel  : convolve given image with the given kernel.
 */
 void THTensor_(validXCorr2Dptr)(real *r_,
                                        real alpha,
                                        real *t_, long ir, long ic,
                                        real *k_, long kr, long kc,
                                        long sr, long sc)
 {
   long or = (ir - kr) / sr + 1;
   long oc = (ic - kc) / sc + 1;

   long xx, yy, kx, ky;

   if ((sc != 1) || (oc < 4))  {
     /* regular convolution */
     for(yy = 0; yy < or; yy++) {
       for(xx = 0; xx < oc; xx++) {
         /* Dot product in two dimensions... (between input image and the mask) */
         real *pi_ = t_ + yy*sr*ic + xx*sc;
         real *pw_ = k_;
         real sum = 0;
         for(ky = 0; ky < kr; ky++) {
           for(kx = 0; kx < kc; kx++) {
             sum += pi_[kx]*pw_[kx];
           }
           pi_ += ic; /* next input line */
           pw_ += kc; /* next mask line */
         }
         /* Update output */
         *r_++ += alpha*sum;
       }
     }

   } else {
     /* SSE-based convolution */
     for(yy = 0; yy < or; yy++) {
       real *pi_ = t_ + yy*sr*ic;
       real *pw_ = k_;
       for (ky = 0; ky < kr; ky++) {
         real *pis_ = pi_;
         for (kx = 0; kx < kc; kx++) {
           THVector_(add)(r_, pis_, alpha*pw_[kx], oc);
           pis_++;
         }
         pi_ += ic; /* next input line */
         pw_ += kc; /* next mask line */
       }
       r_ += oc;
     }
   }
 }

 /*
   2D Input, 2D kernel  : convolve given image with the given kernel.
 */
 void THTensor_(validConv2Dptr)(real *r_,
                                       real alpha,
                                       real *t_, long ir, long ic,
                                       real *k_, long kr, long kc,
                                       long sr, long sc)
 {
   long or = (ir - kr) / sr + 1;
   long oc = (ic - kc) / sc + 1;

   long xx, yy, kx, ky;

   if ((sc != 1) || (oc < 4))  {
     /* regular convolution */
     for(yy = 0; yy < or; yy++) {
       for(xx = 0; xx < oc; xx++) {
         /* Dot product in two dimensions... (between input image and the mask) */
         real *pi_ = t_ + yy*sr*ic + xx*sc;
         real *pw_ = k_ + kr*kc - 1;
         real sum = 0;
         for(ky = 0; ky < kr; ky++) {
           for(kx = 0; kx < kc; kx++) {
             sum += pi_[kx]*pw_[-kx];
           }
           pi_ += ic; /* next input line */
           pw_ -= kc; /* next mask line */
         }
         /* Update output */
         *r_++ += alpha*sum;
       }
     }

   } else {
     /* SSE-based convolution */
     for(yy = 0; yy < or; yy++) {
       real *pw_ = k_ + kr*kc - 1;
       real *pi_ = t_ + yy*sr*ic;
       for (ky = 0; ky < kr; ky++) {
         real *pis_ = pi_;
         for (kx = 0; kx < kc; kx++) {
           THVector_(add)(r_, pis_, alpha*pw_[-kx], oc);
           pis_++;
         }
         pi_ += ic; /* next input line */
         pw_ -= kc; /* next mask line */
       }
       r_ += oc;
     }
   }
 }

 /*
   2D Input, 2D kernel  : convolve given image with the given kernel, full convolution.
 */
 void THTensor_(fullConv2Dptr)(real *r_,
                                      real alpha,
                                      real *t_, long ir, long ic,
                                      real *k_, long kr, long kc,
                                      long sr, long sc)
 {
   long oc = (ic - 1) * sc + kc;

   long xx, yy, kx, ky;

   if ((sc != 1) || (ic < 4))  {
     /* regular convolution */
     for(yy = 0; yy < ir; yy++) {
       for(xx = 0; xx < ic; xx++) {
         /* Outer product in two dimensions... (between input image and the mask) */
         real *po_ = r_ + yy*sr*oc + xx*sc;
         real *pw_ = k_;
         for(ky = 0; ky < kr; ky++)
         {
           real z = *t_ * alpha;
           for(kx = 0; kx < kc; kx++) {
             po_[kx] += z * pw_[kx];
           }
           po_ += oc; /* next input line */
           pw_ += kc; /* next mask line */
         }
         t_++;
       }
     }

   } else {
     /* SSE-based convolution */
     for(yy = 0; yy < ir; yy++) {
       real *po_ = r_ + yy*sr*oc;
       real *pw_ = k_;
       for (ky = 0; ky < kr; ky++) {
         real *pos_ = po_;
         for (kx = 0; kx < kc; kx++) {
           THVector_(add)(pos_, t_, alpha*pw_[kx], ic);
           pos_++;
         }
         po_ += oc; /* next input line */
         pw_ += kc; /* next mask line */
       }
       t_ += ic;
     }
   }
 }

 /*
   2D Input, 2D kernel  : convolve given image with the given kernel, full convolution.
 */
 void THTensor_(fullXCorr2Dptr)(real *r_,
                                       real alpha,
                                       real *t_, long ir, long ic,
                                       real *k_, long kr, long kc,
                                       long sr, long sc)
 {
   long oc = (ic - 1) * sc + kc;

   long xx, yy, kx, ky;

   if ((sc != 1) || (ic < 4))  {
     /* regular convolution */
     for(yy = 0; yy < ir; yy++) {
       for(xx = 0; xx < ic; xx++) {
         /* Outer product in two dimensions... (between input image and the mask) */
         real *po_ = r_ + yy*sr*oc + xx*sc;
         real *pw_ = k_ + kr*kc -1;
         long kx, ky;
         for(ky = 0; ky < kr; ky++)
         {
           real z = *t_ * alpha;
           for(kx = 0; kx < kc; kx++) {
             po_[kx] += z * pw_[-kx];
           }
           po_ += oc; /* next input line */
           pw_ -= kc; /* next mask line */
         }
         t_++;
       }
     }

   } else {
     /* SSE-based convolution */
     for(yy = 0; yy < ir; yy++) {
       real *po_ = r_ + yy*sr*oc;
       real *pw_ = k_ + kr*kc -1;
       for (ky = 0; ky < kr; ky++) {
         real *pos_ = po_;
         for (kx = 0; kx < kc; kx++) {
           THVector_(add)(pos_, t_, pw_[-kx]*alpha, ic);
           pos_++;
         }
         po_ += oc; /* next input line */
         pw_ -= kc; /* next mask line */
       }
       t_ += ic;
     }
   }
 }

 /*
   2D Input, 2D kernel  : convolve given image with the given kernel, valid convolution.
   for sr,sc=1 this is equivalent to validXCorr2Dptr, but otherwise it is useful for
   calculating derivatives wrt a kernel that is applied with stride sr,sc != 1
 */
 void THTensor_(validXCorr2DRevptr)(real *r_,
                                           real alpha,
                                           real *t_, long ir, long ic,
                                           real *k_, long kr, long kc,
                                           long sr, long sc)
 {
   long or = ir - (kr - 1) * sr;
   long oc = ic - (kc - 1) * sc;

   long xx, yy, kx, ky;

   if ((sc != 1) || (kc < 4))  {
     /* regular convolution */
     for(yy = 0; yy < kr; yy++) {
       for(xx = 0; xx < kc; xx++) {
         real *po_ = r_;
         real *pi_ = t_ + yy*sr*ic + xx*sc;
         real z = *k_++ * alpha;

         for(ky = 0; ky < or; ky++) {
           for(kx = 0; kx < oc; kx++)
             po_[kx] += z * pi_[kx];
           pi_ += ic;
           po_ += oc;
         }
       }
     }

   } else {
     /* SSE-based convolution */
     for(yy = 0; yy < kr; yy++) {
       for(xx = 0; xx < kc; xx++) {
         real *po_ = r_;
         real *pi_ = t_ + yy*sr*ic + xx*sc;
         real z = *k_++ * alpha;

         for(ky = 0; ky < or; ky++) {
           THVector_(add)(po_, pi_, z, oc);
           pi_ += ic;
           po_ += oc;
         }
       }
     }
   }
 }
 /*
   3D Input, 3D kernel  : convolve given volume with the given kernel.
 */
 void THTensor_(validXCorr3Dptr)(real *r_,
                                        real alpha,
                                        real *t_, long it, long ir, long ic,
                                        real *k_, long kt, long kr, long kc,
                                        long st, long sr, long sc)
 {
   long ot = (it - kt) / st + 1;
   long or = (ir - kr) / sr + 1;
   long oc = (ic - kc) / sc + 1;

   long zz, xx, yy;

   for (zz = 0; zz < ot; zz++)
   {
     for(yy = 0; yy < or; yy++)
     {
       for(xx = 0; xx < oc; xx++)
       {
         /* Dot product in two dimensions... (between input image and the mask) */
         real *pi_ = t_ + zz*st*ir*ic + yy*sr*ic + xx*sc;
         real *pw_ = k_;
         real sum = 0;
         long kz, kx, ky;
         for(kz = 0; kz < kt; kz++)
         {
           for(ky = 0; ky < kr; ky++)
           {
             for(kx = 0; kx < kc; kx++) {
               sum += pi_[kx]*pw_[kx];
             }
             pi_ += ic; /* next input line */
             pw_ += kc; /* next mask line */
           }
           pi_ += (ir-kr)*ic; /* next input slice */
         }
         /* Update output */
         *r_++ += sum*alpha;
       }
     }
   }
 }

 /*
   3D Input, 3D kernel  : convolve given volume with the given kernel.
 */
 void THTensor_(validConv3Dptr)(real *r_,
                                       real alpha,
                                       real *t_, long it, long ir, long ic,
                                       real *k_, long kt, long kr, long kc,
                                       long st, long sr, long sc)
 {
   long ot = (it - kt) / st + 1;
   long or = (ir - kr) / sr + 1;
   long oc = (ic - kc) / sc + 1;

   long zz, xx, yy;

   for(zz = 0; zz < ot; zz++)
   {
     for(yy = 0; yy < or; yy++)
     {
       for(xx = 0; xx < oc; xx++)
       {
         /* Dot product in two dimensions... (between input image and the mask) */
         real *pi_ = t_ + zz*st*ir*ic + yy*sr*ic + xx*sc;
         real *pw_ = k_ + kt*kr*kc - 1;
         real sum = 0;
         long kz, kx, ky;
         for(kz = 0; kz < kt; kz++)
         {
           for(ky = 0; ky < kr; ky++)
           {
             for(kx = 0; kx < kc; kx++) {
               sum += pi_[kx]*pw_[-kx];
             }
             pi_ += ic; /* next input line */
             pw_ -= kc; /* next mask line */
           }
           pi_ += (ir-kr)*ic; /* next input slice */
         }
         /* Update output */
         *r_++ += alpha*sum;
       }
     }
   }
 }


 /*
   3D Input, 3D kernel  : convolve given volume with the given kernel, full convolution.
 */
 void THTensor_(fullConv3Dptr)(real *r_,
                                      real alpha,
                                      real *t_, long it, long ir, long ic,
                                      real *k_, long kt, long kr, long kc,
                                      long st, long sr, long sc)
 {
   long or = (ir - 1) * sr + kr;
   long oc = (ic - 1) * sc + kc;

   long zz, xx, yy;

   for(zz = 0; zz < it; zz++)
   {
     for(yy = 0; yy < ir; yy++)
     {
       for(xx = 0; xx < ic; xx++)
       {
         /* Outer product in two dimensions... (between input image and the mask) */
         real *po_ = r_ + zz*st*or*oc + yy*sr*oc + xx*sc;
         real *pw_ = k_;
         long kz, kx, ky;
         /* printf("Output Plane : %ld,%ld,%ld, input val=%g\n",zz,yy,xx,*t_); */
         for(kz = 0; kz < kt; kz++)
         {
           for(ky = 0; ky < kr; ky++)
           {
             real z = *t_ * alpha;
             for(kx = 0; kx < kc; kx++) {
               /* printf("o=%g,k=%g," , po_[kx],pw_[kx]); */
               po_[kx] += z * pw_[kx];
               /* printf("o=%g " , po_[kx]); */
             }
             /* printf("\n"); */
             po_ += oc; /* next input line */
             pw_ += kc; /* next mask line */
           }
           po_ += (or-kr)*oc; /* next output slice */
           /* printf("\n"); */
         }
         t_++;
       }
     }
   }
 }

 /*
   3D Input, 3D kernel  : convolve given volume with the given kernel, full convolution.
 */
 void THTensor_(fullXCorr3Dptr)(real *r_,
                                       real alpha,
                                       real *t_, long it, long ir, long ic,
                                       real *k_, long kt, long kr, long kc,
                                       long st, long sr, long sc)
 {
   long or = (ir - 1) * sr + kr;
   long oc = (ic - 1) * sc + kc;

   long zz, xx, yy;

   for(zz = 0; zz < it; zz++)
   {
     for(yy = 0; yy < ir; yy++)
     {
       for(xx = 0; xx < ic; xx++)
       {
         /* Outer product in two dimensions... (between input image and the mask) */
         real *po_ = r_ + zz*st*or*oc + yy*sr*oc + xx*sc;
         real *pw_ = k_ + kt*kr*kc -1;
         long kz, kx, ky;
         for(kz = 0; kz < kt; kz++)
         {
           for(ky = 0; ky < kr; ky++)
           {
             real z = *t_ * alpha;
             for(kx = 0; kx < kc; kx++) {
               po_[kx] += z * pw_[-kx];
             }
             po_ += oc; /* next input line */
             pw_ -= kc; /* next mask line */
           }
           po_ += (or-kr)*oc; /* next output slice */
         }
         t_++;
       }
     }
   }
 }

 /*
   3D Input, 3D kernel  : convolve given image with the given kernel, valid convolution.
   for sr,sc=1 this is equivalent to validXCorr3Dptr, but otherwise it is useful for
   calculating derivatives wrt a kernel that is applied with stride sr,sc != 1
 */
 void THTensor_(validXCorr3DRevptr)(real *r_,
                                           real alpha,
                                           real *t_, long it, long ir, long ic,
                                           real *k_, long kt, long kr, long kc,
                                           long st, long sr, long sc)
 {
   long ot = it - (kt - 1) * st;
   long or = ir - (kr - 1) * sr;
   long oc = ic - (kc - 1) * sc;

   long zz, xx, yy;
   for(zz = 0; zz < kt; zz++)
   {
     for(yy = 0; yy < kr; yy++)
     {
       for(xx = 0; xx < kc; xx++)
       {
         real *po_ = r_;
         real *pi_ = t_ + zz*st*ir*ic + yy*sr*ic + xx*sc;
         real z = *k_++ * alpha;
         long kz, kx, ky;
         for(kz = 0; kz < ot; kz++)
         {
           for(ky = 0; ky < or; ky++)
           {
             for(kx = 0; kx < oc; kx++)
               po_[kx] += z * pi_[kx];
             pi_ += ic;
             po_ += oc;
           }
           pi_ += (ir-or)*ic; /* next input slice */
         }
       }
     }
   }
 }

 void THTensor_(conv2d)(real* output_data,
                        real alpha,
                        real* ptr_input, long nInputRows, long nInputCols,
                        real* ptr_weight, long nKernelRows, long nKernelCols,
                        long srow, long scol,
                        const char *vf, const char *xc)
 {
   THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can be 'V' or 'F'");
   THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can be 'X' or 'C'");
   if (*vf == 'F')
     if (*xc == 'X')
       THTensor_(fullXCorr2Dptr)(output_data,
                                 alpha,
                                 ptr_input,  nInputRows,  nInputCols,
                                 ptr_weight, nKernelRows, nKernelCols,
                                 srow, scol);
     else
       THTensor_(fullConv2Dptr)(output_data,
                                alpha,
                                ptr_input,  nInputRows,  nInputCols,
                                ptr_weight, nKernelRows, nKernelCols,
                                srow, scol);
   else
     if (*xc == 'X')
       THTensor_(validXCorr2Dptr)(output_data,
                                  alpha,
                                  ptr_input,  nInputRows,  nInputCols,
                                  ptr_weight, nKernelRows, nKernelCols,
                                  srow, scol);
     else
       THTensor_(validConv2Dptr)(output_data,
                                 alpha,
                                 ptr_input,  nInputRows,  nInputCols,
                                 ptr_weight, nKernelRows, nKernelCols,
                                 srow, scol);
 }

 void THTensor_(conv3d)(real* output_data,
                        real alpha,
                        real* ptr_input, long nInputDepth, long nInputRows, long nInputCols,
                        real* ptr_weight, long nKernelDepth, long nKernelRows, long nKernelCols,
                        long sdepth, long srow, long scol,
                        const char *vf, const char *xc)
 {
   THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can be 'V' or 'F'");
   THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can be 'X' or 'C'");
   if (*vf == 'F')
     if (*xc == 'X')
       THTensor_(fullXCorr3Dptr)(output_data,
                                 alpha,
                                 ptr_input, nInputDepth, nInputRows,  nInputCols,
                                 ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
                                 sdepth, srow, scol);
     else
       THTensor_(fullConv3Dptr)(output_data,
                                alpha,
                                ptr_input, nInputDepth, nInputRows,  nInputCols,
                                ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
                                sdepth, srow, scol);
   else
     if (*xc == 'X')
       THTensor_(validXCorr3Dptr)(output_data,
                                  alpha,
                                  ptr_input, nInputDepth, nInputRows,  nInputCols,
                                  ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
                                  sdepth, srow, scol);
     else
       THTensor_(validConv3Dptr)(output_data,
                                 alpha,
                                 ptr_input, nInputDepth, nInputRows,  nInputCols,
                                 ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
                                 sdepth, srow, scol);
 }

 long THTensor_(convsize)(long x, long k, long s, const char* vf)
 {
   THArgCheck(*vf == 'V' || *vf == 'F', 1, "type of convolution can be 'V' or 'F'");
   if (*vf == 'V')
     return (x-k)/s + 1;
   else
     return (x-1)*s + k;
 }


 /*
   3D input, 3D kernel, 4D output
   like rank1 update
   A <- xx' + beta*A
   for sr,sc=1 this is equivalent to conv2Dger, but otherwise it is useful for
   calculating derivatives wrt a kernel that is applied with stride sr,sc != 1
 */
 void THTensor_(conv2DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol)
 {
   long nInputPlane, nInputRows, nInputCols;
   long nKernelPlane, nKernelRows, nKernelCols;
   long nOutputPlane, nOutputRows, nOutputCols;
   long istride0, kstride0;
   THTensor *input;
   THTensor *kernel;
   real *input_data;
   real *weight_data;
   real *output_data;
   long nelem;
   long k;

   THArgCheck(t_->nDimension == 3 , 3, "input: 3D Tensor expected");
   THArgCheck(k_->nDimension == 3 , 4, "kernel: 3D Tensor expected");
   THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
   THArgCheck(scol >= 1, 6, "Stride should be a positive integer");

   input = THTensor_(newContiguous)(t_);
   kernel = THTensor_(newContiguous)(k_);

   nInputPlane = input->size[0];
   istride0    = input->stride[0];
   nInputRows  = input->size[1];
   nInputCols  = input->size[2];

   kstride0 = kernel->stride[0];
   nKernelPlane = kernel->size[0];
   nKernelRows = kernel->size[1];
   nKernelCols = kernel->size[2];
   nOutputPlane = nInputPlane * kernel->size[0];

   THArgCheck(nInputRows >= nKernelRows && nInputCols >= nKernelCols , 2, "covn2DRevger : Input image is smaller than kernel");

   nOutputRows = nInputRows - (nKernelRows - 1) * srow;
   nOutputCols = nInputCols - (nKernelCols - 1) * scol;

   nelem = THTensor_(nElement)(r_);
   THTensor_(resize4d)(r_,nKernelPlane, nInputPlane, nOutputRows, nOutputCols);

   input_data = THTensor_(data)(input);
   weight_data = THTensor_(data)(kernel);
   output_data = THTensor_(data)(r_);

   if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
   {
     /*THTensor_(zero)(r_);*/

 #pragma omp parallel for private(k)
     for (k = 0; k < r_->size[0]*r_->size[1]; k++)
     {
       real* ptr_output = output_data + k*nOutputCols*nOutputRows;
       long l;
       for (l = 0; l < nOutputRows*nOutputCols; l++)
         ptr_output[l] = 0.0;
     }
   }
   else if (beta != 1)
   {
     /*THTensor_(mul)(r_, beta);*/
 #pragma omp parallel for private(k)
     for (k = 0; k < r_->size[0]*r_->size[1]; k++)
     {
       real* ptr_output = output_data + k*nOutputCols*nOutputRows;
       long l;
       for (l = 0; l < nOutputRows*nOutputCols; l++)
         ptr_output[l] *= beta;
     }
   }

 #pragma omp parallel for private(k)
   for(k = 0; k < nKernelPlane; k++)
   {
     long i;
     /* get kernel */
     real *ptr_weight = weight_data+k*kstride0;

     for(i = 0; i < nInputPlane; i++)
     {
       /* get output */
       real *ptr_output = output_data + k*nInputPlane*nOutputCols*nOutputRows + i*nOutputCols*nOutputRows;
       /* get input */
       real *ptr_input = input_data+i*istride0;

       /* do image, kernel convolution */
       THTensor_(validXCorr2DRevptr)(ptr_output,
                                     alpha,
                                     ptr_input,  nInputRows,  nInputCols,
                                     ptr_weight, nKernelRows, nKernelCols,
                                     srow, scol);
       /* Next output plane */
       /* output_data += nOutputCols*nOutputRows; */
     }
   }
   THTensor_(free)(input);
   THTensor_(free)(kernel);
 }


 /*
   3D input, 3D kernel, 4D output
   like rank1 update
   A <- xx' + beta*A
   for sr,sc=1 this is equivalent to conv2Dger, but otherwise it is useful for
   calculating derivatives wrt a kernel that is applied with stride sr,sc != 1
 */
 void THTensor_(conv2DRevgerm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol)
 {
   long nbatch, nInputPlane, nInputRows, nInputCols;
   long nKernelPlane, nKernelRows, nKernelCols;
   long nOutputRows, nOutputCols;
   long istride0, kstride0, istride1, kstride1;
   THTensor *input;
   THTensor *kernel;
   real *input_data;
   real *weight_data;
   real *output_data;
   long nelem;
   long k;

   THArgCheck(t_->nDimension == 4 , 3, "input: 4D Tensor expected");
   THArgCheck(k_->nDimension == 4 , 4, "kernel: 4D Tensor expected");
   THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
   THArgCheck(scol >= 1, 6, "Stride should be a positive integer");

   input = THTensor_(newContiguous)(t_);
   kernel = THTensor_(newContiguous)(k_);

   istride0    = input->stride[0];
   istride1    = input->stride[1];
   nbatch      = input->size[0];
   nInputPlane = input->size[1];
   nInputRows  = input->size[2];
   nInputCols  = input->size[3];

   kstride0 = kernel->stride[0];
   kstride1 = kernel->stride[1];
   nKernelPlane = kernel->size[1];
   nKernelRows = kernel->size[2];
   nKernelCols = kernel->size[3];

   THArgCheck(nInputRows >= nKernelRows && nInputCols >= nKernelCols , 2, "conv2DRevger : Input image is smaller than kernel");
   THArgCheck(kernel->size[0] == input->size[0] , 2, "conv2DRevger : Input batch and kernel batch is not same size");

   nOutputRows = nInputRows - (nKernelRows - 1) * srow;
   nOutputCols = nInputCols - (nKernelCols - 1) * scol;

   nelem = THTensor_(nElement)(r_);
   THTensor_(resize4d)(r_,nKernelPlane, nInputPlane, nOutputRows, nOutputCols);

   input_data = THTensor_(data)(input);
   weight_data = THTensor_(data)(kernel);
   output_data = THTensor_(data)(r_);

   if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
   {
     /*THTensor_(zero)(r_);*/

 #pragma omp parallel for private(k)
     for (k = 0; k < r_->size[0]*r_->size[1]; k++)
     {
       real* ptr_output = output_data + k*nOutputCols*nOutputRows;
       long l;
       for (l = 0; l < nOutputRows*nOutputCols; l++)
         ptr_output[l] = 0.0;
     }
   }
   else if (beta != 1)
   {
     /*THTensor_(mul)(r_, beta);*/
 #pragma omp parallel for private(k)
     for (k = 0; k < r_->size[0]*r_->size[1]; k++)
     {
       real* ptr_output = output_data + k*nOutputCols*nOutputRows;
       long l;
       for (l = 0; l < nOutputRows*nOutputCols; l++)
         ptr_output[l] *= beta;
     }
   }

 #pragma omp parallel for private(k)
   for(k = 0; k < nKernelPlane; k++)
   {
     long i;
     for(i = 0; i < nInputPlane; i++)
     {
       long p;
       for(p = 0; p < nbatch; p++)
       {
         /* get kernel */
         real *ptr_weight = weight_data + p*kstride0 + k*kstride1;
         /* get output */
         real *ptr_output = output_data + k*nInputPlane*nOutputCols*nOutputRows + i*nOutputCols*nOutputRows;
         /* get input */
         real *ptr_input = input_data + p*istride0 + i*istride1;

         /* do image, kernel convolution */
         THTensor_(validXCorr2DRevptr)(ptr_output,
                                       alpha,
                                       ptr_input,  nInputRows,  nInputCols,
                                       ptr_weight, nKernelRows, nKernelCols,
                                       srow, scol);
         /* Next output plane */
         /* output_data += nOutputCols*nOutputRows; */
       }
     }
   }
   THTensor_(free)(input);
   THTensor_(free)(kernel);
 }


 /*
   3D input, 3D kernel, 4D output
   like rank1 update
   A <- xx' + beta*A
 */
 void THTensor_(conv2Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc)
 {
   long nInputPlane, nInputRows, nInputCols;
   long nKernelPlane, nKernelRows, nKernelCols;
   long nOutputPlane, nOutputRows, nOutputCols;
   long istride0, kstride0;

   THTensor *input;
   THTensor *kernel;
   real *input_data;
   real *weight_data;
   real *output_data;
   long nelem;
   long k;

   THArgCheck(t_->nDimension == 3 , 3, "input: 3D Tensor expected");
   THArgCheck(k_->nDimension == 3 , 4, "kernel: 3D Tensor expected");
   THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
   THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
   THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can 'V' or 'F'");
   THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can 'X' or 'C'");

   input = THTensor_(newContiguous)(t_);
   kernel = THTensor_(newContiguous)(k_);

   nInputPlane = input->size[0];
   istride0    = input->stride[0];
   nInputRows  = input->size[1];
   nInputCols  = input->size[2];

   kstride0 = kernel->stride[0];
   nKernelPlane = kernel->size[0];
   nKernelRows = kernel->size[1];
   nKernelCols = kernel->size[2];
   nOutputPlane = nInputPlane * kernel->size[0];

   THArgCheck((nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dger : Input image is smaller than kernel");

   if (*vf == 'F') {
     nOutputRows = (nInputRows - 1) * srow + nKernelRows;
     nOutputCols = (nInputCols - 1) * scol + nKernelCols;
   } else { /* valid */
     nOutputRows = (nInputRows - nKernelRows) / srow + 1;
     nOutputCols = (nInputCols - nKernelCols) / scol + 1;
   }

   nelem = THTensor_(nElement)(r_);
   THTensor_(resize4d)(r_, nKernelPlane, nInputPlane, nOutputRows, nOutputCols);

   input_data = THTensor_(data)(input);
   weight_data = THTensor_(data)(kernel);
   output_data = THTensor_(data)(r_);

   if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
   {
     /*THTensor_(zero)(r_);*/
 #pragma omp parallel for private(k)
     for (k = 0; k < r_->size[0]*r_->size[1]; k++)
     {
       real* ptr_output = output_data + k*nOutputCols*nOutputRows;
       long l;
       for (l = 0; l < nOutputRows*nOutputCols; l++)
         ptr_output[l] = 0.0;
     }
   }
   else if (beta != 1)
   {
     /*THTensor_(mul)(r_, beta);*/
 #pragma omp parallel for private(k)
     for (k = 0; k < r_->size[0]*r_->size[1]; k++)
     {
       real* ptr_output = output_data + k*nOutputCols*nOutputRows;
       long l;
       for (l = 0; l < nOutputRows*nOutputCols; l++)
         ptr_output[l] *= beta;
     }
   }

 #pragma omp parallel for private(k)
   for(k = 0; k < nKernelPlane; k++)
   {
     long i;
     /* get kernel */
     real *ptr_weight = weight_data+k*kstride0;

     for(i = 0; i < nInputPlane; i++)
     {
       /* get output */
       real *ptr_output = output_data + k*nInputPlane*nOutputCols*nOutputRows + i*nOutputCols*nOutputRows;
       /* get input */
       real *ptr_input = input_data+i*istride0;

       /* do image, kernel convolution */
       if (*vf == 'F')
         if (*xc == 'X')
           THTensor_(fullXCorr2Dptr)(ptr_output,
                                     alpha,
                                     ptr_input,  nInputRows,  nInputCols,
                                     ptr_weight, nKernelRows, nKernelCols,
                                     srow, scol);
         else
           THTensor_(fullConv2Dptr)(ptr_output,
                                    alpha,
                                    ptr_input,  nInputRows,  nInputCols,
                                    ptr_weight, nKernelRows, nKernelCols,
                                    srow, scol);
       else
         if (*xc == 'X')
           THTensor_(validXCorr2Dptr)(ptr_output,
                                      alpha,
                                      ptr_input,  nInputRows,  nInputCols,
                                      ptr_weight, nKernelRows, nKernelCols,
                                      srow, scol);
         else
           THTensor_(validConv2Dptr)(ptr_output,
                                     alpha,
                                     ptr_input,  nInputRows,  nInputCols,
                                     ptr_weight, nKernelRows, nKernelCols,
                                     srow, scol);
       /* Next output plane */
       /* output_data += nOutputCols*nOutputRows; */
     }
   }
   THTensor_(free)(input);
   THTensor_(free)(kernel);
 }


 /*
   3D input, 4D kernel, 3D output
   matrix vector product like
   y <- Ax + beta*y
 */
 void THTensor_(conv2Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc)
 {
   long nInputPlane, nInputRows, nInputCols;
   long nKernelRows, nKernelCols;
   long nOutputPlane, nOutputRows, nOutputCols;
   long istride0, kstride0, kstride1;
   THTensor *input;
   THTensor* kernel;
   real *input_data;
   real *weight_data;
   real *output_data;
   long nelem;
   long k;

   THArgCheck(t_->nDimension == 3 , 3, "input: 3D Tensor expected");
   THArgCheck(k_->nDimension == 4 , 4, "kernel: 4D Tensor expected");
   THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
   THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
   THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can 'V' or 'F'");
   THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can 'X' or 'C'");

   input = THTensor_(newContiguous)(t_);
   if (!(k_->stride[3] == 1) || !(k_->stride[2] == k_->size[3])) {
     kernel = THTensor_(newContiguous)(k_);
   } else {
     THTensor_(retain)(k_);
     kernel = k_;
   }

   nInputPlane = input->size[0];
   istride0    = input->stride[0];
   nInputRows  = input->size[1];
   nInputCols  = input->size[2];

   kstride0    = kernel->stride[0];
   kstride1    = kernel->stride[1];
   nKernelRows = kernel->size[2];
   nKernelCols = kernel->size[3];
   nOutputPlane = kernel->size[0];
   THArgCheck(kernel->size[1] == nInputPlane, 2, "invalid number of input planes");

   THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dmv : Input image is smaller than kernel");

   if (*vf == 'F') {
     nOutputRows = (nInputRows - 1) * srow + nKernelRows;
     nOutputCols = (nInputCols - 1) * scol + nKernelCols;
   } else { /* valid */
     nOutputRows = (nInputRows - nKernelRows) / srow + 1;
     nOutputCols = (nInputCols - nKernelCols) / scol + 1;
   }

   nelem = THTensor_(nElement)(r_);
   THTensor_(resize3d)(r_, nOutputPlane, nOutputRows, nOutputCols);

   input_data = THTensor_(data)(input);
   weight_data = THTensor_(data)(kernel);
   output_data = THTensor_(data)(r_);

   if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
   {
     /*THTensor_(zero)(r_);*/
 #pragma omp parallel for private(k)
     for (k = 0; k < r_->size[0]; k++)
     {
       real* ptr_output = output_data + k*nOutputCols*nOutputRows;
       long l;
       for (l = 0; l < nOutputRows*nOutputCols; l++)
         ptr_output[l] = 0.0;
     }
   }
   else if (beta != 1)
   {
     /*THTensor_(mul)(r_, beta);*/
 #pragma omp parallel for private(k)
     for (k = 0; k < r_->size[0]; k++)
     {
       real* ptr_output = output_data + k*nOutputCols*nOutputRows;
       long l;
       for (l = 0; l < nOutputRows*nOutputCols; l++)
         ptr_output[l] *= beta;
     }
   }

 #pragma omp parallel for private(k)
   for(k = 0; k < nOutputPlane; k++)
   {
     long i;
     /* get output */
     real *ptr_output = output_data + k*nOutputCols*nOutputRows;
     for(i = 0; i < nInputPlane; i++)
     {
       /* get kernel */
       real *ptr_weight = weight_data + k*kstride0 + i*kstride1;
       /* get input */
       real *ptr_input = input_data + i*istride0;

       /* do image, kernel convolution */
       if (*vf == 'F')
         if (*xc == 'X')
           THTensor_(fullXCorr2Dptr)(ptr_output,
                                     alpha,
                                     ptr_input,  nInputRows,  nInputCols,
                                     ptr_weight, nKernelRows, nKernelCols,
                                     srow, scol);
         else
           THTensor_(fullConv2Dptr)(ptr_output,
                                    alpha,
                                    ptr_input,  nInputRows,  nInputCols,
                                    ptr_weight, nKernelRows, nKernelCols,
                                    srow, scol);
       else
         if (*xc == 'X')
           THTensor_(validXCorr2Dptr)(ptr_output,
                                      alpha,
                                      ptr_input,  nInputRows,  nInputCols,
                                      ptr_weight, nKernelRows, nKernelCols,
                                      srow, scol);
         else
           THTensor_(validConv2Dptr)(ptr_output,
                                     alpha,
                                     ptr_input,  nInputRows,  nInputCols,
                                     ptr_weight, nKernelRows, nKernelCols,
                                     srow, scol);
     }
     /* Next output plane */
     /* output_data += nOutputCols*nOutputRows;*/
   }
   THTensor_(free)(input);
   THTensor_(free)(kernel);
 }


 /*
   3D input, 4D kernel, 3D output
   matrix vector product like
   y <- Ax + beta*y
 */
 void THTensor_(conv2Dmm)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc)
 {
   long nInputPlane, nInputRows, nInputCols;
   long nKernelRows, nKernelCols;
   long nOutputPlane, nOutputRows, nOutputCols;
   long kstride0, kstride1;
   THTensor *input;
   THTensor* kernel;
   long nbatch;
   long nelem;
   real *input_data;
   real *weight_data;
   real *output_data;
   long p;

   THArgCheck(t_->nDimension == 4 , 3, "input: 4D Tensor expected");
   THArgCheck(k_->nDimension == 4 , 4, "kernel: 4D Tensor expected");
   THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
   THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
   THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can 'V' or 'F'");
   THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can 'X' or 'C'");

   input = THTensor_(newContiguous)(t_);
   if (!(k_->stride[3] == 1) || !(k_->stride[2] == k_->size[3])) {
     kernel = THTensor_(newContiguous)(k_);
   } else {
     THTensor_(retain)(k_);
     kernel = k_;
   }

   nbatch = input->size[0];
   nInputPlane = input->size[1];
   nInputRows  = input->size[2];
   nInputCols  = input->size[3];

   kstride0    = kernel->stride[0];
   kstride1    = kernel->stride[1];
   nKernelRows = kernel->size[2];
   nKernelCols = kernel->size[3];
   nOutputPlane = kernel->size[0];
   THArgCheck(kernel->size[1] == nInputPlane, 2, "invalid number of input planes");

   THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dmv : Input image is smaller than kernel");

   if (*vf == 'F') {
     nOutputRows = (nInputRows - 1) * srow + nKernelRows;
     nOutputCols = (nInputCols - 1) * scol + nKernelCols;
   } else { /* valid */
     nOutputRows = (nInputRows - nKernelRows) / srow + 1;
     nOutputCols = (nInputCols - nKernelCols) / scol + 1;
   }

   nelem = THTensor_(nElement)(r_);
   THTensor_(resize4d)(r_, nbatch, nOutputPlane, nOutputRows, nOutputCols);

   input_data = THTensor_(data)(input);
   weight_data = THTensor_(data)(kernel);
   output_data = THTensor_(data)(r_);

   if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
   {
     /*THTensor_(zero)(r_);*/
 #pragma omp parallel for private(p)
     for (p=0; p < r_->size[0]; p++)
     {
       long k;
       for (k = 0; k < r_->size[1]; k++)
       {
         real* ptr_output = output_data + p*nOutputPlane*nOutputRows*nOutputCols + k*nOutputCols*nOutputRows;
         long l;
         for (l = 0; l < nOutputRows*nOutputCols; l++)
           ptr_output[l] = 0.0;
       }
     }
   }
   else if (beta != 1)
   {
     /*THTensor_(mul)(r_, beta);*/
 #pragma omp parallel for private(p)
     for(p=0; p < r_->size[0]; p++)
     {
       long k;
       for (k = 0; k < r_->size[1]; k++)
       {
         real* ptr_output = output_data + p*nOutputPlane*nOutputRows*nOutputCols + k*nOutputCols*nOutputRows;
         long l;
         for (l = 0; l < nOutputRows*nOutputCols; l++)
           ptr_output[l] *= beta;
       }
     }
   }

 #pragma omp parallel for private(p)
   for(p=0; p < nbatch; p++)
   {
     long k;
     for(k = 0; k < nOutputPlane; k++)
     {
       long i;
       /* get output */
       real *ptr_output = output_data + p*nOutputPlane*nOutputCols*nOutputRows + k*nOutputCols*nOutputRows;
       for(i = 0; i < nInputPlane; i++)
       {
         /* get kernel */
         real *ptr_weight = weight_data + k*kstride0 + i*kstride1;
         /* get input */
         real *ptr_input = input_data + p*nInputPlane*nInputRows*nInputCols + i*nInputRows*nInputCols;

         /* do image, kernel convolution */
         if (*vf == 'F')
           if (*xc == 'X')
             THTensor_(fullXCorr2Dptr)(ptr_output,
                                       alpha,
                                       ptr_input,  nInputRows,  nInputCols,
                                       ptr_weight, nKernelRows, nKernelCols,
                                       srow, scol);
           else
             THTensor_(fullConv2Dptr)(ptr_output,
                                      alpha,
                                      ptr_input,  nInputRows,  nInputCols,
                                      ptr_weight, nKernelRows, nKernelCols,
                                      srow, scol);
         else
           if (*xc == 'X')
             THTensor_(validXCorr2Dptr)(ptr_output,
                                        alpha,
                                        ptr_input,  nInputRows,  nInputCols,
                                        ptr_weight, nKernelRows, nKernelCols,
                                        srow, scol);
           else
             THTensor_(validConv2Dptr)(ptr_output,
                                       alpha,
                                       ptr_input,  nInputRows,  nInputCols,
                                       ptr_weight, nKernelRows, nKernelCols,
                                       srow, scol);
       }
       /* Next output plane */
       /* output_data += nOutputCols*nOutputRows;*/
     }
   }
   THTensor_(free)(input);
   THTensor_(free)(kernel);
 }


 /*
   2D input, 2D kernel, 2D output
   scalar multiplication like
   y <- x*y + beta*y
 */
 void THTensor_(conv2Dmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc)
 {
   THTensor *input;
   THTensor* kernel;
   long nInputRows;
   long nInputCols;
   long nKernelRows;
   long nKernelCols;
   long nOutputRows, nOutputCols;
   real *ptr_input;
   real *ptr_weight;
   real *output_data;
   long nelem;

   THArgCheck(t_->nDimension == 2 , 3, "input: 2D Tensor expected");
   THArgCheck(k_->nDimension == 2 , 4, "kernel: 2D Tensor expected");
   THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
   THArgCheck(scol >= 1, 6, "Stride should be a positive integer");

   input = THTensor_(newContiguous)(t_);
   kernel = THTensor_(newContiguous)(k_);

   nInputRows  = input->size[0];
   nInputCols  = input->size[1];
   nKernelRows = kernel->size[0];
   nKernelCols = kernel->size[1];

   THArgCheck((nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dmul : Input image is smaller than kernel");

   nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
   nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);

   nelem = THTensor_(nElement)(r_);
   THTensor_(resize2d)(r_, nOutputRows, nOutputCols);
   if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
     THTensor_(zero)(r_);
   else if (beta != 1)
     THTensor_(mul)(r_, r_, beta);

   ptr_input = THTensor_(data)(input);
   ptr_weight = THTensor_(data)(kernel);
   output_data = THTensor_(data)(r_);


   /* do image, kernel convolution */
   THTensor_(conv2d)(output_data,
                     alpha,
                     ptr_input, nInputRows, nInputCols,
                     ptr_weight, nKernelRows, nKernelCols,
                     srow, scol, vf, xc);
   THTensor_(free)(input);
   THTensor_(free)(kernel);
 }

 /*
   3D input, 3D kernel, 3D output
   component wise multiplication like
   y <- y.*x + beta*y
 */
 void THTensor_(conv2Dcmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, long srow, long scol, const char *vf, const char *xc)
 {
   long nInputPlane, nInputRows, nInputCols;
   long nKernelRows, nKernelCols;
   long nOutputPlane, nOutputRows, nOutputCols;
   long istride0, kstride0;
   THTensor *input;
   THTensor *kernel;
   real *input_data;
   real *weight_data;
   real *output_data;
   long nelem;
   long k;

   THArgCheck(t_->nDimension == 3 , 3, "input: 3D Tensor expected");
   THArgCheck(k_->nDimension == 3 , 4, "kernel: 3D Tensor expected");
   THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
   THArgCheck(scol >= 1, 6, "Stride should be a positive integer");

   input = THTensor_(newContiguous)(t_);
   kernel = THTensor_(newContiguous)(k_);

   istride0    = input->stride[0];
   nInputPlane = input->size[0];
   nInputRows  = input->size[1];
   nInputCols  = input->size[2];

   kstride0    = kernel->stride[0];
   nOutputPlane = kernel->size[0];
   nKernelRows = kernel->size[1];
   nKernelCols = kernel->size[2];

   THArgCheck(nOutputPlane == nInputPlane, 2, "invalid number of input/kernel planes");
   THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv2Dcmul : Input image is smaller than kernel");

   nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
   nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);

   nelem = THTensor_(nElement)(r_);
   THTensor_(resize3d)(r_, nOutputPlane, nOutputRows, nOutputCols);

   if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
   {
     THTensor_(zero)(r_);
   }
   else if (beta != 1)
     THTensor_(mul)(r_, r_, beta);

   input_data = THTensor_(data)(input);
   weight_data = THTensor_(data)(kernel);
   output_data = THTensor_(data)(r_);

   for(k = 0; k < nOutputPlane; k++)
   {
     /* get kernel */
     real *ptr_weight = weight_data + k*kstride0;
     /* get input */
     real *ptr_input = input_data + k*istride0;

     /* do image, kernel convolution */
     THTensor_(conv2d)(output_data,
                       alpha,
                       ptr_input, nInputRows, nInputCols,
                       ptr_weight, nKernelRows, nKernelCols,
                       srow, scol, vf, xc);
     /* Next output plane */
     output_data += nOutputCols*nOutputRows;
   }
   THTensor_(free)(input);
   THTensor_(free)(kernel);
 }

 /*
   3D input, 3D kernel, 3D output
   component wise multiplication like with a permutation map
   y <- y.*x + beta*y
 */
 void THTensor_(conv2Dmap)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, THTensor *map, long srow, long scol, const char *vf, const char *xc)
 {
   long nInputPlane, nInputRows, nInputCols;
   long nKernelRows, nKernelCols;
   long nOutputPlane, nOutputRows, nOutputCols;
   long istride0, kstride0;
   THTensor *input;
   THTensor* kernel;
   real *input_data;
   real *weight_data;
   real *output_data;
   long nmaps;
   long nelem;
   long k;

   THArgCheck(t_->nDimension == 3 , 3, "input: 3D Tensor expected");
   THArgCheck(k_->nDimension == 3 , 4, "kernel: 3D Tensor expected");
   THArgCheck(map->nDimension == 2 , 4, "map: 2D Tensor expected");
   THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
   THArgCheck(scol >= 1, 7, "Stride should be a positive integer");

   input = THTensor_(newContiguous)(t_);
   kernel = THTensor_(newContiguous)(k_);

   istride0    = input->stride[0];
   nInputPlane = input->size[0];
   nInputRows  = input->size[1];
   nInputCols  = input->size[2];

   kstride0    = kernel->stride[0];
   nOutputPlane = kernel->size[0];
   nKernelRows = kernel->size[1];
   nKernelCols = kernel->size[2];

   THArgCheck(nOutputPlane == nInputPlane, 2, "invalid number of input/kernel planes");
   THArgCheck( (nInputRows >= nKernelRows && nInputCols >= nKernelCols)
               || *vf == 'F', 2, "conv2Dmap : Input image is smaller than kernel");

   nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
   nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);

   nelem = THTensor_(nElement)(r_);
   THTensor_(resize3d)(r_, nOutputPlane, nOutputRows, nOutputCols);

   if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
   {
     THTensor_(zero)(r_);
   }
   else if (beta != 1)
     THTensor_(mul)(r_, r_, beta);

   input_data = THTensor_(data)(input);
   weight_data = THTensor_(data)(kernel);
   output_data = THTensor_(data)(r_);

   nmaps = map->size[0];

   for(k = 0; k < nmaps; k++)
   {
     /* get indices */
     long from = (long)THTensor_(get2d)(map,k,0)-1;
     long to   = (long)THTensor_(get2d)(map,k,1)-1;

     /* get kernel */
     real *ptr_weight = weight_data + k*kstride0;
     /* get input */
     real *ptr_input = input_data + from*istride0;
     /* get output */
     real *ptr_output = output_data + to*nOutputRows*nOutputCols;

     /* do image, kernel convolution */
     THTensor_(conv2d)(ptr_output,
                       alpha,
                       ptr_input, nInputRows, nInputCols,
                       ptr_weight, nKernelRows, nKernelCols,
                       srow, scol, vf, xc);
   }
   THTensor_(free)(input);
   THTensor_(free)(kernel);
 }

 /*
   4D input, 4D kernel, 5D output
   like rank1 update
   A <- xx' + beta*A
   for sr,sc=1 this is equivalent to xcorr2Dger, but otherwise it is useful for
   calculating derivatives wrt a kernel that is applied with stride sr,sc != 1
 */
 void THTensor_(conv3DRevger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_,
                              long sdepth, long srow, long scol)
 {
   long nInputPlane, nInputDepth, nInputRows, nInputCols;
   long nKernelPlane, nKernelDepth, nKernelRows, nKernelCols;
   long nOutputPlane, nOutputDepth, nOutputRows, nOutputCols;
   long istride0, kstride0;
   THTensor *input;
   THTensor *kernel;
   real *input_data;
   real *weight_data;
   real *output_data;
   long nelem;
   long k, i;

   THArgCheck(t_->nDimension == 4 , 3, "input: 4D Tensor expected");
   THArgCheck(k_->nDimension == 4 , 4, "kernel: 4D Tensor expected");
   THArgCheck(sdepth >= 1, 5, "Stride should be a positive integer");
   THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
   THArgCheck(scol >= 1, 7, "Stride should be a positive integer");

   input = THTensor_(newContiguous)(t_);
   kernel = THTensor_(newContiguous)(k_);

   nInputPlane = input->size[0];
   istride0    = input->stride[0];
   nInputDepth = input->size[1];
   nInputRows  = input->size[2];
   nInputCols  = input->size[3];

   kstride0 = kernel->stride[0];
   nKernelPlane = kernel->size[0];
   nKernelDepth= kernel->size[1];
   nKernelRows = kernel->size[2];
   nKernelCols = kernel->size[3];
   nOutputPlane = nInputPlane * kernel->size[0];

   THArgCheck(nInputDepth >= nKernelDepth && nInputRows >= nKernelRows && nInputCols >= nKernelCols , 2, "conv3DRevger : Input image is smaller than kernel");

   nOutputDepth = nInputDepth - (nKernelDepth - 1) * sdepth;
   nOutputRows = nInputRows - (nKernelRows - 1) * srow;
   nOutputCols = nInputCols - (nKernelCols - 1) * scol;

   nelem = THTensor_(nElement)(r_);
   THTensor_(resize5d)(r_,nKernelPlane, nInputPlane, nOutputDepth, nOutputRows, nOutputCols);

   if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
   {
     THTensor_(zero)(r_);
   }
   else if (beta != 1)
     THTensor_(mul)(r_, r_, beta);

   input_data = THTensor_(data)(input);
   weight_data = THTensor_(data)(kernel);
   output_data = THTensor_(data)(r_);

   for(k = 0; k < nKernelPlane; k++)
   {
     /* get kernel */
     real *ptr_weight = weight_data+k*kstride0;

     for(i = 0; i < nInputPlane; i++)
     {
       /* get input */
       real *ptr_input = input_data+i*istride0;

       /* do image, kernel convolution */
       THTensor_(validXCorr3DRevptr)(output_data,
                                     alpha,
                                     ptr_input,  nInputDepth, nInputRows,  nInputCols,
                                     ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
                                     sdepth, srow, scol);
       /* Next output plane */
       output_data += nOutputDepth*nOutputCols*nOutputRows;
     }
   }
   THTensor_(free)(input);
   THTensor_(free)(kernel);
 }


 /*
   4D input, 4D kernel, 5D output
   like rank1 update
   A <- xx' + beta*A
 */
 void THTensor_(conv3Dger)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_,
                           long sdepth, long srow, long scol, const char *vf, const char *xc)
 {
   long nInputPlane, nInputDepth, nInputRows, nInputCols;
   long nKernelPlane, nKernelDepth, nKernelRows, nKernelCols;
   long nOutputPlane, nOutputDepth, nOutputRows, nOutputCols;
   long istride0, kstride0;
   THTensor *input;
   THTensor *kernel;
   real *input_data;
   real *weight_data;
   real *output_data;
   long nelem;
   long k, i;

   THArgCheck(t_->nDimension == 4 , 3, "input: 4D Tensor expected");
   THArgCheck(k_->nDimension == 4 , 4, "kernel: 4D Tensor expected");
   THArgCheck(sdepth >= 1, 5, "Stride should be a positive integer");
   THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
   THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
   THArgCheck(*vf == 'V' || *vf == 'F', 8, "type of convolution can 'V' or 'F'");
   THArgCheck(*xc == 'C' || *xc == 'X', 8, "type of convolution can 'X' or 'C'");

   input = THTensor_(newContiguous)(t_);
   kernel = THTensor_(newContiguous)(k_);

   nInputPlane = input->size[0];
   istride0    = input->stride[0];
   nInputDepth = input->size[1];
   nInputRows  = input->size[2];
   nInputCols  = input->size[3];

   kstride0     = kernel->stride[0];
   nKernelPlane = kernel->size[0];
   nKernelDepth = kernel->size[1];
   nKernelRows  = kernel->size[2];
   nKernelCols  = kernel->size[3];
   nOutputPlane = nInputPlane * kernel->size[0];

   THArgCheck((nInputDepth >= nKernelDepth
               && nInputRows >= nKernelRows
               && nInputCols >= nKernelCols)
              || *vf == 'F', 2, "conv3Dger : Input image is smaller than kernel");

   nOutputDepth = THTensor_(convsize)(nInputDepth, nKernelDepth, sdepth, vf);
   nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
   nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);

   nelem = THTensor_(nElement)(r_);
   THTensor_(resize5d)(r_,nKernelPlane, nInputPlane, nOutputDepth, nOutputRows, nOutputCols);

   if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
   {
     THTensor_(zero)(r_);
   }
   else if (beta != 1)
     THTensor_(mul)(r_, r_, beta);

   input_data = THTensor_(data)(input);
   weight_data = THTensor_(data)(kernel);
   output_data = THTensor_(data)(r_);

   for(k = 0; k < nKernelPlane; k++)
   {
     /* get kernel */
     real *ptr_weight = weight_data+k*kstride0;

     for(i = 0; i < nInputPlane; i++)
     {
       /* get input */
       real *ptr_input = input_data+i*istride0;

       /* do image, kernel convolution */
       THTensor_(conv3d)(output_data,
                         alpha,
                         ptr_input,  nInputDepth, nInputRows,  nInputCols,
                         ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
                         sdepth, srow, scol, vf, xc);

       /* Next output plane */
       output_data += nOutputDepth*nOutputCols*nOutputRows;
     }
   }
   THTensor_(free)(input);
   THTensor_(free)(kernel);
 }

 /*
   4D input, 5D kernel, 4D output
   matrix vector product like
   y <- Ax + beta*y
 */
 void THTensor_(conv3Dmv)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_,
                          long sdepth, long srow, long scol, const char *vf, const char *xc)
 {
   long nInputPlane, nInputDepth, nInputRows, nInputCols;
   long nKernelDepth, nKernelRows, nKernelCols;
   long nOutputPlane, nOutputDepth, nOutputRows, nOutputCols;
   long istride0, kstride0, kstride1;
   THTensor *input;
   THTensor *kernel;
   real *input_data;
   real *weight_data;
   real *output_data;
   long nelem;
   long k, i;

   THArgCheck(t_->nDimension == 4 , 3, "input: 4D Tensor expected");
   THArgCheck(k_->nDimension == 5 , 4, "kernel: 5D Tensor expected");
   THArgCheck(sdepth >= 1, 5, "Stride should be a positive integer");
   THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
   THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
   THArgCheck(*vf == 'V' || *vf == 'F', 8, "type of convolution can 'V' or 'F'");
   THArgCheck(*xc == 'C' || *xc == 'X', 8, "type of convolution can 'X' or 'C'");

   input = THTensor_(newContiguous)(t_);
   if (!(k_->stride[4] == 1) || !(k_->stride[3] == k_->size[4])) {
     kernel = THTensor_(newContiguous)(k_);
   } else {
     THTensor_(retain)(k_);
     kernel = k_;
   }

   nInputPlane = input->size[0];
   istride0    = input->stride[0];
   nInputDepth = input->size[1];
   nInputRows  = input->size[2];
   nInputCols  = input->size[3];

   kstride0    = kernel->stride[0];
   kstride1    = kernel->stride[1];
   nKernelDepth = kernel->size[2];
   nKernelRows = kernel->size[3];
   nKernelCols = kernel->size[4];
   nOutputPlane = kernel->size[0];
   THArgCheck(kernel->size[1] == nInputPlane, 2, "invalid number of input planes");

   THArgCheck( (nInputDepth >= nKernelDepth && nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv3Dmv : Input image is smaller than kernel");

   nOutputDepth = THTensor_(convsize)(nInputDepth, nKernelDepth, sdepth, vf);
   nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
   nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);

   nelem = THTensor_(nElement)(r_);
   THTensor_(resize4d)(r_, nOutputPlane, nOutputDepth, nOutputRows, nOutputCols);

   if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
   {
     THTensor_(zero)(r_);
   }
   else if (beta != 1)
     THTensor_(mul)(r_, r_, beta);

   input_data = THTensor_(data)(input);
   weight_data = THTensor_(data)(kernel);
   output_data = THTensor_(data)(r_);

   for(k = 0; k < nOutputPlane; k++)
   {
     for(i = 0; i < nInputPlane; i++)
     {
       /* get kernel */
       real *ptr_weight = weight_data + k*kstride0 + i*kstride1;
       /* get input */
       real *ptr_input = input_data + i*istride0;

       /* do image, kernel convolution */
       THTensor_(conv3d)(output_data,
                         alpha,
                         ptr_input,  nInputDepth, nInputRows,  nInputCols,
                         ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
                         sdepth, srow, scol, vf, xc);
     }
     /* Next output plane */
     output_data += nOutputDepth*nOutputCols*nOutputRows;
   }
   THTensor_(free)(input);
   THTensor_(free)(kernel);
 }

 /*
   3D input, 3D kernel, 3D output
   scalar multiplication like
   y <- x*y + beta*y
 */
 void THTensor_(conv3Dmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_,
                           long sdepth, long srow, long scol, const char *vf, const char *xc)
 {
   THTensor *input;
   THTensor* kernel;
   long nInputDepth;
   long nInputRows;
   long nInputCols;
   long nKernelDepth;
   long nKernelRows;
   long nKernelCols;
   long nOutputDepth, nOutputRows, nOutputCols;
   real *ptr_input;
   real *ptr_weight;
   real *output_data;
   long nelem;

   THArgCheck(t_->nDimension == 3 , 3, "input: 3D Tensor expected");
   THArgCheck(k_->nDimension == 3 , 4, "kernel: 3D Tensor expected");
   THArgCheck(sdepth >= 1, 5, "Stride should be a positive integer");
   THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
   THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
   THArgCheck(*vf == 'V' || *vf == 'F', 8, "type of convolution can 'V' or 'F'");
   THArgCheck(*xc == 'C' || *xc == 'X', 8, "type of convolution can 'X' or 'C'");

   input = THTensor_(newContiguous)(t_);
   kernel = THTensor_(newContiguous)(k_);

   nInputDepth = input->size[0];
   nInputRows  = input->size[1];
   nInputCols  = input->size[2];
   nKernelDepth = kernel->size[0];
   nKernelRows = kernel->size[1];
   nKernelCols = kernel->size[2];

   THArgCheck((nInputDepth >= nKernelDepth && nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv3Dmul : Input image is smaller than kernel");

   nOutputDepth = THTensor_(convsize)(nInputDepth, nKernelDepth, sdepth, vf);
   nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
   nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);

   nelem = THTensor_(nElement)(r_);
   THTensor_(resize3d)(r_, nOutputDepth, nOutputRows, nOutputCols);
   if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
     THTensor_(zero)(r_);
   else if (beta != 1)
     THTensor_(mul)(r_, r_, beta);

   ptr_input = THTensor_(data)(input);
   ptr_weight = THTensor_(data)(kernel);
   output_data = THTensor_(data)(r_);


   /* do image, kernel convolution */
   THTensor_(conv3d)(output_data,
                     alpha,
                     ptr_input,  nInputDepth, nInputRows,  nInputCols,
                     ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
                     sdepth, srow, scol, vf, xc);
   THTensor_(free)(input);
   THTensor_(free)(kernel);
 }

 /*
   4D input, 4D kernel, 4D output
   component wise multiplication like
   y <- y.*x + beta*y
 */
 void THTensor_(conv3Dcmul)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_,
                            long sdepth, long srow, long scol, const char *vf, const char *xc)
 {
   long nInputPlane, nInputDepth, nInputRows, nInputCols;
   long nKernelDepth, nKernelRows, nKernelCols;
   long nOutputPlane, nOutputDepth, nOutputRows, nOutputCols;
   long istride0, kstride0;

   THTensor *input;
   THTensor *kernel;
   real *input_data;
   real *weight_data;
   real *output_data;
   long nelem;
   long k;

   THArgCheck(t_->nDimension == 4 , 3, "input: 3D Tensor expected");
   THArgCheck(k_->nDimension == 4 , 4, "kernel: 3D Tensor expected");
   THArgCheck(srow >= 1, 5, "Stride should be a positive integer");
   THArgCheck(scol >= 1, 6, "Stride should be a positive integer");
   THArgCheck(*vf == 'V' || *vf == 'F', 7, "type of convolution can 'V' or 'F'");
   THArgCheck(*xc == 'C' || *xc == 'X', 7, "type of convolution can 'X' or 'C'");

   input = THTensor_(newContiguous)(t_);
   kernel = THTensor_(newContiguous)(k_);

   istride0    = input->stride[0];
   nInputPlane = input->size[0];
   nInputDepth = input->size[1];
   nInputRows  = input->size[2];
   nInputCols  = input->size[3];

   kstride0    = kernel->stride[0];
   nOutputPlane = kernel->size[0];
   nKernelDepth = kernel->size[1];
   nKernelRows = kernel->size[2];
   nKernelCols = kernel->size[3];

   THArgCheck(nOutputPlane == nInputPlane, 2, "invalid number of input/kernel planes");
   THArgCheck( (nInputDepth >= nKernelDepth && nInputRows >= nKernelRows && nInputCols >= nKernelCols) || *vf == 'F', 2, "conv3Dcmul : Input image is smaller than kernel");

   nOutputDepth = THTensor_(convsize)(nInputDepth, nKernelDepth, sdepth, vf);
   nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
   nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);

   nelem = THTensor_(nElement)(r_);
   THTensor_(resize4d)(r_, nOutputPlane, nOutputDepth, nOutputRows, nOutputCols);

   if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
   {
     THTensor_(zero)(r_);
   }
   else if (beta != 1)
     THTensor_(mul)(r_, r_, beta);

   input_data = THTensor_(data)(input);
   weight_data = THTensor_(data)(kernel);
   output_data = THTensor_(data)(r_);

   for(k = 0; k < nOutputPlane; k++)
   {
     /* get kernel */
     real *ptr_weight = weight_data + k*kstride0;
     /* get input */
     real *ptr_input = input_data + k*istride0;

     /* do image, kernel convolution */
     THTensor_(conv3d)(output_data,
                       alpha,
                       ptr_input,  nInputDepth, nInputRows,  nInputCols,
                       ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
                       sdepth, srow, scol, vf, xc);

     /* Next output plane */
     output_data += nOutputDepth*nOutputCols*nOutputRows;
   }
   THTensor_(free)(input);
   THTensor_(free)(kernel);
 }

 /*
   4D input, 4D kernel, 4D output
   component wise multiplication like with a permutation map
   y <- y.*x + beta*y
 */
 void THTensor_(conv3Dmap)(THTensor *r_, real beta, real alpha, THTensor *t_, THTensor *k_, THTensor *map,
                           long sdepth, long srow, long scol, const char *vf, const char *xc)
 {
   long nInputPlane, nInputDepth, nInputRows, nInputCols;
   long nKernelDepth, nKernelRows, nKernelCols;
   long nOutputPlane, nOutputDepth, nOutputRows, nOutputCols;
   long istride0, kstride0;

   THTensor *input;
   THTensor *kernel;
   long nelem;
   real *input_data;
   real *weight_data;
   real *output_data;
   long nmaps;
   long k;

   THArgCheck(t_->nDimension == 4 , 3, "input: 4D Tensor expected");
   THArgCheck(k_->nDimension == 4 , 4, "kernel: 4D Tensor expected");
   THArgCheck(map->nDimension == 2 , 4, "map: 2D Tensor expected");
   THArgCheck(srow >= 1, 6, "Stride should be a positive integer");
   THArgCheck(scol >= 1, 7, "Stride should be a positive integer");
   THArgCheck(*vf == 'V' || *vf == 'F', 8, "type of convolution can 'V' or 'F'");
   THArgCheck(*xc == 'C' || *xc == 'X', 8, "type of convolution can 'X' or 'C'");

   input = THTensor_(newContiguous)(t_);
   kernel = THTensor_(newContiguous)(k_);

   istride0    = input->stride[0];
   nInputPlane = input->size[0];
   nInputDepth = input->size[1];
   nInputRows  = input->size[2];
   nInputCols  = input->size[3];

   kstride0    = kernel->stride[0];
   nOutputPlane = kernel->size[0];
   nKernelDepth = kernel->size[1];
   nKernelRows = kernel->size[2];
   nKernelCols = kernel->size[3];

   THArgCheck(nOutputPlane == nInputPlane, 2, "invalid number of input/kernel planes");
   THArgCheck((nInputDepth >= nKernelDepth
               && nInputRows >= nKernelRows
               && nInputCols >= nKernelCols) || *vf == 'F',
              2, "conv3Dmap : Input image is smaller than kernel");

   nOutputDepth = THTensor_(convsize)(nInputDepth, nKernelDepth, sdepth, vf);
   nOutputRows = THTensor_(convsize)(nInputRows, nKernelRows, srow, vf);
   nOutputCols = THTensor_(convsize)(nInputCols, nKernelCols, scol, vf);

   nelem = THTensor_(nElement)(r_);
   THTensor_(resize4d)(r_, nOutputPlane, nOutputDepth, nOutputRows, nOutputCols);

   if (nelem == 0 || beta == 0 || nelem != THTensor_(nElement)(r_))
   {
     THTensor_(zero)(r_);
   }
   else if (beta != 1)
     THTensor_(mul)(r_, r_, beta);

   input_data = THTensor_(data)(input);
   weight_data = THTensor_(data)(kernel);
   output_data = THTensor_(data)(r_);

   nmaps = map->size[0];

   for(k = 0; k < nmaps; k++)
   {
     /* get indices */
     long from = (long)THTensor_(get2d)(map,k,0)-1;
     long to   = (long)THTensor_(get2d)(map,k,1)-1;

     /* get kernel */
     real *ptr_weight = weight_data + k*kstride0;
     /* get input */
     real *ptr_input = input_data + from*istride0;
     /* get output */
     real *ptr_output = output_data + to*nOutputDepth*nOutputRows*nOutputCols;

     /* do image, kernel convolution */
     THTensor_(conv3d)(ptr_output,
                       alpha,
                       ptr_input,  nInputDepth, nInputRows,  nInputCols,
                       ptr_weight, nKernelDepth, nKernelRows, nKernelCols,
                       sdepth, srow, scol, vf, xc);
   }
   THTensor_(free)(input);
   THTensor_(free)(kernel);
 }

 #endif