generic/VolumetricConvolutionMM.c - platform/external/pytorch - Git at Google

 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/VolumetricConvolutionMM.c"
 #else

 /* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */
 static void THNN_(unfolded_acc_vol)(
   THTensor *finput, THTensor *input,
   int kT, int kW, int kH,
   int dT, int dW, int dH,
   int pT, int pW, int pH,
   int nInputPlane,
   int inputDepth, int inputWidth, int inputHeight,
   int outputDepth, int outputWidth, int outputHeight)
 {
   int nip;
   real *input_data = THTensor_(data)(input);
   real *finput_data = THTensor_(data)(finput);

 //#pragma omp parallel for private(nip)
   for (nip = 0; nip < nInputPlane; nip++)
   {
     int kt, kw, kh, t, y, x, it, ix, iy;
     for (kt = 0; kt < kT; kt++)
     {
       for (kh = 0; kh < kH; kh++)
       {
         for (kw = 0; kw < kW; kw++)
         {
           real *src = finput_data
             + nip * (kT*kH*kW*outputDepth*outputHeight*outputWidth)
             + kt  * (kH*kW*outputDepth*outputHeight*outputWidth)
             + kh  * (kW*outputDepth*outputHeight*outputWidth)
             + kw  * (outputDepth*outputHeight*outputWidth);

           real *dst = input_data + nip*(inputDepth*inputHeight*inputWidth);
           if (pT > 0 || pH > 0 || pW > 0)
           {
             for (t = 0; t < outputDepth; t++)
             {
               it = t*dT - pT + kt;
               for (y = 0; y < outputHeight; y++)
               {
                 iy = y*dH - pH + kh;
                 for (x = 0; x < outputWidth; x++)
                 {
                   ix = x*dW - pW + kw;
                   if (it < 0 || it >= inputDepth || iy < 0 || iy >= inputHeight || ix < 0 || ix >= inputWidth)
                   {
                   }
                   else
                   {
                     THVector_(add)(dst+it*inputHeight*inputWidth+iy*inputWidth+ix, src+t*outputHeight*outputWidth+y*outputWidth+x, 1, 1);
                   }
                 }
               }
             }
           }
           else
           {
             for (t = 0; t < outputDepth; t++)
             {
               it = t*dT + kt;
               for (y = 0; y < outputHeight; y++)
               {
                 iy = y*dH + kh;
                 for(x = 0; x < outputWidth; x++)
                 {
                   ix = x*dW + kw;
                   THVector_(add)(dst+it*inputHeight*inputWidth+iy*inputWidth+ix, src+t*outputHeight*outputWidth+y*outputWidth+x, 1, 1);
                 }
               }
             }
           }
         }
       }
     }
   }
 }

 static void THNN_(unfolded_copy_vol)(
   THTensor *finput, THTensor *input,
   int kT, int kW, int kH,
   int dT, int dW, int dH,
   int pT, int pW, int pH,
   int nInputPlane,
   int inputDepth, int inputWidth, int inputHeight,
   int outputDepth, int outputWidth, int outputHeight)
 {
   long k;
   real *input_data = THTensor_(data)(input);
   real *finput_data = THTensor_(data)(finput);
 // #pragma omp parallel for private(k)
   for (k = 0; k < nInputPlane*kT*kH*kW; k++)
   {
     int nip = k / (kT*kH*kW);
     int rest = k % (kT*kH*kW);
     int kt = rest / (kH*kW);
     rest = rest % (kH*kW);
     int kh = rest / kW;
     int kw = rest % kW;
     int t,x,y,it,ix,iy;
     real *dst = finput_data
       + nip * (kT*kH*kW*outputDepth*outputHeight*outputWidth)
       + kt  * (kH*kW*outputDepth*outputHeight*outputWidth)
       + kh  * (kW*outputDepth*outputHeight*outputWidth)
       + kw  * (outputDepth*outputHeight*outputWidth);
     real *src = input_data + nip*(inputDepth*inputHeight*inputWidth);

     if (pT > 0 || pH > 0 || pW > 0)
     {
       for (t = 0; t < outputDepth; t++)
       {
         it = t*dT - pT + kt;
         for (y = 0; y < outputHeight; y++)
         {
           iy = y*dH - pH + kh;
           for (x = 0; x < outputWidth; x++)
           {
             ix = x*dW - pW + kw;
             if (it < 0 || it >= inputDepth || iy < 0 || iy >= inputHeight || ix < 0 || ix >= inputWidth)
               memset(dst+t*outputHeight*outputWidth+y*outputWidth+x, 0, sizeof(real)*(1));
             else
               memcpy(dst+t*outputHeight*outputWidth+y*outputWidth+x, src+it*inputHeight*inputWidth+iy*inputWidth+ix, sizeof(real)*(1));
           }
         }
       }
     }
     else
     {
       for (t = 0; t < outputDepth; t++)
       {
         it = t*dT + kt;
         for (y = 0; y < outputHeight; y++)
         {
           iy = y*dH + kh;
           for(x = 0; x < outputWidth; x++)
           {
             ix = x*dW + kw;
             memcpy(dst+t*outputHeight*outputWidth+y*outputWidth+x, src+it*inputHeight*inputWidth+iy*inputWidth+ix, sizeof(real)*(1));
           }
         }
       }
     }
   }
 }

 static void THNN_(VolumetricConvolutionMM_updateOutput_frame)(
   THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, THTensor *finput,
   int kT, int kW, int kH,
   int dT, int dW, int dH,
   int pT,int pW, int pH,
   long nInputPlane, long inputDepth, long inputWidth, long inputHeight,
   long nOutputPlane, long outputDepth, long outputWidth, long outputHeight)
 {
   long i;
   THTensor *output2d;

   THNN_(unfolded_copy_vol)(
     finput, input,
     kT, kW, kH,
     dT, dW, dH,
     pT, pW, pH,
     nInputPlane,
     inputDepth, inputWidth, inputHeight,
     outputDepth, outputWidth, outputHeight
   );

   output2d = THTensor_(newWithStorage2d)(
     output->storage, output->storageOffset, nOutputPlane, -1,
     outputDepth*outputHeight*outputWidth, -1
   );

   for (i = 0; i < nOutputPlane; i++)
   {
     THVector_(fill)(
       output->storage->data+output->storageOffset+output->stride[0]*i,
       THTensor_(get1d)(bias, i),
       outputDepth*outputHeight*outputWidth
     );
   }

   THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput);

   THTensor_(free)(output2d);
 }

 void THNN_(VolumetricConvolutionMM_updateOutput)(
   THNNState *state,
   THTensor *input,
   THTensor *output,
   THTensor *weight,
   THTensor *bias,
   THTensor *finput,
   int kT, int kW, int kH,
   int dT, int dW, int dH,
   int pT, int pW, int pH)
 {
   int dimf = 0;
   int dimt = 1;
   int dimh = 2;
   int dimw = 3;

   long nInputPlane;
   long inputDepth;
   long inputHeight;
   long inputWidth;
   long nOutputPlane;
   long outputDepth;
   long outputHeight;
   long outputWidth;

   THArgCheck(input->nDimension == 4 || input->nDimension == 5, 2,
     "4D or 5D(batch mode) tensor expected"
   );

   if (input->nDimension == 5)
   {
     dimf++;
     dimt++;
     dimh++;
     dimw++;
   }

   nInputPlane = input->size[dimf];
   inputDepth = input->size[dimt];
   inputHeight  = input->size[dimh];
   inputWidth   = input->size[dimw];
   nOutputPlane = weight->size[0];
   outputDepth  = (inputDepth + 2*pT - kT) / dT + 1;
   outputHeight = (inputHeight + 2*pH - kH) / dH + 1;
   outputWidth  = (inputWidth + 2*pW - kW) / dW + 1;

   if (outputWidth < 1 || outputHeight < 1)
   {
     THError(
       "Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
       nInputPlane, inputDepth, inputHeight, inputWidth,
       nOutputPlane, outputDepth, outputHeight, outputWidth
     );
   }

   if (input->nDimension == 4)
   {
     THTensor_(resize2d)(finput, kT*kW*kH*nInputPlane, outputDepth*outputHeight*outputWidth);
     THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);

     THNN_(VolumetricConvolutionMM_updateOutput_frame)(
       input, output, weight, bias, finput,
       kT, kW, kH,
       dT, dW, dH,
       pT, pW, pH,
       nInputPlane, inputDepth, inputWidth, inputHeight,
       nOutputPlane, outputDepth, outputWidth, outputHeight
     );
   }
   else
   {
     long T = input->size[0];
     long t;

     THTensor_(resize3d)(finput, T, kT*kW*kH*nInputPlane, outputDepth*outputHeight*outputWidth);
     THTensor_(resize5d)(output, T, nOutputPlane, outputDepth, outputHeight, outputWidth);

 // #pragma omp parallel for private(t)
     for (t = 0; t < T; t++)
     {
       THTensor *input_t = THTensor_(newSelect)(input, 0, t);
       THTensor *output_t = THTensor_(newSelect)(output, 0, t);
       THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);

       THNN_(VolumetricConvolutionMM_updateOutput_frame)(
         input_t, output_t, weight, bias, finput_t,
         kT, kW, kH,
         dT, dW, dH,
         pT, pW, pH,
         nInputPlane, inputDepth, inputWidth, inputHeight,
         nOutputPlane, outputDepth, outputWidth, outputHeight
       );

       THTensor_(free)(input_t);
       THTensor_(free)(output_t);
       THTensor_(free)(finput_t);
     }
   }
 }

 static void THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
   THTensor *gradInput, THTensor *gradOutput, THTensor *weight, THTensor *fgradInput,
   int kT, int kW, int kH,
   int dT, int dW, int dH,
   int pT, int pW, int pH)
 {
   THTensor *gradOutput2d = THTensor_(newWithStorage2d)(
     gradOutput->storage, gradOutput->storageOffset,
     gradOutput->size[0], -1,
     gradOutput->size[1]*gradOutput->size[2]*gradOutput->size[3], -1
   );

   THTensor_(addmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput2d);
   THTensor_(free)(gradOutput2d);

   THTensor_(zero)(gradInput);

   THNN_(unfolded_acc_vol)(
     fgradInput, gradInput,
     kT, kW, kH,
     dT, dW, dH,
     pT, pW, pH,
     gradInput->size[0], gradInput->size[1], gradInput->size[3], gradInput->size[2],
     gradOutput->size[1], gradOutput->size[3], gradOutput->size[2]
   );
 }

 void THNN_(VolumetricConvolutionMM_updateGradInput)(
   THNNState *state,
   THTensor *input,
   THTensor *gradOutput,
   THTensor *gradInput,
   THTensor *weight,
   THTensor *finput,
   THTensor *fgradInput,
   int kT, int kW, int kH,
   int dT, int dW, int dH,
   int pT, int pW, int pH)
 {
   // number of input/output planes and kernel size is indirectly defined by the weight tensor
   THArgCheck(weight->nDimension == 2, 4,
     "2D weight tensor is expected (nOutputPlane x (nInputPlane * kT * kH * kW))"
   );

   int nOutputPlane = (int)weight->size[0];

   THArgCheck(nOutputPlane == gradOutput->size[input->nDimension == 5 ? 1 : 0], 1,
     "Number of output features is not equal to nOutputPlane"
   );

   THTensor_(resizeAs)(gradInput, input);
   THTensor_(resizeAs)(fgradInput, finput);
   THTensor_(transpose)(weight, weight, 0, 1);

   if (input->nDimension == 4)
   {
     THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
       gradInput, gradOutput, weight, fgradInput,
       kT, kW, kH,
       dT, dW, dH,
       pT, pW, pH
     );
   }
   else
   {
     long T = input->size[0];
     long t;

 //#pragma omp parallel for private(t)
     for (t = 0; t < T; t++)
     {
       THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
       THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
       THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);

       THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
         gradInput_t, gradOutput_t, weight, fgradInput_t,
         kT, kW, kH,
         dT, dW, dH,
         pT, pW, pH
       );

       THTensor_(free)(gradInput_t);
       THTensor_(free)(gradOutput_t);
       THTensor_(free)(fgradInput_t);
     }
   }

   THTensor_(transpose)(weight, weight, 0, 1);
 }

 static void THNN_(VolumetricConvolutionMM_accGradParameters_frame)(
   THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *finput, real scale)
 {
   long i;
   THTensor *gradOutput2d = THTensor_(newWithStorage2d)(
     gradOutput->storage, gradOutput->storageOffset,
     gradOutput->size[0], -1,
     gradOutput->size[1]*gradOutput->size[2]*gradOutput->size[3], -1
   );

   THTensor_(transpose)(finput, finput, 0, 1);
   THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, finput);
   THTensor_(transpose)(finput, finput, 0, 1);

   for (i = 0; i < gradBias->size[0]; i++)
   {
     long k;
     real sum = 0;
     real *data = gradOutput2d->storage->data + gradOutput2d->storageOffset + i*gradOutput2d->stride[0];
     for (k = 0; k < gradOutput2d->size[1]; k++)
       sum += data[k];

     (gradBias->storage->data + gradBias->storageOffset)[i] += scale * sum;
   }

   THTensor_(free)(gradOutput2d);
 }

 void THNN_(VolumetricConvolutionMM_accGradParameters)(
   THNNState *state,
   THTensor *input,
   THTensor *gradOutput,
   THTensor *gradWeight,
   THTensor *gradBias,
   THTensor *finput,
   real scale)
 {
   THArgCheck(gradWeight->nDimension == 2, 4,
     "2D gradWeight tensor is expected (nOutputPlane x (nInputPlane * kT * kH * kW))"
   );

   int nOutputPlane = (int)gradWeight->size[0];

   THArgCheck(gradBias->nDimension == 1 && gradBias->size[0] == nOutputPlane, 5,
     "gradBias tensor has wrong size"
   );

   THArgCheck(nOutputPlane == gradOutput->size[input->nDimension == 5 ? 1 : 0], 3,
     "Number of output features is not equal to nOutputPlane"
   );

   if (input->nDimension == 4)   // non-batch mode
   {
     THNN_(VolumetricConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight, gradBias, finput, scale);
   }
   else  // batch mode
   {
     long T = input->size[0];
     long t;

     for (t = 0; t < T; t++)
     {
       THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
       THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);

       THNN_(VolumetricConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight, gradBias, finput_t, scale);

       THTensor_(free)(gradOutput_t);
       THTensor_(free)(finput_t);
     }
   }
 }

 #endif
	#ifndef TH_GENERIC_FILE
	#define TH_GENERIC_FILE "generic/VolumetricConvolutionMM.c"
	#else

	/* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */
	static void THNN_(unfolded_acc_vol)(
	THTensor finput, THTensor input,
	int kT, int kW, int kH,
	int dT, int dW, int dH,
	int pT, int pW, int pH,
	int nInputPlane,
	int inputDepth, int inputWidth, int inputHeight,
	int outputDepth, int outputWidth, int outputHeight)
	{
	int nip;
	real *input_data = THTensor_(data)(input);
	real *finput_data = THTensor_(data)(finput);

	//#pragma omp parallel for private(nip)
	for (nip = 0; nip < nInputPlane; nip++)
	{
	int kt, kw, kh, t, y, x, it, ix, iy;
	for (kt = 0; kt < kT; kt++)
	{
	for (kh = 0; kh < kH; kh++)
	{
	for (kw = 0; kw < kW; kw++)
	{
	real *src = finput_data
	+ nip * (kTkHkWoutputDepthoutputHeight*outputWidth)
	+ kt * (kHkWoutputDepthoutputHeightoutputWidth)
	+ kh * (kWoutputDepthoutputHeight*outputWidth)
	+ kw * (outputDepthoutputHeightoutputWidth);

	real dst = input_data + nip(inputDepthinputHeightinputWidth);
	if (pT > 0 \|\| pH > 0 \|\| pW > 0)
	{
	for (t = 0; t < outputDepth; t++)
	{
	it = t*dT - pT + kt;
	for (y = 0; y < outputHeight; y++)
	{
	iy = y*dH - pH + kh;
	for (x = 0; x < outputWidth; x++)
	{
	ix = x*dW - pW + kw;
	if (it < 0 \|\| it >= inputDepth \|\| iy < 0 \|\| iy >= inputHeight \|\| ix < 0 \|\| ix >= inputWidth)
	{
	}
	else
	{
	THVector_(add)(dst+itinputHeightinputWidth+iyinputWidth+ix, src+toutputHeightoutputWidth+youtputWidth+x, 1, 1);
	}
	}
	}
	}
	}
	else
	{
	for (t = 0; t < outputDepth; t++)
	{
	it = t*dT + kt;
	for (y = 0; y < outputHeight; y++)
	{
	iy = y*dH + kh;
	for(x = 0; x < outputWidth; x++)
	{
	ix = x*dW + kw;
	THVector_(add)(dst+itinputHeightinputWidth+iyinputWidth+ix, src+toutputHeightoutputWidth+youtputWidth+x, 1, 1);
	}
	}
	}
	}
	}
	}
	}
	}
	}

	static void THNN_(unfolded_copy_vol)(
	THTensor finput, THTensor input,
	int kT, int kW, int kH,
	int dT, int dW, int dH,
	int pT, int pW, int pH,
	int nInputPlane,
	int inputDepth, int inputWidth, int inputHeight,
	int outputDepth, int outputWidth, int outputHeight)
	{
	long k;
	real *input_data = THTensor_(data)(input);
	real *finput_data = THTensor_(data)(finput);
	// #pragma omp parallel for private(k)
	for (k = 0; k < nInputPlanekTkH*kW; k++)
	{
	int nip = k / (kTkHkW);
	int rest = k % (kTkHkW);
	int kt = rest / (kH*kW);
	rest = rest % (kH*kW);
	int kh = rest / kW;
	int kw = rest % kW;
	int t,x,y,it,ix,iy;
	real *dst = finput_data
	+ nip * (kTkHkWoutputDepthoutputHeight*outputWidth)
	+ kt * (kHkWoutputDepthoutputHeightoutputWidth)
	+ kh * (kWoutputDepthoutputHeight*outputWidth)
	+ kw * (outputDepthoutputHeightoutputWidth);
	real src = input_data + nip(inputDepthinputHeightinputWidth);

	if (pT > 0 \|\| pH > 0 \|\| pW > 0)
	{
	for (t = 0; t < outputDepth; t++)
	{
	it = t*dT - pT + kt;
	for (y = 0; y < outputHeight; y++)
	{
	iy = y*dH - pH + kh;
	for (x = 0; x < outputWidth; x++)
	{
	ix = x*dW - pW + kw;
	if (it < 0 \|\| it >= inputDepth \|\| iy < 0 \|\| iy >= inputHeight \|\| ix < 0 \|\| ix >= inputWidth)
	memset(dst+toutputHeightoutputWidth+youtputWidth+x, 0, sizeof(real)(1));
	else
	memcpy(dst+toutputHeightoutputWidth+youtputWidth+x, src+itinputHeightinputWidth+iyinputWidth+ix, sizeof(real)*(1));
	}
	}
	}
	}
	else
	{
	for (t = 0; t < outputDepth; t++)
	{
	it = t*dT + kt;
	for (y = 0; y < outputHeight; y++)
	{
	iy = y*dH + kh;
	for(x = 0; x < outputWidth; x++)
	{
	ix = x*dW + kw;
	memcpy(dst+toutputHeightoutputWidth+youtputWidth+x, src+itinputHeightinputWidth+iyinputWidth+ix, sizeof(real)*(1));
	}
	}
	}
	}
	}
	}

	static void THNN_(VolumetricConvolutionMM_updateOutput_frame)(
	THTensor input, THTensor output, THTensor weight, THTensor bias, THTensor *finput,
	int kT, int kW, int kH,
	int dT, int dW, int dH,
	int pT,int pW, int pH,
	long nInputPlane, long inputDepth, long inputWidth, long inputHeight,
	long nOutputPlane, long outputDepth, long outputWidth, long outputHeight)
	{
	long i;
	THTensor *output2d;

	THNN_(unfolded_copy_vol)(
	finput, input,
	kT, kW, kH,
	dT, dW, dH,
	pT, pW, pH,
	nInputPlane,
	inputDepth, inputWidth, inputHeight,
	outputDepth, outputWidth, outputHeight
	);

	output2d = THTensor_(newWithStorage2d)(
	output->storage, output->storageOffset, nOutputPlane, -1,
	outputDepthoutputHeightoutputWidth, -1
	);

	for (i = 0; i < nOutputPlane; i++)
	{
	THVector_(fill)(
	output->storage->data+output->storageOffset+output->stride[0]*i,
	THTensor_(get1d)(bias, i),
	outputDepthoutputHeightoutputWidth
	);
	}

	THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput);

	THTensor_(free)(output2d);
	}

	void THNN_(VolumetricConvolutionMM_updateOutput)(
	THNNState *state,
	THTensor *input,
	THTensor *output,
	THTensor *weight,
	THTensor *bias,
	THTensor *finput,
	int kT, int kW, int kH,
	int dT, int dW, int dH,
	int pT, int pW, int pH)
	{
	int dimf = 0;
	int dimt = 1;
	int dimh = 2;
	int dimw = 3;

	long nInputPlane;
	long inputDepth;
	long inputHeight;
	long inputWidth;
	long nOutputPlane;
	long outputDepth;
	long outputHeight;
	long outputWidth;

	THArgCheck(input->nDimension == 4 \|\| input->nDimension == 5, 2,
	"4D or 5D(batch mode) tensor expected"
	);

	if (input->nDimension == 5)
	{
	dimf++;
	dimt++;
	dimh++;
	dimw++;
	}

	nInputPlane = input->size[dimf];
	inputDepth = input->size[dimt];
	inputHeight = input->size[dimh];
	inputWidth = input->size[dimw];
	nOutputPlane = weight->size[0];
	outputDepth = (inputDepth + 2*pT - kT) / dT + 1;
	outputHeight = (inputHeight + 2*pH - kH) / dH + 1;
	outputWidth = (inputWidth + 2*pW - kW) / dW + 1;

	if (outputWidth < 1 \|\| outputHeight < 1)
	{
	THError(
	"Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
	nInputPlane, inputDepth, inputHeight, inputWidth,
	nOutputPlane, outputDepth, outputHeight, outputWidth
	);
	}

	if (input->nDimension == 4)
	{
	THTensor_(resize2d)(finput, kTkWkHnInputPlane, outputDepthoutputHeight*outputWidth);
	THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);

	THNN_(VolumetricConvolutionMM_updateOutput_frame)(
	input, output, weight, bias, finput,
	kT, kW, kH,
	dT, dW, dH,
	pT, pW, pH,
	nInputPlane, inputDepth, inputWidth, inputHeight,
	nOutputPlane, outputDepth, outputWidth, outputHeight
	);
	}
	else
	{
	long T = input->size[0];
	long t;

	THTensor_(resize3d)(finput, T, kTkWkHnInputPlane, outputDepthoutputHeight*outputWidth);
	THTensor_(resize5d)(output, T, nOutputPlane, outputDepth, outputHeight, outputWidth);

	// #pragma omp parallel for private(t)
	for (t = 0; t < T; t++)
	{
	THTensor *input_t = THTensor_(newSelect)(input, 0, t);
	THTensor *output_t = THTensor_(newSelect)(output, 0, t);
	THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);

	THNN_(VolumetricConvolutionMM_updateOutput_frame)(
	input_t, output_t, weight, bias, finput_t,
	kT, kW, kH,
	dT, dW, dH,
	pT, pW, pH,
	nInputPlane, inputDepth, inputWidth, inputHeight,
	nOutputPlane, outputDepth, outputWidth, outputHeight
	);

	THTensor_(free)(input_t);
	THTensor_(free)(output_t);
	THTensor_(free)(finput_t);
	}
	}
	}

	static void THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
	THTensor gradInput, THTensor gradOutput, THTensor weight, THTensor fgradInput,
	int kT, int kW, int kH,
	int dT, int dW, int dH,
	int pT, int pW, int pH)
	{
	THTensor *gradOutput2d = THTensor_(newWithStorage2d)(
	gradOutput->storage, gradOutput->storageOffset,
	gradOutput->size[0], -1,
	gradOutput->size[1]gradOutput->size[2]gradOutput->size[3], -1
	);

	THTensor_(addmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput2d);
	THTensor_(free)(gradOutput2d);

	THTensor_(zero)(gradInput);

	THNN_(unfolded_acc_vol)(
	fgradInput, gradInput,
	kT, kW, kH,
	dT, dW, dH,
	pT, pW, pH,
	gradInput->size[0], gradInput->size[1], gradInput->size[3], gradInput->size[2],
	gradOutput->size[1], gradOutput->size[3], gradOutput->size[2]
	);
	}

	void THNN_(VolumetricConvolutionMM_updateGradInput)(
	THNNState *state,
	THTensor *input,
	THTensor *gradOutput,
	THTensor *gradInput,
	THTensor *weight,
	THTensor *finput,
	THTensor *fgradInput,
	int kT, int kW, int kH,
	int dT, int dW, int dH,
	int pT, int pW, int pH)
	{
	// number of input/output planes and kernel size is indirectly defined by the weight tensor
	THArgCheck(weight->nDimension == 2, 4,
	"2D weight tensor is expected (nOutputPlane x (nInputPlane * kT * kH * kW))"
	);

	int nOutputPlane = (int)weight->size[0];

	THArgCheck(nOutputPlane == gradOutput->size[input->nDimension == 5 ? 1 : 0], 1,
	"Number of output features is not equal to nOutputPlane"
	);

	THTensor_(resizeAs)(gradInput, input);
	THTensor_(resizeAs)(fgradInput, finput);
	THTensor_(transpose)(weight, weight, 0, 1);

	if (input->nDimension == 4)
	{
	THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
	gradInput, gradOutput, weight, fgradInput,
	kT, kW, kH,
	dT, dW, dH,
	pT, pW, pH
	);
	}
	else
	{
	long T = input->size[0];
	long t;

	//#pragma omp parallel for private(t)
	for (t = 0; t < T; t++)
	{
	THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
	THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
	THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);

	THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
	gradInput_t, gradOutput_t, weight, fgradInput_t,
	kT, kW, kH,
	dT, dW, dH,
	pT, pW, pH
	);

	THTensor_(free)(gradInput_t);
	THTensor_(free)(gradOutput_t);
	THTensor_(free)(fgradInput_t);
	}
	}

	THTensor_(transpose)(weight, weight, 0, 1);
	}

	static void THNN_(VolumetricConvolutionMM_accGradParameters_frame)(
	THTensor gradOutput, THTensor gradWeight, THTensor gradBias, THTensor finput, real scale)
	{
	long i;
	THTensor *gradOutput2d = THTensor_(newWithStorage2d)(
	gradOutput->storage, gradOutput->storageOffset,
	gradOutput->size[0], -1,
	gradOutput->size[1]gradOutput->size[2]gradOutput->size[3], -1
	);

	THTensor_(transpose)(finput, finput, 0, 1);
	THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, finput);
	THTensor_(transpose)(finput, finput, 0, 1);

	for (i = 0; i < gradBias->size[0]; i++)
	{
	long k;
	real sum = 0;
	real data = gradOutput2d->storage->data + gradOutput2d->storageOffset + igradOutput2d->stride[0];
	for (k = 0; k < gradOutput2d->size[1]; k++)
	sum += data[k];

	(gradBias->storage->data + gradBias->storageOffset)[i] += scale * sum;
	}

	THTensor_(free)(gradOutput2d);
	}

	void THNN_(VolumetricConvolutionMM_accGradParameters)(
	THNNState *state,
	THTensor *input,
	THTensor *gradOutput,
	THTensor *gradWeight,
	THTensor *gradBias,
	THTensor *finput,
	real scale)
	{
	THArgCheck(gradWeight->nDimension == 2, 4,
	"2D gradWeight tensor is expected (nOutputPlane x (nInputPlane * kT * kH * kW))"
	);

	int nOutputPlane = (int)gradWeight->size[0];

	THArgCheck(gradBias->nDimension == 1 && gradBias->size[0] == nOutputPlane, 5,
	"gradBias tensor has wrong size"
	);

	THArgCheck(nOutputPlane == gradOutput->size[input->nDimension == 5 ? 1 : 0], 3,
	"Number of output features is not equal to nOutputPlane"
	);

	if (input->nDimension == 4) // non-batch mode
	{
	THNN_(VolumetricConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight, gradBias, finput, scale);
	}
	else // batch mode
	{
	long T = input->size[0];
	long t;

	for (t = 0; t < T; t++)
	{
	THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
	THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);

	THNN_(VolumetricConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight, gradBias, finput_t, scale);

	THTensor_(free)(gradOutput_t);
	THTensor_(free)(finput_t);
	}
	}
	}

	#endif