generic/SpatialFullConvolution.c - platform/external/pytorch - Git at Google

 #ifndef TH_GENERIC_FILE
 #define TH_GENERIC_FILE "generic/SpatialFullConvolution.c"
 #else

 static void THNN_(im2col)(const real* data_im, const int channels,
       const int height, const int width, const int kernel_h, const int kernel_w,
       const int pad_h, const int pad_w,
       const int stride_h, const int stride_w,
       const int dilation_h, const int dilation_w,
       real* data_col) {
   const int height_col = (height + 2 * pad_h -
                           (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
   const int width_col = (width + 2 * pad_w -
                          (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
   const int channels_col = channels * kernel_h * kernel_w;
   for (int c_col = 0; c_col < channels_col; ++c_col) {
     int w_offset = c_col % kernel_w;
     int h_offset = (c_col / kernel_w) % kernel_h;
     int c_im = c_col / kernel_h / kernel_w;
     for (int h_col = 0; h_col < height_col; ++h_col) {
       for (int w_col = 0; w_col < width_col; ++w_col) {
         int h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
         int w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
         data_col[(c_col * height_col + h_col) * width_col + w_col] =
           (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
           data_im[(c_im * height + h_im) * width + w_im] : 0;
       }
     }
   }
 }

 static void THNN_(col2im)(const real* data_col, const int channels,
       const int height, const int width, const int kernel_h, const int kernel_w,
       const int pad_h, const int pad_w,
       const int stride_h, const int stride_w,
       const int dilation_h, const int dilation_w,
       real* data_im) {
   memset(data_im, 0, sizeof(real) * height * width * channels);
   const int height_col = (height + 2 * pad_h -
                           (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
   const int width_col = (width + 2 * pad_w -
                          (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
   const int channels_col = channels * kernel_h * kernel_w;
   for (int c_col = 0; c_col < channels_col; ++c_col) {
     int w_offset = c_col % kernel_w;
     int h_offset = (c_col / kernel_w) % kernel_h;
     int c_im = c_col / kernel_h / kernel_w;
     for (int h_col = 0; h_col < height_col; ++h_col) {
       for (int w_col = 0; w_col < width_col; ++w_col) {
         int h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
         int w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
         if (h_im >= 0 && h_im < height && w_im >= 0 && w_im < width)
           data_im[(c_im * height + h_im) * width + w_im] +=
             data_col[(c_col * height_col + h_col) * width_col + w_col];
       }
     }
   }
 }

 void THNN_(SpatialFullConvolution_updateOutput)(
     THNNState *state,
     THTensor *input,
     THTensor *output,
     THTensor *weight,
     THTensor *bias,
     THTensor *columns,
     THTensor *ones,
     int kW, int kH,
     int dW, int dH,
     int padW, int padH,
     int adjW, int adjH)
 {
   int nInputPlane = THTensor_(size)(weight,0);
   int nOutputPlane = THTensor_(size)(weight,1);

   THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");

   int batch = 1;
   if (input->nDimension == 3) {
     THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
     // Force batch
     batch = 0;
     THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
   } else {
     THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
   }

   long inputWidth   = input->size[3];
   long inputHeight  = input->size[2];
   long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
   long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;

   // Batch size + input planes
   long batchSize = input->size[0];

   // Resize output
   THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth);

   // Resize temporary columns
   THTensor_(resize2d)(columns, nOutputPlane*kW*kH, inputHeight*inputWidth);

   // Define a buffer of ones, for bias accumulation
   // Note: this buffer can be shared with other modules, it only ever gets increased,
   // and always contains ones.
   if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
     // Resize plane and fill with ones...
     THTensor_(resize2d)(ones, outputHeight, outputWidth);
     THTensor_(fill)(ones, 1);
   }

   // Helpers
   THTensor *input_n = THTensor_(new)();
   THTensor *output_n = THTensor_(new)();

   int elt;
   // For each elt in batch, do:
   for (elt = 0; elt < batchSize; elt ++) {
     // Matrix mulitply per output:
     THTensor_(select)(input_n, input, 0, elt);
     THTensor_(select)(output_n, output, 0, elt);

     // M,N,K are dims of matrix A and B
     // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
     long m = weight->size[1] * weight->size[2] * weight->size[3];
     long n = columns->size[1];
     long k = weight->size[0];

     // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
     THBlas_(gemm)(
         'n', 't',
         n, m, k,
         1,
         THTensor_(data)(input_n), n,
         THTensor_(data)(weight), m,
         0,
         THTensor_(data)(columns), n
     );

     // Unpack columns back into input:
     THNN_(col2im)(
       THTensor_(data)(columns),
       nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
       1, 1,
       THTensor_(data)(output_n)
     );

     // Do Bias after:
     // M,N,K are dims of matrix A and B
     // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
     long m_ = nOutputPlane;
     long n_ = outputHeight * outputWidth;
     long k_ = 1;

     // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
     THBlas_(gemm)(
         't', 'n',
         n_, m_, k_,
         1,
         THTensor_(data)(ones), k_,
         THTensor_(data)(bias), k_,
         1,
         THTensor_(data)(output_n), n_
     );

   }

   // Free
   THTensor_(free)(input_n);
   THTensor_(free)(output_n);

   // Resize output
   if (batch == 0) {
     THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
     THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
   }
 }

 void THNN_(SpatialFullConvolution_updateGradInput)(
     THNNState *state,
     THTensor *input,
     THTensor *gradOutput,
     THTensor *gradInput,
     THTensor *weight,
     THTensor *gradColumns,
     int kW, int kH,
     int dW, int dH,
     int padW, int padH,
     int adjW, int adjH)
 {
   int nInputPlane = THTensor_(size)(weight,0);
   int nOutputPlane = THTensor_(size)(weight,1);

   THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");

   int batch = 1;
   if (input->nDimension == 3) {
     // Force batch
     batch = 0;
     THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
     THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
   }

   long inputWidth   = input->size[3];
   long inputHeight  = input->size[2];
   long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
   long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;

   // Batch size + input planes
   long batchSize = input->size[0];

   // Resize output
   THTensor_(resize4d)(gradInput, batchSize, nInputPlane, inputHeight, inputWidth);

   // Resize temporary columns
   THTensor_(resize2d)(gradColumns, nOutputPlane*kW*kH, inputHeight*inputWidth);

   // Helpers
   THTensor *gradInput_n = THTensor_(new)();
   THTensor *gradOutput_n = THTensor_(new)();

   int elt;
   // For each elt in batch, do:
   for (elt = 0; elt < batchSize; elt ++) {
     // Matrix mulitply per sample:
     THTensor_(select)(gradInput_n, gradInput, 0, elt);
     THTensor_(select)(gradOutput_n, gradOutput, 0, elt);

     // Extract columns:
     THNN_(im2col)(
       THTensor_(data)(gradOutput_n),
       nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
       1, 1,
       THTensor_(data)(gradColumns)
     );


     // M,N,K are dims of matrix A and B
     // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
     long m = weight->size[0];
     long n = gradColumns->size[1];
     long k = weight->size[1] * weight->size[2] * weight->size[3];

     // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
     THBlas_(gemm)(
         'n', 'n',
         n, m, k,
         1,
         THTensor_(data)(gradColumns), n,
         THTensor_(data)(weight), k,
         0,
         THTensor_(data)(gradInput_n), n
     );
   }


   // Free
   THTensor_(free)(gradInput_n);
   THTensor_(free)(gradOutput_n);

   // Resize output
   if (batch == 0) {
     THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
     THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
     THTensor_(resize3d)(gradInput, nInputPlane, inputHeight, inputWidth);
   }
 }


 void THNN_(SpatialFullConvolution_accGradParameters)(
     THNNState *state,
     THTensor *input,
     THTensor *gradOutput,
     THTensor *gradWeight,
     THTensor *gradBias,
     THTensor *columns,
     THTensor *ones,
     int kW, int kH,
     int dW, int dH,
     int padW, int padH,
     int adjW, int adjH,
     real scale)
 {
   int nInputPlane = THTensor_(size)(gradWeight,0);
   int nOutputPlane = THTensor_(size)(gradWeight,1);

   THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");

   int batch = 1;
   if (input->nDimension == 3) {
     // Force batch
     batch = 0;
     THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
     THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
   }

   long inputWidth   = input->size[3];
   long inputHeight  = input->size[2];
   long outputWidth  = (inputWidth - 1) * dW - 2*padW + kW + adjW;
   long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;

   // Batch size + input planes
   long batchSize = input->size[0];

   // Define a buffer of ones, for bias accumulation
   if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
     // Resize plane and fill with ones...
     THTensor_(resize2d)(ones, outputHeight, outputWidth);
     THTensor_(fill)(ones, 1);
   }

   // Resize temporary columns
   THTensor_(resize2d)(columns, nOutputPlane*kW*kH, inputHeight*inputWidth);

   // Helpers
   THTensor *input_n = THTensor_(new)();
   THTensor *gradOutput_n = THTensor_(new)();

   int elt;
   // For each elt in batch, do:
   for (elt = 0; elt < batchSize; elt ++) {
     // Matrix mulitply per output:
     THTensor_(select)(input_n, input, 0, elt);
     THTensor_(select)(gradOutput_n, gradOutput, 0, elt);

     // Extract columns:
     THNN_(im2col)(
       THTensor_(data)(gradOutput_n),
       nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
       1, 1,
       THTensor_(data)(columns)
     );

     // M,N,K are dims of matrix A and B
     // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
     long n = columns->size[0];   // nOutputPlane * kh * kw
     long m = input_n->size[0];   // nInputPlane
     long k = columns->size[1];   // inputHeight * inputWidth

     // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
     THBlas_(gemm)(
         't', 'n',
         n, m, k,
         scale,
         THTensor_(data)(columns), k,
         THTensor_(data)(input_n), k,
         1,
         THTensor_(data)(gradWeight), n
     );


     // Do Bias:
     // M,N,K are dims of matrix A and B
     // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
     long m_ = nOutputPlane;
     long k_ = outputHeight * outputWidth;

     // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
     THBlas_(gemv)(
         't',
         k_, m_,
         scale,
         THTensor_(data)(gradOutput_n), k_,
         THTensor_(data)(ones), 1,
         1,
         THTensor_(data)(gradBias), 1
     );
   }

   // Free
   THTensor_(free)(input_n);
   THTensor_(free)(gradOutput_n);

   // Resize
   if (batch == 0) {
     THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
     THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
   }
 }

 #endif
	#ifndef TH_GENERIC_FILE
	#define TH_GENERIC_FILE "generic/SpatialFullConvolution.c"
	#else

	static void THNN_(im2col)(const real* data_im, const int channels,
	const int height, const int width, const int kernel_h, const int kernel_w,
	const int pad_h, const int pad_w,
	const int stride_h, const int stride_w,
	const int dilation_h, const int dilation_w,
	real* data_col) {
	const int height_col = (height + 2 * pad_h -
	(dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
	const int width_col = (width + 2 * pad_w -
	(dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
	const int channels_col = channels * kernel_h * kernel_w;
	for (int c_col = 0; c_col < channels_col; ++c_col) {
	int w_offset = c_col % kernel_w;
	int h_offset = (c_col / kernel_w) % kernel_h;
	int c_im = c_col / kernel_h / kernel_w;
	for (int h_col = 0; h_col < height_col; ++h_col) {
	for (int w_col = 0; w_col < width_col; ++w_col) {
	int h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
	int w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
	data_col[(c_col * height_col + h_col) * width_col + w_col] =
	(h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
	data_im[(c_im * height + h_im) * width + w_im] : 0;
	}
	}
	}
	}

	static void THNN_(col2im)(const real* data_col, const int channels,
	const int height, const int width, const int kernel_h, const int kernel_w,
	const int pad_h, const int pad_w,
	const int stride_h, const int stride_w,
	const int dilation_h, const int dilation_w,
	real* data_im) {
	memset(data_im, 0, sizeof(real) * height * width * channels);
	const int height_col = (height + 2 * pad_h -
	(dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
	const int width_col = (width + 2 * pad_w -
	(dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
	const int channels_col = channels * kernel_h * kernel_w;
	for (int c_col = 0; c_col < channels_col; ++c_col) {
	int w_offset = c_col % kernel_w;
	int h_offset = (c_col / kernel_w) % kernel_h;
	int c_im = c_col / kernel_h / kernel_w;
	for (int h_col = 0; h_col < height_col; ++h_col) {
	for (int w_col = 0; w_col < width_col; ++w_col) {
	int h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
	int w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
	if (h_im >= 0 && h_im < height && w_im >= 0 && w_im < width)
	data_im[(c_im * height + h_im) * width + w_im] +=
	data_col[(c_col * height_col + h_col) * width_col + w_col];
	}
	}
	}
	}

	void THNN_(SpatialFullConvolution_updateOutput)(
	THNNState *state,
	THTensor *input,
	THTensor *output,
	THTensor *weight,
	THTensor *bias,
	THTensor *columns,
	THTensor *ones,
	int kW, int kH,
	int dW, int dH,
	int padW, int padH,
	int adjW, int adjH)
	{
	int nInputPlane = THTensor_(size)(weight,0);
	int nOutputPlane = THTensor_(size)(weight,1);

	THArgCheck(input->nDimension == 3 \|\| input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");

	int batch = 1;
	if (input->nDimension == 3) {
	THArgCheck(input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
	// Force batch
	batch = 0;
	THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
	} else {
	THArgCheck(input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
	}

	long inputWidth = input->size[3];
	long inputHeight = input->size[2];
	long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW;
	long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;

	// Batch size + input planes
	long batchSize = input->size[0];

	// Resize output
	THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth);

	// Resize temporary columns
	THTensor_(resize2d)(columns, nOutputPlanekWkH, inputHeight*inputWidth);

	// Define a buffer of ones, for bias accumulation
	// Note: this buffer can be shared with other modules, it only ever gets increased,
	// and always contains ones.
	if (ones->nDimension != 2 \|\| ones->size[0]ones->size[1] < outputHeightoutputWidth) {
	// Resize plane and fill with ones...
	THTensor_(resize2d)(ones, outputHeight, outputWidth);
	THTensor_(fill)(ones, 1);
	}

	// Helpers
	THTensor *input_n = THTensor_(new)();
	THTensor *output_n = THTensor_(new)();

	int elt;
	// For each elt in batch, do:
	for (elt = 0; elt < batchSize; elt ++) {
	// Matrix mulitply per output:
	THTensor_(select)(input_n, input, 0, elt);
	THTensor_(select)(output_n, output, 0, elt);

	// M,N,K are dims of matrix A and B
	// (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
	long m = weight->size[1] * weight->size[2] * weight->size[3];
	long n = columns->size[1];
	long k = weight->size[0];

	// Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
	THBlas_(gemm)(
	'n', 't',
	n, m, k,
	1,
	THTensor_(data)(input_n), n,
	THTensor_(data)(weight), m,
	0,
	THTensor_(data)(columns), n
	);

	// Unpack columns back into input:
	THNN_(col2im)(
	THTensor_(data)(columns),
	nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
	1, 1,
	THTensor_(data)(output_n)
	);

	// Do Bias after:
	// M,N,K are dims of matrix A and B
	// (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
	long m_ = nOutputPlane;
	long n_ = outputHeight * outputWidth;
	long k_ = 1;

	// Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
	THBlas_(gemm)(
	't', 'n',
	n_, m_, k_,
	1,
	THTensor_(data)(ones), k_,
	THTensor_(data)(bias), k_,
	1,
	THTensor_(data)(output_n), n_
	);

	}

	// Free
	THTensor_(free)(input_n);
	THTensor_(free)(output_n);

	// Resize output
	if (batch == 0) {
	THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
	THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
	}
	}

	void THNN_(SpatialFullConvolution_updateGradInput)(
	THNNState *state,
	THTensor *input,
	THTensor *gradOutput,
	THTensor *gradInput,
	THTensor *weight,
	THTensor *gradColumns,
	int kW, int kH,
	int dW, int dH,
	int padW, int padH,
	int adjW, int adjH)
	{
	int nInputPlane = THTensor_(size)(weight,0);
	int nOutputPlane = THTensor_(size)(weight,1);

	THArgCheck(input->nDimension == 3 \|\| input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");

	int batch = 1;
	if (input->nDimension == 3) {
	// Force batch
	batch = 0;
	THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
	THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
	}

	long inputWidth = input->size[3];
	long inputHeight = input->size[2];
	long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW;
	long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;

	// Batch size + input planes
	long batchSize = input->size[0];

	// Resize output
	THTensor_(resize4d)(gradInput, batchSize, nInputPlane, inputHeight, inputWidth);

	// Resize temporary columns
	THTensor_(resize2d)(gradColumns, nOutputPlanekWkH, inputHeight*inputWidth);

	// Helpers
	THTensor *gradInput_n = THTensor_(new)();
	THTensor *gradOutput_n = THTensor_(new)();

	int elt;
	// For each elt in batch, do:
	for (elt = 0; elt < batchSize; elt ++) {
	// Matrix mulitply per sample:
	THTensor_(select)(gradInput_n, gradInput, 0, elt);
	THTensor_(select)(gradOutput_n, gradOutput, 0, elt);

	// Extract columns:
	THNN_(im2col)(
	THTensor_(data)(gradOutput_n),
	nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
	1, 1,
	THTensor_(data)(gradColumns)
	);


	// M,N,K are dims of matrix A and B
	// (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
	long m = weight->size[0];
	long n = gradColumns->size[1];
	long k = weight->size[1] * weight->size[2] * weight->size[3];

	// Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
	THBlas_(gemm)(
	'n', 'n',
	n, m, k,
	1,
	THTensor_(data)(gradColumns), n,
	THTensor_(data)(weight), k,
	0,
	THTensor_(data)(gradInput_n), n
	);
	}


	// Free
	THTensor_(free)(gradInput_n);
	THTensor_(free)(gradOutput_n);

	// Resize output
	if (batch == 0) {
	THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
	THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
	THTensor_(resize3d)(gradInput, nInputPlane, inputHeight, inputWidth);
	}
	}


	void THNN_(SpatialFullConvolution_accGradParameters)(
	THNNState *state,
	THTensor *input,
	THTensor *gradOutput,
	THTensor *gradWeight,
	THTensor *gradBias,
	THTensor *columns,
	THTensor *ones,
	int kW, int kH,
	int dW, int dH,
	int padW, int padH,
	int adjW, int adjH,
	real scale)
	{
	int nInputPlane = THTensor_(size)(gradWeight,0);
	int nOutputPlane = THTensor_(size)(gradWeight,1);

	THArgCheck(input->nDimension == 3 \|\| input->nDimension == 4, 2, "3D or 4D (batch mode) tensor is expected");

	int batch = 1;
	if (input->nDimension == 3) {
	// Force batch
	batch = 0;
	THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
	THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
	}

	long inputWidth = input->size[3];
	long inputHeight = input->size[2];
	long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW;
	long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;

	// Batch size + input planes
	long batchSize = input->size[0];

	// Define a buffer of ones, for bias accumulation
	if (ones->nDimension != 2 \|\| ones->size[0]ones->size[1] < outputHeightoutputWidth) {
	// Resize plane and fill with ones...
	THTensor_(resize2d)(ones, outputHeight, outputWidth);
	THTensor_(fill)(ones, 1);
	}

	// Resize temporary columns
	THTensor_(resize2d)(columns, nOutputPlanekWkH, inputHeight*inputWidth);

	// Helpers
	THTensor *input_n = THTensor_(new)();
	THTensor *gradOutput_n = THTensor_(new)();

	int elt;
	// For each elt in batch, do:
	for (elt = 0; elt < batchSize; elt ++) {
	// Matrix mulitply per output:
	THTensor_(select)(input_n, input, 0, elt);
	THTensor_(select)(gradOutput_n, gradOutput, 0, elt);

	// Extract columns:
	THNN_(im2col)(
	THTensor_(data)(gradOutput_n),
	nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
	1, 1,
	THTensor_(data)(columns)
	);

	// M,N,K are dims of matrix A and B
	// (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
	long n = columns->size[0]; // nOutputPlane * kh * kw
	long m = input_n->size[0]; // nInputPlane
	long k = columns->size[1]; // inputHeight * inputWidth

	// Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
	THBlas_(gemm)(
	't', 'n',
	n, m, k,
	scale,
	THTensor_(data)(columns), k,
	THTensor_(data)(input_n), k,
	1,
	THTensor_(data)(gradWeight), n
	);


	// Do Bias:
	// M,N,K are dims of matrix A and B
	// (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
	long m_ = nOutputPlane;
	long k_ = outputHeight * outputWidth;

	// Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
	THBlas_(gemv)(
	't',
	k_, m_,
	scale,
	THTensor_(data)(gradOutput_n), k_,
	THTensor_(data)(ones), 1,
	1,
	THTensor_(data)(gradBias), 1
	);
	}

	// Free
	THTensor_(free)(input_n);
	THTensor_(free)(gradOutput_n);

	// Resize
	if (batch == 0) {
	THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
	THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
	}
	}

	#endif