SpatialMaxPooling.cu - platform/external/pytorch - Git at Google

 #include "THCUNN.h"
 #include "common.h"

 // kernels borrowed from Caffe
 template <typename Dtype>
 __global__ void MaxPoolForward(const int nthreads, const Dtype* bottom_data,
     const int num, const int channels, const int height,
     const int width, const int pooled_height, const int pooled_width,
     const int kernel_h, const int kernel_w, const int stride_h,
     const int stride_w, const int pad_h, const int pad_w, Dtype* top_data,
     Dtype* top_mask) {
   CUDA_KERNEL_LOOP(index, nthreads) {
     int pw = index % pooled_width;
     int ph = (index / pooled_width) % pooled_height;
     int c = (index / pooled_width / pooled_height) % channels;
     int n = index / pooled_width / pooled_height / channels;
     int hstart = ph * stride_h - pad_h;
     int wstart = pw * stride_w - pad_w;
     int hend = min(hstart + kernel_h, height);
     int wend = min(wstart + kernel_w, width);
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
     Dtype maxval = -FLT_MAX;
     int maxidx = -1;
     bottom_data += (n * channels + c) * height * width;
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
         if (bottom_data[h * width + w] > maxval) {
           maxidx = h * width + w;
           maxval = bottom_data[maxidx];
         }
       }
     }
     top_data[index] = maxval;
     top_mask[index] = maxidx + 1;
   }
 }


 template <typename Dtype>
 __global__ void MaxPoolBackward(const int nthreads, const Dtype* top_diff,
     const Dtype* top_mask, const int num, const int channels,
     const int height, const int width, const int pooled_height,
     const int pooled_width, const int kernel_h, const int kernel_w,
     const int stride_h, const int stride_w, const int pad_h, const int pad_w,
     Dtype* bottom_diff) {
   CUDA_KERNEL_LOOP(index, nthreads) {
     // find out the local index
     // find out the local offset
     int w = index % width;
     int h = (index / width) % height;
     int c = (index / width / height) % channels;
     int n = index / width / height / channels;
     int phstart =
         (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
     int phend = min((h + pad_h) / stride_h + 1, pooled_height);
     int pwstart =
         (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
     int pwend = min((w + pad_w) / stride_w + 1, pooled_width);
     Dtype gradient = 0;
     int offset = (n * channels + c) * pooled_height * pooled_width;
     top_diff += offset;
     top_mask += offset;
     for (int ph = phstart; ph < phend; ++ph) {
       for (int pw = pwstart; pw < pwend; ++pw) {
 	if (top_mask[ph * pooled_width + pw] - 1 == h * width + w) {
 	  gradient += top_diff[ph * pooled_width + pw];
 	}
       }
     }
     bottom_diff[index] = gradient;
   }
 }

 void THNN_CudaSpatialMaxPooling_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output, THCudaTensor *indices, int kW, int kH, int dW, int dH, int padW, int padH, bool ceil_mode)
 {

   THCUNN_assertSameGPU(state, 3, input, output, indices);
   THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch) tensor expected");

   long nInputCols, nInputRows, nInputPlane, batchSize;
   long nOutputCols, nOutputRows;

   if (input->nDimension == 3) {
     nInputCols = input->size[2];
     nInputRows = input->size[1];
     nInputPlane = input->size[0];
     batchSize = 1;
   }
   else
   {
     nInputCols = input->size[3];
     nInputRows = input->size[2];
     nInputPlane = input->size[1];
     batchSize = input->size[0];
   }

   THArgCheck(nInputCols >= kW - padW && nInputRows >= kH - padH, 2, "input image smaller than kernel size");
   THArgCheck(kW/2 >= padW && kH/2 >= padH, 2, "pad should be smaller than half of kernel size");

   if(ceil_mode) {
     nOutputCols = ceil(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
     nOutputRows = ceil(float(nInputRows - kH + 2*padH) / float(dH)) + 1;
   }
   else {
     nOutputCols = floor(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
     nOutputRows = floor(float(nInputRows - kH + 2*padH) / float(dH)) + 1;
   }

   if (padW || padH)
   {
     // ensure that the last pooling starts inside the image
     if ((nOutputRows - 1)*dH >= nInputRows + padH)
       --nOutputRows;
     if ((nOutputCols  - 1)*dW >= nInputCols  + padW)
       --nOutputCols;
   }

   input = THCudaTensor_newContiguous(state, input);
   float* input_data = THCudaTensor_data(state, input);

   THCudaTensor_resize4d(state, output, batchSize, nInputPlane, nOutputRows, nOutputCols);
   THCudaTensor_resizeAs(state, indices, output);

   float* indices_data = THCudaTensor_data(state, indices);
   float* output_data = THCudaTensor_data(state, output);

   int count = THCudaTensor_nElement(state, output);

   MaxPoolForward <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
       (count, input_data,
       batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
       kH, kW, dH, dW, padH, padW, output_data, indices_data);

   if(input->nDimension == 3)
     THCudaTensor_resize3d(state, output, nInputPlane, nOutputRows, nOutputCols);

   THCudaTensor_free(state, input);

   // check for errors
   cudaError_t err = cudaGetLastError();
   if (err != cudaSuccess) {
     printf("error in SpatialMaxPooling.updateOutput: %s\n", cudaGetErrorString(err));
     THError("aborting");
   }
 }

 void THNN_CudaSpatialMaxPooling_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput, THCudaTensor *gradInput, THCudaTensor *indices, int kW, int kH, int dW, int dH, int padW, int padH, bool ceil_mode)
 {
   THCUNN_assertSameGPU(state, 4, input, gradOutput, indices, gradInput);

   input = THCudaTensor_newContiguous(state, input);
   gradOutput = THCudaTensor_newContiguous(state, gradOutput);

   long nInputCols, nInputRows, nInputPlane, batchSize;
   long nOutputCols, nOutputRows;

   if (input->nDimension == 3) {
     nInputCols = input->size[2];
     nInputRows = input->size[1];
     nInputPlane = input->size[0];
     batchSize = 1;
   }
   else
   {
     nInputCols = input->size[3];
     nInputRows = input->size[2];
     nInputPlane = input->size[1];
     batchSize = input->size[0];
   }

   if(ceil_mode) {
     nOutputCols = ceil(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
     nOutputRows = ceil(float(nInputRows - kH + 2*padH) / float(dH)) + 1;
   }
   else {
     nOutputCols = floor(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
     nOutputRows = floor(float(nInputRows - kH + 2*padH) / float(dH)) + 1;
   }


   gradOutput = THCudaTensor_newContiguous(state, gradOutput);
   THCudaTensor_resizeAs(state, gradInput, input);

   int count = THCudaTensor_nElement(state, input);

   MaxPoolBackward <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
       (count,
       THCudaTensor_data(state, gradOutput),
       THCudaTensor_data(state, indices),
       batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
       kH, kW, dH, dW, padH, padW,
       THCudaTensor_data(state, gradInput));

   THCudaTensor_free(state, gradOutput);

   // check for errors
   cudaError_t err = cudaGetLastError();
   if (err != cudaSuccess) {
     printf("error in SpatialMaxPooling.updateGradInput: %s\n", cudaGetErrorString(err));
     THError("aborting");
   }
   // clean
   THCudaTensor_free(state, input);
   THCudaTensor_free(state, gradOutput);
 }
	#include "THCUNN.h"
	#include "common.h"

	// kernels borrowed from Caffe
	template <typename Dtype>
	__global__ void MaxPoolForward(const int nthreads, const Dtype* bottom_data,
	const int num, const int channels, const int height,
	const int width, const int pooled_height, const int pooled_width,
	const int kernel_h, const int kernel_w, const int stride_h,
	const int stride_w, const int pad_h, const int pad_w, Dtype* top_data,
	Dtype* top_mask) {
	CUDA_KERNEL_LOOP(index, nthreads) {
	int pw = index % pooled_width;
	int ph = (index / pooled_width) % pooled_height;
	int c = (index / pooled_width / pooled_height) % channels;
	int n = index / pooled_width / pooled_height / channels;
	int hstart = ph * stride_h - pad_h;
	int wstart = pw * stride_w - pad_w;
	int hend = min(hstart + kernel_h, height);
	int wend = min(wstart + kernel_w, width);
	hstart = max(hstart, 0);
	wstart = max(wstart, 0);
	Dtype maxval = -FLT_MAX;
	int maxidx = -1;
	bottom_data += (n * channels + c) * height * width;
	for (int h = hstart; h < hend; ++h) {
	for (int w = wstart; w < wend; ++w) {
	if (bottom_data[h * width + w] > maxval) {
	maxidx = h * width + w;
	maxval = bottom_data[maxidx];
	}
	}
	}
	top_data[index] = maxval;
	top_mask[index] = maxidx + 1;
	}
	}


	template <typename Dtype>
	__global__ void MaxPoolBackward(const int nthreads, const Dtype* top_diff,
	const Dtype* top_mask, const int num, const int channels,
	const int height, const int width, const int pooled_height,
	const int pooled_width, const int kernel_h, const int kernel_w,
	const int stride_h, const int stride_w, const int pad_h, const int pad_w,
	Dtype* bottom_diff) {
	CUDA_KERNEL_LOOP(index, nthreads) {
	// find out the local index
	// find out the local offset
	int w = index % width;
	int h = (index / width) % height;
	int c = (index / width / height) % channels;
	int n = index / width / height / channels;
	int phstart =
	(h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
	int phend = min((h + pad_h) / stride_h + 1, pooled_height);
	int pwstart =
	(w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
	int pwend = min((w + pad_w) / stride_w + 1, pooled_width);
	Dtype gradient = 0;
	int offset = (n * channels + c) * pooled_height * pooled_width;
	top_diff += offset;
	top_mask += offset;
	for (int ph = phstart; ph < phend; ++ph) {
	for (int pw = pwstart; pw < pwend; ++pw) {
	if (top_mask[ph * pooled_width + pw] - 1 == h * width + w) {
	gradient += top_diff[ph * pooled_width + pw];
	}
	}
	}
	bottom_diff[index] = gradient;
	}
	}

	void THNN_CudaSpatialMaxPooling_updateOutput(THCState state, THCudaTensor input, THCudaTensor output, THCudaTensor indices, int kW, int kH, int dW, int dH, int padW, int padH, bool ceil_mode)
	{

	THCUNN_assertSameGPU(state, 3, input, output, indices);
	THArgCheck(input->nDimension == 3 \|\| input->nDimension == 4, 2, "3D or 4D (batch) tensor expected");

	long nInputCols, nInputRows, nInputPlane, batchSize;
	long nOutputCols, nOutputRows;

	if (input->nDimension == 3) {
	nInputCols = input->size[2];
	nInputRows = input->size[1];
	nInputPlane = input->size[0];
	batchSize = 1;
	}
	else
	{
	nInputCols = input->size[3];
	nInputRows = input->size[2];
	nInputPlane = input->size[1];
	batchSize = input->size[0];
	}

	THArgCheck(nInputCols >= kW - padW && nInputRows >= kH - padH, 2, "input image smaller than kernel size");
	THArgCheck(kW/2 >= padW && kH/2 >= padH, 2, "pad should be smaller than half of kernel size");

	if(ceil_mode) {
	nOutputCols = ceil(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
	nOutputRows = ceil(float(nInputRows - kH + 2*padH) / float(dH)) + 1;
	}
	else {
	nOutputCols = floor(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
	nOutputRows = floor(float(nInputRows - kH + 2*padH) / float(dH)) + 1;
	}

	if (padW \|\| padH)
	{
	// ensure that the last pooling starts inside the image
	if ((nOutputRows - 1)*dH >= nInputRows + padH)
	--nOutputRows;
	if ((nOutputCols - 1)*dW >= nInputCols + padW)
	--nOutputCols;
	}

	input = THCudaTensor_newContiguous(state, input);
	float* input_data = THCudaTensor_data(state, input);

	THCudaTensor_resize4d(state, output, batchSize, nInputPlane, nOutputRows, nOutputCols);
	THCudaTensor_resizeAs(state, indices, output);

	float* indices_data = THCudaTensor_data(state, indices);
	float* output_data = THCudaTensor_data(state, output);

	int count = THCudaTensor_nElement(state, output);

	MaxPoolForward <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
	(count, input_data,
	batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
	kH, kW, dH, dW, padH, padW, output_data, indices_data);

	if(input->nDimension == 3)
	THCudaTensor_resize3d(state, output, nInputPlane, nOutputRows, nOutputCols);

	THCudaTensor_free(state, input);

	// check for errors
	cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess) {
	printf("error in SpatialMaxPooling.updateOutput: %s\n", cudaGetErrorString(err));
	THError("aborting");
	}
	}

	void THNN_CudaSpatialMaxPooling_updateGradInput(THCState state, THCudaTensor input, THCudaTensor gradOutput, THCudaTensor gradInput, THCudaTensor *indices, int kW, int kH, int dW, int dH, int padW, int padH, bool ceil_mode)
	{
	THCUNN_assertSameGPU(state, 4, input, gradOutput, indices, gradInput);

	input = THCudaTensor_newContiguous(state, input);
	gradOutput = THCudaTensor_newContiguous(state, gradOutput);

	long nInputCols, nInputRows, nInputPlane, batchSize;
	long nOutputCols, nOutputRows;

	if (input->nDimension == 3) {
	nInputCols = input->size[2];
	nInputRows = input->size[1];
	nInputPlane = input->size[0];
	batchSize = 1;
	}
	else
	{
	nInputCols = input->size[3];
	nInputRows = input->size[2];
	nInputPlane = input->size[1];
	batchSize = input->size[0];
	}

	if(ceil_mode) {
	nOutputCols = ceil(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
	nOutputRows = ceil(float(nInputRows - kH + 2*padH) / float(dH)) + 1;
	}
	else {
	nOutputCols = floor(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
	nOutputRows = floor(float(nInputRows - kH + 2*padH) / float(dH)) + 1;
	}


	gradOutput = THCudaTensor_newContiguous(state, gradOutput);
	THCudaTensor_resizeAs(state, gradInput, input);

	int count = THCudaTensor_nElement(state, input);

	MaxPoolBackward <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
	(count,
	THCudaTensor_data(state, gradOutput),
	THCudaTensor_data(state, indices),
	batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
	kH, kW, dH, dW, padH, padW,
	THCudaTensor_data(state, gradInput));

	THCudaTensor_free(state, gradOutput);

	// check for errors
	cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess) {
	printf("error in SpatialMaxPooling.updateGradInput: %s\n", cudaGetErrorString(err));
	THError("aborting");
	}
	// clean
	THCudaTensor_free(state, input);
	THCudaTensor_free(state, gradOutput);
	}