generic/SpatialUpSamplingNearest.cu - platform/external/pytorch - Git at Google

 #ifndef THC_GENERIC_FILE
 #define THC_GENERIC_FILE "generic/SpatialUpSamplingNearest.cu"
 #else

 #include "../common.h"

 static inline void THNN_(SpatialUpSamplingNearest_shapeCheck)
                         (THCState *state,THCTensor *input, THCTensor *gradOutput,
                          int scale_factor) {
   THArgCheck(input != NULL, 2, "4D input tensor expected but got NULL");
   THArgCheck(scale_factor > 1, 4,
              "scale_factor must be greater than 1, but got: %d", scale_factor);
   THCUNN_argCheck(state, input->nDimension == 3 || input->nDimension == 4, 2, input,
                   "3D or 4D input tensor expected but got: %s");
   if (input->nDimension == 3) {
     int nChannels    = THCTensor_(size)(state, input, 0);
     int inputHeight  = THCTensor_(size)(state, input, 1);
     int inputWidth   = THCTensor_(size)(state, input, 2);
     int outputHeight = inputHeight * scale_factor;
     int outputWidth  = inputWidth  * scale_factor;
     if (gradOutput != NULL) {
       THCUNN_check_dim_size(state, gradOutput, 3, 0, nChannels);
       THCUNN_check_dim_size(state, gradOutput, 3, 1, outputHeight);
       THCUNN_check_dim_size(state, gradOutput, 3, 2, outputWidth);
     }
   } else {
     int nBatch       = THCTensor_(size)(state, input, 0);
     int nChannels    = THCTensor_(size)(state, input, 1);
     int inputHeight  = THCTensor_(size)(state, input, 2);
     int inputWidth   = THCTensor_(size)(state, input, 3);
     int outputHeight = inputHeight * scale_factor;
     int outputWidth  = inputWidth  * scale_factor;
     if (gradOutput != NULL) {
       THCUNN_check_dim_size(state, gradOutput, 4, 0, nBatch);
       THCUNN_check_dim_size(state, gradOutput, 4, 1, nChannels);
       THCUNN_check_dim_size(state, gradOutput, 4, 2, outputHeight);
       THCUNN_check_dim_size(state, gradOutput, 4, 3, outputWidth);
     }
   }
 }

 void THNN_(SpatialUpSamplingNearest_updateOutput)(
            THCState *state,
            THCTensor *input,
            THCTensor *output,
            int scale_factor)
 {
   THCTensor_(zero)(state, output);

   THCUNN_assertSameGPU(state, 2, input, output);
   THNN_(SpatialUpSamplingNearest_shapeCheck)(state, input, NULL, scale_factor);
   int inputHeight = THCTensor_(size)(state, input, input->nDimension-2);
   int inputWidth  = THCTensor_(size)(state, input,  input->nDimension-1);
   int outputHeight = inputHeight * scale_factor;
   int outputWidth = inputWidth * scale_factor;

    if (input->nDimension == 3) {
      THCTensor_(resize3d)(state, output,
                           THCTensor_(size)(state, input, 0),
                           outputHeight, outputWidth);
    } else {
      THCTensor_(resize4d)(state, output,
                           THCTensor_(size)(state, input, 0),
                           THCTensor_(size)(state, input, 1),
                           outputHeight, outputWidth);
   }

   input = THCTensor_(newContiguous)(state, input);
   // This is for allocating output Tensor
   long no_elements = 1;
   for(int i = 0; i < input->nDimension; i++){
     no_elements *= input->size[i];
   }
   no_elements *= scale_factor * scale_factor;

   int d1;
   int d2;
   int d3;

   if (input->nDimension == 3) {
     d1 = output->size[0];
     d2 = output->size[1];
     d3 = output->size[2];
   } else {
     d1 = output->size[1];
     d2 = output->size[2];
     d3 = output->size[3];
   }

   real *input_data = THCTensor_(data)(state, input);
   real *output_data = THCTensor_(data)(state, output);

   // cuda blocks & threads:
   long nthreads = 256;
   // Max number of blocks: http://en.wikipedia.org/wiki/CUDA
   // 65535 for SM 2.x, 2^32 -1 for >= 3.0
   // TODO: When we move to SM 3.5 we should update this
   long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535);
   long n_yblocks = (long)ceil((float)no_elements / (float)(n_xblocks * nthreads));
   if (n_yblocks > 65535) {
     THError("Input size is too large!  aborting");
   }
   dim3 blocks(n_xblocks, n_yblocks);
   dim3 threads(nthreads);

   // kernel:
   upscale<<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (input_data, output_data, no_elements, scale_factor, d1, d2, d3);
   THCudaCheck(cudaGetLastError());

   // final cut:
   THCTensor_(free)(state, input);
 }

 void THNN_(SpatialUpSamplingNearest_updateGradInput)(
            THCState *state,
            THCTensor *input,
            THCTensor *gradOutput,
            THCTensor *gradInput,
            int scale_factor)
 {

   THCUNN_assertSameGPU(state, 2, gradOutput, gradInput);
   THNN_(SpatialUpSamplingNearest_shapeCheck)(state, input, gradOutput, scale_factor);
   THCTensor_(resizeAs)(state, gradInput, input);

   THCTensor_(zero)(state, gradInput);

   real *gradInput_data = THCTensor_(data)(state, gradInput);
   real *gradOutput_data = THCTensor_(data)(state, gradOutput);

   long no_elements = 1;
   for(int i = 0; i < gradInput->nDimension; i++){
     no_elements *= gradInput->size[i];
   }

   int d1;
   int d2;
   int d3;

   if (gradInput->nDimension == 3) {
     d1 = gradInput->size[0];
     d2 = gradInput->size[1];
     d3 = gradInput->size[2];
   } else {
     d1 = gradInput->size[1];
     d2 = gradInput->size[2];
     d3 = gradInput->size[3];
   }

   // cuda blocks & threads:
   long nthreads = 256;
   // Max number of blocks: http://en.wikipedia.org/wiki/CUDA
   // 65535 for SM 2.x, 2^32 -1 for >= 3.0
   // TODO: When we move to SM 3.5 we should update this
   long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535);
   long n_yblocks = (long)ceil((float)no_elements / (float)(n_xblocks * nthreads));
   if (n_yblocks > 65535) {
     THError("Input size is too large!  aborting");
   }
   dim3 blocks(n_xblocks, n_yblocks);
   dim3 threads(nthreads);

   // kernel:
   downscale<real ,accreal> <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data, no_elements,
     scale_factor, d1, d2, d3);
   THCudaCheck(cudaGetLastError());
 }

 #endif
	#ifndef THC_GENERIC_FILE
	#define THC_GENERIC_FILE "generic/SpatialUpSamplingNearest.cu"
	#else

	#include "../common.h"

	static inline void THNN_(SpatialUpSamplingNearest_shapeCheck)
	(THCState state,THCTensor input, THCTensor *gradOutput,
	int scale_factor) {
	THArgCheck(input != NULL, 2, "4D input tensor expected but got NULL");
	THArgCheck(scale_factor > 1, 4,
	"scale_factor must be greater than 1, but got: %d", scale_factor);
	THCUNN_argCheck(state, input->nDimension == 3 \|\| input->nDimension == 4, 2, input,
	"3D or 4D input tensor expected but got: %s");
	if (input->nDimension == 3) {
	int nChannels = THCTensor_(size)(state, input, 0);
	int inputHeight = THCTensor_(size)(state, input, 1);
	int inputWidth = THCTensor_(size)(state, input, 2);
	int outputHeight = inputHeight * scale_factor;
	int outputWidth = inputWidth * scale_factor;
	if (gradOutput != NULL) {
	THCUNN_check_dim_size(state, gradOutput, 3, 0, nChannels);
	THCUNN_check_dim_size(state, gradOutput, 3, 1, outputHeight);
	THCUNN_check_dim_size(state, gradOutput, 3, 2, outputWidth);
	}
	} else {
	int nBatch = THCTensor_(size)(state, input, 0);
	int nChannels = THCTensor_(size)(state, input, 1);
	int inputHeight = THCTensor_(size)(state, input, 2);
	int inputWidth = THCTensor_(size)(state, input, 3);
	int outputHeight = inputHeight * scale_factor;
	int outputWidth = inputWidth * scale_factor;
	if (gradOutput != NULL) {
	THCUNN_check_dim_size(state, gradOutput, 4, 0, nBatch);
	THCUNN_check_dim_size(state, gradOutput, 4, 1, nChannels);
	THCUNN_check_dim_size(state, gradOutput, 4, 2, outputHeight);
	THCUNN_check_dim_size(state, gradOutput, 4, 3, outputWidth);
	}
	}
	}

	void THNN_(SpatialUpSamplingNearest_updateOutput)(
	THCState *state,
	THCTensor *input,
	THCTensor *output,
	int scale_factor)
	{
	THCTensor_(zero)(state, output);

	THCUNN_assertSameGPU(state, 2, input, output);
	THNN_(SpatialUpSamplingNearest_shapeCheck)(state, input, NULL, scale_factor);
	int inputHeight = THCTensor_(size)(state, input, input->nDimension-2);
	int inputWidth = THCTensor_(size)(state, input, input->nDimension-1);
	int outputHeight = inputHeight * scale_factor;
	int outputWidth = inputWidth * scale_factor;

	if (input->nDimension == 3) {
	THCTensor_(resize3d)(state, output,
	THCTensor_(size)(state, input, 0),
	outputHeight, outputWidth);
	} else {
	THCTensor_(resize4d)(state, output,
	THCTensor_(size)(state, input, 0),
	THCTensor_(size)(state, input, 1),
	outputHeight, outputWidth);
	}

	input = THCTensor_(newContiguous)(state, input);
	// This is for allocating output Tensor
	long no_elements = 1;
	for(int i = 0; i < input->nDimension; i++){
	no_elements *= input->size[i];
	}
	no_elements = scale_factor scale_factor;

	int d1;
	int d2;
	int d3;

	if (input->nDimension == 3) {
	d1 = output->size[0];
	d2 = output->size[1];
	d3 = output->size[2];
	} else {
	d1 = output->size[1];
	d2 = output->size[2];
	d3 = output->size[3];
	}

	real *input_data = THCTensor_(data)(state, input);
	real *output_data = THCTensor_(data)(state, output);

	// cuda blocks & threads:
	long nthreads = 256;
	// Max number of blocks: http://en.wikipedia.org/wiki/CUDA
	// 65535 for SM 2.x, 2^32 -1 for >= 3.0
	// TODO: When we move to SM 3.5 we should update this
	long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535);
	long n_yblocks = (long)ceil((float)no_elements / (float)(n_xblocks * nthreads));
	if (n_yblocks > 65535) {
	THError("Input size is too large! aborting");
	}
	dim3 blocks(n_xblocks, n_yblocks);
	dim3 threads(nthreads);

	// kernel:
	upscale<<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (input_data, output_data, no_elements, scale_factor, d1, d2, d3);
	THCudaCheck(cudaGetLastError());

	// final cut:
	THCTensor_(free)(state, input);
	}

	void THNN_(SpatialUpSamplingNearest_updateGradInput)(
	THCState *state,
	THCTensor *input,
	THCTensor *gradOutput,
	THCTensor *gradInput,
	int scale_factor)
	{

	THCUNN_assertSameGPU(state, 2, gradOutput, gradInput);
	THNN_(SpatialUpSamplingNearest_shapeCheck)(state, input, gradOutput, scale_factor);
	THCTensor_(resizeAs)(state, gradInput, input);

	THCTensor_(zero)(state, gradInput);

	real *gradInput_data = THCTensor_(data)(state, gradInput);
	real *gradOutput_data = THCTensor_(data)(state, gradOutput);

	long no_elements = 1;
	for(int i = 0; i < gradInput->nDimension; i++){
	no_elements *= gradInput->size[i];
	}

	int d1;
	int d2;
	int d3;

	if (gradInput->nDimension == 3) {
	d1 = gradInput->size[0];
	d2 = gradInput->size[1];
	d3 = gradInput->size[2];
	} else {
	d1 = gradInput->size[1];
	d2 = gradInput->size[2];
	d3 = gradInput->size[3];
	}

	// cuda blocks & threads:
	long nthreads = 256;
	// Max number of blocks: http://en.wikipedia.org/wiki/CUDA
	// 65535 for SM 2.x, 2^32 -1 for >= 3.0
	// TODO: When we move to SM 3.5 we should update this
	long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535);
	long n_yblocks = (long)ceil((float)no_elements / (float)(n_xblocks * nthreads));
	if (n_yblocks > 65535) {
	THError("Input size is too large! aborting");
	}
	dim3 blocks(n_xblocks, n_yblocks);
	dim3 threads(nthreads);

	// kernel:
	downscale<real ,accreal> <<<blocks, threads, 0, THCState_getCurrentStream(state)>>> (gradInput_data, gradOutput_data, no_elements,
	scale_factor, d1, d2, d3);
	THCudaCheck(cudaGetLastError());
	}

	#endif