SpatialCrossMapLRN.cu - platform/external/pytorch - Git at Google

 #include "THCUNN.h"
 #include "common.h"

 template <typename Dtype>
 __global__ void
 #if __CUDA_ARCH__ >= 320
 __launch_bounds__(CUDA_NUM_THREADS)
 #endif
 LRNFillScale(const int nthreads, const Dtype* const in,
     const int num, const int channels, const int height,
     const int width, const int size, const Dtype alpha_over_size,
     const Dtype k, Dtype* const scale) {
   CUDA_KERNEL_LOOP(index, nthreads) {
     // find out the local offset
     const int w = index % width;
     const int h = (index / width) % height;
     const int n = index / width / height;
     const int offset = (n * channels * height + h) * width + w;
     const int step = height * width;
     const Dtype* const in_off = in + offset;
     Dtype* const scale_off = scale + offset;
     int head = 0;
     const int pre_pad = (size - 1) / 2;
     const int post_pad = size - pre_pad - 1;
     Dtype accum_scale = 0;
     // fill the scale at [n, :, h, w]
     // accumulate values
     while (head < post_pad && head < channels) {
       accum_scale += in_off[head * step] * in_off[head * step];
       ++head;
     }
     // both add and subtract
     while (head < channels) {
       accum_scale += in_off[head * step] * in_off[head * step];
       if (head - size >= 0) {
         accum_scale -= in_off[(head - size) * step]
                        * in_off[(head - size) * step];
       }
       scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
       ++head;
     }
     // subtract only
     while (head < channels + post_pad) {
       if (head - size >= 0) {
         accum_scale -= in_off[(head - size) * step]
                        * in_off[(head - size) * step];
       }
       scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
       ++head;
     }
   }
 }

 __global__ void LRNComputeOutput(const int nthreads, const float* in,
     const float* scale, const float negative_beta, float* out) {
   CUDA_KERNEL_LOOP(index, nthreads) {
     out[index] = in[index] * pow(scale[index], negative_beta);
   }
 }

 template <typename Dtype>
 __global__ void LRNComputeDiff(const int nthreads,
     const Dtype* const bottom_data, const Dtype* const top_data,
     const Dtype* const scale, const Dtype* const top_diff,
     const int num, const int channels, const int height,
     const int width, const int size, const Dtype negative_beta,
     const Dtype cache_ratio, Dtype* const bottom_diff) {
   CUDA_KERNEL_LOOP(index, nthreads) {
     // find out the local offset
     const int w = index % width;
     const int h = (index / width) % height;
     const int n = index / width / height;
     const int offset = (n * channels * height + h) * width + w;
     const int step = height * width;
     const Dtype* const bottom_off = bottom_data + offset;
     const Dtype* const top_off = top_data + offset;
     const Dtype* const scale_off = scale + offset;
     const Dtype* const top_diff_off = top_diff + offset;
     Dtype* const bottom_diff_off = bottom_diff + offset;
     int head = 0;
     const int pre_pad = size - (size + 1) / 2;
     const int post_pad = size - pre_pad - 1;
     Dtype accum_ratio = 0;
     // accumulate values
     while (head < post_pad && head < channels) {
       accum_ratio += top_diff_off[head * step] * top_off[head * step] /
           scale_off[head * step];
       ++head;
     }
     // both add and subtract
     while (head < channels) {
       accum_ratio += top_diff_off[head * step] * top_off[head * step] /
           scale_off[head * step];
       if (head - size >= 0) {
         accum_ratio -= top_diff_off[(head - size) * step] *
             top_off[(head - size) * step] / scale_off[(head - size) * step];
       }
       bottom_diff_off[(head - post_pad) * step] =
           top_diff_off[(head - post_pad) * step]
             * pow(scale_off[(head - post_pad) * step], negative_beta)
           - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio;
       ++head;
     }
     // subtract only
     while (head < channels + post_pad) {
       if (head - size >= 0) {
         accum_ratio -= top_diff_off[(head - size) * step] *
             top_off[(head - size) * step] / scale_off[(head - size) * step];
       }
       bottom_diff_off[(head - post_pad) * step] =
           top_diff_off[(head - post_pad) * step]
             * pow(scale_off[(head - post_pad) * step], negative_beta)
           - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio;
       ++head;
     }
   }
 }

 extern "C"
 void LRNforward(THCState* state, THCudaTensor* input, THCudaTensor* output,
     THCudaTensor* scale, int local_size, float alpha, float beta, float k)
 {
   THCudaTensor_resizeAs(state, output, input);
   THCudaTensor_resizeAs(state, scale, input);

   int batchSize;
   int nInputPlane;
   int imsize_h;
   int imsize_w;

   if (input->nDimension == 3) {
     batchSize = 1;
     nInputPlane = input->size[0];
     imsize_h = input->size[1];
     imsize_w = input->size[2];
   }
   else
   {
     batchSize = input->size[0];
     nInputPlane = input->size[1];
     imsize_h = input->size[2];
     imsize_w = input->size[3];
   }

   input = THCudaTensor_newContiguous(state, input);

   int n_threads = batchSize * imsize_h * imsize_w;
   LRNFillScale<<<GET_BLOCKS(n_threads), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
       n_threads, THCudaTensor_data(state, input), batchSize, nInputPlane, imsize_h, imsize_w, local_size,
       alpha / local_size, k, THCudaTensor_data(state, scale));
   n_threads *= nInputPlane;
   THCudaCheck(cudaGetLastError());
   LRNComputeOutput<<<GET_BLOCKS(n_threads), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
     n_threads, THCudaTensor_data(state, input), THCudaTensor_data(state, scale), -beta, THCudaTensor_data(state, output));
   THCudaCheck(cudaGetLastError());

   THCudaTensor_free(state, input);
 }


 extern "C"
 void LRNbackward(THCState* state, THCudaTensor* input, THCudaTensor* output,
     THCudaTensor* gradOutput, THCudaTensor* gradInput, THCudaTensor* scale,
     int local_size, float alpha, float beta, float k)
 {
   THCudaTensor_resizeAs(state, gradInput, input);

   int batchSize;
   int nInputPlane;
   int imsize_h;
   int imsize_w;

   if (input->nDimension == 3) {
     batchSize = 1;
     nInputPlane = input->size[0];
     imsize_h = input->size[1];
     imsize_w = input->size[2];
   }
   else
   {
     batchSize = input->size[0];
     nInputPlane = input->size[1];
     imsize_h = input->size[2];
     imsize_w = input->size[3];
   }

   input = THCudaTensor_newContiguous(state, input);
   gradOutput = THCudaTensor_newContiguous(state, gradOutput);

   int n_threads = batchSize * imsize_h * imsize_w;
   LRNComputeDiff<<<GET_BLOCKS(n_threads), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
       n_threads, THCudaTensor_data(state, input), THCudaTensor_data(state, output),
       THCudaTensor_data(state, scale), THCudaTensor_data(state, gradOutput), batchSize, nInputPlane, imsize_h, imsize_w,
       local_size, -beta, float(2. * alpha * beta / local_size),
       THCudaTensor_data(state, gradInput));
   THCudaCheck(cudaGetLastError());

   THCudaTensor_free(state, input);
   THCudaTensor_free(state, gradOutput);
 }

 void THNN_CudaSpatialCrossMapLRN_updateOutput(
     THCState *state,
     THCudaTensor *input,
     THCudaTensor *output,
     THCudaTensor *scale,
     int size,
     float alpha,
     float beta,
     float k)
 {
   LRNforward(state, input, output, scale, size, alpha, beta, k);
 }

 void THNN_CudaSpatialCrossMapLRN_updateGradInput(
     THCState *state,
     THCudaTensor *input,
     THCudaTensor *gradOutput,
     THCudaTensor *gradInput,
     THCudaTensor *scale,
     THCudaTensor *output,
     int size,
     float alpha,
     float beta,
     float k)
 {
   LRNbackward(state, input, output, gradOutput, gradInput, scale, size, alpha, beta, k);
 }
	#include "THCUNN.h"
	#include "common.h"

	template <typename Dtype>
	__global__ void
	#if __CUDA_ARCH__ >= 320
	__launch_bounds__(CUDA_NUM_THREADS)
	#endif
	LRNFillScale(const int nthreads, const Dtype* const in,
	const int num, const int channels, const int height,
	const int width, const int size, const Dtype alpha_over_size,
	const Dtype k, Dtype* const scale) {
	CUDA_KERNEL_LOOP(index, nthreads) {
	// find out the local offset
	const int w = index % width;
	const int h = (index / width) % height;
	const int n = index / width / height;
	const int offset = (n * channels * height + h) * width + w;
	const int step = height * width;
	const Dtype* const in_off = in + offset;
	Dtype* const scale_off = scale + offset;
	int head = 0;
	const int pre_pad = (size - 1) / 2;
	const int post_pad = size - pre_pad - 1;
	Dtype accum_scale = 0;
	// fill the scale at [n, :, h, w]
	// accumulate values
	while (head < post_pad && head < channels) {
	accum_scale += in_off[head * step] * in_off[head * step];
	++head;
	}
	// both add and subtract
	while (head < channels) {
	accum_scale += in_off[head * step] * in_off[head * step];
	if (head - size >= 0) {
	accum_scale -= in_off[(head - size) * step]
	* in_off[(head - size) * step];
	}
	scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
	++head;
	}
	// subtract only
	while (head < channels + post_pad) {
	if (head - size >= 0) {
	accum_scale -= in_off[(head - size) * step]
	* in_off[(head - size) * step];
	}
	scale_off[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
	++head;
	}
	}
	}

	__global__ void LRNComputeOutput(const int nthreads, const float* in,
	const float* scale, const float negative_beta, float* out) {
	CUDA_KERNEL_LOOP(index, nthreads) {
	out[index] = in[index] * pow(scale[index], negative_beta);
	}
	}

	template <typename Dtype>
	__global__ void LRNComputeDiff(const int nthreads,
	const Dtype* const bottom_data, const Dtype* const top_data,
	const Dtype* const scale, const Dtype* const top_diff,
	const int num, const int channels, const int height,
	const int width, const int size, const Dtype negative_beta,
	const Dtype cache_ratio, Dtype* const bottom_diff) {
	CUDA_KERNEL_LOOP(index, nthreads) {
	// find out the local offset
	const int w = index % width;
	const int h = (index / width) % height;
	const int n = index / width / height;
	const int offset = (n * channels * height + h) * width + w;
	const int step = height * width;
	const Dtype* const bottom_off = bottom_data + offset;
	const Dtype* const top_off = top_data + offset;
	const Dtype* const scale_off = scale + offset;
	const Dtype* const top_diff_off = top_diff + offset;
	Dtype* const bottom_diff_off = bottom_diff + offset;
	int head = 0;
	const int pre_pad = size - (size + 1) / 2;
	const int post_pad = size - pre_pad - 1;
	Dtype accum_ratio = 0;
	// accumulate values
	while (head < post_pad && head < channels) {
	accum_ratio += top_diff_off[head * step] * top_off[head * step] /
	scale_off[head * step];
	++head;
	}
	// both add and subtract
	while (head < channels) {
	accum_ratio += top_diff_off[head * step] * top_off[head * step] /
	scale_off[head * step];
	if (head - size >= 0) {
	accum_ratio -= top_diff_off[(head - size) * step] *
	top_off[(head - size) * step] / scale_off[(head - size) * step];
	}
	bottom_diff_off[(head - post_pad) * step] =
	top_diff_off[(head - post_pad) * step]
	* pow(scale_off[(head - post_pad) * step], negative_beta)
	- cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio;
	++head;
	}
	// subtract only
	while (head < channels + post_pad) {
	if (head - size >= 0) {
	accum_ratio -= top_diff_off[(head - size) * step] *
	top_off[(head - size) * step] / scale_off[(head - size) * step];
	}
	bottom_diff_off[(head - post_pad) * step] =
	top_diff_off[(head - post_pad) * step]
	* pow(scale_off[(head - post_pad) * step], negative_beta)
	- cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio;
	++head;
	}
	}
	}

	extern "C"
	void LRNforward(THCState* state, THCudaTensor* input, THCudaTensor* output,
	THCudaTensor* scale, int local_size, float alpha, float beta, float k)
	{
	THCudaTensor_resizeAs(state, output, input);
	THCudaTensor_resizeAs(state, scale, input);

	int batchSize;
	int nInputPlane;
	int imsize_h;
	int imsize_w;

	if (input->nDimension == 3) {
	batchSize = 1;
	nInputPlane = input->size[0];
	imsize_h = input->size[1];
	imsize_w = input->size[2];
	}
	else
	{
	batchSize = input->size[0];
	nInputPlane = input->size[1];
	imsize_h = input->size[2];
	imsize_w = input->size[3];
	}

	input = THCudaTensor_newContiguous(state, input);

	int n_threads = batchSize * imsize_h * imsize_w;
	LRNFillScale<<<GET_BLOCKS(n_threads), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
	n_threads, THCudaTensor_data(state, input), batchSize, nInputPlane, imsize_h, imsize_w, local_size,
	alpha / local_size, k, THCudaTensor_data(state, scale));
	n_threads *= nInputPlane;
	THCudaCheck(cudaGetLastError());
	LRNComputeOutput<<<GET_BLOCKS(n_threads), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
	n_threads, THCudaTensor_data(state, input), THCudaTensor_data(state, scale), -beta, THCudaTensor_data(state, output));
	THCudaCheck(cudaGetLastError());

	THCudaTensor_free(state, input);
	}


	extern "C"
	void LRNbackward(THCState* state, THCudaTensor* input, THCudaTensor* output,
	THCudaTensor* gradOutput, THCudaTensor* gradInput, THCudaTensor* scale,
	int local_size, float alpha, float beta, float k)
	{
	THCudaTensor_resizeAs(state, gradInput, input);

	int batchSize;
	int nInputPlane;
	int imsize_h;
	int imsize_w;

	if (input->nDimension == 3) {
	batchSize = 1;
	nInputPlane = input->size[0];
	imsize_h = input->size[1];
	imsize_w = input->size[2];
	}
	else
	{
	batchSize = input->size[0];
	nInputPlane = input->size[1];
	imsize_h = input->size[2];
	imsize_w = input->size[3];
	}

	input = THCudaTensor_newContiguous(state, input);
	gradOutput = THCudaTensor_newContiguous(state, gradOutput);

	int n_threads = batchSize * imsize_h * imsize_w;
	LRNComputeDiff<<<GET_BLOCKS(n_threads), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
	n_threads, THCudaTensor_data(state, input), THCudaTensor_data(state, output),
	THCudaTensor_data(state, scale), THCudaTensor_data(state, gradOutput), batchSize, nInputPlane, imsize_h, imsize_w,
	local_size, -beta, float(2. * alpha * beta / local_size),
	THCudaTensor_data(state, gradInput));
	THCudaCheck(cudaGetLastError());

	THCudaTensor_free(state, input);
	THCudaTensor_free(state, gradOutput);
	}

	void THNN_CudaSpatialCrossMapLRN_updateOutput(
	THCState *state,
	THCudaTensor *input,
	THCudaTensor *output,
	THCudaTensor *scale,
	int size,
	float alpha,
	float beta,
	float k)
	{
	LRNforward(state, input, output, scale, size, alpha, beta, k);
	}

	void THNN_CudaSpatialCrossMapLRN_updateGradInput(
	THCState *state,
	THCudaTensor *input,
	THCudaTensor *gradOutput,
	THCudaTensor *gradInput,
	THCudaTensor *scale,
	THCudaTensor *output,
	int size,
	float alpha,
	float beta,
	float k)
	{
	LRNbackward(state, input, output, gradOutput, gradInput, scale, size, alpha, beta, k);
	}