LogSoftMax.cu - platform/external/pytorch - Git at Google

 #include "THCUNN.h"
 #include "THCHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "SharedMem.cuh"

 template <typename T, typename AccumT>
 __global__ void cunn_SpatialLogSoftMax_updateOutput_kernel(T *output, T *input, int classSize, int height, int width)
 {
   int batchIndex = blockIdx.x;
   int index = threadIdx.x;

   while (index < height*width) {
     int y = index / width;
     int x = index % width;
     if (y >= height)
       break;

     // calculate input starting index in cuda layout (B x H x W x C)
     int inputStartIndex =
       (height*width*classSize)*batchIndex +
       (width*classSize)*y +
       (classSize)*x;

     T maxInput = input[inputStartIndex];
     for (int i = 1; i < classSize; i++) {
       T value = input[inputStartIndex + i];
       maxInput = THCNumerics<T>::ge(maxInput, value) ? maxInput : value;
     }

     AccumT sum = 0;
     for (int i = 0; i < classSize; i++) {
       sum += THCNumerics<T>::exp(input[inputStartIndex + i] - maxInput);
     }
     T logsum = maxInput + ScalarConvert<AccumT, T>::to(THCNumerics<AccumT>::log(sum));

     for (int i = 0; i < classSize; i++) {
       // calculate output index in torch layout (B x C x H x W)
       int outputIndex =
         (classSize*height*width)*batchIndex +
         (height*width)*i +
         (width)*y +
         x;
       output[outputIndex] = input[inputStartIndex + i] - logsum;
     }
     index += blockDim.x;
   }
 }

 template <typename T, typename AccumT>
 __global__ void cunn_SpatialLogSoftMax_updateGradInput_kernel(T *gradInput, T *output, T *gradOutput, int classSize, int height, int width)
 {
   int batchIndex = blockIdx.x;
   int index = threadIdx.x;

   while (index < height*width) {
     int y = index / width;
     int x = index % width;
     if (y >= height)
       break;

     // calculate output starting index in cuda layout (B x H x W x C)
     int outputStartIndex =
       (height*width*classSize)*batchIndex +
       (width*classSize)*y +
       (classSize)*x;

     AccumT sum = 0;
     for (int i = 0; i < classSize; i++) {
       sum += gradOutput[outputStartIndex + i];
     }

     for (int i = 0; i < classSize; i++) {
       // calculate input index in torch layout (B x C x H x W)
       int inputIndex =
         (classSize*height*width)*batchIndex +
         (height*width)*i +
         (width)*y +
         x;
       gradInput[inputIndex] = ScalarConvert<AccumT, T>::to(
         gradOutput[outputStartIndex + i] - THCNumerics<T>::exp(output[outputStartIndex + i]) * sum);
     }
     index += blockDim.x;
   }
 }

 template <typename T, typename AccumT>
 struct MaxFloat
 {
   __device__ __forceinline__ AccumT operator()(AccumT max, T v) const
   {
     return fmaxType(max, v);
   }
 };

 template<typename T, typename AccumT>
 struct SumFloat
 {
   __device__ __forceinline__ AccumT operator()(AccumT sum, T v) const
   {
     return sum + v;
   }
 };

 template<typename T, typename AccumT>
 struct SumExpFloat
 {
   __device__ __forceinline__ SumExpFloat(T v)
     : max_k(v)
   {}

   __device__ __forceinline__ AccumT operator()(AccumT sum, T v) const
   {
     return sum + THCNumerics<T>::exp(v - max_k);
   }

   const T max_k;
 };

 template<typename AccumT>
 struct NoFinal
 {
   __device__ __forceinline__ AccumT operator()(AccumT v) const
   {
     return v;
   }
 };

 template<typename AccumT>
 struct LSMFinal
 {
   __device__ __forceinline__ LSMFinal(AccumT m)
     : max_k(m)
   {}

   __device__ __forceinline__ AccumT operator()(AccumT v) const
   {
     return max_k + THCNumerics<AccumT>::log(v);
   }

   const AccumT max_k;
 };

 template <template<typename, typename> class Reduction, template<typename> class Finalize, typename AccumT>
 __device__ __forceinline__ AccumT
 blockReduce(AccumT* smem, AccumT val,
             const Reduction<AccumT, AccumT>& r,
             AccumT defaultVal,
             const Finalize<AccumT>& f)
 {
   // To avoid RaW races from chaining blockReduce calls together, we
   // need a sync here
   __syncthreads();

   smem[threadIdx.x] = val;

   __syncthreads();

   AccumT warpVal = defaultVal;

   // First warp will perform per-warp reductions for the remaining warps
   if ((threadIdx.x / 32) == 0) // only threads in warp1 go into this (if)
   {
     int lane = threadIdx.x % 32; // from 0 to 31

     // if less than 1024 threads per block, then only activate the relevant lanes
     if (lane < blockDim.x / 32)
     {
 #pragma unroll
       for (int i = 0; i < 32; ++i)
       {
         warpVal = r(warpVal, smem[lane * 32 + i]);
       }

       smem[lane] = warpVal;
     }
   }

   __syncthreads();

   // First thread will perform a reduction of the above per-warp reductions
   AccumT blockVal = defaultVal;

   if (threadIdx.x == 0)
   {
     for (int i = 0; i < blockDim.x / 32; ++i)
     {
       blockVal = r(blockVal, smem[i]);
     }

     smem[0] = f(blockVal);
   }

   // Sync and broadcast
   __syncthreads();
   return smem[0];
 }

 template <template<typename, typename> class Reduction, typename AccumT>
 __device__ __forceinline__ AccumT
 blockReduce(AccumT* smem, AccumT val,
             const Reduction<AccumT, AccumT>& r,
             AccumT defaultVal)
 {
   return blockReduce<Reduction, NoFinal, AccumT>(smem, val, r, defaultVal, NoFinal<AccumT>());
 }

 template <template<typename, typename> class Reduction, int ILP, typename T, typename AccumT>
 __device__ __forceinline__ AccumT
 ilpReduce(T* data,
           int size,
           const Reduction<T, AccumT>& r,
           AccumT defaultVal)
 {
   AccumT threadVal = defaultVal;
   int offset = threadIdx.x;

   int last = size % (ILP * blockDim.x);

   // Body (unroll by ILP times)
   for (; offset < size - last; offset += blockDim.x * ILP)
   {
     T tmp[ILP];

 #pragma unroll
     for (int j = 0; j < ILP; ++j)
     {
       tmp[j] = data[offset + j * blockDim.x];
     }

 #pragma unroll
     for (int j = 0; j < ILP; ++j)
     {
       threadVal = r(threadVal, tmp[j]);
     }
   }

   // Epilogue
   for (; offset < size; offset += blockDim.x)
   {
     threadVal = r(threadVal, data[offset]);
   }

   return threadVal;
 }

 template <int ILP, typename T, typename AccumT>
 __global__ void
 cunn_LogSoftMax_updateOutput_kernel(T *output, T *input, int classes)
 {
   SharedMem<AccumT> smem;
   AccumT *buffer = smem.getPointer();
   // forward pointers to batch[blockIdx.x]
   // each block handles a sample in the mini-batch
   input += blockIdx.x * classes;
   output += blockIdx.x * classes;

   // find the max of the batch
   AccumT threadMax = ilpReduce<MaxFloat, ILP, T, AccumT>(
       input, classes, MaxFloat<T, AccumT>(), -THCNumerics<AccumT>::max());
   // find the max over all batches
   AccumT max_k = blockReduce<MaxFloat, AccumT>(
       buffer, threadMax, MaxFloat<AccumT, AccumT>(), -THCNumerics<AccumT>::max());
   T max_k_non_accum = ScalarConvert<AccumT, T>::to(max_k);

   AccumT threadExp = ilpReduce<SumExpFloat, ILP, T, AccumT>(
       input, classes, SumExpFloat<T, AccumT>(max_k_non_accum), AccumT(0));
   T logsum_k = ScalarConvert<AccumT, T>::to(
       blockReduce<SumFloat, LSMFinal, AccumT>(
           buffer, threadExp, SumFloat<AccumT, AccumT>(), AccumT(0), LSMFinal<AccumT>(max_k)));

   // Output LSM (hand ILP)
   int offset = threadIdx.x;

   int last = classes % (ILP * blockDim.x);
   for (; offset < classes - last; offset += blockDim.x * ILP)
   {
     T tmp[ILP];

 #pragma unroll
     for (int j = 0; j < ILP; ++j) {
       tmp[j] = input[offset + j * blockDim.x];
     }

 #pragma unroll
     for (int j = 0; j < ILP; ++j)
     {
       output[offset + j * blockDim.x] = tmp[j] - logsum_k;
     }
   }

   for (; offset < classes; offset += blockDim.x)
   {
     output[offset] = input[offset] - logsum_k;
   }
 }

 template <int ILP, typename T, typename AccumT>
 __global__ void
 cunn_LogSoftMax_updateGradInput_kernel(T *gradInput,
                                        T *output,
                                        T *gradOutput,
                                        int classes)
 {
   SharedMem<AccumT> smem;
   AccumT *buffer = smem.getPointer();
   gradInput += blockIdx.x * classes;
   output += blockIdx.x * classes;
   gradOutput += blockIdx.x * classes;

   AccumT threadSum = ilpReduce<SumFloat, 4, T, AccumT>(
       gradOutput, classes, SumFloat<T, AccumT>(), AccumT(0));
   T sum_k = ScalarConvert<AccumT, T>::to(
       blockReduce<SumFloat, AccumT>(
           buffer, threadSum, SumFloat<AccumT, AccumT>(), AccumT(0)));

   // Update gradInput (hand ILP)
   int offset = threadIdx.x;
   int last = classes % (ILP * blockDim.x);
   for (; offset < classes - last; offset += blockDim.x * ILP)
   {
     T tmpGradOutput[ILP];
     T tmpOutput[ILP];

 #pragma unroll
     for (int j = 0; j < ILP; ++j)
     {
       tmpGradOutput[j] = gradOutput[offset + j * blockDim.x];
       tmpOutput[j] = output[offset + j * blockDim.x];
     }

 #pragma unroll
     for (int j = 0; j < ILP; ++j)
     {
       gradInput[offset + j * blockDim.x] =
         tmpGradOutput[j] - THCNumerics<T>::exp(tmpOutput[j]) * sum_k;
     }
   }

   for (; offset < classes; offset += blockDim.x)
   {
     gradInput[offset] =
       gradOutput[offset] - THCNumerics<T>::exp(output[offset]) * sum_k;
   }
 }

 #include "generic/LogSoftMax.cu"
 #include "THCGenerateFloatTypes.h"
	#include "THCUNN.h"
	#include "THCHalf.h"
	#include "THCHalfAutoNumerics.cuh"
	#include "SharedMem.cuh"

	template <typename T, typename AccumT>
	__global__ void cunn_SpatialLogSoftMax_updateOutput_kernel(T output, T input, int classSize, int height, int width)
	{
	int batchIndex = blockIdx.x;
	int index = threadIdx.x;

	while (index < height*width) {
	int y = index / width;
	int x = index % width;
	if (y >= height)
	break;

	// calculate input starting index in cuda layout (B x H x W x C)
	int inputStartIndex =
	(heightwidthclassSize)*batchIndex +
	(widthclassSize)y +
	(classSize)*x;

	T maxInput = input[inputStartIndex];
	for (int i = 1; i < classSize; i++) {
	T value = input[inputStartIndex + i];
	maxInput = THCNumerics<T>::ge(maxInput, value) ? maxInput : value;
	}

	AccumT sum = 0;
	for (int i = 0; i < classSize; i++) {
	sum += THCNumerics<T>::exp(input[inputStartIndex + i] - maxInput);
	}
	T logsum = maxInput + ScalarConvert<AccumT, T>::to(THCNumerics<AccumT>::log(sum));

	for (int i = 0; i < classSize; i++) {
	// calculate output index in torch layout (B x C x H x W)
	int outputIndex =
	(classSizeheightwidth)*batchIndex +
	(heightwidth)i +
	(width)*y +
	x;
	output[outputIndex] = input[inputStartIndex + i] - logsum;
	}
	index += blockDim.x;
	}
	}

	template <typename T, typename AccumT>
	__global__ void cunn_SpatialLogSoftMax_updateGradInput_kernel(T gradInput, T output, T *gradOutput, int classSize, int height, int width)
	{
	int batchIndex = blockIdx.x;
	int index = threadIdx.x;

	while (index < height*width) {
	int y = index / width;
	int x = index % width;
	if (y >= height)
	break;

	// calculate output starting index in cuda layout (B x H x W x C)
	int outputStartIndex =
	(heightwidthclassSize)*batchIndex +
	(widthclassSize)y +
	(classSize)*x;

	AccumT sum = 0;
	for (int i = 0; i < classSize; i++) {
	sum += gradOutput[outputStartIndex + i];
	}

	for (int i = 0; i < classSize; i++) {
	// calculate input index in torch layout (B x C x H x W)
	int inputIndex =
	(classSizeheightwidth)*batchIndex +
	(heightwidth)i +
	(width)*y +
	x;
	gradInput[inputIndex] = ScalarConvert<AccumT, T>::to(
	gradOutput[outputStartIndex + i] - THCNumerics<T>::exp(output[outputStartIndex + i]) * sum);
	}
	index += blockDim.x;
	}
	}

	template <typename T, typename AccumT>
	struct MaxFloat
	{
	__device__ __forceinline__ AccumT operator()(AccumT max, T v) const
	{
	return fmaxType(max, v);
	}
	};

	template<typename T, typename AccumT>
	struct SumFloat
	{
	__device__ __forceinline__ AccumT operator()(AccumT sum, T v) const
	{
	return sum + v;
	}
	};

	template<typename T, typename AccumT>
	struct SumExpFloat
	{
	__device__ __forceinline__ SumExpFloat(T v)
	: max_k(v)
	{}

	__device__ __forceinline__ AccumT operator()(AccumT sum, T v) const
	{
	return sum + THCNumerics<T>::exp(v - max_k);
	}

	const T max_k;
	};

	template<typename AccumT>
	struct NoFinal
	{
	__device__ __forceinline__ AccumT operator()(AccumT v) const
	{
	return v;
	}
	};

	template<typename AccumT>
	struct LSMFinal
	{
	__device__ __forceinline__ LSMFinal(AccumT m)
	: max_k(m)
	{}

	__device__ __forceinline__ AccumT operator()(AccumT v) const
	{
	return max_k + THCNumerics<AccumT>::log(v);
	}

	const AccumT max_k;
	};

	template <template<typename, typename> class Reduction, template<typename> class Finalize, typename AccumT>
	__device__ __forceinline__ AccumT
	blockReduce(AccumT* smem, AccumT val,
	const Reduction<AccumT, AccumT>& r,
	AccumT defaultVal,
	const Finalize<AccumT>& f)
	{
	// To avoid RaW races from chaining blockReduce calls together, we
	// need a sync here
	__syncthreads();

	smem[threadIdx.x] = val;

	__syncthreads();

	AccumT warpVal = defaultVal;

	// First warp will perform per-warp reductions for the remaining warps
	if ((threadIdx.x / 32) == 0) // only threads in warp1 go into this (if)
	{
	int lane = threadIdx.x % 32; // from 0 to 31

	// if less than 1024 threads per block, then only activate the relevant lanes
	if (lane < blockDim.x / 32)
	{
	#pragma unroll
	for (int i = 0; i < 32; ++i)
	{
	warpVal = r(warpVal, smem[lane * 32 + i]);
	}

	smem[lane] = warpVal;
	}
	}

	__syncthreads();

	// First thread will perform a reduction of the above per-warp reductions
	AccumT blockVal = defaultVal;

	if (threadIdx.x == 0)
	{
	for (int i = 0; i < blockDim.x / 32; ++i)
	{
	blockVal = r(blockVal, smem[i]);
	}

	smem[0] = f(blockVal);
	}

	// Sync and broadcast
	__syncthreads();
	return smem[0];
	}

	template <template<typename, typename> class Reduction, typename AccumT>
	__device__ __forceinline__ AccumT
	blockReduce(AccumT* smem, AccumT val,
	const Reduction<AccumT, AccumT>& r,
	AccumT defaultVal)
	{
	return blockReduce<Reduction, NoFinal, AccumT>(smem, val, r, defaultVal, NoFinal<AccumT>());
	}

	template <template<typename, typename> class Reduction, int ILP, typename T, typename AccumT>
	__device__ __forceinline__ AccumT
	ilpReduce(T* data,
	int size,
	const Reduction<T, AccumT>& r,
	AccumT defaultVal)
	{
	AccumT threadVal = defaultVal;
	int offset = threadIdx.x;

	int last = size % (ILP * blockDim.x);

	// Body (unroll by ILP times)
	for (; offset < size - last; offset += blockDim.x * ILP)
	{
	T tmp[ILP];

	#pragma unroll
	for (int j = 0; j < ILP; ++j)
	{
	tmp[j] = data[offset + j * blockDim.x];
	}

	#pragma unroll
	for (int j = 0; j < ILP; ++j)
	{
	threadVal = r(threadVal, tmp[j]);
	}
	}

	// Epilogue
	for (; offset < size; offset += blockDim.x)
	{
	threadVal = r(threadVal, data[offset]);
	}

	return threadVal;
	}

	template <int ILP, typename T, typename AccumT>
	__global__ void
	cunn_LogSoftMax_updateOutput_kernel(T output, T input, int classes)
	{
	SharedMem<AccumT> smem;
	AccumT *buffer = smem.getPointer();
	// forward pointers to batch[blockIdx.x]
	// each block handles a sample in the mini-batch
	input += blockIdx.x * classes;
	output += blockIdx.x * classes;

	// find the max of the batch
	AccumT threadMax = ilpReduce<MaxFloat, ILP, T, AccumT>(
	input, classes, MaxFloat<T, AccumT>(), -THCNumerics<AccumT>::max());
	// find the max over all batches
	AccumT max_k = blockReduce<MaxFloat, AccumT>(
	buffer, threadMax, MaxFloat<AccumT, AccumT>(), -THCNumerics<AccumT>::max());
	T max_k_non_accum = ScalarConvert<AccumT, T>::to(max_k);

	AccumT threadExp = ilpReduce<SumExpFloat, ILP, T, AccumT>(
	input, classes, SumExpFloat<T, AccumT>(max_k_non_accum), AccumT(0));
	T logsum_k = ScalarConvert<AccumT, T>::to(
	blockReduce<SumFloat, LSMFinal, AccumT>(
	buffer, threadExp, SumFloat<AccumT, AccumT>(), AccumT(0), LSMFinal<AccumT>(max_k)));

	// Output LSM (hand ILP)
	int offset = threadIdx.x;

	int last = classes % (ILP * blockDim.x);
	for (; offset < classes - last; offset += blockDim.x * ILP)
	{
	T tmp[ILP];

	#pragma unroll
	for (int j = 0; j < ILP; ++j) {
	tmp[j] = input[offset + j * blockDim.x];
	}

	#pragma unroll
	for (int j = 0; j < ILP; ++j)
	{
	output[offset + j * blockDim.x] = tmp[j] - logsum_k;
	}
	}

	for (; offset < classes; offset += blockDim.x)
	{
	output[offset] = input[offset] - logsum_k;
	}
	}

	template <int ILP, typename T, typename AccumT>
	__global__ void
	cunn_LogSoftMax_updateGradInput_kernel(T *gradInput,
	T *output,
	T *gradOutput,
	int classes)
	{
	SharedMem<AccumT> smem;
	AccumT *buffer = smem.getPointer();
	gradInput += blockIdx.x * classes;
	output += blockIdx.x * classes;
	gradOutput += blockIdx.x * classes;

	AccumT threadSum = ilpReduce<SumFloat, 4, T, AccumT>(
	gradOutput, classes, SumFloat<T, AccumT>(), AccumT(0));
	T sum_k = ScalarConvert<AccumT, T>::to(
	blockReduce<SumFloat, AccumT>(
	buffer, threadSum, SumFloat<AccumT, AccumT>(), AccumT(0)));

	// Update gradInput (hand ILP)
	int offset = threadIdx.x;
	int last = classes % (ILP * blockDim.x);
	for (; offset < classes - last; offset += blockDim.x * ILP)
	{
	T tmpGradOutput[ILP];
	T tmpOutput[ILP];

	#pragma unroll
	for (int j = 0; j < ILP; ++j)
	{
	tmpGradOutput[j] = gradOutput[offset + j * blockDim.x];
	tmpOutput[j] = output[offset + j * blockDim.x];
	}

	#pragma unroll
	for (int j = 0; j < ILP; ++j)
	{
	gradInput[offset + j * blockDim.x] =
	tmpGradOutput[j] - THCNumerics<T>::exp(tmpOutput[j]) * sum_k;
	}
	}

	for (; offset < classes; offset += blockDim.x)
	{
	gradInput[offset] =
	gradOutput[offset] - THCNumerics<T>::exp(output[offset]) * sum_k;
	}
	}

	#include "generic/LogSoftMax.cu"
	#include "THCGenerateFloatTypes.h"