LogSoftMax.cu - platform/external/pytorch - Git at Google

 #include "THCUNN.h"
 #include "common.h"

 struct MaxFloat
 {
   __device__ __forceinline__ float operator()(float max, float v) const
   {
     return fmaxf(max, v);
   }
 };

 struct SumFloat
 {
   __device__ __forceinline__ float operator()(float sum, float v) const
   {
     return sum + v;
   }
 };

 struct SumExpFloat
 {
   __device__ __forceinline__ SumExpFloat(float v)
     : max_k(v)
   {}

   __device__ __forceinline__ float operator()(float sum, float v) const
   {
     return sum + expf(v - max_k);
   }

   const float max_k;
 };

 struct NoFinal
 {
   __device__ __forceinline__ float operator()(float v) const
   {
     return v;
   }
 };

 struct LSMFinal
 {
   __device__ __forceinline__ LSMFinal(float m)
     : max_k(m)
   {}

   __device__ __forceinline__ float operator()(float v) const
   {
     return max_k + logf(v);
   }

   const float max_k;
 };

 template <typename Reduction, typename Finalize>
 __device__ __forceinline__ float
 blockReduce(float* smem, float val,
             const Reduction& r,
             float defaultVal,
             const Finalize& f)
 {
   // To avoid RaW races from chaining blockReduce calls together, we
   // need a sync here
   __syncthreads();

   smem[threadIdx.x] = val;

   __syncthreads();

   float warpVal = defaultVal;

   // First warp will perform per-warp reductions for the remaining warps
   if ((threadIdx.x / 32) == 0) // only threads in warp1 go into this (if)
   {
     int lane = threadIdx.x % 32; // from 0 to 31

     // if less than 1024 threads per block, then only activate the relevant lanes
     if (lane < blockDim.x / 32)
     {
 #pragma unroll
       for (int i = 0; i < 32; ++i)
       {
         warpVal = r(warpVal, smem[lane * 32 + i]);
       }

       smem[lane] = warpVal;
     }
   }

   __syncthreads();

   // First thread will perform a reduction of the above per-warp reductions
   float blockVal = defaultVal;

   if (threadIdx.x == 0)
   {
     for (int i = 0; i < blockDim.x / 32; ++i)
     {
       blockVal = r(blockVal, smem[i]);
     }

     smem[0] = f(blockVal);
   }

   // Sync and broadcast
   __syncthreads();
   return smem[0];
 }

 template <typename Reduction>
 __device__ __forceinline__ float
 blockReduce(float* smem, float val,
             const Reduction& r,
             float defaultVal)
 {
   return blockReduce<Reduction, NoFinal>(smem, val, r, defaultVal, NoFinal());
 }

 template <typename Reduction, int ILP>
 __device__ __forceinline__ float
 ilpReduce(float* data,
           int size,
           const Reduction& r,
           float defaultVal)
 {
   float threadVal = defaultVal;
   int offset = threadIdx.x;

   int last = size % (ILP * blockDim.x);

   // Body (unroll by ILP times)
   for (; offset < size - last; offset += blockDim.x * ILP)
   {
     float tmp[ILP];

 #pragma unroll
     for (int j = 0; j < ILP; ++j)
     {
       tmp[j] = data[offset + j * blockDim.x];
     }

 #pragma unroll
     for (int j = 0; j < ILP; ++j)
     {
       threadVal = r(threadVal, tmp[j]);
     }
   }

   // Epilogue
   for (; offset < size; offset += blockDim.x)
   {
     threadVal = r(threadVal, data[offset]);
   }

   return threadVal;
 }

 template <int ILP>
 __global__ void
 cunn_LogSoftMax_updateOutput_kernel(float *output, float *input, int classes)
 {
   extern __shared__ float buffer[];
   // forward pointers to batch[blockIdx.x]
   // each block handles a sample in the mini-batch
   input += blockIdx.x * classes;
   output += blockIdx.x * classes;

   // find the max of the batch
   float threadMax =
     ilpReduce<MaxFloat, ILP>(input, classes, MaxFloat(), -FLT_MAX);
   // find the max over all batches
   float max_k =
     blockReduce<MaxFloat>(buffer, threadMax, MaxFloat(), -FLT_MAX);

   float threadExp =
     ilpReduce<SumExpFloat, ILP>(input, classes, SumExpFloat(max_k), 0.0f);
   float logsum_k =
     blockReduce<SumFloat, LSMFinal>(
       buffer, threadExp, SumFloat(), 0.0f, LSMFinal(max_k));

   // Output LSM (hand ILP)
   int offset = threadIdx.x;

   int last = classes % (ILP * blockDim.x);
   for (; offset < classes - last; offset += blockDim.x * ILP)
   {
     float tmp[ILP];

 #pragma unroll
     for (int j = 0; j < ILP; ++j) {
       tmp[j] = input[offset + j * blockDim.x];
     }

 #pragma unroll
     for (int j = 0; j < ILP; ++j)
     {
       output[offset + j * blockDim.x] = tmp[j] - logsum_k;
     }
   }

   for (; offset < classes; offset += blockDim.x)
   {
     output[offset] = input[offset] - logsum_k;
   }
 }

 template <int ILP>
 __global__ void
 cunn_LogSoftMax_updateGradInput_kernel(float *gradInput,
                                        float *output,
                                        float *gradOutput,
                                        int classes)
 {
   extern __shared__ float buffer[];
   gradInput += blockIdx.x * classes;
   output += blockIdx.x * classes;
   gradOutput += blockIdx.x * classes;

   float threadSum =
     ilpReduce<SumFloat, 4>(gradOutput, classes, SumFloat(), 0.0f);
   float sum_k =
     blockReduce<SumFloat>(buffer, threadSum, SumFloat(), 0.0f);

   // Update gradInput (hand ILP)
   int offset = threadIdx.x;
   int last = classes % (ILP * blockDim.x);
   for (; offset < classes - last; offset += blockDim.x * ILP)
   {
     float tmpGradOutput[ILP];
     float tmpOutput[ILP];

 #pragma unroll
     for (int j = 0; j < ILP; ++j)
     {
       tmpGradOutput[j] = gradOutput[offset + j * blockDim.x];
       tmpOutput[j] = output[offset + j * blockDim.x];
     }

 #pragma unroll
     for (int j = 0; j < ILP; ++j)
     {
       gradInput[offset + j * blockDim.x] =
         tmpGradOutput[j] - __expf(tmpOutput[j]) * sum_k;
     }
   }

   for (; offset < classes; offset += blockDim.x)
   {
     gradInput[offset] =
       gradOutput[offset] - __expf(output[offset]) * sum_k;
   }
 }

 void THNN_CudaLogSoftMax_updateOutput(THCState *state, THCudaTensor *input, THCudaTensor *output)
 {
   THCUNN_assertSameGPU(state, 2, input, output);

   input = THCudaTensor_newContiguous(state, input);
   THCudaTensor_resizeAs(state, output, input);

   int batchSize = 1;
   int classSize = 0;

   if (THCudaTensor_nDimension(state, input) == 1)
   {
     classSize = THCudaTensor_size(state, input, 0);
   }
   else if (THCudaTensor_nDimension(state, input) == 2)
   {
     batchSize = THCudaTensor_size(state, input, 0);
     classSize = THCudaTensor_size(state, input, 1);
   }
   else
   {
     THError("vector or matrix expected");
   }

   dim3 grid(batchSize);
   dim3 block(1024);

   cunn_LogSoftMax_updateOutput_kernel<2>
     <<<grid, block, block.x * sizeof(float), THCState_getCurrentStream(state)>>>(
       THCudaTensor_data(state, output),
       THCudaTensor_data(state, input),
       classSize
   );
   THCudaCheck(cudaGetLastError());
   THCudaTensor_free(state, input);
 }

 void THNN_CudaLogSoftMax_updateGradInput(THCState *state, THCudaTensor *input, THCudaTensor *gradOutput,
   THCudaTensor *gradInput, THCudaTensor *output)
 {
   THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput);

   output = THCudaTensor_newContiguous(state, output);
   gradOutput = THCudaTensor_newContiguous(state, gradOutput);

   THCudaTensor_resizeAs(state, gradInput, output);

   int batchSize = 1;
   int classSize = 0;

   if (THCudaTensor_nDimension(state, gradInput) == 1)
   {
     classSize = THCudaTensor_size(state, gradInput, 0);
   }
   else if (THCudaTensor_nDimension(state, gradInput) == 2)
   {
     batchSize = THCudaTensor_size(state, gradInput, 0);
     classSize = THCudaTensor_size(state, gradInput, 1);
   }
   else
   {
     THError("vector or matrix expected");
   }

   dim3 grid(batchSize);
   dim3 block(1024);

   cunn_LogSoftMax_updateGradInput_kernel<2>
     <<<grid, block, block.x * sizeof(float), THCState_getCurrentStream(state)>>>(
       THCudaTensor_data(state, gradInput),
       THCudaTensor_data(state, output),
       THCudaTensor_data(state, gradOutput),
       classSize
   );
   THCudaCheck(cudaGetLastError());
   THCudaTensor_free(state, gradOutput);
   THCudaTensor_free(state, output);
 }
	#include "THCUNN.h"
	#include "common.h"

	struct MaxFloat
	{
	__device__ __forceinline__ float operator()(float max, float v) const
	{
	return fmaxf(max, v);
	}
	};

	struct SumFloat
	{
	__device__ __forceinline__ float operator()(float sum, float v) const
	{
	return sum + v;
	}
	};

	struct SumExpFloat
	{
	__device__ __forceinline__ SumExpFloat(float v)
	: max_k(v)
	{}

	__device__ __forceinline__ float operator()(float sum, float v) const
	{
	return sum + expf(v - max_k);
	}

	const float max_k;
	};

	struct NoFinal
	{
	__device__ __forceinline__ float operator()(float v) const
	{
	return v;
	}
	};

	struct LSMFinal
	{
	__device__ __forceinline__ LSMFinal(float m)
	: max_k(m)
	{}

	__device__ __forceinline__ float operator()(float v) const
	{
	return max_k + logf(v);
	}

	const float max_k;
	};

	template <typename Reduction, typename Finalize>
	__device__ __forceinline__ float
	blockReduce(float* smem, float val,
	const Reduction& r,
	float defaultVal,
	const Finalize& f)
	{
	// To avoid RaW races from chaining blockReduce calls together, we
	// need a sync here
	__syncthreads();

	smem[threadIdx.x] = val;

	__syncthreads();

	float warpVal = defaultVal;

	// First warp will perform per-warp reductions for the remaining warps
	if ((threadIdx.x / 32) == 0) // only threads in warp1 go into this (if)
	{
	int lane = threadIdx.x % 32; // from 0 to 31

	// if less than 1024 threads per block, then only activate the relevant lanes
	if (lane < blockDim.x / 32)
	{
	#pragma unroll
	for (int i = 0; i < 32; ++i)
	{
	warpVal = r(warpVal, smem[lane * 32 + i]);
	}

	smem[lane] = warpVal;
	}
	}

	__syncthreads();

	// First thread will perform a reduction of the above per-warp reductions
	float blockVal = defaultVal;

	if (threadIdx.x == 0)
	{
	for (int i = 0; i < blockDim.x / 32; ++i)
	{
	blockVal = r(blockVal, smem[i]);
	}

	smem[0] = f(blockVal);
	}

	// Sync and broadcast
	__syncthreads();
	return smem[0];
	}

	template <typename Reduction>
	__device__ __forceinline__ float
	blockReduce(float* smem, float val,
	const Reduction& r,
	float defaultVal)
	{
	return blockReduce<Reduction, NoFinal>(smem, val, r, defaultVal, NoFinal());
	}

	template <typename Reduction, int ILP>
	__device__ __forceinline__ float
	ilpReduce(float* data,
	int size,
	const Reduction& r,
	float defaultVal)
	{
	float threadVal = defaultVal;
	int offset = threadIdx.x;

	int last = size % (ILP * blockDim.x);

	// Body (unroll by ILP times)
	for (; offset < size - last; offset += blockDim.x * ILP)
	{
	float tmp[ILP];

	#pragma unroll
	for (int j = 0; j < ILP; ++j)
	{
	tmp[j] = data[offset + j * blockDim.x];
	}

	#pragma unroll
	for (int j = 0; j < ILP; ++j)
	{
	threadVal = r(threadVal, tmp[j]);
	}
	}

	// Epilogue
	for (; offset < size; offset += blockDim.x)
	{
	threadVal = r(threadVal, data[offset]);
	}

	return threadVal;
	}

	template <int ILP>
	__global__ void
	cunn_LogSoftMax_updateOutput_kernel(float output, float input, int classes)
	{
	extern __shared__ float buffer[];
	// forward pointers to batch[blockIdx.x]
	// each block handles a sample in the mini-batch
	input += blockIdx.x * classes;
	output += blockIdx.x * classes;

	// find the max of the batch
	float threadMax =
	ilpReduce<MaxFloat, ILP>(input, classes, MaxFloat(), -FLT_MAX);
	// find the max over all batches
	float max_k =
	blockReduce<MaxFloat>(buffer, threadMax, MaxFloat(), -FLT_MAX);

	float threadExp =
	ilpReduce<SumExpFloat, ILP>(input, classes, SumExpFloat(max_k), 0.0f);
	float logsum_k =
	blockReduce<SumFloat, LSMFinal>(
	buffer, threadExp, SumFloat(), 0.0f, LSMFinal(max_k));

	// Output LSM (hand ILP)
	int offset = threadIdx.x;

	int last = classes % (ILP * blockDim.x);
	for (; offset < classes - last; offset += blockDim.x * ILP)
	{
	float tmp[ILP];

	#pragma unroll
	for (int j = 0; j < ILP; ++j) {
	tmp[j] = input[offset + j * blockDim.x];
	}

	#pragma unroll
	for (int j = 0; j < ILP; ++j)
	{
	output[offset + j * blockDim.x] = tmp[j] - logsum_k;
	}
	}

	for (; offset < classes; offset += blockDim.x)
	{
	output[offset] = input[offset] - logsum_k;
	}
	}

	template <int ILP>
	__global__ void
	cunn_LogSoftMax_updateGradInput_kernel(float *gradInput,
	float *output,
	float *gradOutput,
	int classes)
	{
	extern __shared__ float buffer[];
	gradInput += blockIdx.x * classes;
	output += blockIdx.x * classes;
	gradOutput += blockIdx.x * classes;

	float threadSum =
	ilpReduce<SumFloat, 4>(gradOutput, classes, SumFloat(), 0.0f);
	float sum_k =
	blockReduce<SumFloat>(buffer, threadSum, SumFloat(), 0.0f);

	// Update gradInput (hand ILP)
	int offset = threadIdx.x;
	int last = classes % (ILP * blockDim.x);
	for (; offset < classes - last; offset += blockDim.x * ILP)
	{
	float tmpGradOutput[ILP];
	float tmpOutput[ILP];

	#pragma unroll
	for (int j = 0; j < ILP; ++j)
	{
	tmpGradOutput[j] = gradOutput[offset + j * blockDim.x];
	tmpOutput[j] = output[offset + j * blockDim.x];
	}

	#pragma unroll
	for (int j = 0; j < ILP; ++j)
	{
	gradInput[offset + j * blockDim.x] =
	tmpGradOutput[j] - __expf(tmpOutput[j]) * sum_k;
	}
	}

	for (; offset < classes; offset += blockDim.x)
	{
	gradInput[offset] =
	gradOutput[offset] - __expf(output[offset]) * sum_k;
	}
	}

	void THNN_CudaLogSoftMax_updateOutput(THCState state, THCudaTensor input, THCudaTensor *output)
	{
	THCUNN_assertSameGPU(state, 2, input, output);

	input = THCudaTensor_newContiguous(state, input);
	THCudaTensor_resizeAs(state, output, input);

	int batchSize = 1;
	int classSize = 0;

	if (THCudaTensor_nDimension(state, input) == 1)
	{
	classSize = THCudaTensor_size(state, input, 0);
	}
	else if (THCudaTensor_nDimension(state, input) == 2)
	{
	batchSize = THCudaTensor_size(state, input, 0);
	classSize = THCudaTensor_size(state, input, 1);
	}
	else
	{
	THError("vector or matrix expected");
	}

	dim3 grid(batchSize);
	dim3 block(1024);

	cunn_LogSoftMax_updateOutput_kernel<2>
	<<<grid, block, block.x * sizeof(float), THCState_getCurrentStream(state)>>>(
	THCudaTensor_data(state, output),
	THCudaTensor_data(state, input),
	classSize
	);
	THCudaCheck(cudaGetLastError());
	THCudaTensor_free(state, input);
	}

	void THNN_CudaLogSoftMax_updateGradInput(THCState state, THCudaTensor input, THCudaTensor *gradOutput,
	THCudaTensor gradInput, THCudaTensor output)
	{
	THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput);

	output = THCudaTensor_newContiguous(state, output);
	gradOutput = THCudaTensor_newContiguous(state, gradOutput);

	THCudaTensor_resizeAs(state, gradInput, output);

	int batchSize = 1;
	int classSize = 0;

	if (THCudaTensor_nDimension(state, gradInput) == 1)
	{
	classSize = THCudaTensor_size(state, gradInput, 0);
	}
	else if (THCudaTensor_nDimension(state, gradInput) == 2)
	{
	batchSize = THCudaTensor_size(state, gradInput, 0);
	classSize = THCudaTensor_size(state, gradInput, 1);
	}
	else
	{
	THError("vector or matrix expected");
	}

	dim3 grid(batchSize);
	dim3 block(1024);

	cunn_LogSoftMax_updateGradInput_kernel<2>
	<<<grid, block, block.x * sizeof(float), THCState_getCurrentStream(state)>>>(
	THCudaTensor_data(state, gradInput),
	THCudaTensor_data(state, output),
	THCudaTensor_data(state, gradOutput),
	classSize
	);
	THCudaCheck(cudaGetLastError());
	THCudaTensor_free(state, gradOutput);
	THCudaTensor_free(state, output);
	}