LookupTable.cu - platform/external/pytorch - Git at Google

 #include "THCUNN.h"
 #include "common.h"

 #include "THCThrustAllocator.cuh"
 #include <thrust/device_ptr.h>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/transform_reduce.h>
 #if CUDA_VERSION >= 7000
 #include <thrust/system/cuda/execution_policy.h>
 #endif
 #include <thrust/unique.h>
 #include "THCHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCTensorSort.cuh"

 const int WARP_SIZE = 32;

 __device__ __forceinline__ bool warpHasCollision(int val)
 {
   // Compare our value to the values stored in the next 16 lanes,
   // wrapping around at 32. If any pair of values is the same than
   // there is a collision in the warp.
   bool dup = 0;
   const int laneId = threadIdx.x % 32;

 #if __CUDA_ARCH__ >= 300

   #pragma unroll
   for (int i = 1; i <= 16; i++)
   {
     dup |= (__shfl(val, (laneId + i) % 32) == val);
   }

 #else

   volatile __shared__ int values[128];
   values[threadIdx.x] = val;
   const int offset = threadIdx.x - laneId;

   #pragma unroll
   for (int i = 1; i <= 16; i++)
   {
     dup |= (values[offset + ((laneId + i) % 32)] == val);
   }

 #endif

   return __any(dup) != 0;
 }

 template <typename Dtype>
 __global__ void cunn_LookupTable_accGradParametersKernelByFeature(
   long *input, Dtype *gradOutput, Dtype *gradWeight, Dtype scale, ptrdiff_t numel,
   long stride, int paddingValue) {

   const int featureDim = blockIdx.x * 4 + threadIdx.x / 32;
   if (featureDim >= stride) {
     return;
   }

   // The strategy here is that each warp handles a single feature
   // dimension.
   // Within that feature dimension, points in the [batch][element]
   // dimension can overlap, and we need to determine if threads want
   // to add to the gradient in a colliding manner.
   // Typically one would use floating-point atomicAdd() to resolve
   // these collisions, but that is non-deterministic if there are
   // collisions. Non-determinism for this code is really bad,
   // especially in RNNs, and is prone to snowballing error.
   // In order to get a deterministic order of execution, we handle
   // non-colliding updates separately from colliding ones. Colliding
   // updates are serialized in their order of execution by using the
   // warp-wide collision detector `warpHasCollision`.
   const int laneId = threadIdx.x % 32;
   for (ptrdiff_t i = laneId; i < numel; i += WARP_SIZE) {
     const int weightIndex = (int) (input[i] - TH_INDEX_BASE);
     if (weightIndex == paddingValue - TH_INDEX_BASE) {
       continue;
     }

     Dtype update = gradOutput[i*stride + featureDim] * scale;

     // FIXME: should we accumulate as accreal?
     // Check for collision
     if (warpHasCollision(weightIndex)) {
       // Run all lanes sequentially; warp divergence
       for (int i = 0; i < WARP_SIZE; ++i) {
         if (laneId == i) {
           gradWeight[weightIndex*stride + featureDim] += update;
         }
       }
     } else {
       // No collision; warp coherence
       gradWeight[weightIndex*stride + featureDim] += update;
     }
   }
 }

 template <typename Dtype, typename Acctype>
 __global__ void cunn_LookupTable_accGradParametersKernel(
   long *input, long *indices, Dtype *gradOutput, Dtype *gradWeight,
   long *count, Dtype defaultScale, ptrdiff_t numel, long stride, int paddingValue) {

   int idx = blockIdx.x * 4 + threadIdx.y;

   // Each warp is responsible for an input into the LookupTable.
   // If the preceeding input has the same as this input, then the warp
   // exits immediately. The warp also processes subsequent inputs with the
   // same value.
   //
   // Input Warp
   // 1     <warp 1>
   // 1     <warp 1> (<warp 2> exits without doing any work)
   // 5     <warp 3>
   // 8     <warp 4>

   // Number of values proceessed by each thread (grain size)
   const int SZ = 4;

   if (idx < numel
       && (idx == 0 || input[idx] != input[idx - 1])
       && input[idx] != paddingValue) {
     do {
       const int startFeature = threadIdx.x + blockIdx.y * blockDim.x * SZ;
       const int weightRow = ((int) input[idx] - TH_INDEX_BASE) * stride;
       const int gradOutputRow = ((int) indices[idx] - TH_INDEX_BASE) * stride;
       const Acctype scale = count ? ScalarConvert<Dtype, Acctype>::to(defaultScale) / count[idx] : ScalarConvert<Dtype, Acctype>::to(defaultScale);

       Acctype gradient[SZ];
       Acctype weight[SZ];

       #pragma unroll
       for (int ii = 0; ii < SZ; ii++)
       {
         int featureDim = startFeature + ii * WARP_SIZE;
         if (featureDim < stride)
         {
           gradient[ii] = ScalarConvert<Dtype, Acctype>::to(gradOutput[gradOutputRow + featureDim]);
           weight[ii] = ScalarConvert<Dtype, Acctype>::to(gradWeight[weightRow + featureDim]);
         }
       }

       #pragma unroll
       for (int ii = 0; ii < SZ; ii++)
       {
         weight[ii] += gradient[ii] * scale;
       }

       #pragma unroll
       for (int ii = 0; ii < SZ; ii++)
       {
         int featureDim = startFeature + ii * WARP_SIZE;
         if (featureDim < stride)
         {
           gradWeight[weightRow + featureDim] = ScalarConvert<Acctype, Dtype>::to(weight[ii]);
         }
       }

       idx++;
     } while (idx < numel && input[idx] == input[idx - 1]);
   }
 }

 /*
  * Keep the norm of weight smaller than maxNorm
  */
 template <typename Dtype, typename Acctype>
 struct pow_v
 {
   Acctype normType;
   pow_v(Dtype v) : normType(ScalarConvert<Dtype, Acctype>::to(v)) {}
   __host__ __device__
   Acctype operator()(const Dtype& x) const {
     Acctype xA = ScalarConvert<Dtype, Acctype>::to(x);
     if (normType == 1)
       return std::abs(xA);
     else if (normType == 2)
       return xA * xA;
     else
       return std::pow(std::abs(xA), normType);
   }
 };

 template <typename T>
 struct multiply_s
 {
   T scale;
   multiply_s(T s) : scale(s) {}
   __host__ __device__
   T operator()(const T& x) const {
     return x * scale;
   }
 };

 #include "generic/LookupTable.cu"
 #include "THCGenerateFloatTypes.h"
	#include "THCUNN.h"
	#include "common.h"

	#include "THCThrustAllocator.cuh"
	#include <thrust/device_ptr.h>
	#include <thrust/execution_policy.h>
	#include <thrust/iterator/constant_iterator.h>
	#include <thrust/transform_reduce.h>
	#if CUDA_VERSION >= 7000
	#include <thrust/system/cuda/execution_policy.h>
	#endif
	#include <thrust/unique.h>
	#include "THCHalf.h"
	#include "THCHalfAutoNumerics.cuh"
	#include "THCTensorSort.cuh"

	const int WARP_SIZE = 32;

	__device__ __forceinline__ bool warpHasCollision(int val)
	{
	// Compare our value to the values stored in the next 16 lanes,
	// wrapping around at 32. If any pair of values is the same than
	// there is a collision in the warp.
	bool dup = 0;
	const int laneId = threadIdx.x % 32;

	#if __CUDA_ARCH__ >= 300

	#pragma unroll
	for (int i = 1; i <= 16; i++)
	{
	dup \|= (__shfl(val, (laneId + i) % 32) == val);
	}

	#else

	volatile __shared__ int values[128];
	values[threadIdx.x] = val;
	const int offset = threadIdx.x - laneId;

	#pragma unroll
	for (int i = 1; i <= 16; i++)
	{
	dup \|= (values[offset + ((laneId + i) % 32)] == val);
	}

	#endif

	return __any(dup) != 0;
	}

	template <typename Dtype>
	__global__ void cunn_LookupTable_accGradParametersKernelByFeature(
	long input, Dtype gradOutput, Dtype *gradWeight, Dtype scale, ptrdiff_t numel,
	long stride, int paddingValue) {

	const int featureDim = blockIdx.x * 4 + threadIdx.x / 32;
	if (featureDim >= stride) {
	return;
	}

	// The strategy here is that each warp handles a single feature
	// dimension.
	// Within that feature dimension, points in the [batch][element]
	// dimension can overlap, and we need to determine if threads want
	// to add to the gradient in a colliding manner.
	// Typically one would use floating-point atomicAdd() to resolve
	// these collisions, but that is non-deterministic if there are
	// collisions. Non-determinism for this code is really bad,
	// especially in RNNs, and is prone to snowballing error.
	// In order to get a deterministic order of execution, we handle
	// non-colliding updates separately from colliding ones. Colliding
	// updates are serialized in their order of execution by using the
	// warp-wide collision detector `warpHasCollision`.
	const int laneId = threadIdx.x % 32;
	for (ptrdiff_t i = laneId; i < numel; i += WARP_SIZE) {
	const int weightIndex = (int) (input[i] - TH_INDEX_BASE);
	if (weightIndex == paddingValue - TH_INDEX_BASE) {
	continue;
	}

	Dtype update = gradOutput[istride + featureDim] scale;

	// FIXME: should we accumulate as accreal?
	// Check for collision
	if (warpHasCollision(weightIndex)) {
	// Run all lanes sequentially; warp divergence
	for (int i = 0; i < WARP_SIZE; ++i) {
	if (laneId == i) {
	gradWeight[weightIndex*stride + featureDim] += update;
	}
	}
	} else {
	// No collision; warp coherence
	gradWeight[weightIndex*stride + featureDim] += update;
	}
	}
	}

	template <typename Dtype, typename Acctype>
	__global__ void cunn_LookupTable_accGradParametersKernel(
	long input, long indices, Dtype gradOutput, Dtype gradWeight,
	long *count, Dtype defaultScale, ptrdiff_t numel, long stride, int paddingValue) {

	int idx = blockIdx.x * 4 + threadIdx.y;

	// Each warp is responsible for an input into the LookupTable.
	// If the preceeding input has the same as this input, then the warp
	// exits immediately. The warp also processes subsequent inputs with the
	// same value.
	//
	// Input Warp
	// 1 <warp 1>
	// 1 <warp 1> (<warp 2> exits without doing any work)
	// 5 <warp 3>
	// 8 <warp 4>

	// Number of values proceessed by each thread (grain size)
	const int SZ = 4;

	if (idx < numel
	&& (idx == 0 \|\| input[idx] != input[idx - 1])
	&& input[idx] != paddingValue) {
	do {
	const int startFeature = threadIdx.x + blockIdx.y * blockDim.x * SZ;
	const int weightRow = ((int) input[idx] - TH_INDEX_BASE) * stride;
	const int gradOutputRow = ((int) indices[idx] - TH_INDEX_BASE) * stride;
	const Acctype scale = count ? ScalarConvert<Dtype, Acctype>::to(defaultScale) / count[idx] : ScalarConvert<Dtype, Acctype>::to(defaultScale);

	Acctype gradient[SZ];
	Acctype weight[SZ];

	#pragma unroll
	for (int ii = 0; ii < SZ; ii++)
	{
	int featureDim = startFeature + ii * WARP_SIZE;
	if (featureDim < stride)
	{
	gradient[ii] = ScalarConvert<Dtype, Acctype>::to(gradOutput[gradOutputRow + featureDim]);
	weight[ii] = ScalarConvert<Dtype, Acctype>::to(gradWeight[weightRow + featureDim]);
	}
	}

	#pragma unroll
	for (int ii = 0; ii < SZ; ii++)
	{
	weight[ii] += gradient[ii] * scale;
	}

	#pragma unroll
	for (int ii = 0; ii < SZ; ii++)
	{
	int featureDim = startFeature + ii * WARP_SIZE;
	if (featureDim < stride)
	{
	gradWeight[weightRow + featureDim] = ScalarConvert<Acctype, Dtype>::to(weight[ii]);
	}
	}

	idx++;
	} while (idx < numel && input[idx] == input[idx - 1]);
	}
	}

	/*
	* Keep the norm of weight smaller than maxNorm
	*/
	template <typename Dtype, typename Acctype>
	struct pow_v
	{
	Acctype normType;
	pow_v(Dtype v) : normType(ScalarConvert<Dtype, Acctype>::to(v)) {}
	__host__ __device__
	Acctype operator()(const Dtype& x) const {
	Acctype xA = ScalarConvert<Dtype, Acctype>::to(x);
	if (normType == 1)
	return std::abs(xA);
	else if (normType == 2)
	return xA * xA;
	else
	return std::pow(std::abs(xA), normType);
	}
	};

	template <typename T>
	struct multiply_s
	{
	T scale;
	multiply_s(T s) : scale(s) {}
	__host__ __device__
	T operator()(const T& x) const {
	return x * scale;
	}
	};

	#include "generic/LookupTable.cu"
	#include "THCGenerateFloatTypes.h"