torch/lib/THCUNN/LookupTableBag.cu - platform/external/pytorch - Git at Google

 #include "THCUNN.h"
 #include "common.h"

 #include "THCThrustAllocator.cuh"
 #include <thrust/device_ptr.h>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/transform_reduce.h>
 #if CUDA_VERSION >= 7000
 #include <thrust/system/cuda/execution_policy.h>
 #endif
 #include <thrust/unique.h>
 #include "THCHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCTensorSort.cuh"

 const int WARP_SIZE = 32;
 const int MODE_SUM = 0;
 const int MODE_MEAN = 1;

 template <typename Dtype, typename Acctype>
 __global__ void cunn_LookupTableBag_updateOutputKernel(
   int64_t *input, int64_t *offsets, Dtype *weight, Dtype *output,
   int64_t *offset2bag, int64_t numIndices, int64_t numBags, int64_t stride, int mode,
   int64_t *bag_size) {

   // the strategy here is that each bag x feature is handled by a single thread

   int64_t chunksPerBag = THCCeilDiv(stride, (int64_t) blockDim.x);
   int64_t numChunks = numBags * chunksPerBag;
   int64_t chunkOffset = blockIdx.x * blockDim.y + threadIdx.y;
   int64_t chunkStride = gridDim.x * blockDim.y;

   for (int64_t chunk = chunkOffset; chunk < numChunks; chunk += chunkStride) {
     int64_t featureDim = (chunk % chunksPerBag) * blockDim.x + threadIdx.x;
     if (featureDim < stride) {
       int64_t bag = chunk / chunksPerBag;
       Dtype*  weightFeat = weight + featureDim;
       int64_t begin = offsets[bag] - TH_INDEX_BASE;
       int64_t end = (bag < numBags - 1) ? (offsets[bag + 1] - TH_INDEX_BASE) : numIndices;
       assert(end >= begin);
       Acctype weightFeatSum = ScalarConvert<float, Acctype>::to(0);
       int64_t bag_size_ = 0;
       for (int64_t emb = begin; emb < end; emb++) {
         const int weightRow = ((int) input[emb] - TH_INDEX_BASE) * stride;
         weightFeatSum += ScalarConvert<Dtype, Acctype>::to(weightFeat[weightRow]);
 	bag_size_ ++;
         if (featureDim == 0) {
           offset2bag[emb] = bag + TH_INDEX_BASE;
         }
       }
       if (mode == MODE_MEAN) {
 	weightFeatSum = weightFeatSum / ScalarConvert<int64_t, Acctype>::to(bag_size_);
 	bag_size[bag] = bag_size_;
       }
       output[bag * stride + featureDim] = ScalarConvert<Acctype, Dtype>::to(weightFeatSum);
     }
   }
 }

 // FIXME: removed the accGradParametersKernelByFeature case present in
 // LookupTable. That kernel is faster at small sizes (<768 indices), which
 // does not need LookupTableBag (LookupTable + Sum works fine), but would
 // still be nice to not be slow in that case.

 template <typename Dtype, typename Acctype>
 __global__ void cunn_LookupTableBag_accGradParametersKernel(
   int64_t *input, int64_t *indices, Dtype *gradOutput, Dtype *gradWeight, int64_t *offset2bag,
   int64_t *count, Dtype defaultScale, ptrdiff_t numel, int64_t stride,
   int mode, int64_t *bag_size) {

   int idx = blockIdx.x * 4 + threadIdx.y;

   // Each warp is responsible for an input into the LookupTable.
   // If the preceding input has the same as this input, then the warp
   // exits immediately. The warp also processes subsequent inputs with the
   // same value.
   //
   // Input Warp
   // 1     <warp 1>
   // 1     <warp 1> (<warp 2> exits without doing any work)
   // 5     <warp 3>
   // 8     <warp 4>

   // Number of values proceessed by each thread (grain size)
   const int SZ = 4;

   if (idx < numel
       && (idx == 0 || input[idx] != input[idx - 1])) {
     do {
       const int startFeature = threadIdx.x + blockIdx.y * blockDim.x * SZ;
       const int weightRow = ((int) input[idx] - TH_INDEX_BASE) * stride;

       // Note: only this line changes from LookupTable_accgradParametersKernel
       const int origRow = ((int) indices[idx] - TH_INDEX_BASE);
       const int seq_number = offset2bag[origRow] - TH_INDEX_BASE;
       const int gradOutputRow = ((int) seq_number) * stride;

       const Acctype scale = count ? ScalarConvert<Dtype, Acctype>::to(defaultScale) / count[idx] : ScalarConvert<Dtype, Acctype>::to(defaultScale);

       Acctype gradient[SZ];
       Acctype weight[SZ];

       #pragma unroll
       for (int ii = 0; ii < SZ; ii++)
       {
         int featureDim = startFeature + ii * WARP_SIZE;
         if (featureDim < stride)
         {
           gradient[ii] = ScalarConvert<Dtype, Acctype>::to(gradOutput[gradOutputRow + featureDim]);
 	  if (mode == MODE_MEAN) {
 	    gradient[ii] /= bag_size[seq_number];
 	  }
           weight[ii] = ScalarConvert<Dtype, Acctype>::to(gradWeight[weightRow + featureDim]);
         }
       }

       #pragma unroll
       for (int ii = 0; ii < SZ; ii++)
       {
         weight[ii] += gradient[ii] * scale;
       }

       #pragma unroll
       for (int ii = 0; ii < SZ; ii++)
       {
         int featureDim = startFeature + ii * WARP_SIZE;
         if (featureDim < stride)
         {
           gradWeight[weightRow + featureDim] = ScalarConvert<Acctype, Dtype>::to(weight[ii]);
         }
       }

       idx++;
     } while (idx < numel && input[idx] == input[idx - 1]);
   }
 }


 #include "generic/LookupTableBag.cu"
 #include "THCGenerateFloatTypes.h"
	#include "THCUNN.h"
	#include "common.h"

	#include "THCThrustAllocator.cuh"
	#include <thrust/device_ptr.h>
	#include <thrust/execution_policy.h>
	#include <thrust/iterator/constant_iterator.h>
	#include <thrust/transform_reduce.h>
	#if CUDA_VERSION >= 7000
	#include <thrust/system/cuda/execution_policy.h>
	#endif
	#include <thrust/unique.h>
	#include "THCHalf.h"
	#include "THCHalfAutoNumerics.cuh"
	#include "THCTensorSort.cuh"

	const int WARP_SIZE = 32;
	const int MODE_SUM = 0;
	const int MODE_MEAN = 1;

	template <typename Dtype, typename Acctype>
	__global__ void cunn_LookupTableBag_updateOutputKernel(
	int64_t input, int64_t offsets, Dtype weight, Dtype output,
	int64_t *offset2bag, int64_t numIndices, int64_t numBags, int64_t stride, int mode,
	int64_t *bag_size) {

	// the strategy here is that each bag x feature is handled by a single thread

	int64_t chunksPerBag = THCCeilDiv(stride, (int64_t) blockDim.x);
	int64_t numChunks = numBags * chunksPerBag;
	int64_t chunkOffset = blockIdx.x * blockDim.y + threadIdx.y;
	int64_t chunkStride = gridDim.x * blockDim.y;

	for (int64_t chunk = chunkOffset; chunk < numChunks; chunk += chunkStride) {
	int64_t featureDim = (chunk % chunksPerBag) * blockDim.x + threadIdx.x;
	if (featureDim < stride) {
	int64_t bag = chunk / chunksPerBag;
	Dtype* weightFeat = weight + featureDim;
	int64_t begin = offsets[bag] - TH_INDEX_BASE;
	int64_t end = (bag < numBags - 1) ? (offsets[bag + 1] - TH_INDEX_BASE) : numIndices;
	assert(end >= begin);
	Acctype weightFeatSum = ScalarConvert<float, Acctype>::to(0);
	int64_t bag_size_ = 0;
	for (int64_t emb = begin; emb < end; emb++) {
	const int weightRow = ((int) input[emb] - TH_INDEX_BASE) * stride;
	weightFeatSum += ScalarConvert<Dtype, Acctype>::to(weightFeat[weightRow]);
	bag_size_ ++;
	if (featureDim == 0) {
	offset2bag[emb] = bag + TH_INDEX_BASE;
	}
	}
	if (mode == MODE_MEAN) {
	weightFeatSum = weightFeatSum / ScalarConvert<int64_t, Acctype>::to(bag_size_);
	bag_size[bag] = bag_size_;
	}
	output[bag * stride + featureDim] = ScalarConvert<Acctype, Dtype>::to(weightFeatSum);
	}
	}
	}

	// FIXME: removed the accGradParametersKernelByFeature case present in
	// LookupTable. That kernel is faster at small sizes (<768 indices), which
	// does not need LookupTableBag (LookupTable + Sum works fine), but would
	// still be nice to not be slow in that case.

	template <typename Dtype, typename Acctype>
	__global__ void cunn_LookupTableBag_accGradParametersKernel(
	int64_t input, int64_t indices, Dtype gradOutput, Dtype gradWeight, int64_t *offset2bag,
	int64_t *count, Dtype defaultScale, ptrdiff_t numel, int64_t stride,
	int mode, int64_t *bag_size) {

	int idx = blockIdx.x * 4 + threadIdx.y;

	// Each warp is responsible for an input into the LookupTable.
	// If the preceding input has the same as this input, then the warp
	// exits immediately. The warp also processes subsequent inputs with the
	// same value.
	//
	// Input Warp
	// 1 <warp 1>
	// 1 <warp 1> (<warp 2> exits without doing any work)
	// 5 <warp 3>
	// 8 <warp 4>

	// Number of values proceessed by each thread (grain size)
	const int SZ = 4;

	if (idx < numel
	&& (idx == 0 \|\| input[idx] != input[idx - 1])) {
	do {
	const int startFeature = threadIdx.x + blockIdx.y * blockDim.x * SZ;
	const int weightRow = ((int) input[idx] - TH_INDEX_BASE) * stride;

	// Note: only this line changes from LookupTable_accgradParametersKernel
	const int origRow = ((int) indices[idx] - TH_INDEX_BASE);
	const int seq_number = offset2bag[origRow] - TH_INDEX_BASE;
	const int gradOutputRow = ((int) seq_number) * stride;

	const Acctype scale = count ? ScalarConvert<Dtype, Acctype>::to(defaultScale) / count[idx] : ScalarConvert<Dtype, Acctype>::to(defaultScale);

	Acctype gradient[SZ];
	Acctype weight[SZ];

	#pragma unroll
	for (int ii = 0; ii < SZ; ii++)
	{
	int featureDim = startFeature + ii * WARP_SIZE;
	if (featureDim < stride)
	{
	gradient[ii] = ScalarConvert<Dtype, Acctype>::to(gradOutput[gradOutputRow + featureDim]);
	if (mode == MODE_MEAN) {
	gradient[ii] /= bag_size[seq_number];
	}
	weight[ii] = ScalarConvert<Dtype, Acctype>::to(gradWeight[weightRow + featureDim]);
	}
	}

	#pragma unroll
	for (int ii = 0; ii < SZ; ii++)
	{
	weight[ii] += gradient[ii] * scale;
	}

	#pragma unroll
	for (int ii = 0; ii < SZ; ii++)
	{
	int featureDim = startFeature + ii * WARP_SIZE;
	if (featureDim < stride)
	{
	gradWeight[weightRow + featureDim] = ScalarConvert<Acctype, Dtype>::to(weight[ii]);
	}
	}

	idx++;
	} while (idx < numel && input[idx] == input[idx - 1]);
	}
	}


	#include "generic/LookupTableBag.cu"
	#include "THCGenerateFloatTypes.h"