torch/lib/THCUNN/IndexLinear.cu - platform/external/pytorch - Git at Google

 #include "THCUNN.h"
 #include "THCHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "THCAtomics.cuh"

 #define divup(a, b) ((a) + (b) - 1) / (b)
 const int THREADS_PER_BLOCK = 256;
 const int THREADS_X = 32;
 const int THREADS_Y = THREADS_PER_BLOCK / THREADS_X;
 const int REPEAT = 32;
 const int64_t NNZ_PER_BLOCK_MAX = 1024;

 /* sign MACRO */
 #ifndef clamp
 #define clamp(a, low, high) max(min((a), (high)), (low))
 #endif

 __device__ double atomicExch(double *address, double val) {
   unsigned long long int* address_as_ull = (unsigned long long int*)address;
   unsigned long long res = atomicExch(address_as_ull, __double_as_longlong(val));
   return __longlong_as_double(res);
 }

 template<typename Ty, bool train>
 __global__ static
 void updateOutput(
     Ty *output,
     Ty *normalizedValues,
     const Ty *values,
     const int64_t *cumSumSizes,
     const int64_t *keys,
     const int64_t batchSize,
     const int64_t outDim,
     Ty *weight,
     const Ty *bias,
     const int64_t weightStride,
     const int64_t keysOffset,
     const int maxNormalize,
     const int nnzPerBlock)
 {
     /*******************************************************
      * Adapted from the following file in arrayfire
      * https://github.com/arrayfire/arrayfire/blob/v3.4.1/src/backend/opencl/kernel/csrmm.cl
      *
      *******************************************************
      * Original copyright notice can be seen below:
      *
      * Copyright (c) 2016, ArrayFire
      * All rights reserved.
      *
      * This file is distributed under 3-clause BSD license.
      * The complete license agreement can be obtained at:
      * http://arrayfire.com/licenses/BSD-3-Clause
      ********************************************************/

     const int64_t tidx = threadIdx.x;
     const int64_t tidy = threadIdx.y;
     const int64_t tid  = tidy * blockDim.x + tidx;
     const int64_t gidx = blockIdx.x * blockDim.x + tidx;


     Ty *nWeight = weight;
      // Offset the number of elements specified by  maxNormalize
     weight += gidx + maxNormalize;
     output += gidx;

     bool within_N = (gidx < outDim);

     __shared__ Ty s_values[THREADS_PER_BLOCK];
     __shared__ int64_t s_keys[THREADS_PER_BLOCK];

     const int64_t rowId = blockIdx.y;
     // if (rowId >= batchSize) return;

     // Load the nonzero column offsets for current row
     const int64_t batchStart = (rowId == 0 ? 0 : cumSumSizes[rowId - 1]) + blockIdx.z * nnzPerBlock;
     const int64_t batchEnd   = min(batchStart + nnzPerBlock, cumSumSizes[rowId]);
     const int64_t batchStride = blockDim.x * blockDim.y;

     Ty outVal = 0;
     // Since the number of nonzero elements might be greater than local memory available,
     // Load only part of the row into local memory, perform partial dot, repeat until done.
     for (int64_t id = batchStart; id < batchEnd; id += batchStride) {
         // Load the current chunk of the row into local memory
         int64_t lim = min(batchEnd - id, (int64_t)batchStride);

         int64_t key = tid < lim ? keys[id + tid] + keysOffset : -1;
         Ty val = tid < lim ? values[id + tid] : 0;
         int64_t nWeightOffset = key * weightStride;

         if (tid < lim && maxNormalize) {
             Ty *nWeightCurr = nWeight + nWeightOffset;
             if (train) {
                 Ty absVal = fabs(val);
                 Ty maxVal = nWeightCurr[0];
                 if (absVal > maxVal) {
                     // Updating maxVal and invMaxVal. Go hogwild!
                     Ty invAbsVal = 1.0 / absVal;
                     atomicExch(nWeightCurr + 0, absVal);
                     atomicExch(nWeightCurr + 1, invAbsVal);
                 }
                 val = clamp(val * nWeightCurr[1], -1.0, 1.0) + nWeightCurr[3];
                 normalizedValues[id + tid] = val;
                 nWeightCurr[2] = 1;
             } else {
                 val = clamp(val * nWeightCurr[1], -1.0, 1.0) + nWeightCurr[3];
             }
         }

         s_keys[tid] = key;
         s_values[tid] = val;
         __syncthreads();

         // Perform a single "dot" operation for each thread
         for (int64_t idy = tidy; within_N && idy < lim; idy += blockDim.y) {
             outVal += s_values[idy] * weight[weightStride * s_keys[idy]];
         }
         __syncthreads();
     }

     // s_values is no longer used at this point. Reuse it for reducing outVal.
     // A reduction along the y dimension now gives a single output value along x.
     s_values[tid] = outVal;
     for (int64_t y = blockDim.y / 2; y >= 1; y /= 2) {
         __syncthreads();
         if (tidy < y) s_values[tid] = s_values[tid] + s_values[tid + y * blockDim.x];
     }

     if (within_N && tidy == 0) {
         Ty val = s_values[tid] + (blockIdx.z == 0 ? bias[gidx] : 0);
         if (gridDim.z == 1) {
             output[rowId * outDim] = val;
         } else {
             atomicAdd(output + rowId * outDim, val);
         }
     }
 }

 // This kernel takes in the following inputs:
 // values of size [keysSize x 1] and gradOutput of size [batchSize x outDim],
 // to generate gradWeight of size [keysSize x outDim]
 // nth block along y dimension computes on the non zero elements from the nth batch.
 template<typename Ty>
 __global__ static
 void accGradWeight(
     Ty *gradWeight,
     const Ty *gradOutput,
     const Ty *values,
     const int64_t  *cumSumSizes,
     const int64_t  outDim,
     const int64_t  gradWeightStride,
     const Ty scale,
     const Ty weightDecay,
     const int maxNormalize)
 {
     const int64_t bidy = blockIdx.y;
     const int64_t tidx = threadIdx.x;
     const int64_t tidy = threadIdx.y;
     const int64_t tid  = tidy * blockDim.x + tidx;
     const int64_t ntid = blockDim.x * blockDim.y;
     const int64_t gidx = blockIdx.x * blockDim.x + tidx;

     // All the y threads in the block will use the same gradOutput value
     gradOutput += bidy * outDim;
     Ty gradOutVal = scale * (gidx < outDim ? gradOutput[gidx] : 0);

     // Calculate the amount of work for the current block / batch.
     const int64_t batchStart = bidy == 0 ? 0 : cumSumSizes[bidy - 1];
     const int64_t batchEnd   = cumSumSizes[bidy];
     const int64_t batchLimit = batchEnd - batchStart;

     // Number of iterations required to finish the work for the current batch.
     const int64_t iters    = divup(batchLimit, ntid);

     // Offset the values to the current batch.
     values += batchStart;

     // When maxNormalize is enabled, gradWeight will be twice the size.
     // The first half will contain the gradients required for maxNormalization.
     // The second half will contain the gradients required for updating weights.
     // if maxNormalize is false, both will evaluate to the same pointer.
     Ty *gradWeight0 = gradWeight + batchStart * gradWeightStride + gidx;
     Ty *gradWeight1 = gradWeight0 + (maxNormalize ? outDim : 0);

     __shared__ Ty s_values[THREADS_PER_BLOCK];

     // Using iters to avoid divergence + synchtreads
     for (int64_t n = 0; n < iters; n++) {
         int64_t off = n * ntid;
         int64_t id = off + tid;
         int64_t lim = min(ntid, batchLimit - off);

         // Read the values required for the current iteration.
         s_values[tid] = id < batchLimit ? values[id] : 0;
         __syncthreads();

         if (gidx < outDim) {
             if (maxNormalize) {
                 for (int64_t idy = tidy; idy < lim; idy += blockDim.y) {
                     // gradOutVal is already scaled
                     gradWeight0[(off + idy) * gradWeightStride] = gradOutVal;
                 }
             }

             for (int64_t idy = tidy; idy < lim; idy += blockDim.y) {
                 gradWeight1[(off + idy) * gradWeightStride] = s_values[idy] * gradOutVal;
             }
         }
         __syncthreads();
     }
 }

 // The gradBias is just a reduction of gradOutput along the batches.
 // There is only one block along y dimension performing the reduction.
 template<typename Ty, bool update>
 __global__ static
 void accGradBias(
     Ty *buffer,
     const Ty *gradOutput,
     const int64_t  outDim,
     const int64_t  batchSize,
     const Ty scale,
     const Ty weightDecay)
 {
     const int tidx = threadIdx.x;
     const int tidy = threadIdx.y;
     const int tid = tidy * blockDim.x + tidx;
     const int64_t idx = blockIdx.x * blockDim.x + tidx;


     Ty gradBiasVal = 0;
     gradOutput += idx;
     __shared__ Ty s_gradBiasVals[THREADS_PER_BLOCK];

     // Each thread along y calculates the partial sum.
     if (idx < outDim) {
         for (int64_t idy = tidy; idy < batchSize; idy += blockDim.y) {
             gradBiasVal += gradOutput[idy * outDim];
         }
     }
     s_gradBiasVals[tid] = gradBiasVal * scale;
     __syncthreads();

     // Perform reduction is performed along y.
     for (int y = blockDim.y / 2; y >= 1; y /= 2) {
         if (tidy < y) {
             s_gradBiasVals[tid] += s_gradBiasVals[tid + y * blockDim.x];
         }
         __syncthreads();
     }

     // Write the output only from the first lane.
     if (tidy == 0 && idx < outDim) {
         if (update) {
             // If performing inplace update, subtract from bias.
             Ty *bias = buffer;
             bias[idx] = (bias[idx] - s_gradBiasVals[tid]);
         } else {
             // If just accumulating gradients, write to gradBias.
             Ty *gradBias = buffer;
             gradBias[idx] = s_gradBiasVals[tid];
         }
     }
 }

 // Use gradWeight from accGradWeight to update the weight.
 // This kernel is launched batchSize number of times.
 // At each step in the iteration, the weights are updated in a sparse manner.
 template<typename Ty>
 __global__ static
 void updateWeight(
     Ty *weight,
     const Ty *gradWeight,
     const int64_t *keys,
     const int64_t *cumSumSizes,
     const int64_t outDim,
     const int64_t gradWeightStride,
     const int64_t weightStride,
     const int64_t keysOffset,
     const Ty learningRate,
     const Ty weightDecay,
     const int maxNormalize,
     const int64_t batchId)
 {
     int64_t gidx = blockIdx.x * blockDim.x + threadIdx.x;
     int64_t gidy = blockIdx.y * blockDim.y + threadIdx.y;

     // Find the limits of the work to be done
     const int64_t batchStart = batchId == 0 ? 0 : cumSumSizes[batchId - 1];
     const int64_t batchEnd = cumSumSizes[batchId];

     // When maxNormalize is turned on, the weight tensor will contain
     // an extra "maxNormalize" number of terms per output at the beginning.
     // When maxNormalize is false, both will evaluate to same pointer.
     // when maxNormalize is true,
     // - nWeight[2] will contain the individual scaling factor.
     // - nWeight[3] will contain the individual bias for the normalized input.
     Ty *nWeight = weight;
     weight += maxNormalize + gidx;

     // When maxNormalize is enabled, gradWeight will be twice the size.
     // The first half will contain the gradients required for maxNormalization.
     // The second half will contain the gradients required for updating weights.
     // if maxNormalize is false, both will evaluate to the same pointer.
     const Ty *gradWeight0 = gradWeight + gidx;
     const Ty *gradWeight1 = gradWeight0 + (maxNormalize ? outDim : 0);

     if (gidx >= outDim) return;
     for (int64_t id = batchStart + gidy; id < batchEnd; id += blockDim.y * gridDim.y) {
         Ty lr = learningRate;
         Ty wd = weightDecay;
         int64_t weightOffset = (keys[id] + keysOffset) * weightStride;
         Ty weightVal = weight[weightOffset];

         if (maxNormalize) {
             Ty scale = nWeight[weightOffset + 2];
             lr *= scale;
             wd *= scale;
             // nWeight[3] needs to be updated in the following manner for a given input.
             // nWeight[3] = nWeight[3] - sum(gradWeight0[gidx] * weight[gidx]);
             // Since problem is parallelized along gidx, use atomicAdd for the update.
             Ty gradNormBias = lr * weightVal * gradWeight0[id * gradWeightStride];
             atomicAdd(nWeight + weightOffset + 3, -gradNormBias);
         }

         // Perform the regular update
         Ty gradWeightVal = lr * gradWeight1[id * gradWeightStride];
         if (weightDecay == 0) {
             weight[weightOffset] = weightVal - gradWeightVal;
         } else {
             weight[weightOffset] = weightVal * (1 - wd) - gradWeightVal;
         }
     }
 }

 // This kernel is launched batchSize number of times.
 // At each step in the iteration, the weights are updated in place in a sparse manner.
 template<typename Ty>
 __global__ static
 void accUpdateWeight(
     Ty *weight,
     const int64_t weightStride,
     const Ty *gradOutput,
     const int64_t outDim,
     const Ty *values,
     const int64_t *cumSumSizes,
     const int64_t *keys,
     const int64_t keysOffset,
     const Ty scale,
     const Ty weightDecay,
     const int maxNormalize,
     const int64_t batchId)
 {
     // Parallel along outDim.
     int64_t gidx = blockIdx.x * blockDim.x + threadIdx.x;
     // Parallel along the sparse input size for current batch.
     int64_t gidy = blockIdx.y * blockDim.y + threadIdx.y;

     if (gidx >= outDim) return;

     // Find the limits of the work to be done.
     const int64_t batchStart = batchId == 0 ? 0 : cumSumSizes[batchId - 1];
     const int64_t batchEnd = cumSumSizes[batchId];

     gradOutput += batchId * outDim;
     Ty gradOutVal = scale * (gidx < outDim ? gradOutput[gidx] : 0);

     // When maxNormalize is turned on, the weight tensor will contain
     // an extra "maxNormalize" number of terms per output at the beginning.
     // When maxNormalize is false, both will evaluate to same pointer.
     // when maxNormalize is true,
     // - nWeight[2] will contain the individual scaling factor.
     // - nWeight[3] will contain the individual bias for the normalized input.
     Ty *nWeight = weight;
     weight += maxNormalize + gidx;

     for (int64_t id = batchStart + gidy; id < batchEnd; id += blockDim.y * gridDim.y) {
         Ty wd = weightDecay;
         int64_t weightOffset = (keys[id] + keysOffset) * weightStride;
         Ty gradWeightVal = gradOutVal * values[id];
         Ty weightVal = weight[weightOffset];

         if (maxNormalize) {
             Ty nScale = nWeight[weightOffset + 2];
             gradWeightVal *= nScale;
             wd *= nScale;
             // nWeight[3] needs to be updated in the following manner for a given input.
             // nWeight[3] = nWeight[3] - sum(gradOut[gidx] * weight[gidx]);
             // Since problem is parallelized along gidx, use atomicAdd for the update.
             Ty gradNormBias = nScale * weightVal * gradOutVal;
             atomicAdd(nWeight + weightOffset + 3, -gradNormBias);
         }

         // Perform the regular update
         if (weightDecay == 0) {
             weight[weightOffset] = weightVal - gradWeightVal;
         } else {
             weight[weightOffset] = weightVal * (1 - wd) - gradWeightVal;
         }
     }
 }


 #ifdef CUDA_HALF_TENSOR
 void THNN_CudaHalfIndexLinear_updateOutput(
                   THCState *state,
                   THCudaLongTensor *keys,
                   int64_t keysOffset,
                   THCudaHalfTensor *values,
                   THCudaLongTensor *sizes,
                   THCudaLongTensor *cumSumSizes,
                   THCudaHalfTensor *output,
                   THCudaHalfTensor *weight,
                   THCudaHalfTensor *bias,
                   THCudaHalfTensor *normalizedValues,
                   int   train) {
     THError("THCudaHalfTensor not supported with IndexLinear");
 }

 void THNN_CudaHalfIndexLinear_accGradParameters(
                   THCState *state,
                   THCudaLongTensor *keys,
                   int64_t keysOffset,
                   THCudaHalfTensor *values,
                   THCudaLongTensor *sizes,
                   THCudaLongTensor *cumSumSizes,
                   THCudaHalfTensor *gradOutput,
                   THCudaHalfTensor *gradWeight,
                   THCudaHalfTensor *gradBias,
                   THCudaHalfTensor *weight,
                   THCudaHalfTensor *bias,
                   THCudaHalfTensor* valuesBuffer,
                   float weightDecay,
                   float scale) {
     THError("THCudaHalfTensor not supported with IndexLinear");
 }

 void THNN_CudaHalfIndexLinear_accUpdateGradParameters(
                   THCState *state,
                   THCudaLongTensor *keys,
                   int64_t keysOffset,
                   THCudaHalfTensor *values,
                   THCudaLongTensor *sizes,
                   THCudaLongTensor *cumSumSizes,
                   THCudaHalfTensor *gradOutput,
                   THCudaHalfTensor *weight,
                   THCudaHalfTensor *bias,
                   float weightDecay,
                   float scale) {
     THError("THCudaHalfTensor not supported with IndexLinear");
 }

 void THNN_CudaHalfIndexLinear_updateParameters(
                   THCState *state,
                   THCudaHalfTensor *gradWeight,
                   THCudaHalfTensor *gradBias,
                   THCudaHalfTensor *weight,
                   THCudaHalfTensor *bias,
                   THCudaLongTensor *runningKeys,
                   THCudaLongTensor *cumSumSizes,
                   int64_t keysOffset,
                   float weightDecay,
                   float learningRate) {
     THError("THCudaHalfTensor not supported with IndexLinear");
 }
 #endif

 #include "generic/IndexLinear.cu"
 #include "THCGenerateFloatType.h"
 #include "generic/IndexLinear.cu"
 #include "THCGenerateDoubleType.h"
	#include "THCUNN.h"
	#include "THCHalf.h"
	#include "THCHalfAutoNumerics.cuh"
	#include "THCAtomics.cuh"

	#define divup(a, b) ((a) + (b) - 1) / (b)
	const int THREADS_PER_BLOCK = 256;
	const int THREADS_X = 32;
	const int THREADS_Y = THREADS_PER_BLOCK / THREADS_X;
	const int REPEAT = 32;
	const int64_t NNZ_PER_BLOCK_MAX = 1024;

	/* sign MACRO */
	#ifndef clamp
	#define clamp(a, low, high) max(min((a), (high)), (low))
	#endif

	__device__ double atomicExch(double *address, double val) {
	unsigned long long int* address_as_ull = (unsigned long long int*)address;
	unsigned long long res = atomicExch(address_as_ull, __double_as_longlong(val));
	return __longlong_as_double(res);
	}

	template<typename Ty, bool train>
	__global__ static
	void updateOutput(
	Ty *output,
	Ty *normalizedValues,
	const Ty *values,
	const int64_t *cumSumSizes,
	const int64_t *keys,
	const int64_t batchSize,
	const int64_t outDim,
	Ty *weight,
	const Ty *bias,
	const int64_t weightStride,
	const int64_t keysOffset,
	const int maxNormalize,
	const int nnzPerBlock)
	{
	/*******************************************************
	* Adapted from the following file in arrayfire
	* https://github.com/arrayfire/arrayfire/blob/v3.4.1/src/backend/opencl/kernel/csrmm.cl
	*
	*******************************************************
	* Original copyright notice can be seen below:
	*
	* Copyright (c) 2016, ArrayFire
	* All rights reserved.
	*
	* This file is distributed under 3-clause BSD license.
	* The complete license agreement can be obtained at:
	* http://arrayfire.com/licenses/BSD-3-Clause
	********************************************************/

	const int64_t tidx = threadIdx.x;
	const int64_t tidy = threadIdx.y;
	const int64_t tid = tidy * blockDim.x + tidx;
	const int64_t gidx = blockIdx.x * blockDim.x + tidx;


	Ty *nWeight = weight;
	// Offset the number of elements specified by maxNormalize
	weight += gidx + maxNormalize;
	output += gidx;

	bool within_N = (gidx < outDim);

	__shared__ Ty s_values[THREADS_PER_BLOCK];
	__shared__ int64_t s_keys[THREADS_PER_BLOCK];

	const int64_t rowId = blockIdx.y;
	// if (rowId >= batchSize) return;

	// Load the nonzero column offsets for current row
	const int64_t batchStart = (rowId == 0 ? 0 : cumSumSizes[rowId - 1]) + blockIdx.z * nnzPerBlock;
	const int64_t batchEnd = min(batchStart + nnzPerBlock, cumSumSizes[rowId]);
	const int64_t batchStride = blockDim.x * blockDim.y;

	Ty outVal = 0;
	// Since the number of nonzero elements might be greater than local memory available,
	// Load only part of the row into local memory, perform partial dot, repeat until done.
	for (int64_t id = batchStart; id < batchEnd; id += batchStride) {
	// Load the current chunk of the row into local memory
	int64_t lim = min(batchEnd - id, (int64_t)batchStride);

	int64_t key = tid < lim ? keys[id + tid] + keysOffset : -1;
	Ty val = tid < lim ? values[id + tid] : 0;
	int64_t nWeightOffset = key * weightStride;

	if (tid < lim && maxNormalize) {
	Ty *nWeightCurr = nWeight + nWeightOffset;
	if (train) {
	Ty absVal = fabs(val);
	Ty maxVal = nWeightCurr[0];
	if (absVal > maxVal) {
	// Updating maxVal and invMaxVal. Go hogwild!
	Ty invAbsVal = 1.0 / absVal;
	atomicExch(nWeightCurr + 0, absVal);
	atomicExch(nWeightCurr + 1, invAbsVal);
	}
	val = clamp(val * nWeightCurr[1], -1.0, 1.0) + nWeightCurr[3];
	normalizedValues[id + tid] = val;
	nWeightCurr[2] = 1;
	} else {
	val = clamp(val * nWeightCurr[1], -1.0, 1.0) + nWeightCurr[3];
	}
	}

	s_keys[tid] = key;
	s_values[tid] = val;
	__syncthreads();

	// Perform a single "dot" operation for each thread
	for (int64_t idy = tidy; within_N && idy < lim; idy += blockDim.y) {
	outVal += s_values[idy] * weight[weightStride * s_keys[idy]];
	}
	__syncthreads();
	}

	// s_values is no longer used at this point. Reuse it for reducing outVal.
	// A reduction along the y dimension now gives a single output value along x.
	s_values[tid] = outVal;
	for (int64_t y = blockDim.y / 2; y >= 1; y /= 2) {
	__syncthreads();
	if (tidy < y) s_values[tid] = s_values[tid] + s_values[tid + y * blockDim.x];
	}

	if (within_N && tidy == 0) {
	Ty val = s_values[tid] + (blockIdx.z == 0 ? bias[gidx] : 0);
	if (gridDim.z == 1) {
	output[rowId * outDim] = val;
	} else {
	atomicAdd(output + rowId * outDim, val);
	}
	}
	}

	// This kernel takes in the following inputs:
	// values of size [keysSize x 1] and gradOutput of size [batchSize x outDim],
	// to generate gradWeight of size [keysSize x outDim]
	// nth block along y dimension computes on the non zero elements from the nth batch.
	template<typename Ty>
	__global__ static
	void accGradWeight(
	Ty *gradWeight,
	const Ty *gradOutput,
	const Ty *values,
	const int64_t *cumSumSizes,
	const int64_t outDim,
	const int64_t gradWeightStride,
	const Ty scale,
	const Ty weightDecay,
	const int maxNormalize)
	{
	const int64_t bidy = blockIdx.y;
	const int64_t tidx = threadIdx.x;
	const int64_t tidy = threadIdx.y;
	const int64_t tid = tidy * blockDim.x + tidx;
	const int64_t ntid = blockDim.x * blockDim.y;
	const int64_t gidx = blockIdx.x * blockDim.x + tidx;

	// All the y threads in the block will use the same gradOutput value
	gradOutput += bidy * outDim;
	Ty gradOutVal = scale * (gidx < outDim ? gradOutput[gidx] : 0);

	// Calculate the amount of work for the current block / batch.
	const int64_t batchStart = bidy == 0 ? 0 : cumSumSizes[bidy - 1];
	const int64_t batchEnd = cumSumSizes[bidy];
	const int64_t batchLimit = batchEnd - batchStart;

	// Number of iterations required to finish the work for the current batch.
	const int64_t iters = divup(batchLimit, ntid);

	// Offset the values to the current batch.
	values += batchStart;

	// When maxNormalize is enabled, gradWeight will be twice the size.
	// The first half will contain the gradients required for maxNormalization.
	// The second half will contain the gradients required for updating weights.
	// if maxNormalize is false, both will evaluate to the same pointer.
	Ty gradWeight0 = gradWeight + batchStart gradWeightStride + gidx;
	Ty *gradWeight1 = gradWeight0 + (maxNormalize ? outDim : 0);

	__shared__ Ty s_values[THREADS_PER_BLOCK];

	// Using iters to avoid divergence + synchtreads
	for (int64_t n = 0; n < iters; n++) {
	int64_t off = n * ntid;
	int64_t id = off + tid;
	int64_t lim = min(ntid, batchLimit - off);

	// Read the values required for the current iteration.
	s_values[tid] = id < batchLimit ? values[id] : 0;
	__syncthreads();

	if (gidx < outDim) {
	if (maxNormalize) {
	for (int64_t idy = tidy; idy < lim; idy += blockDim.y) {
	// gradOutVal is already scaled
	gradWeight0[(off + idy) * gradWeightStride] = gradOutVal;
	}
	}

	for (int64_t idy = tidy; idy < lim; idy += blockDim.y) {
	gradWeight1[(off + idy) * gradWeightStride] = s_values[idy] * gradOutVal;
	}
	}
	__syncthreads();
	}
	}

	// The gradBias is just a reduction of gradOutput along the batches.
	// There is only one block along y dimension performing the reduction.
	template<typename Ty, bool update>
	__global__ static
	void accGradBias(
	Ty *buffer,
	const Ty *gradOutput,
	const int64_t outDim,
	const int64_t batchSize,
	const Ty scale,
	const Ty weightDecay)
	{
	const int tidx = threadIdx.x;
	const int tidy = threadIdx.y;
	const int tid = tidy * blockDim.x + tidx;
	const int64_t idx = blockIdx.x * blockDim.x + tidx;


	Ty gradBiasVal = 0;
	gradOutput += idx;
	__shared__ Ty s_gradBiasVals[THREADS_PER_BLOCK];

	// Each thread along y calculates the partial sum.
	if (idx < outDim) {
	for (int64_t idy = tidy; idy < batchSize; idy += blockDim.y) {
	gradBiasVal += gradOutput[idy * outDim];
	}
	}
	s_gradBiasVals[tid] = gradBiasVal * scale;
	__syncthreads();

	// Perform reduction is performed along y.
	for (int y = blockDim.y / 2; y >= 1; y /= 2) {
	if (tidy < y) {
	s_gradBiasVals[tid] += s_gradBiasVals[tid + y * blockDim.x];
	}
	__syncthreads();
	}

	// Write the output only from the first lane.
	if (tidy == 0 && idx < outDim) {
	if (update) {
	// If performing inplace update, subtract from bias.
	Ty *bias = buffer;
	bias[idx] = (bias[idx] - s_gradBiasVals[tid]);
	} else {
	// If just accumulating gradients, write to gradBias.
	Ty *gradBias = buffer;
	gradBias[idx] = s_gradBiasVals[tid];
	}
	}
	}

	// Use gradWeight from accGradWeight to update the weight.
	// This kernel is launched batchSize number of times.
	// At each step in the iteration, the weights are updated in a sparse manner.
	template<typename Ty>
	__global__ static
	void updateWeight(
	Ty *weight,
	const Ty *gradWeight,
	const int64_t *keys,
	const int64_t *cumSumSizes,
	const int64_t outDim,
	const int64_t gradWeightStride,
	const int64_t weightStride,
	const int64_t keysOffset,
	const Ty learningRate,
	const Ty weightDecay,
	const int maxNormalize,
	const int64_t batchId)
	{
	int64_t gidx = blockIdx.x * blockDim.x + threadIdx.x;
	int64_t gidy = blockIdx.y * blockDim.y + threadIdx.y;

	// Find the limits of the work to be done
	const int64_t batchStart = batchId == 0 ? 0 : cumSumSizes[batchId - 1];
	const int64_t batchEnd = cumSumSizes[batchId];

	// When maxNormalize is turned on, the weight tensor will contain
	// an extra "maxNormalize" number of terms per output at the beginning.
	// When maxNormalize is false, both will evaluate to same pointer.
	// when maxNormalize is true,
	// - nWeight[2] will contain the individual scaling factor.
	// - nWeight[3] will contain the individual bias for the normalized input.
	Ty *nWeight = weight;
	weight += maxNormalize + gidx;

	// When maxNormalize is enabled, gradWeight will be twice the size.
	// The first half will contain the gradients required for maxNormalization.
	// The second half will contain the gradients required for updating weights.
	// if maxNormalize is false, both will evaluate to the same pointer.
	const Ty *gradWeight0 = gradWeight + gidx;
	const Ty *gradWeight1 = gradWeight0 + (maxNormalize ? outDim : 0);

	if (gidx >= outDim) return;
	for (int64_t id = batchStart + gidy; id < batchEnd; id += blockDim.y * gridDim.y) {
	Ty lr = learningRate;
	Ty wd = weightDecay;
	int64_t weightOffset = (keys[id] + keysOffset) * weightStride;
	Ty weightVal = weight[weightOffset];

	if (maxNormalize) {
	Ty scale = nWeight[weightOffset + 2];
	lr *= scale;
	wd *= scale;
	// nWeight[3] needs to be updated in the following manner for a given input.
	// nWeight[3] = nWeight[3] - sum(gradWeight0[gidx] * weight[gidx]);
	// Since problem is parallelized along gidx, use atomicAdd for the update.
	Ty gradNormBias = lr * weightVal * gradWeight0[id * gradWeightStride];
	atomicAdd(nWeight + weightOffset + 3, -gradNormBias);
	}

	// Perform the regular update
	Ty gradWeightVal = lr * gradWeight1[id * gradWeightStride];
	if (weightDecay == 0) {
	weight[weightOffset] = weightVal - gradWeightVal;
	} else {
	weight[weightOffset] = weightVal * (1 - wd) - gradWeightVal;
	}
	}
	}

	// This kernel is launched batchSize number of times.
	// At each step in the iteration, the weights are updated in place in a sparse manner.
	template<typename Ty>
	__global__ static
	void accUpdateWeight(
	Ty *weight,
	const int64_t weightStride,
	const Ty *gradOutput,
	const int64_t outDim,
	const Ty *values,
	const int64_t *cumSumSizes,
	const int64_t *keys,
	const int64_t keysOffset,
	const Ty scale,
	const Ty weightDecay,
	const int maxNormalize,
	const int64_t batchId)
	{
	// Parallel along outDim.
	int64_t gidx = blockIdx.x * blockDim.x + threadIdx.x;
	// Parallel along the sparse input size for current batch.
	int64_t gidy = blockIdx.y * blockDim.y + threadIdx.y;

	if (gidx >= outDim) return;

	// Find the limits of the work to be done.
	const int64_t batchStart = batchId == 0 ? 0 : cumSumSizes[batchId - 1];
	const int64_t batchEnd = cumSumSizes[batchId];

	gradOutput += batchId * outDim;
	Ty gradOutVal = scale * (gidx < outDim ? gradOutput[gidx] : 0);

	// When maxNormalize is turned on, the weight tensor will contain
	// an extra "maxNormalize" number of terms per output at the beginning.
	// When maxNormalize is false, both will evaluate to same pointer.
	// when maxNormalize is true,
	// - nWeight[2] will contain the individual scaling factor.
	// - nWeight[3] will contain the individual bias for the normalized input.
	Ty *nWeight = weight;
	weight += maxNormalize + gidx;

	for (int64_t id = batchStart + gidy; id < batchEnd; id += blockDim.y * gridDim.y) {
	Ty wd = weightDecay;
	int64_t weightOffset = (keys[id] + keysOffset) * weightStride;
	Ty gradWeightVal = gradOutVal * values[id];
	Ty weightVal = weight[weightOffset];

	if (maxNormalize) {
	Ty nScale = nWeight[weightOffset + 2];
	gradWeightVal *= nScale;
	wd *= nScale;
	// nWeight[3] needs to be updated in the following manner for a given input.
	// nWeight[3] = nWeight[3] - sum(gradOut[gidx] * weight[gidx]);
	// Since problem is parallelized along gidx, use atomicAdd for the update.
	Ty gradNormBias = nScale * weightVal * gradOutVal;
	atomicAdd(nWeight + weightOffset + 3, -gradNormBias);
	}

	// Perform the regular update
	if (weightDecay == 0) {
	weight[weightOffset] = weightVal - gradWeightVal;
	} else {
	weight[weightOffset] = weightVal * (1 - wd) - gradWeightVal;
	}
	}
	}


	#ifdef CUDA_HALF_TENSOR
	void THNN_CudaHalfIndexLinear_updateOutput(
	THCState *state,
	THCudaLongTensor *keys,
	int64_t keysOffset,
	THCudaHalfTensor *values,
	THCudaLongTensor *sizes,
	THCudaLongTensor *cumSumSizes,
	THCudaHalfTensor *output,
	THCudaHalfTensor *weight,
	THCudaHalfTensor *bias,
	THCudaHalfTensor *normalizedValues,
	int train) {
	THError("THCudaHalfTensor not supported with IndexLinear");
	}

	void THNN_CudaHalfIndexLinear_accGradParameters(
	THCState *state,
	THCudaLongTensor *keys,
	int64_t keysOffset,
	THCudaHalfTensor *values,
	THCudaLongTensor *sizes,
	THCudaLongTensor *cumSumSizes,
	THCudaHalfTensor *gradOutput,
	THCudaHalfTensor *gradWeight,
	THCudaHalfTensor *gradBias,
	THCudaHalfTensor *weight,
	THCudaHalfTensor *bias,
	THCudaHalfTensor* valuesBuffer,
	float weightDecay,
	float scale) {
	THError("THCudaHalfTensor not supported with IndexLinear");
	}

	void THNN_CudaHalfIndexLinear_accUpdateGradParameters(
	THCState *state,
	THCudaLongTensor *keys,
	int64_t keysOffset,
	THCudaHalfTensor *values,
	THCudaLongTensor *sizes,
	THCudaLongTensor *cumSumSizes,
	THCudaHalfTensor *gradOutput,
	THCudaHalfTensor *weight,
	THCudaHalfTensor *bias,
	float weightDecay,
	float scale) {
	THError("THCudaHalfTensor not supported with IndexLinear");
	}

	void THNN_CudaHalfIndexLinear_updateParameters(
	THCState *state,
	THCudaHalfTensor *gradWeight,
	THCudaHalfTensor *gradBias,
	THCudaHalfTensor *weight,
	THCudaHalfTensor *bias,
	THCudaLongTensor *runningKeys,
	THCudaLongTensor *cumSumSizes,
	int64_t keysOffset,
	float weightDecay,
	float learningRate) {
	THError("THCudaHalfTensor not supported with IndexLinear");
	}
	#endif

	#include "generic/IndexLinear.cu"
	#include "THCGenerateFloatType.h"
	#include "generic/IndexLinear.cu"
	#include "THCGenerateDoubleType.h"