blob: e626632cb49ef4be37344ac1c9225ac345ea1e26 [file] [log] [blame]
#include "THCUNN.h"
#include "common.h"
#include "THCThrustAllocator.cuh"
#include <thrust/device_ptr.h>
#include <thrust/execution_policy.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/transform_reduce.h>
#if CUDA_VERSION >= 7000
#include <thrust/system/cuda/execution_policy.h>
#endif
#include <thrust/unique.h>
#include "THCHalf.h"
#include "THCHalfAutoNumerics.cuh"
#include "THCTensorSort.cuh"
const int WARP_SIZE = 32;
__device__ __forceinline__ bool warpHasCollision(int val)
{
// Compare our value to the values stored in the next 16 lanes,
// wrapping around at 32. If any pair of values is the same than
// there is a collision in the warp.
bool dup = 0;
const int laneId = threadIdx.x % 32;
#if __CUDA_ARCH__ >= 300
#pragma unroll
for (int i = 1; i <= 16; i++)
{
dup |= (__shfl(val, (laneId + i) % 32) == val);
}
#else
volatile __shared__ int values[128];
values[threadIdx.x] = val;
const int offset = threadIdx.x - laneId;
#pragma unroll
for (int i = 1; i <= 16; i++)
{
dup |= (values[offset + ((laneId + i) % 32)] == val);
}
#endif
return __any(dup) != 0;
}
template <typename Dtype>
__global__ void cunn_LookupTable_accGradParametersKernelByFeature(
long *input, Dtype *gradOutput, Dtype *gradWeight, Dtype scale, ptrdiff_t numel,
long stride, int paddingValue) {
const int featureDim = blockIdx.x * 4 + threadIdx.x / 32;
if (featureDim >= stride) {
return;
}
// The strategy here is that each warp handles a single feature
// dimension.
// Within that feature dimension, points in the [batch][element]
// dimension can overlap, and we need to determine if threads want
// to add to the gradient in a colliding manner.
// Typically one would use floating-point atomicAdd() to resolve
// these collisions, but that is non-deterministic if there are
// collisions. Non-determinism for this code is really bad,
// especially in RNNs, and is prone to snowballing error.
// In order to get a deterministic order of execution, we handle
// non-colliding updates separately from colliding ones. Colliding
// updates are serialized in their order of execution by using the
// warp-wide collision detector `warpHasCollision`.
const int laneId = threadIdx.x % 32;
for (ptrdiff_t i = laneId; i < numel; i += WARP_SIZE) {
const int weightIndex = (int) (input[i] - TH_INDEX_BASE);
if (weightIndex == paddingValue - TH_INDEX_BASE) {
continue;
}
Dtype update = gradOutput[i*stride + featureDim] * scale;
// FIXME: should we accumulate as accreal?
// Check for collision
if (warpHasCollision(weightIndex)) {
// Run all lanes sequentially; warp divergence
for (int i = 0; i < WARP_SIZE; ++i) {
if (laneId == i) {
gradWeight[weightIndex*stride + featureDim] += update;
}
}
} else {
// No collision; warp coherence
gradWeight[weightIndex*stride + featureDim] += update;
}
}
}
template <typename Dtype, typename Acctype>
__global__ void cunn_LookupTable_accGradParametersKernel(
long *input, long *indices, Dtype *gradOutput, Dtype *gradWeight,
long *count, Dtype defaultScale, ptrdiff_t numel, long stride, int paddingValue) {
int idx = blockIdx.x * 4 + threadIdx.y;
// Each warp is responsible for an input into the LookupTable.
// If the preceeding input has the same as this input, then the warp
// exits immediately. The warp also processes subsequent inputs with the
// same value.
//
// Input Warp
// 1 <warp 1>
// 1 <warp 1> (<warp 2> exits without doing any work)
// 5 <warp 3>
// 8 <warp 4>
// Number of values proceessed by each thread (grain size)
const int SZ = 4;
if (idx < numel
&& (idx == 0 || input[idx] != input[idx - 1])
&& input[idx] != paddingValue) {
do {
const int startFeature = threadIdx.x + blockIdx.y * blockDim.x * SZ;
const int weightRow = ((int) input[idx] - TH_INDEX_BASE) * stride;
const int gradOutputRow = ((int) indices[idx] - TH_INDEX_BASE) * stride;
const Acctype scale = count ? ScalarConvert<Dtype, Acctype>::to(defaultScale) / count[idx] : ScalarConvert<Dtype, Acctype>::to(defaultScale);
Acctype gradient[SZ];
Acctype weight[SZ];
#pragma unroll
for (int ii = 0; ii < SZ; ii++)
{
int featureDim = startFeature + ii * WARP_SIZE;
if (featureDim < stride)
{
gradient[ii] = ScalarConvert<Dtype, Acctype>::to(gradOutput[gradOutputRow + featureDim]);
weight[ii] = ScalarConvert<Dtype, Acctype>::to(gradWeight[weightRow + featureDim]);
}
}
#pragma unroll
for (int ii = 0; ii < SZ; ii++)
{
weight[ii] += gradient[ii] * scale;
}
#pragma unroll
for (int ii = 0; ii < SZ; ii++)
{
int featureDim = startFeature + ii * WARP_SIZE;
if (featureDim < stride)
{
gradWeight[weightRow + featureDim] = ScalarConvert<Acctype, Dtype>::to(weight[ii]);
}
}
idx++;
} while (idx < numel && input[idx] == input[idx - 1]);
}
}
/*
* Keep the norm of weight smaller than maxNorm
*/
template <typename Dtype, typename Acctype>
struct pow_v
{
Acctype normType;
pow_v(Dtype v) : normType(ScalarConvert<Dtype, Acctype>::to(v)) {}
__host__ __device__
Acctype operator()(const Dtype& x) const {
Acctype xA = ScalarConvert<Dtype, Acctype>::to(x);
if (normType == 1)
return std::abs(xA);
else if (normType == 2)
return xA * xA;
else
return std::pow(std::abs(xA), normType);
}
};
template <typename T>
struct multiply_s
{
T scale;
multiply_s(T s) : scale(s) {}
__host__ __device__
T operator()(const T& x) const {
return x * scale;
}
};
#include "generic/LookupTable.cu"
#include "THCGenerateFloatTypes.h"