blob: 7f276a2a00d4a3adfc18a15340c1aa70a6834e09 [file] [log] [blame]
#ifndef THC_REDUCE_INC
#define THC_REDUCE_INC
//
// This file contains dimension reduction operation functions and
// kernels that work on both contiguous and non-contiguous tensor
// arguments of arbitrary (up to MAX_CUTORCH_DIMS) dimensioned
// arguments without copying or temporary storage.
//
#include "THCTensorTypeUtils.cuh"
#include "THCReduceApplyUtils.cuh"
// Threads per thread block
#define THC_NONCONTIG_REDUCE_BLOCK_SIZE 32 * 16
template <typename IndexType>
__device__ __forceinline__ IndexType getReduceNoncontigDimSliceIndex() {
// Each thread handles one slice
return getLinearBlockId<IndexType>() * THC_NONCONTIG_REDUCE_BLOCK_SIZE + threadIdx.x;
}
// Kernel that handles an entire reduction of a slice of a tensor per each thread
template <typename ModifyOp,
typename ReduceOp,
typename T,
typename IndexType,
int ADims, int BDims>
#if __CUDA_ARCH__ >= 350
__launch_bounds__(32 * 16, 4)
#endif
__global__ void
kernelReduceNoncontigDim(TensorInfo<T, IndexType> out,
TensorInfo<T, IndexType> in,
IndexType reductionStride,
IndexType reductionSize,
IndexType totalSlices,
T init,
ModifyOp modifyOp,
ReduceOp reduceOp) {
const IndexType sliceIndex = getReduceNoncontigDimSliceIndex<IndexType>();
if (sliceIndex >= totalSlices) {
return;
}
// Each thread picks a point in `out` and `in` for which it is
// producing the reduction
const IndexType outOffset =
IndexToOffset<T, IndexType, ADims>::get(sliceIndex, out);
const IndexType inBaseOffset =
IndexToOffset<T, IndexType, BDims>::get(sliceIndex, in);
// For each point in reductionSize, reduce into `r`
IndexType inOffset = inBaseOffset;
T r = init;
for (IndexType i = 0; i < reductionSize; ++i) {
r = reduceOp(r, modifyOp(in.data[inOffset]));
inOffset += reductionStride;
}
// Write out reduced value
out.data[outOffset] = r;
}
template <typename IndexType>
__device__ __forceinline__ IndexType getReduceContigDimSliceIndex() {
// Each block handles one slice
return getLinearBlockId<IndexType>();
}
// Kernel that handles an entire reduction of a slice of a tensor per
// each block
template <typename ModifyOp,
typename ReduceOp,
typename T,
typename IndexType,
int ADims, int BDims>
__global__ void
kernelReduceContigDim(TensorInfo<T, IndexType> out,
TensorInfo<T, IndexType> in,
IndexType reductionSize,
IndexType totalSlices,
T init,
ModifyOp modifyOp,
ReduceOp reduceOp) {
const IndexType sliceIndex = getReduceContigDimSliceIndex<IndexType>();
if (sliceIndex >= totalSlices) {
return;
}
// Get the offset in `out` for the reduction
const IndexType outOffset =
IndexToOffset<T, IndexType, ADims>::get(sliceIndex, out);
// Get the base offset in `in` for this block's reduction
const IndexType inBaseOffset =
IndexToOffset<T, IndexType, BDims>::get(sliceIndex, in);
// Each thread in the block will reduce some subset of elements in
// the slice. The elements are guaranteed contiguous starting at
// `inBaseOffset`.
T r = init;
for (IndexType i = threadIdx.x; i < reductionSize; i += blockDim.x) {
r = reduceOp(r, modifyOp(in.data[inBaseOffset + i]));
}
// Reduce within the block
// FIXME: extern name
extern __shared__ char smemChar[];
T* smem = (T*) smemChar;
r = reduceBlock<T, ReduceOp>(smem, blockDim.x, r, reduceOp, init);
if (threadIdx.x == 0) {
// Write out reduced value
out.data[outOffset] = r;
}
}
inline dim3 getNoncontigReduceBlock() {
return dim3(THC_NONCONTIG_REDUCE_BLOCK_SIZE);
}
inline dim3 getContigReduceBlock(ptrdiff_t numSlices, long reductionSize) {
// If the number of slices is low but the reduction dimension size
// is high, then we should increase block size for greater parallelism.
// Aim for at least 32 warps per SM (assume 15 SMs; don't bother
// inquiring the real number for now).
int maxWarps = 4; // better occupancy if many blocks are around
// For numSlices > 15 * 8, there are > 32 warps active per SM.
if (numSlices < 15 * 8) {
maxWarps = 8;
if (numSlices < 15 * 4) {
maxWarps = 16;
if (numSlices < 15 * 2) {
maxWarps = 32;
}
}
}
// Scale up block size based on the reduction dimension size
long warpsInReductionSize = THCCeilDiv(reductionSize, 32L);
int numWarps = warpsInReductionSize > (long) maxWarps ?
maxWarps : (int) warpsInReductionSize;
return dim3(numWarps * 32);
}
inline bool getNoncontigReduceGrid(ptrdiff_t elements, dim3& grid) {
// One output point per thread
return THC_getGridFromTiles(THCCeilDiv(elements,
(ptrdiff_t) THC_NONCONTIG_REDUCE_BLOCK_SIZE), grid);
}
inline bool getContigReduceGrid(ptrdiff_t elements, dim3& grid) {
// One output point per block
return THC_getGridFromTiles(elements, grid);
}
// Performs a reduction out[..., 0, ...] = reduce_i(modify(in[..., i, ...])) for
// all in where i and the out's 0 are indexed at dimension `dim`
template <typename TensorType, typename ModifyOp, typename ReduceOp>
bool THC_reduceDim(THCState* state,
TensorType* out,
TensorType* in,
const ModifyOp& modifyOp,
const ReduceOp& reduceOp,
typename TensorUtils<TensorType>::DataType init,
int dim) {
ptrdiff_t inElements = TensorUtils<TensorType>::getNumElements(state, in);
long reductionSize = TensorUtils<TensorType>::getSize(state, in, dim);
long reductionStride = TensorUtils<TensorType>::getStride(state, in, dim);
ptrdiff_t outElements = inElements / reductionSize;
if (TensorUtils<TensorType>::getDims(state, out) > MAX_CUTORCH_DIMS ||
TensorUtils<TensorType>::getDims(state, in) > MAX_CUTORCH_DIMS) {
return false;
}
if (TensorUtils<TensorType>::getDims(state, in) == 0) {
// Zero-dim tensor; do nothing
return true;
}
// Is the reduction dimension contiguous? If so, then we can use a
// shared memory reduction kernel to increase performance.
bool contigReduction = (reductionStride == 1);
dim3 block;
dim3 grid;
int smemSize = 0; // contiguous reduction uses smem
if (contigReduction) {
if (!getContigReduceGrid(outElements, grid)) {
return false;
}
block = getContigReduceBlock(outElements, reductionSize);
smemSize = sizeof(typename TensorUtils<TensorType>::DataType) * block.x;
} else {
if (!getNoncontigReduceGrid(outElements, grid)) {
return false;
}
block = getNoncontigReduceBlock();
}
// Resize out to correspond to the reduced size
THLongStorage* sizes = TensorUtils<TensorType>::newSizeOf(state, in);
THLongStorage_set(sizes, dim, 1);
TensorUtils<TensorType>::resize(state, out, sizes, NULL);
THLongStorage_free(sizes);
// It is possible that the tensor dimensions are able to be collapsed,
// and thus we can reduce the actual code complexity of the copy by
// exploiting this knowledge statically, since the div/mod is the
// most expensive part of the operation, more so than memory accesses.
// For instance, when copying a non-contiguous to a contiguous tensor
// (or vice versa), the contiguous tensor can be collapsed to one
// dimension, and the loop to translate the linear index to the array
// index can be similarly collapsed. That is what this unrolling is for.
#define HANDLE_CASE(TYPE, OUT, IN) \
if (contigReduction) { \
kernelReduceContigDim<ModifyOp, ReduceOp, \
typename TensorUtils<TensorType>::DataType, \
TYPE, OUT, IN> \
<<<grid, block, smemSize, THCState_getCurrentStream(state)>>>( \
outInfo, inInfo, reductionSize, \
(TYPE) outElements, init, modifyOp, reduceOp); \
} else { \
kernelReduceNoncontigDim<ModifyOp, ReduceOp, \
typename TensorUtils<TensorType>::DataType, \
TYPE, OUT, IN> \
<<<grid, block, 0, THCState_getCurrentStream(state)>>>( \
outInfo, inInfo, reductionStride, reductionSize, \
(TYPE) outElements, init, modifyOp, reduceOp); \
} \
#define HANDLE_IN_CASE(TYPE, OUT, IN) \
{ \
if (inInfo.isContiguous()) { \
HANDLE_CASE(TYPE, OUT, -2); \
} else { \
switch (IN) { \
case 1: \
HANDLE_CASE(TYPE, OUT, 1); \
break; \
case 2: \
HANDLE_CASE(TYPE, OUT, 2); \
break; \
default: \
HANDLE_CASE(TYPE, OUT, -1); \
break; \
} \
} \
}
#define HANDLE_OUT_CASE(TYPE, OUT, IN) \
{ \
if (outInfo.isContiguous()) { \
HANDLE_IN_CASE(TYPE, -2, IN); \
} else { \
switch (OUT) { \
case 1: \
HANDLE_IN_CASE(TYPE, 1, IN); \
break; \
case 2: \
HANDLE_IN_CASE(TYPE, 2, IN); \
break; \
default: \
HANDLE_IN_CASE(TYPE, -1, IN); \
break; \
} \
} \
}
if (TensorUtils<TensorType>::canUse32BitIndexMath(state, out) &&
TensorUtils<TensorType>::canUse32BitIndexMath(state, in)) {
TensorInfo<typename TensorUtils<TensorType>::DataType,
unsigned int> outInfo =
getTensorInfo<TensorType, unsigned int>(state, out);
outInfo.collapseDims();
TensorInfo<typename TensorUtils<TensorType>::DataType,
unsigned int> inInfo =
getTensorInfo<TensorType, unsigned int>(state, in);
inInfo.reduceDim(dim);
inInfo.collapseDims();
HANDLE_OUT_CASE(unsigned int, outInfo.dims, inInfo.dims);
} else {
TensorInfo<typename TensorUtils<TensorType>::DataType,
unsigned long> outInfo =
getTensorInfo<TensorType, unsigned long>(state, out);
outInfo.collapseDims();
TensorInfo<typename TensorUtils<TensorType>::DataType,
unsigned long> inInfo =
getTensorInfo<TensorType, unsigned long>(state, in);
inInfo.reduceDim(dim);
inInfo.collapseDims();
// For large tensors, we only compile the completely contiguous
// version and the completely generic version, to reduce
// compilation time.
if (outInfo.isContiguous() && inInfo.isContiguous()) {
HANDLE_CASE(unsigned long, -2, -2);
} else {
HANDLE_CASE(unsigned long, -1, -1);
}
}
#undef HANDLE_CASE
#undef HANDLE_IN_CASE
#undef HANDLE_OUT_CASE
return true;
}
#undef THC_NONCONTIG_REDUCE_BLOCK_SIZE
#endif // THC_REDUCE_INC