THCReduce.cuh - platform/external/pytorch - Git at Google

 #ifndef THC_REDUCE_INC
 #define THC_REDUCE_INC

 //
 // This file contains dimension reduction operation functions and
 // kernels that work on both contiguous and non-contiguous tensor
 // arguments of arbitrary (up to MAX_CUTORCH_DIMS) dimensioned
 // arguments without copying or temporary storage.
 //

 #include "THCTensorTypeUtils.cuh"
 #include "THCReduceApplyUtils.cuh"

 // Threads per thread block
 #define THC_NONCONTIG_REDUCE_BLOCK_SIZE 32 * 16

 template <typename IndexType>
 __device__ __forceinline__ IndexType getReduceNoncontigDimSliceIndex() {
   // Each thread handles one slice
   return getLinearBlockId<IndexType>() * THC_NONCONTIG_REDUCE_BLOCK_SIZE + threadIdx.x;
 }

 // Kernel that handles an entire reduction of a slice of a tensor per each thread
 template <typename ModifyOp,
           typename ReduceOp,
           typename T,
           typename IndexType,
           int ADims, int BDims>
 #if __CUDA_ARCH__ >= 350
 __launch_bounds__(32 * 16, 4)
 #endif
 __global__ void
 kernelReduceNoncontigDim(TensorInfo<T, IndexType> out,
                          TensorInfo<T, IndexType> in,
                          IndexType reductionStride,
                          IndexType reductionSize,
                          IndexType totalSlices,
                          T init,
                          ModifyOp modifyOp,
                          ReduceOp reduceOp) {
   const IndexType sliceIndex = getReduceNoncontigDimSliceIndex<IndexType>();

   if (sliceIndex >= totalSlices) {
     return;
   }

   // Each thread picks a point in `out` and `in` for which it is
   // producing the reduction
   const IndexType outOffset =
     IndexToOffset<T, IndexType, ADims>::get(sliceIndex, out);
   const IndexType inBaseOffset =
     IndexToOffset<T, IndexType, BDims>::get(sliceIndex, in);

   // For each point in reductionSize, reduce into `r`
   IndexType inOffset = inBaseOffset;
   T r = init;

   for (IndexType i = 0; i < reductionSize; ++i) {
     r = reduceOp(r, modifyOp(in.data[inOffset]));
     inOffset += reductionStride;
   }

   // Write out reduced value
   out.data[outOffset] = r;
 }

 template <typename IndexType>
 __device__ __forceinline__ IndexType getReduceContigDimSliceIndex() {
   // Each block handles one slice
   return getLinearBlockId<IndexType>();
 }

 // Kernel that handles an entire reduction of a slice of a tensor per
 // each block
 template <typename ModifyOp,
           typename ReduceOp,
           typename T,
           typename IndexType,
           int ADims, int BDims>
 __global__ void
 kernelReduceContigDim(TensorInfo<T, IndexType> out,
                       TensorInfo<T, IndexType> in,
                       IndexType reductionSize,
                       IndexType totalSlices,
                       T init,
                       ModifyOp modifyOp,
                       ReduceOp reduceOp) {
   const IndexType sliceIndex = getReduceContigDimSliceIndex<IndexType>();

   if (sliceIndex >= totalSlices) {
     return;
   }

   // Get the offset in `out` for the reduction
   const IndexType outOffset =
     IndexToOffset<T, IndexType, ADims>::get(sliceIndex, out);

   // Get the base offset in `in` for this block's reduction
   const IndexType inBaseOffset =
     IndexToOffset<T, IndexType, BDims>::get(sliceIndex, in);

   // Each thread in the block will reduce some subset of elements in
   // the slice. The elements are guaranteed contiguous starting at
   // `inBaseOffset`.
   T r = init;
   for (IndexType i = threadIdx.x; i < reductionSize; i += blockDim.x) {
     r = reduceOp(r, modifyOp(in.data[inBaseOffset + i]));
   }

   // Reduce within the block
   // FIXME: extern name
   extern __shared__ char smemChar[];
   T* smem = (T*) smemChar;
   r = reduceBlock<T, ReduceOp>(smem, blockDim.x, r, reduceOp, init);

   if (threadIdx.x == 0) {
     // Write out reduced value
     out.data[outOffset] = r;
   }
 }

 inline dim3 getNoncontigReduceBlock() {
   return dim3(THC_NONCONTIG_REDUCE_BLOCK_SIZE);
 }

 inline dim3 getContigReduceBlock(ptrdiff_t numSlices, long reductionSize) {
   // If the number of slices is low but the reduction dimension size
   // is high, then we should increase block size for greater parallelism.
   // Aim for at least 32 warps per SM (assume 15 SMs; don't bother
   // inquiring the real number for now).
   int maxWarps = 4; // better occupancy if many blocks are around
   // For numSlices > 15 * 8, there are > 32 warps active per SM.
   if (numSlices < 15 * 8) {
     maxWarps = 8;
     if (numSlices < 15 * 4) {
       maxWarps = 16;
       if (numSlices < 15 * 2) {
         maxWarps = 32;
       }
     }
   }

   // Scale up block size based on the reduction dimension size
   long warpsInReductionSize = THCCeilDiv(reductionSize, 32L);
   int numWarps = warpsInReductionSize > (long) maxWarps ?
     maxWarps : (int) warpsInReductionSize;

   return dim3(numWarps * 32);
 }

 inline bool getNoncontigReduceGrid(ptrdiff_t elements, dim3& grid) {
   // One output point per thread
   return THC_getGridFromTiles(THCCeilDiv(elements,
                                          (ptrdiff_t) THC_NONCONTIG_REDUCE_BLOCK_SIZE), grid);
 }

 inline bool getContigReduceGrid(ptrdiff_t elements, dim3& grid) {
   // One output point per block
   return THC_getGridFromTiles(elements, grid);
 }

 // Performs a reduction out[..., 0, ...] = reduce_i(modify(in[..., i, ...])) for
 // all in where i and the out's 0 are indexed at dimension `dim`
 template <typename TensorType, typename ModifyOp, typename ReduceOp>
 bool THC_reduceDim(THCState* state,
                    TensorType* out,
                    TensorType* in,
                    const ModifyOp& modifyOp,
                    const ReduceOp& reduceOp,
                    typename TensorUtils<TensorType>::DataType init,
                    int dim) {
   ptrdiff_t inElements = TensorUtils<TensorType>::getNumElements(state, in);

   long reductionSize = TensorUtils<TensorType>::getSize(state, in, dim);
   long reductionStride = TensorUtils<TensorType>::getStride(state, in, dim);
   ptrdiff_t outElements = inElements / reductionSize;

   if (TensorUtils<TensorType>::getDims(state, out) > MAX_CUTORCH_DIMS ||
       TensorUtils<TensorType>::getDims(state, in) > MAX_CUTORCH_DIMS) {
     return false;
   }

   if (TensorUtils<TensorType>::getDims(state, in) == 0) {
     // Zero-dim tensor; do nothing
     return true;
   }

   // Is the reduction dimension contiguous? If so, then we can use a
   // shared memory reduction kernel to increase performance.
   bool contigReduction = (reductionStride == 1);

   dim3 block;
   dim3 grid;
   int smemSize = 0; // contiguous reduction uses smem
   if (contigReduction) {
     if (!getContigReduceGrid(outElements, grid)) {
       return false;
     }

     block = getContigReduceBlock(outElements, reductionSize);
     smemSize = sizeof(typename TensorUtils<TensorType>::DataType) * block.x;
   } else {
     if (!getNoncontigReduceGrid(outElements, grid)) {
       return false;
     }

     block = getNoncontigReduceBlock();
   }

   // Resize out to correspond to the reduced size
   THLongStorage* sizes = TensorUtils<TensorType>::newSizeOf(state, in);
   THLongStorage_set(sizes, dim, 1);
   TensorUtils<TensorType>::resize(state, out, sizes, NULL);
   THLongStorage_free(sizes);

   // It is possible that the tensor dimensions are able to be collapsed,
   // and thus we can reduce the actual code complexity of the copy by
   // exploiting this knowledge statically, since the div/mod is the
   // most expensive part of the operation, more so than memory accesses.
   // For instance, when copying a non-contiguous to a contiguous tensor
   // (or vice versa), the contiguous tensor can be collapsed to one
   // dimension, and the loop to translate the linear index to the array
   // index can be similarly collapsed. That is what this unrolling is for.
 #define HANDLE_CASE(TYPE, OUT, IN)                                      \
   if (contigReduction) {                                                \
     kernelReduceContigDim<ModifyOp, ReduceOp,                           \
                           typename TensorUtils<TensorType>::DataType,   \
                           TYPE, OUT, IN>                                \
       <<<grid, block, smemSize, THCState_getCurrentStream(state)>>>(    \
         outInfo, inInfo, reductionSize,                                 \
         (TYPE) outElements, init, modifyOp, reduceOp);                  \
   } else {                                                              \
     kernelReduceNoncontigDim<ModifyOp, ReduceOp,                        \
                              typename TensorUtils<TensorType>::DataType, \
                              TYPE, OUT, IN>                             \
       <<<grid, block, 0, THCState_getCurrentStream(state)>>>(           \
         outInfo, inInfo, reductionStride, reductionSize,                \
         (TYPE) outElements, init, modifyOp, reduceOp);                  \
   }                                                                     \

 #define HANDLE_IN_CASE(TYPE, OUT, IN)                     \
   {                                                       \
     if (inInfo.isContiguous()) {                          \
       HANDLE_CASE(TYPE, OUT, -2);                         \
     } else {                                              \
       switch (IN) {                                       \
         case 1:                                           \
           HANDLE_CASE(TYPE, OUT, 1);                      \
           break;                                          \
         case 2:                                           \
           HANDLE_CASE(TYPE, OUT, 2);                      \
           break;                                          \
         default:                                          \
           HANDLE_CASE(TYPE, OUT, -1);                     \
           break;                                          \
       }                                                   \
     }                                                     \
   }

 #define HANDLE_OUT_CASE(TYPE, OUT, IN)                 \
   {                                                    \
     if (outInfo.isContiguous()) {                      \
       HANDLE_IN_CASE(TYPE, -2, IN);                    \
     } else {                                           \
       switch (OUT) {                                   \
         case 1:                                        \
           HANDLE_IN_CASE(TYPE, 1, IN);                 \
           break;                                       \
         case 2:                                        \
           HANDLE_IN_CASE(TYPE, 2, IN);                 \
           break;                                       \
         default:                                       \
           HANDLE_IN_CASE(TYPE, -1, IN);                \
           break;                                       \
       }                                                \
     }                                                  \
   }

   if (TensorUtils<TensorType>::canUse32BitIndexMath(state, out) &&
       TensorUtils<TensorType>::canUse32BitIndexMath(state, in)) {
     TensorInfo<typename TensorUtils<TensorType>::DataType,
                unsigned int> outInfo =
       getTensorInfo<TensorType, unsigned int>(state, out);
     outInfo.collapseDims();

     TensorInfo<typename TensorUtils<TensorType>::DataType,
                unsigned int> inInfo =
       getTensorInfo<TensorType, unsigned int>(state, in);
     inInfo.reduceDim(dim);
     inInfo.collapseDims();

     HANDLE_OUT_CASE(unsigned int, outInfo.dims, inInfo.dims);
   } else {
     TensorInfo<typename TensorUtils<TensorType>::DataType,
                unsigned long> outInfo =
       getTensorInfo<TensorType, unsigned long>(state, out);
     outInfo.collapseDims();

     TensorInfo<typename TensorUtils<TensorType>::DataType,
                unsigned long> inInfo =
       getTensorInfo<TensorType, unsigned long>(state, in);
     inInfo.reduceDim(dim);
     inInfo.collapseDims();

     // For large tensors, we only compile the completely contiguous
     // version and the completely generic version, to reduce
     // compilation time.
     if (outInfo.isContiguous() && inInfo.isContiguous()) {
       HANDLE_CASE(unsigned long, -2, -2);
     } else {
       HANDLE_CASE(unsigned long, -1, -1);
     }
   }
 #undef HANDLE_CASE
 #undef HANDLE_IN_CASE
 #undef HANDLE_OUT_CASE

   return true;
 }

 #undef THC_NONCONTIG_REDUCE_BLOCK_SIZE

 #endif // THC_REDUCE_INC
	#ifndef THC_REDUCE_INC
	#define THC_REDUCE_INC

	//
	// This file contains dimension reduction operation functions and
	// kernels that work on both contiguous and non-contiguous tensor
	// arguments of arbitrary (up to MAX_CUTORCH_DIMS) dimensioned
	// arguments without copying or temporary storage.
	//

	#include "THCTensorTypeUtils.cuh"
	#include "THCReduceApplyUtils.cuh"

	// Threads per thread block
	#define THC_NONCONTIG_REDUCE_BLOCK_SIZE 32 * 16

	template <typename IndexType>
	__device__ __forceinline__ IndexType getReduceNoncontigDimSliceIndex() {
	// Each thread handles one slice
	return getLinearBlockId<IndexType>() * THC_NONCONTIG_REDUCE_BLOCK_SIZE + threadIdx.x;
	}

	// Kernel that handles an entire reduction of a slice of a tensor per each thread
	template <typename ModifyOp,
	typename ReduceOp,
	typename T,
	typename IndexType,
	int ADims, int BDims>
	#if __CUDA_ARCH__ >= 350
	__launch_bounds__(32 * 16, 4)
	#endif
	__global__ void
	kernelReduceNoncontigDim(TensorInfo<T, IndexType> out,
	TensorInfo<T, IndexType> in,
	IndexType reductionStride,
	IndexType reductionSize,
	IndexType totalSlices,
	T init,
	ModifyOp modifyOp,
	ReduceOp reduceOp) {
	const IndexType sliceIndex = getReduceNoncontigDimSliceIndex<IndexType>();

	if (sliceIndex >= totalSlices) {
	return;
	}

	// Each thread picks a point in `out` and `in` for which it is
	// producing the reduction
	const IndexType outOffset =
	IndexToOffset<T, IndexType, ADims>::get(sliceIndex, out);
	const IndexType inBaseOffset =
	IndexToOffset<T, IndexType, BDims>::get(sliceIndex, in);

	// For each point in reductionSize, reduce into `r`
	IndexType inOffset = inBaseOffset;
	T r = init;

	for (IndexType i = 0; i < reductionSize; ++i) {
	r = reduceOp(r, modifyOp(in.data[inOffset]));
	inOffset += reductionStride;
	}

	// Write out reduced value
	out.data[outOffset] = r;
	}

	template <typename IndexType>
	__device__ __forceinline__ IndexType getReduceContigDimSliceIndex() {
	// Each block handles one slice
	return getLinearBlockId<IndexType>();
	}

	// Kernel that handles an entire reduction of a slice of a tensor per
	// each block
	template <typename ModifyOp,
	typename ReduceOp,
	typename T,
	typename IndexType,
	int ADims, int BDims>
	__global__ void
	kernelReduceContigDim(TensorInfo<T, IndexType> out,
	TensorInfo<T, IndexType> in,
	IndexType reductionSize,
	IndexType totalSlices,
	T init,
	ModifyOp modifyOp,
	ReduceOp reduceOp) {
	const IndexType sliceIndex = getReduceContigDimSliceIndex<IndexType>();

	if (sliceIndex >= totalSlices) {
	return;
	}

	// Get the offset in `out` for the reduction
	const IndexType outOffset =
	IndexToOffset<T, IndexType, ADims>::get(sliceIndex, out);

	// Get the base offset in `in` for this block's reduction
	const IndexType inBaseOffset =
	IndexToOffset<T, IndexType, BDims>::get(sliceIndex, in);

	// Each thread in the block will reduce some subset of elements in
	// the slice. The elements are guaranteed contiguous starting at
	// `inBaseOffset`.
	T r = init;
	for (IndexType i = threadIdx.x; i < reductionSize; i += blockDim.x) {
	r = reduceOp(r, modifyOp(in.data[inBaseOffset + i]));
	}

	// Reduce within the block
	// FIXME: extern name
	extern __shared__ char smemChar[];
	T* smem = (T*) smemChar;
	r = reduceBlock<T, ReduceOp>(smem, blockDim.x, r, reduceOp, init);

	if (threadIdx.x == 0) {
	// Write out reduced value
	out.data[outOffset] = r;
	}
	}

	inline dim3 getNoncontigReduceBlock() {
	return dim3(THC_NONCONTIG_REDUCE_BLOCK_SIZE);
	}

	inline dim3 getContigReduceBlock(ptrdiff_t numSlices, long reductionSize) {
	// If the number of slices is low but the reduction dimension size
	// is high, then we should increase block size for greater parallelism.
	// Aim for at least 32 warps per SM (assume 15 SMs; don't bother
	// inquiring the real number for now).
	int maxWarps = 4; // better occupancy if many blocks are around
	// For numSlices > 15 * 8, there are > 32 warps active per SM.
	if (numSlices < 15 * 8) {
	maxWarps = 8;
	if (numSlices < 15 * 4) {
	maxWarps = 16;
	if (numSlices < 15 * 2) {
	maxWarps = 32;
	}
	}
	}

	// Scale up block size based on the reduction dimension size
	long warpsInReductionSize = THCCeilDiv(reductionSize, 32L);
	int numWarps = warpsInReductionSize > (long) maxWarps ?
	maxWarps : (int) warpsInReductionSize;

	return dim3(numWarps * 32);
	}

	inline bool getNoncontigReduceGrid(ptrdiff_t elements, dim3& grid) {
	// One output point per thread
	return THC_getGridFromTiles(THCCeilDiv(elements,
	(ptrdiff_t) THC_NONCONTIG_REDUCE_BLOCK_SIZE), grid);
	}

	inline bool getContigReduceGrid(ptrdiff_t elements, dim3& grid) {
	// One output point per block
	return THC_getGridFromTiles(elements, grid);
	}

	// Performs a reduction out[..., 0, ...] = reduce_i(modify(in[..., i, ...])) for
	// all in where i and the out's 0 are indexed at dimension `dim`
	template <typename TensorType, typename ModifyOp, typename ReduceOp>
	bool THC_reduceDim(THCState* state,
	TensorType* out,
	TensorType* in,
	const ModifyOp& modifyOp,
	const ReduceOp& reduceOp,
	typename TensorUtils<TensorType>::DataType init,
	int dim) {
	ptrdiff_t inElements = TensorUtils<TensorType>::getNumElements(state, in);

	long reductionSize = TensorUtils<TensorType>::getSize(state, in, dim);
	long reductionStride = TensorUtils<TensorType>::getStride(state, in, dim);
	ptrdiff_t outElements = inElements / reductionSize;

	if (TensorUtils<TensorType>::getDims(state, out) > MAX_CUTORCH_DIMS \|\|
	TensorUtils<TensorType>::getDims(state, in) > MAX_CUTORCH_DIMS) {
	return false;
	}

	if (TensorUtils<TensorType>::getDims(state, in) == 0) {
	// Zero-dim tensor; do nothing
	return true;
	}

	// Is the reduction dimension contiguous? If so, then we can use a
	// shared memory reduction kernel to increase performance.
	bool contigReduction = (reductionStride == 1);

	dim3 block;
	dim3 grid;
	int smemSize = 0; // contiguous reduction uses smem
	if (contigReduction) {
	if (!getContigReduceGrid(outElements, grid)) {
	return false;
	}

	block = getContigReduceBlock(outElements, reductionSize);
	smemSize = sizeof(typename TensorUtils<TensorType>::DataType) * block.x;
	} else {
	if (!getNoncontigReduceGrid(outElements, grid)) {
	return false;
	}

	block = getNoncontigReduceBlock();
	}

	// Resize out to correspond to the reduced size
	THLongStorage* sizes = TensorUtils<TensorType>::newSizeOf(state, in);
	THLongStorage_set(sizes, dim, 1);
	TensorUtils<TensorType>::resize(state, out, sizes, NULL);
	THLongStorage_free(sizes);

	// It is possible that the tensor dimensions are able to be collapsed,
	// and thus we can reduce the actual code complexity of the copy by
	// exploiting this knowledge statically, since the div/mod is the
	// most expensive part of the operation, more so than memory accesses.
	// For instance, when copying a non-contiguous to a contiguous tensor
	// (or vice versa), the contiguous tensor can be collapsed to one
	// dimension, and the loop to translate the linear index to the array
	// index can be similarly collapsed. That is what this unrolling is for.
	#define HANDLE_CASE(TYPE, OUT, IN) \
	if (contigReduction) { \
	kernelReduceContigDim<ModifyOp, ReduceOp, \
	typename TensorUtils<TensorType>::DataType, \
	TYPE, OUT, IN> \
	<<<grid, block, smemSize, THCState_getCurrentStream(state)>>>( \
	outInfo, inInfo, reductionSize, \
	(TYPE) outElements, init, modifyOp, reduceOp); \
	} else { \
	kernelReduceNoncontigDim<ModifyOp, ReduceOp, \
	typename TensorUtils<TensorType>::DataType, \
	TYPE, OUT, IN> \
	<<<grid, block, 0, THCState_getCurrentStream(state)>>>( \
	outInfo, inInfo, reductionStride, reductionSize, \
	(TYPE) outElements, init, modifyOp, reduceOp); \
	} \

	#define HANDLE_IN_CASE(TYPE, OUT, IN) \
	{ \
	if (inInfo.isContiguous()) { \
	HANDLE_CASE(TYPE, OUT, -2); \
	} else { \
	switch (IN) { \
	case 1: \
	HANDLE_CASE(TYPE, OUT, 1); \
	break; \
	case 2: \
	HANDLE_CASE(TYPE, OUT, 2); \
	break; \
	default: \
	HANDLE_CASE(TYPE, OUT, -1); \
	break; \
	} \
	} \
	}

	#define HANDLE_OUT_CASE(TYPE, OUT, IN) \
	{ \
	if (outInfo.isContiguous()) { \
	HANDLE_IN_CASE(TYPE, -2, IN); \
	} else { \
	switch (OUT) { \
	case 1: \
	HANDLE_IN_CASE(TYPE, 1, IN); \
	break; \
	case 2: \
	HANDLE_IN_CASE(TYPE, 2, IN); \
	break; \
	default: \
	HANDLE_IN_CASE(TYPE, -1, IN); \
	break; \
	} \
	} \
	}

	if (TensorUtils<TensorType>::canUse32BitIndexMath(state, out) &&
	TensorUtils<TensorType>::canUse32BitIndexMath(state, in)) {
	TensorInfo<typename TensorUtils<TensorType>::DataType,
	unsigned int> outInfo =
	getTensorInfo<TensorType, unsigned int>(state, out);
	outInfo.collapseDims();

	TensorInfo<typename TensorUtils<TensorType>::DataType,
	unsigned int> inInfo =
	getTensorInfo<TensorType, unsigned int>(state, in);
	inInfo.reduceDim(dim);
	inInfo.collapseDims();

	HANDLE_OUT_CASE(unsigned int, outInfo.dims, inInfo.dims);
	} else {
	TensorInfo<typename TensorUtils<TensorType>::DataType,
	unsigned long> outInfo =
	getTensorInfo<TensorType, unsigned long>(state, out);
	outInfo.collapseDims();

	TensorInfo<typename TensorUtils<TensorType>::DataType,
	unsigned long> inInfo =
	getTensorInfo<TensorType, unsigned long>(state, in);
	inInfo.reduceDim(dim);
	inInfo.collapseDims();

	// For large tensors, we only compile the completely contiguous
	// version and the completely generic version, to reduce
	// compilation time.
	if (outInfo.isContiguous() && inInfo.isContiguous()) {
	HANDLE_CASE(unsigned long, -2, -2);
	} else {
	HANDLE_CASE(unsigned long, -1, -1);
	}
	}
	#undef HANDLE_CASE
	#undef HANDLE_IN_CASE
	#undef HANDLE_OUT_CASE

	return true;
	}

	#undef THC_NONCONTIG_REDUCE_BLOCK_SIZE

	#endif // THC_REDUCE_INC