THCReduceAll.cuh - platform/external/pytorch - Git at Google

 #ifndef THC_REDUCEALL_INC
 #define THC_REDUCEALL_INC

 //
 // This file contains dimension reduction operation functions and
 // kernels that work on both contiguous and non-contiguous tensor
 // arguments of arbitrary (up to MAX_CUTORCH_DIMS) dimensioned
 // arguments without copying or temporary storage, for reducing an
 // entire tensor to one value.
 //

 #include "THCReduceApplyUtils.cuh"

 // Size per each reduction block
 #define THC_REDUCE_ALL_BLOCK_SIZE 1024L

 // Cutoff size for two-pass reduction
 #define THC_TWO_PASS_REDUCTION_SIZE 2048L

 // Kernel that handles an entire reduction of a tensor in one pass
 template <typename ModifyOp,
           typename ReduceOp,
           typename ReduceAccOp,
           typename InT,
           typename AccT,
           typename IndexType,
           int ADims>
 __global__ void
 kernelReduceAll(TensorInfo<InT, IndexType> in,
                 IndexType totalElements,
                 AccT init,
                 ModifyOp modifyOp,
                 ReduceOp reduceOp,
                 ReduceAccOp reduceAccOp,
                 AccT* out) {
   // With a block-wide stride, have each thread perform its own reduction.
   AccT r = init;
   for (IndexType i = threadIdx.x; i < totalElements; i += blockDim.x) {
     const IndexType inOffset = IndexToOffset<InT, IndexType, ADims>::get(i, in);
     r = reduceOp(r, modifyOp(in.data[inOffset]));
   }

   // Reduce within the block
   extern __shared__ char smemChar[];
   AccT* smem = (AccT*) smemChar;
   r = reduceBlock<AccT, ReduceAccOp>(smem, blockDim.x, r, reduceAccOp, init);

   if (threadIdx.x == 0) {
     // Write out reduced value
     *out = r;
   }
 }

 template <typename IndexType>
 __device__ __forceinline__ IndexType getStartIndex(IndexType totalSize) {
   IndexType sizePerBlock = THCCeilDiv(totalSize, (IndexType) gridDim.x);
   return blockIdx.x * sizePerBlock;
 }

 template <typename IndexType>
 __device__ __forceinline__ IndexType getEndIndex(IndexType totalSize) {
   IndexType sizePerBlock = THCCeilDiv(totalSize, (IndexType) gridDim.x);
   return min((IndexType) ((blockIdx.x + 1) * sizePerBlock), totalSize);
 }

 // Kernel that handles an entire reduction of a tensor in two passes
 template <typename ModifyOp,
           typename ReduceOp,
           typename ReduceAccOp,
           typename InT,
           typename AccT,
           typename IndexType,
           int ADims>
 __global__ void
 kernelReduceAllPass1(TensorInfo<InT, IndexType> in,
                      IndexType totalElements,
                      AccT init,
                      ModifyOp modifyOp,
                      ReduceOp reduceOp,
                      ReduceAccOp reduceAccOp,
                      AccT* scratchSpace) {
   const IndexType startIndex = getStartIndex<IndexType>(totalElements);
   const IndexType endIndex = getEndIndex<IndexType>(totalElements);

   // With a block-wide stride, have each thread perform its own reduction.
   AccT r = init;
   for (IndexType i = startIndex + threadIdx.x; i < endIndex; i += blockDim.x) {
     const IndexType inOffset = IndexToOffset<InT, IndexType, ADims>::get(i, in);
     r = reduceOp(r, modifyOp(in.data[inOffset]));
   }

   // Reduce within the block
   extern __shared__ char smemChar[];
   AccT* smem = (AccT*) smemChar;
   r = reduceBlock<AccT, ReduceAccOp>(smem, blockDim.x, r, reduceAccOp, init);

   if (threadIdx.x == 0) {
     // Write out block-wide reduced value
     scratchSpace[blockIdx.x] = r;
   }
 }

 template <typename ReduceOp, typename T, typename IndexType>
 __global__ void
 kernelReduceAllPass2(int numPass1Blocks,
                      T init,
                      ReduceOp reduceOp,
                      T* scratchSpace,
                      T* out) {
   T r = init;
   if (threadIdx.x < numPass1Blocks) {
     r = scratchSpace[threadIdx.x];
   }

   // Reduce within the block
   extern __shared__ char smemChar[];
   T* smem = (T*) smemChar;
   r = reduceBlock<T, ReduceOp>(smem, numPass1Blocks, r, reduceOp, init);

   if (threadIdx.x == 0) {
     *out = r;
   }
 }

 // Perform a two-pass reduction if the tensor is large enough to
 // warrant it.
 inline bool isTwoPassReductionSize(long elements) {
   return (elements > THC_TWO_PASS_REDUCTION_SIZE);
 }

 template <typename InT, typename AccT>
 inline long getTwoPassBlocks(THCState* state, long elements) {
   long numBlocks = THCCeilDiv(elements, THC_REDUCE_ALL_BLOCK_SIZE);

   // We can only have as many blocks as there is scratch space
   long scratchSpace =
     THCState_getCurrentDeviceScratchSpaceSize(state) / sizeof(AccT);
   THAssert(scratchSpace > 0);

   if (numBlocks > scratchSpace) {
     numBlocks = scratchSpace;
   }

   return numBlocks;
 }

 // Get the block/grid size that we want
 template <typename InT, typename AccT>
 inline void getPass1ReduceBlockGrid(THCState* state, long elements,
                                     dim3& grid, dim3& block) {
   grid = dim3(getTwoPassBlocks<InT, AccT>(state, elements));
   block = dim3(THC_REDUCE_ALL_BLOCK_SIZE);
 }

 template <typename InT, typename AccT>
 inline void getPass2ReduceBlockGrid(THCState* state, long elements,
                                     dim3& grid, dim3& block) {
   grid = dim3(1);
   // We only need as many threads as there were blocks originally
   block = dim3(getTwoPassBlocks<InT, AccT>(state, elements));
 }

 template <typename InT, typename AccT>
 inline void getSinglePassReduceBlockGrid(long elements,
                                          dim3& grid, dim3& block) {
   grid = dim3(1);
   block = dim3(THC_REDUCE_ALL_BLOCK_SIZE);
 }

 template <typename ModifyOp,
           typename ReduceOp,
           typename ReduceAccOp,
           typename InT,
           typename AccT,
           typename IndexType,
           int ADims>
 void callReduceAll(THCState* state,
                    const TensorInfo<InT, IndexType>& in,
                    long totalElements,
                    AccT init,
                    const ModifyOp& modifyOp,
                    const ReduceOp& reduceOp,
                    const ReduceAccOp& reduceAccOp,
                    AccT* devOut) {
   dim3 grid;
   dim3 block;

   if (isTwoPassReductionSize(totalElements)) {
     getPass1ReduceBlockGrid<InT, AccT>(state, totalElements, grid, block);
     size_t smemSize = block.x * sizeof(AccT);

     kernelReduceAllPass1<ModifyOp, ReduceOp, ReduceAccOp, InT, AccT, IndexType, ADims>
       <<<grid, block, smemSize, THCState_getCurrentStream(state)>>>(
         in, (IndexType) totalElements, init, modifyOp, reduceOp, reduceAccOp,
         (AccT*) THCState_getCurrentDeviceScratchSpace(state));

     int numPass1Blocks = grid.x;
     getPass2ReduceBlockGrid<InT, AccT>(state, totalElements, grid, block);
     smemSize = block.x * sizeof(AccT);

     kernelReduceAllPass2<ReduceAccOp, AccT, IndexType>
       <<<grid, block, smemSize, THCState_getCurrentStream(state)>>>(
         numPass1Blocks, init, reduceAccOp,
         (AccT*) THCState_getCurrentDeviceScratchSpace(state),
         devOut);

   } else {
     getSinglePassReduceBlockGrid<InT, AccT>(totalElements, grid, block);
     size_t smemSize = block.x * sizeof(AccT);

     kernelReduceAll<ModifyOp, ReduceOp, ReduceAccOp, InT, AccT, IndexType, ADims>
       <<<grid, block, smemSize, THCState_getCurrentStream(state)>>>(
         in, (IndexType) totalElements, init, modifyOp, reduceOp, reduceAccOp, devOut);
   }
 }

 // Reduces the entire tensor to one value. `out` points to
 // host-resident memory.
 template <typename TensorType,
           typename ModifyOp,
           typename ReduceOp,
           typename ReduceAccOp,
           typename AccT>
 bool THC_reduceAll(THCState* state,
                    TensorType* in,
                    const ModifyOp& modifyOp,
                    const ReduceOp& reduceOp,
                    const ReduceAccOp& reduceAccOp,
                    AccT init,
                    AccT* out,
                    int outOnDevice) {
   long inElements = TensorUtils<TensorType>::getNumElements(state, in);

   if (TensorUtils<TensorType>::getDims(state, in) > MAX_CUTORCH_DIMS) {
     return false;
   }

   if (TensorUtils<TensorType>::getDims(state, in) == 0) {
     // Zero-dim tensor; do nothing
     *out = init;
     return true;
   }

   AccT* devOut = out;
   if (!outOnDevice) {
     // Use the stream-specific scratch space for the reduction kernel
     // to write out its value
     devOut = (AccT*) THCState_getCurrentDeviceScratchSpace(state);
   }

   // It is possible that the tensor dimensions are able to be collapsed,
   // and thus we can reduce the actual code complexity of the copy by
   // exploiting this knowledge statically, since the div/mod is the
   // most expensive part of the operation, more so than memory accesses.
   // For instance, when copying a non-contiguous to a contiguous tensor
   // (or vice versa), the contiguous tensor can be collapsed to one
   // dimension, and the loop to translate the linear index to the array
   // index can be similarly collapsed. That is what this unrolling is for.
 #define HANDLE_CASE(TYPE, IN)                                           \
   callReduceAll<ModifyOp, ReduceOp, ReduceAccOp,                        \
                 typename TensorUtils<TensorType>::DataType,             \
                 AccT,                                                   \
                 TYPE, IN>(                                              \
                   state, inInfo, inElements, init, modifyOp,            \
                   reduceOp, reduceAccOp, devOut);

 #define HANDLE_IN_CASE(TYPE, IN)                    \
   {                                                 \
     if (inInfo.isContiguous()) {                    \
       HANDLE_CASE(TYPE, -2);                        \
     } else {                                        \
       switch (IN) {                                 \
         case 1:                                     \
           HANDLE_CASE(TYPE, 1);                     \
           break;                                    \
         case 2:                                     \
           HANDLE_CASE(TYPE, 2);                     \
           break;                                    \
         default:                                    \
           HANDLE_CASE(TYPE, -1);                    \
           break;                                    \
       }                                             \
     }                                               \
   }

   if (TensorUtils<TensorType>::canUse32BitIndexMath(state, in)) {
     TensorInfo<typename TensorUtils<TensorType>::DataType, unsigned int> inInfo =
       getTensorInfo<TensorType, unsigned int>(state, in);
     inInfo.collapseDims();

     HANDLE_IN_CASE(unsigned int, inInfo.dims);
   } else {
     TensorInfo<typename TensorUtils<TensorType>::DataType,
                unsigned long long> inInfo =
       getTensorInfo<TensorType, unsigned long long>(state, in);
     inInfo.collapseDims();

     // For large tensors, we only compile the completely contiguous
     // version and the completely generic version, to reduce
     // compilation time.
     if (inInfo.isContiguous()) {
       HANDLE_IN_CASE(unsigned long long, -2);
     } else {
       HANDLE_IN_CASE(unsigned long long, -1);
     }
   }
 #undef HANDLE_CASE
 #undef HANDLE_IN_CASE

   // If our destination is not on the device, copy the value back to
   // the host (synchronous!)
   if (!outOnDevice) {
     cudaMemcpy(out, devOut, sizeof(AccT), cudaMemcpyDeviceToHost);
   }

   return true;
 }

 #undef THC_REDUCE_ALL_BLOCK_SIZE
 #undef THC_TWO_PASS_REDUCTION_SIZE

 #endif // THC_REDUCEALL_INC
	#ifndef THC_REDUCEALL_INC
	#define THC_REDUCEALL_INC

	//
	// This file contains dimension reduction operation functions and
	// kernels that work on both contiguous and non-contiguous tensor
	// arguments of arbitrary (up to MAX_CUTORCH_DIMS) dimensioned
	// arguments without copying or temporary storage, for reducing an
	// entire tensor to one value.
	//

	#include "THCReduceApplyUtils.cuh"

	// Size per each reduction block
	#define THC_REDUCE_ALL_BLOCK_SIZE 1024L

	// Cutoff size for two-pass reduction
	#define THC_TWO_PASS_REDUCTION_SIZE 2048L

	// Kernel that handles an entire reduction of a tensor in one pass
	template <typename ModifyOp,
	typename ReduceOp,
	typename ReduceAccOp,
	typename InT,
	typename AccT,
	typename IndexType,
	int ADims>
	__global__ void
	kernelReduceAll(TensorInfo<InT, IndexType> in,
	IndexType totalElements,
	AccT init,
	ModifyOp modifyOp,
	ReduceOp reduceOp,
	ReduceAccOp reduceAccOp,
	AccT* out) {
	// With a block-wide stride, have each thread perform its own reduction.
	AccT r = init;
	for (IndexType i = threadIdx.x; i < totalElements; i += blockDim.x) {
	const IndexType inOffset = IndexToOffset<InT, IndexType, ADims>::get(i, in);
	r = reduceOp(r, modifyOp(in.data[inOffset]));
	}

	// Reduce within the block
	extern __shared__ char smemChar[];
	AccT* smem = (AccT*) smemChar;
	r = reduceBlock<AccT, ReduceAccOp>(smem, blockDim.x, r, reduceAccOp, init);

	if (threadIdx.x == 0) {
	// Write out reduced value
	*out = r;
	}
	}

	template <typename IndexType>
	__device__ __forceinline__ IndexType getStartIndex(IndexType totalSize) {
	IndexType sizePerBlock = THCCeilDiv(totalSize, (IndexType) gridDim.x);
	return blockIdx.x * sizePerBlock;
	}

	template <typename IndexType>
	__device__ __forceinline__ IndexType getEndIndex(IndexType totalSize) {
	IndexType sizePerBlock = THCCeilDiv(totalSize, (IndexType) gridDim.x);
	return min((IndexType) ((blockIdx.x + 1) * sizePerBlock), totalSize);
	}

	// Kernel that handles an entire reduction of a tensor in two passes
	template <typename ModifyOp,
	typename ReduceOp,
	typename ReduceAccOp,
	typename InT,
	typename AccT,
	typename IndexType,
	int ADims>
	__global__ void
	kernelReduceAllPass1(TensorInfo<InT, IndexType> in,
	IndexType totalElements,
	AccT init,
	ModifyOp modifyOp,
	ReduceOp reduceOp,
	ReduceAccOp reduceAccOp,
	AccT* scratchSpace) {
	const IndexType startIndex = getStartIndex<IndexType>(totalElements);
	const IndexType endIndex = getEndIndex<IndexType>(totalElements);

	// With a block-wide stride, have each thread perform its own reduction.
	AccT r = init;
	for (IndexType i = startIndex + threadIdx.x; i < endIndex; i += blockDim.x) {
	const IndexType inOffset = IndexToOffset<InT, IndexType, ADims>::get(i, in);
	r = reduceOp(r, modifyOp(in.data[inOffset]));
	}

	// Reduce within the block
	extern __shared__ char smemChar[];
	AccT* smem = (AccT*) smemChar;
	r = reduceBlock<AccT, ReduceAccOp>(smem, blockDim.x, r, reduceAccOp, init);

	if (threadIdx.x == 0) {
	// Write out block-wide reduced value
	scratchSpace[blockIdx.x] = r;
	}
	}

	template <typename ReduceOp, typename T, typename IndexType>
	__global__ void
	kernelReduceAllPass2(int numPass1Blocks,
	T init,
	ReduceOp reduceOp,
	T* scratchSpace,
	T* out) {
	T r = init;
	if (threadIdx.x < numPass1Blocks) {
	r = scratchSpace[threadIdx.x];
	}

	// Reduce within the block
	extern __shared__ char smemChar[];
	T* smem = (T*) smemChar;
	r = reduceBlock<T, ReduceOp>(smem, numPass1Blocks, r, reduceOp, init);

	if (threadIdx.x == 0) {
	*out = r;
	}
	}

	// Perform a two-pass reduction if the tensor is large enough to
	// warrant it.
	inline bool isTwoPassReductionSize(long elements) {
	return (elements > THC_TWO_PASS_REDUCTION_SIZE);
	}

	template <typename InT, typename AccT>
	inline long getTwoPassBlocks(THCState* state, long elements) {
	long numBlocks = THCCeilDiv(elements, THC_REDUCE_ALL_BLOCK_SIZE);

	// We can only have as many blocks as there is scratch space
	long scratchSpace =
	THCState_getCurrentDeviceScratchSpaceSize(state) / sizeof(AccT);
	THAssert(scratchSpace > 0);

	if (numBlocks > scratchSpace) {
	numBlocks = scratchSpace;
	}

	return numBlocks;
	}

	// Get the block/grid size that we want
	template <typename InT, typename AccT>
	inline void getPass1ReduceBlockGrid(THCState* state, long elements,
	dim3& grid, dim3& block) {
	grid = dim3(getTwoPassBlocks<InT, AccT>(state, elements));
	block = dim3(THC_REDUCE_ALL_BLOCK_SIZE);
	}

	template <typename InT, typename AccT>
	inline void getPass2ReduceBlockGrid(THCState* state, long elements,
	dim3& grid, dim3& block) {
	grid = dim3(1);
	// We only need as many threads as there were blocks originally
	block = dim3(getTwoPassBlocks<InT, AccT>(state, elements));
	}

	template <typename InT, typename AccT>
	inline void getSinglePassReduceBlockGrid(long elements,
	dim3& grid, dim3& block) {
	grid = dim3(1);
	block = dim3(THC_REDUCE_ALL_BLOCK_SIZE);
	}

	template <typename ModifyOp,
	typename ReduceOp,
	typename ReduceAccOp,
	typename InT,
	typename AccT,
	typename IndexType,
	int ADims>
	void callReduceAll(THCState* state,
	const TensorInfo<InT, IndexType>& in,
	long totalElements,
	AccT init,
	const ModifyOp& modifyOp,
	const ReduceOp& reduceOp,
	const ReduceAccOp& reduceAccOp,
	AccT* devOut) {
	dim3 grid;
	dim3 block;

	if (isTwoPassReductionSize(totalElements)) {
	getPass1ReduceBlockGrid<InT, AccT>(state, totalElements, grid, block);
	size_t smemSize = block.x * sizeof(AccT);

	kernelReduceAllPass1<ModifyOp, ReduceOp, ReduceAccOp, InT, AccT, IndexType, ADims>
	<<<grid, block, smemSize, THCState_getCurrentStream(state)>>>(
	in, (IndexType) totalElements, init, modifyOp, reduceOp, reduceAccOp,
	(AccT*) THCState_getCurrentDeviceScratchSpace(state));

	int numPass1Blocks = grid.x;
	getPass2ReduceBlockGrid<InT, AccT>(state, totalElements, grid, block);
	smemSize = block.x * sizeof(AccT);

	kernelReduceAllPass2<ReduceAccOp, AccT, IndexType>
	<<<grid, block, smemSize, THCState_getCurrentStream(state)>>>(
	numPass1Blocks, init, reduceAccOp,
	(AccT*) THCState_getCurrentDeviceScratchSpace(state),
	devOut);

	} else {
	getSinglePassReduceBlockGrid<InT, AccT>(totalElements, grid, block);
	size_t smemSize = block.x * sizeof(AccT);

	kernelReduceAll<ModifyOp, ReduceOp, ReduceAccOp, InT, AccT, IndexType, ADims>
	<<<grid, block, smemSize, THCState_getCurrentStream(state)>>>(
	in, (IndexType) totalElements, init, modifyOp, reduceOp, reduceAccOp, devOut);
	}
	}

	// Reduces the entire tensor to one value. `out` points to
	// host-resident memory.
	template <typename TensorType,
	typename ModifyOp,
	typename ReduceOp,
	typename ReduceAccOp,
	typename AccT>
	bool THC_reduceAll(THCState* state,
	TensorType* in,
	const ModifyOp& modifyOp,
	const ReduceOp& reduceOp,
	const ReduceAccOp& reduceAccOp,
	AccT init,
	AccT* out,
	int outOnDevice) {
	long inElements = TensorUtils<TensorType>::getNumElements(state, in);

	if (TensorUtils<TensorType>::getDims(state, in) > MAX_CUTORCH_DIMS) {
	return false;
	}

	if (TensorUtils<TensorType>::getDims(state, in) == 0) {
	// Zero-dim tensor; do nothing
	*out = init;
	return true;
	}

	AccT* devOut = out;
	if (!outOnDevice) {
	// Use the stream-specific scratch space for the reduction kernel
	// to write out its value
	devOut = (AccT*) THCState_getCurrentDeviceScratchSpace(state);
	}

	// It is possible that the tensor dimensions are able to be collapsed,
	// and thus we can reduce the actual code complexity of the copy by
	// exploiting this knowledge statically, since the div/mod is the
	// most expensive part of the operation, more so than memory accesses.
	// For instance, when copying a non-contiguous to a contiguous tensor
	// (or vice versa), the contiguous tensor can be collapsed to one
	// dimension, and the loop to translate the linear index to the array
	// index can be similarly collapsed. That is what this unrolling is for.
	#define HANDLE_CASE(TYPE, IN) \
	callReduceAll<ModifyOp, ReduceOp, ReduceAccOp, \
	typename TensorUtils<TensorType>::DataType, \
	AccT, \
	TYPE, IN>( \
	state, inInfo, inElements, init, modifyOp, \
	reduceOp, reduceAccOp, devOut);

	#define HANDLE_IN_CASE(TYPE, IN) \
	{ \
	if (inInfo.isContiguous()) { \
	HANDLE_CASE(TYPE, -2); \
	} else { \
	switch (IN) { \
	case 1: \
	HANDLE_CASE(TYPE, 1); \
	break; \
	case 2: \
	HANDLE_CASE(TYPE, 2); \
	break; \
	default: \
	HANDLE_CASE(TYPE, -1); \
	break; \
	} \
	} \
	}

	if (TensorUtils<TensorType>::canUse32BitIndexMath(state, in)) {
	TensorInfo<typename TensorUtils<TensorType>::DataType, unsigned int> inInfo =
	getTensorInfo<TensorType, unsigned int>(state, in);
	inInfo.collapseDims();

	HANDLE_IN_CASE(unsigned int, inInfo.dims);
	} else {
	TensorInfo<typename TensorUtils<TensorType>::DataType,
	unsigned long long> inInfo =
	getTensorInfo<TensorType, unsigned long long>(state, in);
	inInfo.collapseDims();

	// For large tensors, we only compile the completely contiguous
	// version and the completely generic version, to reduce
	// compilation time.
	if (inInfo.isContiguous()) {
	HANDLE_IN_CASE(unsigned long long, -2);
	} else {
	HANDLE_IN_CASE(unsigned long long, -1);
	}
	}
	#undef HANDLE_CASE
	#undef HANDLE_IN_CASE

	// If our destination is not on the device, copy the value back to
	// the host (synchronous!)
	if (!outOnDevice) {
	cudaMemcpy(out, devOut, sizeof(AccT), cudaMemcpyDeviceToHost);
	}

	return true;
	}

	#undef THC_REDUCE_ALL_BLOCK_SIZE
	#undef THC_TWO_PASS_REDUCTION_SIZE

	#endif // THC_REDUCEALL_INC