THCTensorMathReduce.cu - platform/external/pytorch - Git at Google

 #include "THCTensorMath.h"
 #include "THCGeneral.h"
 #include "THCNumerics.cuh"
 #include "THCReduce.cuh"
 #include "THCReduceAll.cuh"
 #include <thrust/functional.h>

 // Reduction operators that support `half`, unlike Thrust
 template <typename InT, typename AccT>
 struct ReduceAdd {
   inline __device__ AccT operator()(AccT a, InT b) const {
     return a + (AccT) b;
   }
 };

 #ifdef CUDA_HALF_TENSOR
 template <>
 struct ReduceAdd<half, half> {
   inline __device__ half operator()(half a, half b) const {
 #ifdef CUDA_HALF_INSTRUCTIONS
     return __hadd(a, b);
 #else
     float fa = __half2float(a);
     float fb = __half2float(b);
     return __float2half(fa + fb);
 #endif
   }
 };

 template <>
 struct ReduceAdd<half, float> {
   inline __device__ float operator()(float a, half b) const {
     return a + __half2float(b);
   }
 };
 #endif // CUDA_HALF_TENSOR

 template <typename InT, typename AccT>
 struct ReduceMultiply {
   inline __device__ AccT operator()(AccT a, InT b) const {
     return a * (AccT) b;
   }
 };

 #ifdef CUDA_HALF_TENSOR
 template <>
 struct ReduceMultiply<half, half> {
   inline __device__ half operator()(half a, half b) const {
 #ifdef CUDA_HALF_INSTRUCTIONS
     return __hmul(a, b);
 #else
     float fa = __half2float(a);
     float fb = __half2float(b);
     return __float2half(fa * fb);
 #endif
   }
 };

 template <>
 struct ReduceMultiply<half, float> {
   inline __device__ float operator()(float a, half b) const {
     return a * __half2float(b);
   }
 };
 #endif // CUDA_HALF_TENSOR

 template <typename T>
 struct ReduceMin {
   inline __device__ T operator()(T a, T b) const {
     return THCNumerics<T>::lt(a, b) ? a : b;
   }
 };

 template <typename T>
 struct ReduceMax {
   inline __device__ T operator()(T a, T b) const {
     return THCNumerics<T>::gt(a, b) ? a : b;
   }
 };

 struct LogicalAll {
   inline __device__ unsigned char operator()(unsigned char x,
                                              unsigned char y) const {
     return (x && y);
   }
 };

 struct LogicalAny {
   inline __device__ unsigned char operator()(unsigned char x,
                                              unsigned char y) const {
     return (x || y);
   }
 };


 THC_API int
 THCudaByteTensor_logicalall(THCState *state, THCudaByteTensor *self) {
   THAssert(THCudaByteTensor_checkGPU(state, 1, self));
   unsigned char result;
   if (!THC_reduceAll(state, self,
                      thrust::identity<unsigned char>(),
                      LogicalAll(),
                      LogicalAll(),
                      (unsigned char) 1, &result, 0)) {
     THArgCheck(false, 1, CUTORCH_DIM_WARNING);
   }

   return (int) result;
 }

 THC_API int
 THCudaByteTensor_logicalany(THCState *state, THCudaByteTensor *self) {
   THAssert(THCudaByteTensor_checkGPU(state, 1, self));
   unsigned char result;
   if (!THC_reduceAll(state, self,
                      thrust::identity<unsigned char>(),
                      LogicalAny(),
                      LogicalAny(),
                      (unsigned char) 0, &result, 0)) {
     THArgCheck(false, 1, CUTORCH_DIM_WARNING);
   }

   return (int) result;
 }


 #include <thrust/functional.h>

 /* A set of reduction kernels that take in binary ops on thrust pairs (of value, index).
    These are useful when you not only have to do a reduction, but you might have
    to preserve the location of contention (for example min/max operations).
    The structure of the kernels follows the structure of the reduction kernels.
 */
 template <typename K, typename Index, class BinaryFunction>
 __global__ void
 kernelTransformReduceOuterDimIndex(K *tgt1,
                                    Index *tgt2,
                                    K *src_,
                                    unsigned num_orows,
                                    unsigned num_irows,
                                    unsigned row_size,
                                    thrust::pair<K, Index> init,
                                    BinaryFunction binary_op) {
   for (unsigned orow = blockIdx.x; orow < num_orows; orow += gridDim.x) {
     for (unsigned irow = blockIdx.y * blockDim.x + threadIdx.x;
          irow < num_irows;
          irow += gridDim.y * blockDim.x) {
       K *src = src_ + orow * row_size * num_irows + irow;
       thrust::pair<K, Index> acc = init;

       for (unsigned col = 0; col < row_size; ++col) {
         // +1 for Lua index
         acc = binary_op(thrust::make_pair<K, Index>(*src, col+1),
                         acc);
         src += num_irows;
       }

       tgt1[orow * num_irows + irow] = acc.first;
       tgt2[orow * num_irows + irow] = acc.second;
     }
   }
 }

 template <typename TensorTypeK,
           typename TensorTypeIndex,
           typename BinaryFunction>
 __host__ void
 THC_transformReduceOuterDimIndex(THCState *state,
                                  TensorTypeK *tgt1,
                                  TensorTypeIndex *tgt2,
                                  TensorTypeK *src,
                                  long rdim,
                                  const thrust::pair<
                                  typename TensorUtils<TensorTypeK>::DataType,
                                  typename TensorUtils<TensorTypeIndex>::DataType>& init,
                                  BinaryFunction binary_op) {
   unsigned ndim = TensorUtils<TensorTypeK>::getDims(state, src);
   unsigned num_orows = 1;
   for (unsigned dim = 0; dim < rdim; dim++) {
     num_orows *= TensorUtils<TensorTypeK>::getSize(state, src, dim);
   }
   unsigned row_size = TensorUtils<TensorTypeK>::getSize(state, src, rdim);
   unsigned num_irows = 1;
   for (unsigned dim = rdim + 1; dim < ndim; dim++) {
     num_irows *= TensorUtils<TensorTypeK>::getSize(state, src, dim);
   }

   dim3 threads(min(512, num_irows));
   unsigned maxGridDim = 1024;
   dim3 grid(min(maxGridDim, num_orows),
             min(maxGridDim, THCCeilDiv(num_irows, threads.x)));

   kernelTransformReduceOuterDimIndex
     <<<grid, threads, 0, THCState_getCurrentStream(state)>>>(
       TensorUtils<TensorTypeK>::getData(state, tgt1),
       TensorUtils<TensorTypeIndex>::getData(state, tgt2),
       TensorUtils<TensorTypeK>::getData(state, src),
       num_orows, num_irows, row_size, init, binary_op);

   THCudaCheck(cudaGetLastError());
 }

 /* Reduce the innermost dimension of a tensor (on thrust::pair functors which are (value, index))
  *
  * For an n-d tensor (n <= 4) where the reduction is along the innermost dimension:
  *
  * - block.x is the innermost dimension, i.e. dimension 0;
  * - block.y and grid.y make up dimension 1; and
  * - grid.x and grid z are the remaining two outer dimensions (if any)
  *
  * Reduction along other dimensions is handled in a separate kernel.
  */
 template <typename K, typename Index, class BinaryFunction>
 __global__ void
 kernelTransformReduceInnermostDimIndex(K *tgt1,
                                        Index* tgt2,
                                        K *src_,
                                        unsigned num_rows,
                                        unsigned row_size,
                                        thrust::pair<K, Index> init,
                                        BinaryFunction binary_op) {
   __shared__ K sbuf[32][16 + 1]; // avoid bank conflict
   __shared__ Index ibuf[32][16 + 1]; // avoid bank conflict

   for (unsigned block_row = blockIdx.x * blockDim.y;
        block_row < num_rows;
        block_row += blockDim.y * gridDim.x) {
     unsigned row = block_row + threadIdx.y;
     thrust::pair<K, Index> acc = init;
     if (row < num_rows) {
       K *src = src_ + row * row_size;
       // Sequential reduction within a thread.
       for (unsigned col = threadIdx.x; col < row_size; col += blockDim.x) {
         acc = binary_op(thrust::make_pair<K, Index>(src[col], col + 1), acc);
       }
     }

     sbuf[threadIdx.y][threadIdx.x] = acc.first;
     ibuf[threadIdx.y][threadIdx.x] = acc.second;

     __syncthreads();

     // Reduce intermediate values to single value.
     K* sline = &sbuf[threadIdx.y][0];
     Index* iline = &ibuf[threadIdx.y][0];
     for (unsigned s = 8; s > 0; s >>= 1) {
       if (row < num_rows && threadIdx.x < s) {
         thrust::pair<K, Index> arg1 =
           thrust::make_pair<K, Index>(sline[threadIdx.x], iline[threadIdx.x]);
         thrust::pair<K, Index> arg2 =
           thrust::make_pair<K, Index>(sline[threadIdx.x + s], iline[threadIdx.x + s]);
         thrust::pair<K, Index> res = binary_op(arg1, arg2);

         sline[threadIdx.x] = res.first;
         iline[threadIdx.x] = res.second;
       }
       __syncthreads();
     }

     if (row < num_rows && threadIdx.x == 0) {
       tgt1[row] = sline[0];
       tgt2[row] = iline[0];
     }
     __syncthreads();
   }
 }

 template <typename TensorTypeK,
           typename TensorTypeIndex,
           typename BinaryFunction>
 __host__ void
 THC_transformReduceInnermostDimIndex(THCState *state,
                                      TensorTypeK *tgt1,
                                      TensorTypeIndex *tgt2,
                                      TensorTypeK *src,
                                      const thrust::pair<
                                      typename TensorUtils<TensorTypeK>::DataType,
                                      typename TensorUtils<TensorTypeIndex>::DataType>& init,
                                      BinaryFunction binary_op) {
   unsigned ndim = TensorUtils<TensorTypeK>::getDims(state, src);
   unsigned num_rows = 1;
   for (unsigned dim = 0; dim < ndim - 1; dim++) {
     num_rows *= TensorUtils<TensorTypeK>::getSize(state, src, dim);
   }
   unsigned row_size = TensorUtils<TensorTypeK>::getSize(state, src, ndim - 1);

   dim3 threads(16, 32);
   dim3 grid(min(1024, THCCeilDiv(num_rows, threads.y)));

   kernelTransformReduceInnermostDimIndex
     <<<grid, threads, 0, THCState_getCurrentStream(state)>>>(
       TensorUtils<TensorTypeK>::getData(state, tgt1),
       TensorUtils<TensorTypeIndex>::getData(state, tgt2),
       TensorUtils<TensorTypeK>::getData(state, src),
       num_rows, row_size, init, binary_op);

   THCudaCheck(cudaGetLastError());
 }

 template <typename TensorTypeK,
           typename TensorTypeIndex,
           typename BinaryFunction>
 void
 THC_reduceDimIndex(THCState *state,
                    TensorTypeK *tgt1_,
                    TensorTypeIndex *tgt2_,
                    TensorTypeK *src,
                    long dimension,
                    const thrust::pair<
                    typename TensorUtils<TensorTypeK>::DataType,
                    typename TensorUtils<TensorTypeIndex>::DataType>& init,
                    BinaryFunction binary_op)
 {
   THArgCheck(dimension >= 0 &&
              dimension < TensorUtils<TensorTypeK>::getDims(state, src),
              3, "dimension out of range");

   THLongStorage *dim = TensorUtils<TensorTypeK>::newSizeOf(state, src);
   THLongStorage_set(dim, dimension, 1);
   TensorUtils<TensorTypeK>::resize(state, tgt1_, dim, NULL);
   TensorUtils<TensorTypeIndex>::resize(state, tgt2_, dim, NULL);
   THLongStorage_free(dim);

   TensorTypeK *tgt1 = TensorUtils<TensorTypeK>::newContiguous(state, tgt1_);
   TensorTypeIndex *tgt2 = TensorUtils<TensorTypeIndex>::newContiguous(state, tgt2_);
   src = TensorUtils<TensorTypeK>::newContiguous(state, src);

   if (dimension == TensorUtils<TensorTypeK>::getDims(state, src) - 1) {
     THC_transformReduceInnermostDimIndex(state, tgt1, tgt2, src, init, binary_op);
   } else {
     THC_transformReduceOuterDimIndex(state, tgt1, tgt2, src, dimension, init, binary_op);
   }

   TensorUtils<TensorTypeK>::free(state, src);
   TensorUtils<TensorTypeK>::freeCopyTo(state, tgt1, tgt1_);
   TensorUtils<TensorTypeIndex>::freeCopyTo(state, tgt2, tgt2_);
 }

 template <typename T, typename Index>
 struct MaxValuePair {
   __host__ __device__
   thrust::pair<T, Index> operator()(const thrust::pair<T, Index>& a,
                                     const thrust::pair<T, Index>& b) {
     return THCNumerics<T>::ge(a.first, b.first) ? a : b;
   }
 };

 template <typename T, typename Index>
 struct MinValuePair {
   __host__ __device__
   thrust::pair<T, Index> operator()(const thrust::pair<T, Index>& a,
                                     const thrust::pair<T, Index>& b) {
     return THCNumerics<T>::le(a.first, b.first) ? a : b;
   }
 };

 #include "generic/THCTensorMathReduce.cu"
 #include "THCGenerateAllTypes.h"
	#include "THCTensorMath.h"
	#include "THCGeneral.h"
	#include "THCNumerics.cuh"
	#include "THCReduce.cuh"
	#include "THCReduceAll.cuh"
	#include <thrust/functional.h>

	// Reduction operators that support `half`, unlike Thrust
	template <typename InT, typename AccT>
	struct ReduceAdd {
	inline __device__ AccT operator()(AccT a, InT b) const {
	return a + (AccT) b;
	}
	};

	#ifdef CUDA_HALF_TENSOR
	template <>
	struct ReduceAdd<half, half> {
	inline __device__ half operator()(half a, half b) const {
	#ifdef CUDA_HALF_INSTRUCTIONS
	return __hadd(a, b);
	#else
	float fa = __half2float(a);
	float fb = __half2float(b);
	return __float2half(fa + fb);
	#endif
	}
	};

	template <>
	struct ReduceAdd<half, float> {
	inline __device__ float operator()(float a, half b) const {
	return a + __half2float(b);
	}
	};
	#endif // CUDA_HALF_TENSOR

	template <typename InT, typename AccT>
	struct ReduceMultiply {
	inline __device__ AccT operator()(AccT a, InT b) const {
	return a * (AccT) b;
	}
	};

	#ifdef CUDA_HALF_TENSOR
	template <>
	struct ReduceMultiply<half, half> {
	inline __device__ half operator()(half a, half b) const {
	#ifdef CUDA_HALF_INSTRUCTIONS
	return __hmul(a, b);
	#else
	float fa = __half2float(a);
	float fb = __half2float(b);
	return __float2half(fa * fb);
	#endif
	}
	};

	template <>
	struct ReduceMultiply<half, float> {
	inline __device__ float operator()(float a, half b) const {
	return a * __half2float(b);
	}
	};
	#endif // CUDA_HALF_TENSOR

	template <typename T>
	struct ReduceMin {
	inline __device__ T operator()(T a, T b) const {
	return THCNumerics<T>::lt(a, b) ? a : b;
	}
	};

	template <typename T>
	struct ReduceMax {
	inline __device__ T operator()(T a, T b) const {
	return THCNumerics<T>::gt(a, b) ? a : b;
	}
	};

	struct LogicalAll {
	inline __device__ unsigned char operator()(unsigned char x,
	unsigned char y) const {
	return (x && y);
	}
	};

	struct LogicalAny {
	inline __device__ unsigned char operator()(unsigned char x,
	unsigned char y) const {
	return (x \|\| y);
	}
	};


	THC_API int
	THCudaByteTensor_logicalall(THCState state, THCudaByteTensor self) {
	THAssert(THCudaByteTensor_checkGPU(state, 1, self));
	unsigned char result;
	if (!THC_reduceAll(state, self,
	thrust::identity<unsigned char>(),
	LogicalAll(),
	LogicalAll(),
	(unsigned char) 1, &result, 0)) {
	THArgCheck(false, 1, CUTORCH_DIM_WARNING);
	}

	return (int) result;
	}

	THC_API int
	THCudaByteTensor_logicalany(THCState state, THCudaByteTensor self) {
	THAssert(THCudaByteTensor_checkGPU(state, 1, self));
	unsigned char result;
	if (!THC_reduceAll(state, self,
	thrust::identity<unsigned char>(),
	LogicalAny(),
	LogicalAny(),
	(unsigned char) 0, &result, 0)) {
	THArgCheck(false, 1, CUTORCH_DIM_WARNING);
	}

	return (int) result;
	}


	#include <thrust/functional.h>

	/* A set of reduction kernels that take in binary ops on thrust pairs (of value, index).
	These are useful when you not only have to do a reduction, but you might have
	to preserve the location of contention (for example min/max operations).
	The structure of the kernels follows the structure of the reduction kernels.
	*/
	template <typename K, typename Index, class BinaryFunction>
	__global__ void
	kernelTransformReduceOuterDimIndex(K *tgt1,
	Index *tgt2,
	K *src_,
	unsigned num_orows,
	unsigned num_irows,
	unsigned row_size,
	thrust::pair<K, Index> init,
	BinaryFunction binary_op) {
	for (unsigned orow = blockIdx.x; orow < num_orows; orow += gridDim.x) {
	for (unsigned irow = blockIdx.y * blockDim.x + threadIdx.x;
	irow < num_irows;
	irow += gridDim.y * blockDim.x) {
	K src = src_ + orow row_size * num_irows + irow;
	thrust::pair<K, Index> acc = init;

	for (unsigned col = 0; col < row_size; ++col) {
	// +1 for Lua index
	acc = binary_op(thrust::make_pair<K, Index>(*src, col+1),
	acc);
	src += num_irows;
	}

	tgt1[orow * num_irows + irow] = acc.first;
	tgt2[orow * num_irows + irow] = acc.second;
	}
	}
	}

	template <typename TensorTypeK,
	typename TensorTypeIndex,
	typename BinaryFunction>
	__host__ void
	THC_transformReduceOuterDimIndex(THCState *state,
	TensorTypeK *tgt1,
	TensorTypeIndex *tgt2,
	TensorTypeK *src,
	long rdim,
	const thrust::pair<
	typename TensorUtils<TensorTypeK>::DataType,
	typename TensorUtils<TensorTypeIndex>::DataType>& init,
	BinaryFunction binary_op) {
	unsigned ndim = TensorUtils<TensorTypeK>::getDims(state, src);
	unsigned num_orows = 1;
	for (unsigned dim = 0; dim < rdim; dim++) {
	num_orows *= TensorUtils<TensorTypeK>::getSize(state, src, dim);
	}
	unsigned row_size = TensorUtils<TensorTypeK>::getSize(state, src, rdim);
	unsigned num_irows = 1;
	for (unsigned dim = rdim + 1; dim < ndim; dim++) {
	num_irows *= TensorUtils<TensorTypeK>::getSize(state, src, dim);
	}

	dim3 threads(min(512, num_irows));
	unsigned maxGridDim = 1024;
	dim3 grid(min(maxGridDim, num_orows),
	min(maxGridDim, THCCeilDiv(num_irows, threads.x)));

	kernelTransformReduceOuterDimIndex
	<<<grid, threads, 0, THCState_getCurrentStream(state)>>>(
	TensorUtils<TensorTypeK>::getData(state, tgt1),
	TensorUtils<TensorTypeIndex>::getData(state, tgt2),
	TensorUtils<TensorTypeK>::getData(state, src),
	num_orows, num_irows, row_size, init, binary_op);

	THCudaCheck(cudaGetLastError());
	}

	/* Reduce the innermost dimension of a tensor (on thrust::pair functors which are (value, index))
	*
	* For an n-d tensor (n <= 4) where the reduction is along the innermost dimension:
	*
	* - block.x is the innermost dimension, i.e. dimension 0;
	* - block.y and grid.y make up dimension 1; and
	* - grid.x and grid z are the remaining two outer dimensions (if any)
	*
	* Reduction along other dimensions is handled in a separate kernel.
	*/
	template <typename K, typename Index, class BinaryFunction>
	__global__ void
	kernelTransformReduceInnermostDimIndex(K *tgt1,
	Index* tgt2,
	K *src_,
	unsigned num_rows,
	unsigned row_size,
	thrust::pair<K, Index> init,
	BinaryFunction binary_op) {
	__shared__ K sbuf[32][16 + 1]; // avoid bank conflict
	__shared__ Index ibuf[32][16 + 1]; // avoid bank conflict

	for (unsigned block_row = blockIdx.x * blockDim.y;
	block_row < num_rows;
	block_row += blockDim.y * gridDim.x) {
	unsigned row = block_row + threadIdx.y;
	thrust::pair<K, Index> acc = init;
	if (row < num_rows) {
	K src = src_ + row row_size;
	// Sequential reduction within a thread.
	for (unsigned col = threadIdx.x; col < row_size; col += blockDim.x) {
	acc = binary_op(thrust::make_pair<K, Index>(src[col], col + 1), acc);
	}
	}

	sbuf[threadIdx.y][threadIdx.x] = acc.first;
	ibuf[threadIdx.y][threadIdx.x] = acc.second;

	__syncthreads();

	// Reduce intermediate values to single value.
	K* sline = &sbuf[threadIdx.y][0];
	Index* iline = &ibuf[threadIdx.y][0];
	for (unsigned s = 8; s > 0; s >>= 1) {
	if (row < num_rows && threadIdx.x < s) {
	thrust::pair<K, Index> arg1 =
	thrust::make_pair<K, Index>(sline[threadIdx.x], iline[threadIdx.x]);
	thrust::pair<K, Index> arg2 =
	thrust::make_pair<K, Index>(sline[threadIdx.x + s], iline[threadIdx.x + s]);
	thrust::pair<K, Index> res = binary_op(arg1, arg2);

	sline[threadIdx.x] = res.first;
	iline[threadIdx.x] = res.second;
	}
	__syncthreads();
	}

	if (row < num_rows && threadIdx.x == 0) {
	tgt1[row] = sline[0];
	tgt2[row] = iline[0];
	}
	__syncthreads();
	}
	}

	template <typename TensorTypeK,
	typename TensorTypeIndex,
	typename BinaryFunction>
	__host__ void
	THC_transformReduceInnermostDimIndex(THCState *state,
	TensorTypeK *tgt1,
	TensorTypeIndex *tgt2,
	TensorTypeK *src,
	const thrust::pair<
	typename TensorUtils<TensorTypeK>::DataType,
	typename TensorUtils<TensorTypeIndex>::DataType>& init,
	BinaryFunction binary_op) {
	unsigned ndim = TensorUtils<TensorTypeK>::getDims(state, src);
	unsigned num_rows = 1;
	for (unsigned dim = 0; dim < ndim - 1; dim++) {
	num_rows *= TensorUtils<TensorTypeK>::getSize(state, src, dim);
	}
	unsigned row_size = TensorUtils<TensorTypeK>::getSize(state, src, ndim - 1);

	dim3 threads(16, 32);
	dim3 grid(min(1024, THCCeilDiv(num_rows, threads.y)));

	kernelTransformReduceInnermostDimIndex
	<<<grid, threads, 0, THCState_getCurrentStream(state)>>>(
	TensorUtils<TensorTypeK>::getData(state, tgt1),
	TensorUtils<TensorTypeIndex>::getData(state, tgt2),
	TensorUtils<TensorTypeK>::getData(state, src),
	num_rows, row_size, init, binary_op);

	THCudaCheck(cudaGetLastError());
	}

	template <typename TensorTypeK,
	typename TensorTypeIndex,
	typename BinaryFunction>
	void
	THC_reduceDimIndex(THCState *state,
	TensorTypeK *tgt1_,
	TensorTypeIndex *tgt2_,
	TensorTypeK *src,
	long dimension,
	const thrust::pair<
	typename TensorUtils<TensorTypeK>::DataType,
	typename TensorUtils<TensorTypeIndex>::DataType>& init,
	BinaryFunction binary_op)
	{
	THArgCheck(dimension >= 0 &&
	dimension < TensorUtils<TensorTypeK>::getDims(state, src),
	3, "dimension out of range");

	THLongStorage *dim = TensorUtils<TensorTypeK>::newSizeOf(state, src);
	THLongStorage_set(dim, dimension, 1);
	TensorUtils<TensorTypeK>::resize(state, tgt1_, dim, NULL);
	TensorUtils<TensorTypeIndex>::resize(state, tgt2_, dim, NULL);
	THLongStorage_free(dim);

	TensorTypeK *tgt1 = TensorUtils<TensorTypeK>::newContiguous(state, tgt1_);
	TensorTypeIndex *tgt2 = TensorUtils<TensorTypeIndex>::newContiguous(state, tgt2_);
	src = TensorUtils<TensorTypeK>::newContiguous(state, src);

	if (dimension == TensorUtils<TensorTypeK>::getDims(state, src) - 1) {
	THC_transformReduceInnermostDimIndex(state, tgt1, tgt2, src, init, binary_op);
	} else {
	THC_transformReduceOuterDimIndex(state, tgt1, tgt2, src, dimension, init, binary_op);
	}

	TensorUtils<TensorTypeK>::free(state, src);
	TensorUtils<TensorTypeK>::freeCopyTo(state, tgt1, tgt1_);
	TensorUtils<TensorTypeIndex>::freeCopyTo(state, tgt2, tgt2_);
	}

	template <typename T, typename Index>
	struct MaxValuePair {
	__host__ __device__
	thrust::pair<T, Index> operator()(const thrust::pair<T, Index>& a,
	const thrust::pair<T, Index>& b) {
	return THCNumerics<T>::ge(a.first, b.first) ? a : b;
	}
	};

	template <typename T, typename Index>
	struct MinValuePair {
	__host__ __device__
	thrust::pair<T, Index> operator()(const thrust::pair<T, Index>& a,
	const thrust::pair<T, Index>& b) {
	return THCNumerics<T>::le(a.first, b.first) ? a : b;
	}
	};

	#include "generic/THCTensorMathReduce.cu"
	#include "THCGenerateAllTypes.h"