THCTensorSort.cu - platform/external/pytorch - Git at Google

 #include "THCReduceApplyUtils.cuh"
 #include "THCSortUtils.cuh"
 #include "THCTensorCopy.h"
 #include "THCTensorTypeUtils.cuh"

 #include <thrust/device_ptr.h>
 #include <thrust/sort.h>
 #if CUDA_VERSION >= 7000
 #include <thrust/system/cuda/execution_policy.h>
 #endif

 // Returns 2^(ceil(lg(n)) from Stanford bit twiddling hacks
 unsigned long nextHighestPowerOf2(unsigned long n) {
   n--;
   n |= n >> 1;
   n |= n >> 2;
   n |= n >> 4;
   n |= n >> 8;
   n |= n >> 16;
   n |= n >> 32;
   n++;

   return n;
 }

 // `base` is the base address of a tensor
 // For each slice (defined as a linear point of `out`, from 0 ->
 // (sliceSize - 1) * sliceStride, we fill that slice from `0` to
 // `sliceSize - 1`.
 template <typename IndexType, int Dim>
 __global__ void
 fillSliceWithIndex(TensorInfo<float, IndexType> out,
                    IndexType totalSlices,
                    IndexType sliceSize,
                    IndexType sliceStride) {
   IndexType slice = getLinearBlockId<IndexType>();

   if (slice >= totalSlices) {
     return;
   }

   const unsigned long offset =
     IndexToOffset<float, IndexType, Dim>::get(slice, out);
   float* base = &out.data[offset];

   for (long i = threadIdx.x; i < sliceSize; i += blockDim.x) {
     // Torch indices are 1-based (hence the +1)
     base[i * sliceStride] = (float) i + 1.0f;
   }
 }

 void THCudaTensor_fillSliceWithIndex(THCState* state,
                                      THCudaTensor* t,
                                      int dim) {
   THCCheckTensorDims(state, t, 2);

   long inElements = THCudaTensor_nElement(state, t);
   long sliceSize = THCudaTensor_size(state, t, dim);
   long numSlices = inElements / sliceSize;

   dim3 grid;
   if (!THC_getGridFromTiles(numSlices, grid)) {
     THError("Slice to fill with indices is too large");
   }

   long maxThreads =
     THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock;
   long numThreads = sliceSize;
   if (numThreads > maxThreads) {
     numThreads = maxThreads;
   }

   dim3 block(numThreads);

 #define FILL_INDEX(T, DIM)                                       \
   fillSliceWithIndex<T, DIM>                                     \
     <<<grid, block, 0, THCState_getCurrentStream(state)>>>(      \
       info, numSlices, sliceSize, info.strides[collapseDim])

   if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, t)) {
     TensorInfo<float, unsigned int> info =
       getTensorInfo<THCudaTensor, unsigned int>(state, t);
     info.reduceDim(dim);
     int collapseDim = info.collapseDims(dim);

     if (info.isContiguous()) {
       FILL_INDEX(unsigned int, -2);
     } else {
       if (info.dims == 1) {
         FILL_INDEX(unsigned int, 1);
       } else if (info.dims == 2) {
         FILL_INDEX(unsigned int, 2);
       } else {
         FILL_INDEX(unsigned int, -1);
       }
     }
   } else {
     TensorInfo<float, unsigned long> info =
       getTensorInfo<THCudaTensor, unsigned long>(state, t);
     info.reduceDim(dim);
     int collapseDim = info.collapseDims(dim);

     // catch-all implementation
     FILL_INDEX(unsigned long, -1);
   }

 #undef FILL_INDEX

   THCudaCheck(cudaGetLastError());
 }

 // In alignment with default sort on a c++ map, this function
 // will permute key and value tensors identically, and
 // in such a way that the 'key' tensor is ordered numerically
 THC_API void THCudaTensor_sortKeyValueInplace(THCState* state,
                                               THCudaTensor* key,
                                               THCudaTensor* value,
                                               int dim, bool dir) {
   THArgCheck(THCudaTensor_isSameSizeAs(state, key, value), 2,
              "Key tensor must have same size as value tensor");
   THCCheckTensorDims(state, key, 2);
   THCCheckTensorDims(state, value, 3);

   long inElements = THCudaTensor_nElement(state, key);
   long keySliceSize = THCudaTensor_size(state, key, dim);
   long keySlices = inElements / keySliceSize;

   if (THCudaTensor_nDimension(state, key) == 0) {
     // Zero-dim tensor; do nothing
     return;
   }

   // The amount of shared memory and block size is based on
   // 2^ceil(lg(n)); we choose that sorting implementation for a given
   // size.
   long ceilPowerOf2 = nextHighestPowerOf2(keySliceSize);

   // FIXME: We'd have to find some other trick with Thrust to perform a
   // vectorized (key, value) sort by slice segment
   if (ceilPowerOf2 > 2048) {
     THError("sortKeyValueInplace only works for sizes <= 2048 at present");
   }

   int blockSize = (int) ceilPowerOf2 / 2;
   if (blockSize < 1) {
     blockSize = 1;
   }

   dim3 block(blockSize);

   // The grid is based on the number of independent slices that we
   // have to sort; one block per slice
   dim3 grid;
   if (!THC_getGridFromTiles(keySlices, grid)) {
     THError("Slice to sort is too large");
   }

 #define HANDLE_CASE(TYPE, A, SIZE)                                      \
   if (dir) {                                                            \
     bitonicSortKVInPlace<float, float, A, -1, GTComp<float>, TYPE, SIZE> \
       <<<grid, block, 0, THCState_getCurrentStream(state)>>>(           \
         keyInfo,                                                        \
         keySlices,                                                      \
         (TYPE) keySliceSize,                                            \
         (TYPE) keyInfo.strides[collapseKeyDim],                         \
         valueInfo,                                                      \
         (TYPE) valueInfo.strides[collapseValueDim],                     \
         GTComp<float>());                                               \
   } else {                                                              \
     bitonicSortKVInPlace<float, float, A, -1, LTComp<float>, TYPE, SIZE> \
       <<<grid, block, 0, THCState_getCurrentStream(state)>>>(           \
         keyInfo,                                                        \
         keySlices,                                                      \
         (TYPE) keySliceSize,                                            \
         (TYPE) keyInfo.strides[collapseKeyDim],                         \
         valueInfo,                                                      \
         (TYPE) valueInfo.strides[collapseValueDim],                     \
         LTComp<float>());                                               \
   }

 #define HANDLE_SORT_CASE(TYPE, A)                       \
   {                                                     \
     switch (ceilPowerOf2) {                             \
       case 2048:                                        \
       HANDLE_CASE(TYPE, A, 2048);                       \
       break;                                            \
       case 1024:                                        \
       HANDLE_CASE(TYPE, A, 1024);                       \
       break;                                            \
       case 512:                                         \
       HANDLE_CASE(TYPE, A, 512);                        \
       break;                                            \
       case 256:                                         \
       HANDLE_CASE(TYPE, A, 256);                        \
       break;                                            \
       case 128:                                         \
       HANDLE_CASE(TYPE, A, 128);                        \
       break;                                            \
       case 64:                                          \
       HANDLE_CASE(TYPE, A, 64);                         \
       break;                                            \
       case 32:                                          \
       HANDLE_CASE(TYPE, A, 32);                         \
       break;                                            \
       case 16:                                          \
       HANDLE_CASE(TYPE, A, 16);                         \
       break;                                            \
       case 8:                                           \
       HANDLE_CASE(TYPE, A, 8);                          \
       break;                                            \
       case 4:                                           \
       HANDLE_CASE(TYPE, A, 4);                          \
       break;                                            \
       case 2:                                           \
       HANDLE_CASE(TYPE, A, 2);                          \
       break;                                            \
       case 1:                                           \
       /* Nothing to do, data already sorted */          \
       break;                                            \
       default:                                          \
       assert(false);                                    \
     }                                                   \
   }

   // The constructed key/value tensor info is used to select the slice
   // we are sorting on a per-block basis
   if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, key)) {
     TensorInfo<float, unsigned int> keyInfo =
       getTensorInfo<THCudaTensor, unsigned int>(state, key);
     keyInfo.reduceDim(dim);
     int collapseKeyDim = keyInfo.collapseDims(dim);

     TensorInfo<float, unsigned int> valueInfo =
       getTensorInfo<THCudaTensor, unsigned int>(state, value);
     valueInfo.reduceDim(dim);
     int collapseValueDim = valueInfo.collapseDims(dim);

     if (keyInfo.isContiguous()) {
       HANDLE_SORT_CASE(unsigned int, -2);
     } else {
       switch (keyInfo.dims) {
         case 1:
           HANDLE_SORT_CASE(unsigned int, 1);
           break;
         case 2:
           HANDLE_SORT_CASE(unsigned int, 2);
           break;
         default:
           HANDLE_SORT_CASE(unsigned int, -1);
           break;
       }
     }
   } else {
     TensorInfo<float, unsigned long> keyInfo =
       getTensorInfo<THCudaTensor, unsigned long>(state, key);
     keyInfo.reduceDim(dim);
     int collapseKeyDim = keyInfo.collapseDims(dim);

     TensorInfo<float, unsigned long> valueInfo =
       getTensorInfo<THCudaTensor, unsigned long>(state, value);
     valueInfo.reduceDim(dim);
     int collapseValueDim = valueInfo.collapseDims(dim);

     // long case is rare, just instantiate these versions
     if (keyInfo.isContiguous()) {
       HANDLE_SORT_CASE(unsigned long, -2);
     } else {
       HANDLE_SORT_CASE(unsigned long, -1);
     }
   }
 #undef HANDLE_CASE
 #undef HANDLE_SORT_CASE
 #undef HANDLE_A_CASE

   THCudaCheck(cudaGetLastError());
 }

 // For slice sorting in Thrust; extracts a slice index from a linear
 // index and uses that for comparison
 struct SliceComp {
   SliceComp(int size) : sliceSize(size) {}

   __device__ bool operator()(const int& a, const int& b) const {
     // Since the slices are guaranteed to be innermost, the segment is
     // just via integer division
     int segA = a / sliceSize;
     int segB = b / sliceSize;
     return segA < segB;
   }

   const int sliceSize;
 };

 // For sorting in Thurst; extracts a within-slice index from a linear index
 struct GlobalIndexToPerSliceIndex {
   GlobalIndexToPerSliceIndex(int size) : sliceSize(size) {}

   __device__ inline void operator()(int& v) const {
     // Thrust is operating on this index array as an array of type
     // int, but to Torch it should be a float array.
     v = __float_as_int((float) (v % sliceSize + 1));
   }

   const int sliceSize;
 };

 void sortViaThrust(THCState* state,
                    THCudaTensor* sorted,
                    THCudaTensor* indices,
                    THCudaTensor* input,
                    int dim, bool dir) {
   long nDims = THCudaTensor_nDimension(state, input);

   long totalElements = THCudaTensor_nElement(state, input);
   long sliceSize = THCudaTensor_size(state, input, dim);
   long sliceStride = THCudaTensor_stride(state, input, dim);

   // We perform a vectorized segmented sort in Thrust.
   // Say we are sorting a (2, 3) tensor. We have in flattened form:
   // values 0.4 1.2 5.3 6.2 1.3 2.3
   // indices  0   1   2   3   4   5
   // where indices is a global index (across all slices)

   // First we sort by values, globally:
   // values 6.2 5.3 2.3 1.2 1.3 0.4
   // indices  3   2   5   1   4   0

   // Then we stable sort by segment, which is index / 3:
   // values 5.3 1.2 0.4 6.2 2.3 1.3
   // indices  2   1   0   3   5   4

   // Then we translate the global index to a per-slice Lua index
   // (index % 3) + 1:
   // values 5.3 1.2 0.4 6.2 2.3 1.3
   // indices  3   2   1   1   3   2

   // This method can only work if the slice we are sorting (`dim`) is
   // innermost, and both values and indices are contiguous. We do this
   // by re-arranging the input into this form as needed, which will
   // unfortunately allocate memory if the request is not in this form.
   // Vectorized sort is slower than iterated sort if the number of
   // slices is small (since we're sorting twice, instead of invoking a
   // smaller sort `numSlices` times), but the Thrust sort
   // implementation here is a catch-all, so we're not looking for
   // efficiency, but instead correctness.
   THCudaTensor_copy(state, sorted, input);
   THCudaTensor* trKeys = THCudaTensor_newWithTensor(state, sorted);
   THCudaTensor* trIndices = THCudaTensor_newWithTensor(state, indices);

   // Transpose dim to innermost
   if (dim != nDims - 1) {
     THCudaTensor_transpose(state, trKeys, NULL, dim, nDims - 1);
     THCudaTensor_transpose(state, trIndices, NULL, dim, nDims - 1);
   }

   // Thrust must operate on a contiguous layout
   THCudaTensor* trContigKey = THCudaTensor_newContiguous(state, trKeys);
   THCudaTensor* trContigIndices = THCudaTensor_newContiguous(state, trIndices);

   THCudaTensor_free(state, trKeys);
   THCudaTensor_free(state, trIndices);

   thrust::device_ptr<float> keyIter(THCudaTensor_data(state, trContigKey));

   // Since we are composing a global index across all segments rather
   // than a per-segment index, we treat the memory as int so we don't
   // have problems sorting slices < 2^24 but where the entire tensor
   // has more than 2^24 elements
   thrust::device_ptr<int>
     indexIter((int*) THCudaTensor_data(state, trContigIndices));

   // Fill the indices with a global index across all slices
   thrust::counting_iterator<int> countIter(0);

   thrust::copy(
 #if CUDA_VERSION >= 7000
     thrust::cuda::par.on(THCState_getCurrentStream(state)),
 #endif
     countIter, countIter + totalElements, indexIter);

   // First, we sort globally (across all slices) according to key
   // (the values we're sorting)
   if (dir) {
     thrust::stable_sort_by_key(
 #if CUDA_VERSION >= 7000
       thrust::cuda::par.on(THCState_getCurrentStream(state)),
 #endif
       keyIter, keyIter + totalElements, indexIter, thrust::greater<float>());
   } else {
     thrust::stable_sort_by_key(
 #if CUDA_VERSION >= 7000
       thrust::cuda::par.on(THCState_getCurrentStream(state)),
 #endif
       keyIter, keyIter + totalElements, indexIter, thrust::less<float>());
   }

   // Then, re-sort according to slice that each index is
   // in. This completes the segment sort in Thrust, since we're
   // stably sorting here, preserving the relative order of values
   // per each slice
   thrust::stable_sort_by_key(
 #if CUDA_VERSION >= 7000
     thrust::cuda::par.on(THCState_getCurrentStream(state)),
 #endif
     indexIter, indexIter + totalElements, keyIter,
     SliceComp(sliceSize));

   // Translate the global integer 0-based index to a per-slice float
   // Lua index
   thrust::for_each(
 #if CUDA_VERSION >= 7000
     thrust::cuda::par.on(THCState_getCurrentStream(state)),
 #endif
     indexIter, indexIter + totalElements,
     GlobalIndexToPerSliceIndex(sliceSize));

   // Reverse the transposition as needed
   if (dim != nDims - 1) {
     THCudaTensor_transpose(state, trContigKey, NULL, dim, nDims - 1);
     THCudaTensor_transpose(state, trContigIndices, NULL, dim, nDims - 1);
   }

   // Then copy back to the expected output
   THCudaTensor_freeCopyTo(state, trContigKey, sorted);
   THCudaTensor_freeCopyTo(state, trContigIndices, indices);
 }

 THC_API void THCudaTensor_sort(THCState* state,
                                THCudaTensor *sorted,
                                THCudaTensor *indices,
                                THCudaTensor *input,
                                int dim, int order) {
   THAssert(THCudaTensor_checkGPU(state, 3, sorted, indices, input));
   THCCheckTensorDims(state, sorted, 2);
   THCCheckTensorDims(state, indices, 3);
   THCCheckTensorDims(state, input, 4);

   // Make sure sufficient output space is allocated
   THCudaTensor_resizeAs(state, sorted, input);
   THCudaTensor_resizeAs(state, indices, input);

   // How large are the slices that we are sorting?
   long sliceSize = THCudaTensor_size(state, input, dim);

   // We're using THCudaTensor to write out indices, so if the slice
   // size that we're sorting has more elements than can be
   // represented in fp32, warn the user
   // FIXME: this isn't a real restriction of either our code or of
   // Thrust, but we have to switch to a CUDA long tensor to support
   // larger slice sizes. Otherwise the indices will contain garbage.
   THArgCheck(sliceSize <= (long) FLOAT32_MAX_CONSECUTIVE_INT, 5,
              "The sort dimension exceeds single-precision float "
              "consecutive integer precision size (2^24), since float "
              "is used for indices");

   if (sliceSize <= 2048) {
     // Fill `indices` (the values) with the
     // slice-relative index.
     THCudaTensor_fillSliceWithIndex(state, indices, dim);

     // We sort k/v pairs in-place; copy unsorted input to output
     THCudaTensor_copy(state, sorted, input);

     // Sort using our in-place k/v kernel that supports arbitrary
     // layout
     THCudaTensor_sortKeyValueInplace(state, sorted, indices, dim, order);
   } else {
     // Otherwise, fall back upon Thrust, which handles all other cases
     // (potentially slowly, with extra copies/memory allocations)
     sortViaThrust(state, sorted, indices, input, dim, (bool) order);
   }

   THCudaCheck(cudaGetLastError());
 }
	#include "THCReduceApplyUtils.cuh"
	#include "THCSortUtils.cuh"
	#include "THCTensorCopy.h"
	#include "THCTensorTypeUtils.cuh"

	#include <thrust/device_ptr.h>
	#include <thrust/sort.h>
	#if CUDA_VERSION >= 7000
	#include <thrust/system/cuda/execution_policy.h>
	#endif

	// Returns 2^(ceil(lg(n)) from Stanford bit twiddling hacks
	unsigned long nextHighestPowerOf2(unsigned long n) {
	n--;
	n \|= n >> 1;
	n \|= n >> 2;
	n \|= n >> 4;
	n \|= n >> 8;
	n \|= n >> 16;
	n \|= n >> 32;
	n++;

	return n;
	}

	// `base` is the base address of a tensor
	// For each slice (defined as a linear point of `out`, from 0 ->
	// (sliceSize - 1) * sliceStride, we fill that slice from `0` to
	// `sliceSize - 1`.
	template <typename IndexType, int Dim>
	__global__ void
	fillSliceWithIndex(TensorInfo<float, IndexType> out,
	IndexType totalSlices,
	IndexType sliceSize,
	IndexType sliceStride) {
	IndexType slice = getLinearBlockId<IndexType>();

	if (slice >= totalSlices) {
	return;
	}

	const unsigned long offset =
	IndexToOffset<float, IndexType, Dim>::get(slice, out);
	float* base = &out.data[offset];

	for (long i = threadIdx.x; i < sliceSize; i += blockDim.x) {
	// Torch indices are 1-based (hence the +1)
	base[i * sliceStride] = (float) i + 1.0f;
	}
	}

	void THCudaTensor_fillSliceWithIndex(THCState* state,
	THCudaTensor* t,
	int dim) {
	THCCheckTensorDims(state, t, 2);

	long inElements = THCudaTensor_nElement(state, t);
	long sliceSize = THCudaTensor_size(state, t, dim);
	long numSlices = inElements / sliceSize;

	dim3 grid;
	if (!THC_getGridFromTiles(numSlices, grid)) {
	THError("Slice to fill with indices is too large");
	}

	long maxThreads =
	THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock;
	long numThreads = sliceSize;
	if (numThreads > maxThreads) {
	numThreads = maxThreads;
	}

	dim3 block(numThreads);

	#define FILL_INDEX(T, DIM) \
	fillSliceWithIndex<T, DIM> \
	<<<grid, block, 0, THCState_getCurrentStream(state)>>>( \
	info, numSlices, sliceSize, info.strides[collapseDim])

	if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, t)) {
	TensorInfo<float, unsigned int> info =
	getTensorInfo<THCudaTensor, unsigned int>(state, t);
	info.reduceDim(dim);
	int collapseDim = info.collapseDims(dim);

	if (info.isContiguous()) {
	FILL_INDEX(unsigned int, -2);
	} else {
	if (info.dims == 1) {
	FILL_INDEX(unsigned int, 1);
	} else if (info.dims == 2) {
	FILL_INDEX(unsigned int, 2);
	} else {
	FILL_INDEX(unsigned int, -1);
	}
	}
	} else {
	TensorInfo<float, unsigned long> info =
	getTensorInfo<THCudaTensor, unsigned long>(state, t);
	info.reduceDim(dim);
	int collapseDim = info.collapseDims(dim);

	// catch-all implementation
	FILL_INDEX(unsigned long, -1);
	}

	#undef FILL_INDEX

	THCudaCheck(cudaGetLastError());
	}

	// In alignment with default sort on a c++ map, this function
	// will permute key and value tensors identically, and
	// in such a way that the 'key' tensor is ordered numerically
	THC_API void THCudaTensor_sortKeyValueInplace(THCState* state,
	THCudaTensor* key,
	THCudaTensor* value,
	int dim, bool dir) {
	THArgCheck(THCudaTensor_isSameSizeAs(state, key, value), 2,
	"Key tensor must have same size as value tensor");
	THCCheckTensorDims(state, key, 2);
	THCCheckTensorDims(state, value, 3);

	long inElements = THCudaTensor_nElement(state, key);
	long keySliceSize = THCudaTensor_size(state, key, dim);
	long keySlices = inElements / keySliceSize;

	if (THCudaTensor_nDimension(state, key) == 0) {
	// Zero-dim tensor; do nothing
	return;
	}

	// The amount of shared memory and block size is based on
	// 2^ceil(lg(n)); we choose that sorting implementation for a given
	// size.
	long ceilPowerOf2 = nextHighestPowerOf2(keySliceSize);

	// FIXME: We'd have to find some other trick with Thrust to perform a
	// vectorized (key, value) sort by slice segment
	if (ceilPowerOf2 > 2048) {
	THError("sortKeyValueInplace only works for sizes <= 2048 at present");
	}

	int blockSize = (int) ceilPowerOf2 / 2;
	if (blockSize < 1) {
	blockSize = 1;
	}

	dim3 block(blockSize);

	// The grid is based on the number of independent slices that we
	// have to sort; one block per slice
	dim3 grid;
	if (!THC_getGridFromTiles(keySlices, grid)) {
	THError("Slice to sort is too large");
	}

	#define HANDLE_CASE(TYPE, A, SIZE) \
	if (dir) { \
	bitonicSortKVInPlace<float, float, A, -1, GTComp<float>, TYPE, SIZE> \
	<<<grid, block, 0, THCState_getCurrentStream(state)>>>( \
	keyInfo, \
	keySlices, \
	(TYPE) keySliceSize, \
	(TYPE) keyInfo.strides[collapseKeyDim], \
	valueInfo, \
	(TYPE) valueInfo.strides[collapseValueDim], \
	GTComp<float>()); \
	} else { \
	bitonicSortKVInPlace<float, float, A, -1, LTComp<float>, TYPE, SIZE> \
	<<<grid, block, 0, THCState_getCurrentStream(state)>>>( \
	keyInfo, \
	keySlices, \
	(TYPE) keySliceSize, \
	(TYPE) keyInfo.strides[collapseKeyDim], \
	valueInfo, \
	(TYPE) valueInfo.strides[collapseValueDim], \
	LTComp<float>()); \
	}

	#define HANDLE_SORT_CASE(TYPE, A) \
	{ \
	switch (ceilPowerOf2) { \
	case 2048: \
	HANDLE_CASE(TYPE, A, 2048); \
	break; \
	case 1024: \
	HANDLE_CASE(TYPE, A, 1024); \
	break; \
	case 512: \
	HANDLE_CASE(TYPE, A, 512); \
	break; \
	case 256: \
	HANDLE_CASE(TYPE, A, 256); \
	break; \
	case 128: \
	HANDLE_CASE(TYPE, A, 128); \
	break; \
	case 64: \
	HANDLE_CASE(TYPE, A, 64); \
	break; \
	case 32: \
	HANDLE_CASE(TYPE, A, 32); \
	break; \
	case 16: \
	HANDLE_CASE(TYPE, A, 16); \
	break; \
	case 8: \
	HANDLE_CASE(TYPE, A, 8); \
	break; \
	case 4: \
	HANDLE_CASE(TYPE, A, 4); \
	break; \
	case 2: \
	HANDLE_CASE(TYPE, A, 2); \
	break; \
	case 1: \
	/* Nothing to do, data already sorted */ \
	break; \
	default: \
	assert(false); \
	} \
	}

	// The constructed key/value tensor info is used to select the slice
	// we are sorting on a per-block basis
	if (TensorUtils<THCudaTensor>::canUse32BitIndexMath(state, key)) {
	TensorInfo<float, unsigned int> keyInfo =
	getTensorInfo<THCudaTensor, unsigned int>(state, key);
	keyInfo.reduceDim(dim);
	int collapseKeyDim = keyInfo.collapseDims(dim);

	TensorInfo<float, unsigned int> valueInfo =
	getTensorInfo<THCudaTensor, unsigned int>(state, value);
	valueInfo.reduceDim(dim);
	int collapseValueDim = valueInfo.collapseDims(dim);

	if (keyInfo.isContiguous()) {
	HANDLE_SORT_CASE(unsigned int, -2);
	} else {
	switch (keyInfo.dims) {
	case 1:
	HANDLE_SORT_CASE(unsigned int, 1);
	break;
	case 2:
	HANDLE_SORT_CASE(unsigned int, 2);
	break;
	default:
	HANDLE_SORT_CASE(unsigned int, -1);
	break;
	}
	}
	} else {
	TensorInfo<float, unsigned long> keyInfo =
	getTensorInfo<THCudaTensor, unsigned long>(state, key);
	keyInfo.reduceDim(dim);
	int collapseKeyDim = keyInfo.collapseDims(dim);

	TensorInfo<float, unsigned long> valueInfo =
	getTensorInfo<THCudaTensor, unsigned long>(state, value);
	valueInfo.reduceDim(dim);
	int collapseValueDim = valueInfo.collapseDims(dim);

	// long case is rare, just instantiate these versions
	if (keyInfo.isContiguous()) {
	HANDLE_SORT_CASE(unsigned long, -2);
	} else {
	HANDLE_SORT_CASE(unsigned long, -1);
	}
	}
	#undef HANDLE_CASE
	#undef HANDLE_SORT_CASE
	#undef HANDLE_A_CASE

	THCudaCheck(cudaGetLastError());
	}

	// For slice sorting in Thrust; extracts a slice index from a linear
	// index and uses that for comparison
	struct SliceComp {
	SliceComp(int size) : sliceSize(size) {}

	__device__ bool operator()(const int& a, const int& b) const {
	// Since the slices are guaranteed to be innermost, the segment is
	// just via integer division
	int segA = a / sliceSize;
	int segB = b / sliceSize;
	return segA < segB;
	}

	const int sliceSize;
	};

	// For sorting in Thurst; extracts a within-slice index from a linear index
	struct GlobalIndexToPerSliceIndex {
	GlobalIndexToPerSliceIndex(int size) : sliceSize(size) {}

	__device__ inline void operator()(int& v) const {
	// Thrust is operating on this index array as an array of type
	// int, but to Torch it should be a float array.
	v = __float_as_int((float) (v % sliceSize + 1));
	}

	const int sliceSize;
	};

	void sortViaThrust(THCState* state,
	THCudaTensor* sorted,
	THCudaTensor* indices,
	THCudaTensor* input,
	int dim, bool dir) {
	long nDims = THCudaTensor_nDimension(state, input);

	long totalElements = THCudaTensor_nElement(state, input);
	long sliceSize = THCudaTensor_size(state, input, dim);
	long sliceStride = THCudaTensor_stride(state, input, dim);

	// We perform a vectorized segmented sort in Thrust.
	// Say we are sorting a (2, 3) tensor. We have in flattened form:
	// values 0.4 1.2 5.3 6.2 1.3 2.3
	// indices 0 1 2 3 4 5
	// where indices is a global index (across all slices)

	// First we sort by values, globally:
	// values 6.2 5.3 2.3 1.2 1.3 0.4
	// indices 3 2 5 1 4 0

	// Then we stable sort by segment, which is index / 3:
	// values 5.3 1.2 0.4 6.2 2.3 1.3
	// indices 2 1 0 3 5 4

	// Then we translate the global index to a per-slice Lua index
	// (index % 3) + 1:
	// values 5.3 1.2 0.4 6.2 2.3 1.3
	// indices 3 2 1 1 3 2

	// This method can only work if the slice we are sorting (`dim`) is
	// innermost, and both values and indices are contiguous. We do this
	// by re-arranging the input into this form as needed, which will
	// unfortunately allocate memory if the request is not in this form.
	// Vectorized sort is slower than iterated sort if the number of
	// slices is small (since we're sorting twice, instead of invoking a
	// smaller sort `numSlices` times), but the Thrust sort
	// implementation here is a catch-all, so we're not looking for
	// efficiency, but instead correctness.
	THCudaTensor_copy(state, sorted, input);
	THCudaTensor* trKeys = THCudaTensor_newWithTensor(state, sorted);
	THCudaTensor* trIndices = THCudaTensor_newWithTensor(state, indices);

	// Transpose dim to innermost
	if (dim != nDims - 1) {
	THCudaTensor_transpose(state, trKeys, NULL, dim, nDims - 1);
	THCudaTensor_transpose(state, trIndices, NULL, dim, nDims - 1);
	}

	// Thrust must operate on a contiguous layout
	THCudaTensor* trContigKey = THCudaTensor_newContiguous(state, trKeys);
	THCudaTensor* trContigIndices = THCudaTensor_newContiguous(state, trIndices);

	THCudaTensor_free(state, trKeys);
	THCudaTensor_free(state, trIndices);

	thrust::device_ptr<float> keyIter(THCudaTensor_data(state, trContigKey));

	// Since we are composing a global index across all segments rather
	// than a per-segment index, we treat the memory as int so we don't
	// have problems sorting slices < 2^24 but where the entire tensor
	// has more than 2^24 elements
	thrust::device_ptr<int>
	indexIter((int*) THCudaTensor_data(state, trContigIndices));

	// Fill the indices with a global index across all slices
	thrust::counting_iterator<int> countIter(0);

	thrust::copy(
	#if CUDA_VERSION >= 7000
	thrust::cuda::par.on(THCState_getCurrentStream(state)),
	#endif
	countIter, countIter + totalElements, indexIter);

	// First, we sort globally (across all slices) according to key
	// (the values we're sorting)
	if (dir) {
	thrust::stable_sort_by_key(
	#if CUDA_VERSION >= 7000
	thrust::cuda::par.on(THCState_getCurrentStream(state)),
	#endif
	keyIter, keyIter + totalElements, indexIter, thrust::greater<float>());
	} else {
	thrust::stable_sort_by_key(
	#if CUDA_VERSION >= 7000
	thrust::cuda::par.on(THCState_getCurrentStream(state)),
	#endif
	keyIter, keyIter + totalElements, indexIter, thrust::less<float>());
	}

	// Then, re-sort according to slice that each index is
	// in. This completes the segment sort in Thrust, since we're
	// stably sorting here, preserving the relative order of values
	// per each slice
	thrust::stable_sort_by_key(
	#if CUDA_VERSION >= 7000
	thrust::cuda::par.on(THCState_getCurrentStream(state)),
	#endif
	indexIter, indexIter + totalElements, keyIter,
	SliceComp(sliceSize));

	// Translate the global integer 0-based index to a per-slice float
	// Lua index
	thrust::for_each(
	#if CUDA_VERSION >= 7000
	thrust::cuda::par.on(THCState_getCurrentStream(state)),
	#endif
	indexIter, indexIter + totalElements,
	GlobalIndexToPerSliceIndex(sliceSize));

	// Reverse the transposition as needed
	if (dim != nDims - 1) {
	THCudaTensor_transpose(state, trContigKey, NULL, dim, nDims - 1);
	THCudaTensor_transpose(state, trContigIndices, NULL, dim, nDims - 1);
	}

	// Then copy back to the expected output
	THCudaTensor_freeCopyTo(state, trContigKey, sorted);
	THCudaTensor_freeCopyTo(state, trContigIndices, indices);
	}

	THC_API void THCudaTensor_sort(THCState* state,
	THCudaTensor *sorted,
	THCudaTensor *indices,
	THCudaTensor *input,
	int dim, int order) {
	THAssert(THCudaTensor_checkGPU(state, 3, sorted, indices, input));
	THCCheckTensorDims(state, sorted, 2);
	THCCheckTensorDims(state, indices, 3);
	THCCheckTensorDims(state, input, 4);

	// Make sure sufficient output space is allocated
	THCudaTensor_resizeAs(state, sorted, input);
	THCudaTensor_resizeAs(state, indices, input);

	// How large are the slices that we are sorting?
	long sliceSize = THCudaTensor_size(state, input, dim);

	// We're using THCudaTensor to write out indices, so if the slice
	// size that we're sorting has more elements than can be
	// represented in fp32, warn the user
	// FIXME: this isn't a real restriction of either our code or of
	// Thrust, but we have to switch to a CUDA long tensor to support
	// larger slice sizes. Otherwise the indices will contain garbage.
	THArgCheck(sliceSize <= (long) FLOAT32_MAX_CONSECUTIVE_INT, 5,
	"The sort dimension exceeds single-precision float "
	"consecutive integer precision size (2^24), since float "
	"is used for indices");

	if (sliceSize <= 2048) {
	// Fill `indices` (the values) with the
	// slice-relative index.
	THCudaTensor_fillSliceWithIndex(state, indices, dim);

	// We sort k/v pairs in-place; copy unsorted input to output
	THCudaTensor_copy(state, sorted, input);

	// Sort using our in-place k/v kernel that supports arbitrary
	// layout
	THCudaTensor_sortKeyValueInplace(state, sorted, indices, dim, order);
	} else {
	// Otherwise, fall back upon Thrust, which handles all other cases
	// (potentially slowly, with extra copies/memory allocations)
	sortViaThrust(state, sorted, indices, input, dim, (bool) order);
	}

	THCudaCheck(cudaGetLastError());
	}