THCTensorSort.cuh - platform/external/pytorch - Git at Google

 #ifndef THC_TENSORSORT_CUH
 #define THC_TENSORSORT_CUH

 #include "THCReduceApplyUtils.cuh"
 #include "THCSortUtils.cuh"
 #include "THCTensorCopy.h"
 #include "THCTensorTypeUtils.cuh"

 #include "THCThrustAllocator.cuh"
 #include <thrust/device_ptr.h>
 #include <thrust/sort.h>
 #if CUDA_VERSION >= 7000
 #include <thrust/system/cuda/execution_policy.h>
 #endif

 template <typename T>
 struct ThrustGTOp {
   __device__ bool operator()(const T& lhs, const T& rhs) const {
     return THCNumerics<T>::gt(lhs, rhs);
   }
 };

 template <typename T>
 struct ThrustLTOp {
   __device__ bool operator()(const T& lhs, const T& rhs) const {
     return THCNumerics<T>::lt(lhs, rhs);
   }
 };

 // `base` is the base address of a tensor
 // For each slice (defined as a linear point of `out`, from 0 ->
 // (sliceSize - 1) * sliceStride, we fill that slice from `0` to
 // `sliceSize - 1`.
 template <typename IndexType, int Dim>
 __global__ void
 fillSliceWithIndex(TensorInfo<long, IndexType> out,
                    IndexType totalSlices,
                    IndexType sliceSize,
                    IndexType sliceStride) {
   IndexType slice = getLinearBlockId<IndexType>();

   if (slice >= totalSlices) {
     return;
   }

   const unsigned long offset =
     IndexToOffset<long, IndexType, Dim>::get(slice, out);
   long* base = &out.data[offset];

   for (long i = threadIdx.x; i < sliceSize; i += blockDim.x) {
     // Torch indices are 1-based (hence the +1)
     base[i * sliceStride] = i + TH_INDEX_BASE;
   }
 }

 // For slice sorting in Thrust; extracts a slice index from a linear
 // index and uses that for comparison
 struct SliceComp {
   SliceComp(long size) : sliceSize(size) {}

   __device__ bool operator()(const long& a, const long& b) const {
     // Since the slices are guaranteed to be innermost, the segment is
     // just via long division
     long segA = a / sliceSize;
     long segB = b / sliceSize;
     return segA < segB;
   }

   const long sliceSize;
 };

 // For sorting in Thurst; extracts a within-slice index from a linear index
 struct GlobalIndexToPerSliceIndex {
   GlobalIndexToPerSliceIndex(long size) : sliceSize(size) {}

   __device__ inline void operator()(long& v) const {
     v = v % sliceSize + TH_INDEX_BASE;
   }

   const long sliceSize;
 };

 unsigned long nextHighestPowerOf2(unsigned long n);
 void THCudaLongTensor_fillSliceWithIndex(THCState* state,
                                          THCudaLongTensor* t,
                                          int dim);
 #endif // THC_TENSORSORT_CUH
	#ifndef THC_TENSORSORT_CUH
	#define THC_TENSORSORT_CUH

	#include "THCReduceApplyUtils.cuh"
	#include "THCSortUtils.cuh"
	#include "THCTensorCopy.h"
	#include "THCTensorTypeUtils.cuh"

	#include "THCThrustAllocator.cuh"
	#include <thrust/device_ptr.h>
	#include <thrust/sort.h>
	#if CUDA_VERSION >= 7000
	#include <thrust/system/cuda/execution_policy.h>
	#endif

	template <typename T>
	struct ThrustGTOp {
	__device__ bool operator()(const T& lhs, const T& rhs) const {
	return THCNumerics<T>::gt(lhs, rhs);
	}
	};

	template <typename T>
	struct ThrustLTOp {
	__device__ bool operator()(const T& lhs, const T& rhs) const {
	return THCNumerics<T>::lt(lhs, rhs);
	}
	};

	// `base` is the base address of a tensor
	// For each slice (defined as a linear point of `out`, from 0 ->
	// (sliceSize - 1) * sliceStride, we fill that slice from `0` to
	// `sliceSize - 1`.
	template <typename IndexType, int Dim>
	__global__ void
	fillSliceWithIndex(TensorInfo<long, IndexType> out,
	IndexType totalSlices,
	IndexType sliceSize,
	IndexType sliceStride) {
	IndexType slice = getLinearBlockId<IndexType>();

	if (slice >= totalSlices) {
	return;
	}

	const unsigned long offset =
	IndexToOffset<long, IndexType, Dim>::get(slice, out);
	long* base = &out.data[offset];

	for (long i = threadIdx.x; i < sliceSize; i += blockDim.x) {
	// Torch indices are 1-based (hence the +1)
	base[i * sliceStride] = i + TH_INDEX_BASE;
	}
	}

	// For slice sorting in Thrust; extracts a slice index from a linear
	// index and uses that for comparison
	struct SliceComp {
	SliceComp(long size) : sliceSize(size) {}

	__device__ bool operator()(const long& a, const long& b) const {
	// Since the slices are guaranteed to be innermost, the segment is
	// just via long division
	long segA = a / sliceSize;
	long segB = b / sliceSize;
	return segA < segB;
	}

	const long sliceSize;
	};

	// For sorting in Thurst; extracts a within-slice index from a linear index
	struct GlobalIndexToPerSliceIndex {
	GlobalIndexToPerSliceIndex(long size) : sliceSize(size) {}

	__device__ inline void operator()(long& v) const {
	v = v % sliceSize + TH_INDEX_BASE;
	}

	const long sliceSize;
	};

	unsigned long nextHighestPowerOf2(unsigned long n);
	void THCudaLongTensor_fillSliceWithIndex(THCState* state,
	THCudaLongTensor* t,
	int dim);
	#endif // THC_TENSORSORT_CUH