caffe2/utils/GpuBitonicSort.cuh - platform/external/pytorch - Git at Google

 #ifndef CAFFE2_UTILS_GPU_BITONIC_SORT_H_
 #define CAFFE2_UTILS_GPU_BITONIC_SORT_H_

 #include "caffe2/utils/math.h"
 #include "caffe2/utils/GpuDefs.cuh"

 namespace caffe2 {

 // Returns true if the given integer type is a power-of-2 (positive only)
 // Note(jiayq): windows reported an error per
 //     https://github.com/caffe2/caffe2/issues/997
 // and as a result will make it a macro.
 #ifdef _MSC_VER
 #define integerIsPowerOf2(v) ((v) && !((v) & ((v) - 1)))
 #else // _MSC_VER
 template <typename T>
 constexpr bool integerIsPowerOf2(T v) {
   return (v && !(v & (v - 1)));
 }
 #endif // _MSC_VER

 /// The maximum in-block bitonic sort we support
 constexpr int kMaxBitonicSortSize = 4096;

 template <typename T>
 __device__ inline void swapVars(T& t1, T& t2) {
   T tmp = t1;
   t1 = t2;
   t2 = tmp;
 }

 template <typename Comparator, typename K, typename V>
 __device__ inline void bitonicSwap(K& kA, V& vA,
                                    K& kB, V& vB,
                                    bool dir,
                                    const Comparator& comp) {
   bool swap = comp(kA, vA, kB, vB);
   if (swap == dir) {
     swapVars(kA, kB);
     swapVars(vA, vB);
   }
 };

 template <typename Comparator, typename K, typename V,
           int Power2SortSize,
           int ThreadsPerBlock>
 __device__ inline void bitonicSort(K* keys,
                                    V* values,
                                    const Comparator& comp) {
   static_assert(Power2SortSize <= kMaxBitonicSortSize,
                 "sort size <= 4096 only supported");
   // Assume the sort is taking place in shared memory
   // static_assert(Power2SortSize * (sizeof(K) + sizeof(V)) < 32768,
   //               "sort data too large (>32768 bytes)");
   static_assert(integerIsPowerOf2(Power2SortSize),
                 "sort size must be power of 2");
   static_assert(integerIsPowerOf2(ThreadsPerBlock),
                 "threads in block must be power of 2");

   // If what we are sorting is too small, then not all threads
   // participate
   constexpr int numThreadsForSort = Power2SortSize / 2;
   constexpr bool allThreads = numThreadsForSort >= ThreadsPerBlock;

   // If what we are sorting is too large, then threads must loop more
   // than once
   constexpr int loopPerThread =
     allThreads ? numThreadsForSort / ThreadsPerBlock : 1;

 #pragma unroll
   for (int size = 2; size < Power2SortSize; size *= 2) {

 #pragma unroll
     for (int stride = size / 2; stride > 0; stride /= 2) {

 #pragma unroll
       for (int loop = 0; loop < loopPerThread; ++loop) {
         int threadId = loop * ThreadsPerBlock + threadIdx.x;
         bool flag = ((threadId & (size / 2)) != 0);

         int pos = 2 * threadId - (threadId & (stride - 1));

         if (allThreads || (threadId < numThreadsForSort)) {
           bitonicSwap<Comparator, K, V>(
             keys[pos], values[pos],
             keys[pos + stride], values[pos + stride],
             flag, comp);
         }

         __syncthreads();
       }
     }
   }

 #pragma unroll
   for (int stride = Power2SortSize / 2; stride > 0; stride /= 2) {

 #pragma unroll
     for (int loop = 0; loop < loopPerThread; ++loop) {
       int threadId = loop * ThreadsPerBlock + threadIdx.x;

       int pos = 2 * threadId - (threadId & (stride - 1));

       if (allThreads || (threadId < numThreadsForSort)) {
         bitonicSwap<Comparator, K, V>(
           keys[pos], values[pos],
           keys[pos + stride], values[pos + stride],
           false, comp);
       }

       __syncthreads();
     }
   }
 }

 template <typename Comparator, typename K, typename V, int Power2SortSize>
 __device__ inline void warpBitonicSort(K* keys,
                                        V* values,
                                        const Comparator& comp) {
   // Smaller sorts should use a warp shuffle sort
   static_assert(Power2SortSize > kWarpSize,
                 "sort not large enough");
   static_assert(integerIsPowerOf2(Power2SortSize),
                 "sort size must be power of 2");
   static_assert(Power2SortSize <= kMaxBitonicSortSize,
                 "sort size <= 4096 only supported");

   // If what we are sorting is too large, then lanes must loop more
   // than once
   constexpr int loopPerThread = (Power2SortSize / 2) / kWarpSize;
   int laneId = getLaneId();

 #pragma unroll
   for (int size = 2; size < Power2SortSize; size *= 2) {

 #pragma unroll
     for (int stride = size / 2; stride > 0; stride /= 2) {

 #pragma unroll
       for (int loop = 0; loop < loopPerThread; ++loop) {
         int threadId = loop * kWarpSize + laneId;
         bool flag = ((threadId & (size / 2)) != 0);

         int pos = 2 * threadId - (threadId & (stride - 1));

         bitonicSwap<Comparator, K, V>(
           keys[pos], values[pos],
           keys[pos + stride], values[pos + stride],
           flag, comp);

         __threadfence_block();
       }
     }
   }

 #pragma unroll
   for (int stride = Power2SortSize / 2; stride > 0; stride /= 2) {

 #pragma unroll
     for (int loop = 0; loop < loopPerThread; ++loop) {
       int threadId = loop * kWarpSize + laneId;

       int pos = 2 * threadId - (threadId & (stride - 1));

       bitonicSwap<Comparator, K, V>(
         keys[pos], values[pos],
         keys[pos + stride], values[pos + stride],
         false, comp);

       __threadfence_block();
     }
   }
 }


 }  // namespace caffe2

 #endif  // CAFFE2_UTILS_GPU_BITONIC_SORT_H_
	#ifndef CAFFE2_UTILS_GPU_BITONIC_SORT_H_
	#define CAFFE2_UTILS_GPU_BITONIC_SORT_H_

	#include "caffe2/utils/math.h"
	#include "caffe2/utils/GpuDefs.cuh"

	namespace caffe2 {

	// Returns true if the given integer type is a power-of-2 (positive only)
	// Note(jiayq): windows reported an error per
	// https://github.com/caffe2/caffe2/issues/997
	// and as a result will make it a macro.
	#ifdef _MSC_VER
	#define integerIsPowerOf2(v) ((v) && !((v) & ((v) - 1)))
	#else // _MSC_VER
	template <typename T>
	constexpr bool integerIsPowerOf2(T v) {
	return (v && !(v & (v - 1)));
	}
	#endif // _MSC_VER

	/// The maximum in-block bitonic sort we support
	constexpr int kMaxBitonicSortSize = 4096;

	template <typename T>
	__device__ inline void swapVars(T& t1, T& t2) {
	T tmp = t1;
	t1 = t2;
	t2 = tmp;
	}

	template <typename Comparator, typename K, typename V>
	__device__ inline void bitonicSwap(K& kA, V& vA,
	K& kB, V& vB,
	bool dir,
	const Comparator& comp) {
	bool swap = comp(kA, vA, kB, vB);
	if (swap == dir) {
	swapVars(kA, kB);
	swapVars(vA, vB);
	}
	};

	template <typename Comparator, typename K, typename V,
	int Power2SortSize,
	int ThreadsPerBlock>
	__device__ inline void bitonicSort(K* keys,
	V* values,
	const Comparator& comp) {
	static_assert(Power2SortSize <= kMaxBitonicSortSize,
	"sort size <= 4096 only supported");
	// Assume the sort is taking place in shared memory
	// static_assert(Power2SortSize * (sizeof(K) + sizeof(V)) < 32768,
	// "sort data too large (>32768 bytes)");
	static_assert(integerIsPowerOf2(Power2SortSize),
	"sort size must be power of 2");
	static_assert(integerIsPowerOf2(ThreadsPerBlock),
	"threads in block must be power of 2");

	// If what we are sorting is too small, then not all threads
	// participate
	constexpr int numThreadsForSort = Power2SortSize / 2;
	constexpr bool allThreads = numThreadsForSort >= ThreadsPerBlock;

	// If what we are sorting is too large, then threads must loop more
	// than once
	constexpr int loopPerThread =
	allThreads ? numThreadsForSort / ThreadsPerBlock : 1;

	#pragma unroll
	for (int size = 2; size < Power2SortSize; size *= 2) {

	#pragma unroll
	for (int stride = size / 2; stride > 0; stride /= 2) {

	#pragma unroll
	for (int loop = 0; loop < loopPerThread; ++loop) {
	int threadId = loop * ThreadsPerBlock + threadIdx.x;
	bool flag = ((threadId & (size / 2)) != 0);

	int pos = 2 * threadId - (threadId & (stride - 1));

	if (allThreads \|\| (threadId < numThreadsForSort)) {
	bitonicSwap<Comparator, K, V>(
	keys[pos], values[pos],
	keys[pos + stride], values[pos + stride],
	flag, comp);
	}

	__syncthreads();
	}
	}
	}

	#pragma unroll
	for (int stride = Power2SortSize / 2; stride > 0; stride /= 2) {

	#pragma unroll
	for (int loop = 0; loop < loopPerThread; ++loop) {
	int threadId = loop * ThreadsPerBlock + threadIdx.x;

	int pos = 2 * threadId - (threadId & (stride - 1));

	if (allThreads \|\| (threadId < numThreadsForSort)) {
	bitonicSwap<Comparator, K, V>(
	keys[pos], values[pos],
	keys[pos + stride], values[pos + stride],
	false, comp);
	}

	__syncthreads();
	}
	}
	}

	template <typename Comparator, typename K, typename V, int Power2SortSize>
	__device__ inline void warpBitonicSort(K* keys,
	V* values,
	const Comparator& comp) {
	// Smaller sorts should use a warp shuffle sort
	static_assert(Power2SortSize > kWarpSize,
	"sort not large enough");
	static_assert(integerIsPowerOf2(Power2SortSize),
	"sort size must be power of 2");
	static_assert(Power2SortSize <= kMaxBitonicSortSize,
	"sort size <= 4096 only supported");

	// If what we are sorting is too large, then lanes must loop more
	// than once
	constexpr int loopPerThread = (Power2SortSize / 2) / kWarpSize;
	int laneId = getLaneId();

	#pragma unroll
	for (int size = 2; size < Power2SortSize; size *= 2) {

	#pragma unroll
	for (int stride = size / 2; stride > 0; stride /= 2) {

	#pragma unroll
	for (int loop = 0; loop < loopPerThread; ++loop) {
	int threadId = loop * kWarpSize + laneId;
	bool flag = ((threadId & (size / 2)) != 0);

	int pos = 2 * threadId - (threadId & (stride - 1));

	bitonicSwap<Comparator, K, V>(
	keys[pos], values[pos],
	keys[pos + stride], values[pos + stride],
	flag, comp);

	__threadfence_block();
	}
	}
	}

	#pragma unroll
	for (int stride = Power2SortSize / 2; stride > 0; stride /= 2) {

	#pragma unroll
	for (int loop = 0; loop < loopPerThread; ++loop) {
	int threadId = loop * kWarpSize + laneId;

	int pos = 2 * threadId - (threadId & (stride - 1));

	bitonicSwap<Comparator, K, V>(
	keys[pos], values[pos],
	keys[pos + stride], values[pos + stride],
	false, comp);

	__threadfence_block();
	}
	}
	}


	} // namespace caffe2

	#endif // CAFFE2_UTILS_GPU_BITONIC_SORT_H_