src/common_kernel.h - platform/external/pytorch - Git at Google

 /*************************************************************************
  * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *  * Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  *  * Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *  * Neither the name of NVIDIA CORPORATION nor the names of its
  *    contributors may be used to endorse or promote products derived
  *    from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  ************************************************************************/


 #ifndef COMMON_KERNEL_H_
 #define COMMON_KERNEL_H_

 #include <cstdio>
 #include <cstdint>

 #include <cuda_runtime.h>

 // BAR macro and helpers
 #define WARP_SIZE 32
 #define ROUNDUP(x, y)                                                           \
     (((((x) + (y) - 1) / (y))) * (y))
 #define BAR_EXEC(type, barid, nthreads) \
     asm("bar." #type " " #barid ", " #nthreads ";\n\t")
 #define BAR_EXPAND(type, barid, nthreads) \
     BAR_EXEC(type, barid, (nthreads))

 // Named barrier macro.
 // Expands to asm("bar.type barid, nthreads") where
 // nthreads has been rounded up to WARP_SIZE.
 #define BAR(type, barid, nthreads) \
     BAR_EXPAND(type, barid, ROUNDUP(nthreads, WARP_SIZE))

 __device__ unsigned int spinct;

 // Spin wait until func evaluates to true
 template<typename FUNC>
 __device__ inline void Wait(const FUNC& func) {
   while (!func()) {
     // waste time
     atomicInc(&spinct, 10);
   }
 }

 typedef uint64_t PackType;

 // unpack x and y to elements of type T and apply FUNC to each element
 template<class FUNC, typename T>
 struct MULTI {
   __device__ PackType operator()(const PackType x, const PackType y) const;
 };

 template<class FUNC>
 struct MULTI<FUNC, char> {
   static_assert(sizeof(PackType) == 2 * sizeof(uint32_t),
       "PackType must be twice the size of uint32_t.");
   union converter {
     PackType storage;
     struct {
       uint32_t a, b;
     };
   };

   __device__ PackType operator()(const PackType x, const PackType y) const {
     converter cx, cy, cr;
     cx.storage = x;
     cy.storage = y;

     // for char, we do these as vector ops
     cr.a = FUNC()(cx.a, cy.a);
     cr.b = FUNC()(cx.b, cy.b);

     return cr.storage;
   }
 };

 template<class FUNC>
 struct MULTI<FUNC, int> {
   static_assert(sizeof(PackType) == 2 * sizeof(int),
       "PackType must be twice the size of int.");
   union converter {
     PackType storage;
     struct {
       int a, b;
     };
   };

   __device__ PackType operator()(const PackType x, const PackType y) const {
     converter cx, cy, cr;
     cx.storage = x;
     cy.storage = y;

     cr.a = FUNC()(cx.a, cy.a);
     cr.b = FUNC()(cx.b, cy.b);

     return cr.storage;
   }
 };

 #ifdef CUDA_HAS_HALF
 template<class FUNC>
 struct MULTI<FUNC, half> {
   static_assert(sizeof(PackType) == 2 * sizeof(float),
       "PackType must be twice the size of float.");
   union converter {
     PackType storage;
     struct {
       half2 a, b;
     };
   };

   __device__ PackType operator()(const PackType x, const PackType y) const {
     converter cx, cy, cr;
     cx.storage = x;
     cy.storage = y;

     cr.a = FUNC()(cx.a, cy.a);
     cr.b = FUNC()(cx.b, cy.b);

     return cr.storage;
   }
 };
 #endif

 template<class FUNC>
 struct MULTI<FUNC, float> {
   static_assert(sizeof(PackType) == 2 * sizeof(float),
       "PackType must be twice the size of float.");
   union converter {
     PackType storage;
     struct {
       float a, b;
     };
   };

   __device__ PackType operator()(const PackType x, const PackType y) const {
     converter cx, cy, cr;
     cx.storage = x;
     cy.storage = y;

     cr.a = FUNC()(cx.a, cy.a);
     cr.b = FUNC()(cx.b, cy.b);

     return cr.storage;
   }
 };

 template<class FUNC>
 struct MULTI<FUNC, double> {
   static_assert(sizeof(PackType) == sizeof(double),
       "PackType must be the same size as double.");
   __device__ PackType operator()(const PackType x, const PackType y) const {
     double rv = FUNC()(__longlong_as_double(x), __longlong_as_double(y));
     return __double_as_longlong(rv);
   }
 };

 template<class FUNC>
 struct MULTI<FUNC, unsigned long long> {
   static_assert(sizeof(PackType) == sizeof(unsigned long long),
       "PackType must be the same size as unsigned long long.");
   __device__ PackType operator()(const PackType x, const PackType y) const {
     unsigned long long rv = FUNC()(x, y);
     return rv;
   }
 };

 template<class FUNC>
 struct MULTI<FUNC, long long> {
   static_assert(sizeof(PackType) == sizeof(long long),
       "PackType must be the same size as long long.");
   __device__ PackType operator()(const PackType x, const PackType y) const {
     long long rv = FUNC()((long long)x, (long long)y);
     return rv;
   }
 };

 template<typename T, bool FETCHTWO>
 __device__ inline void FetchOneOrTwo64b(PackType& s0,
     const volatile T * __restrict__ const src0, PackType& s1,
     const volatile T * __restrict__ const src1, const int idx) {
   s0 = (reinterpret_cast<const volatile PackType *>(src0))[idx];
   if (FETCHTWO) {
     s1 = (reinterpret_cast<const volatile PackType *>(src1))[idx];
   }
 }

 template<typename T, bool STORETWO>
 __device__ inline void StoreOneOrTwo64b(volatile T * __restrict__ const dest0,
     volatile T * __restrict__ const dest1, PackType val, const int idx) {
   (reinterpret_cast<volatile PackType *>(dest0))[idx] = val;
   if (STORETWO) {
     (reinterpret_cast<volatile PackType *>(dest1))[idx] = val;
   }
 }

 template<class FUNC, typename T, bool ISREDUCE>
 __device__ inline PackType ReduceOrCopy64b(const PackType s0,
     const PackType s1) {
   if (ISREDUCE) {
     return MULTI<FUNC, T>()(s0, s1);
   } else {
     return s0;
   }
 }

 #define ALIGNUP(x, a)   ((((x)-1) & ~((a)-1)) + (a))

 template<typename T>
 __device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) {
   size_t ptrval = reinterpret_cast<size_t>(ptr);
   return reinterpret_cast<volatile T*>(ALIGNUP(ptrval, align));
 }

 template<typename T> inline __device__
 T vFetch(const volatile T* ptr) {
   return *ptr;
 }

 #ifdef CUDA_HAS_HALF
 template<> inline __device__
 half vFetch<half>(const volatile half* ptr) {
   half r;
   r.x = ptr->x;
   return r;
 }
 #endif

 template<typename T> inline __device__
 void vStore(volatile T* ptr, const T val) {
   *ptr = val;
 }

 #ifdef CUDA_HAS_HALF
 template<> inline __device__
 void vStore<half>(volatile half* ptr, const half val) {
   ptr->x = val.x;
 }
 #endif

 // Assumptions:
 // - there is exactly 1 block
 // - THREADS is the number of producer threads
 // - this function is called by all producer threads
 template<int UNROLL, int THREADS, class FUNC, typename T, bool HAS_DEST1,
     bool HAS_SRC1>
 __device__ inline void ReduceOrCopy(const int tid,
     volatile T * __restrict__ dest0, volatile T * __restrict__ dest1,
     const volatile T * __restrict__ src0, const volatile T * __restrict__ src1,
     int N) {
   if (N==0) {
     return;
   }

   const int UNROLL2 = (UNROLL >= 2) ? (UNROLL / 2) : 1;
   const bool NOUNROLL2 = ((UNROLL / 2) == 0);

   int Npreamble = (N<alignof(PackType)) ? N : AlignUp(dest0, alignof(PackType)) - dest0;

   // stage 0: check if we'll be able to use the fast, 64-bit aligned path.
   // If not, we'll just use the slow preamble path for the whole operation
   bool alignable = (((AlignUp(src0,  alignof(PackType)) == src0  + Npreamble)) &&
       (!HAS_DEST1 || (AlignUp(dest1, alignof(PackType)) == dest1 + Npreamble)) &&
       (!HAS_SRC1  || (AlignUp(src1,  alignof(PackType)) == src1  + Npreamble)));

   if (!alignable) {
     Npreamble = N;
   }

 /*
   if (threadIdx.x == 0) {
     printf("** alignable: %s", (alignable ? "YES" : " NO"));
     printf(", dest0 = 0x%08X", dest0);
     printf(", src0 = 0x%08X", src0);
     if (HAS_DEST1) printf(", dest1 = 0x%08X", dest1);
     if (HAS_SRC1) printf(", src1 = 0x%08X", src1);
     printf("\n");
   }
 */

   // stage 1: preamble: handle any elements up to the point of everything coming
   // into alignment
   for (int idx = tid; idx < Npreamble; idx += THREADS) {
     // ought to be no way this is ever more than one iteration, except when
     // alignable is false
     T val = vFetch(src0+idx);
     if (HAS_SRC1) {
       val = FUNC()(val, vFetch(src1+idx));
     }

     vStore(dest0+idx, val);
     if (HAS_DEST1) {
       vStore(dest1+idx, val);
     }
   }

   // reduce N by however many elements we've handled already
   int Ndone = Npreamble;
   int Nrem = N - Ndone;

   // stage 2: fast path: use 64b loads/stores to do the bulk of the work,
   // assuming the pointers we have are all 64-bit alignable.
   if (alignable) {
     if (Ndone > 0) {
       // align up pointers
       dest0 += Ndone; if (HAS_DEST1) { dest1 += Ndone; }
       src0  += Ndone; if (HAS_SRC1)  { src1  += Ndone; }
     }

     // stage 2a: main loop
     int Nalign = (Nrem / (sizeof(PackType) / sizeof(T)) / (UNROLL * THREADS))
         * (UNROLL * THREADS); // round down

     #pragma unroll 1 // don't unroll this loop
     for (int idx = tid; idx < Nalign; idx += UNROLL * THREADS) {
       PackType t0[UNROLL2];
       PackType t1[UNROLL2];
       PackType t2[UNROLL2];

       #pragma unroll
       for (int j = 0; j < UNROLL2; ++j)
         FetchOneOrTwo64b<T, HAS_SRC1>(t0[j], src0, t1[j], src1,
             idx + j * THREADS);

       #pragma unroll
       for (int j = 0; j < UNROLL2; ++j)
         t2[j] = ReduceOrCopy64b<FUNC, T, HAS_SRC1>(t0[j], t1[j]);

       if (!NOUNROLL2) {
         #pragma unroll
         for (int j = 0; j < UNROLL2; ++j)
           FetchOneOrTwo64b<T, HAS_SRC1>(t0[j], src0, t1[j], src1,
               idx + (UNROLL2 + j) * THREADS);
       }

       #pragma unroll
       for (int j = 0; j < UNROLL2; ++j)
         StoreOneOrTwo64b<T, HAS_DEST1>(dest0, dest1, t2[j], idx + j * THREADS);

       if (!NOUNROLL2) {
         #pragma unroll
         for (int j = 0; j < UNROLL2; ++j)
           t2[j] = ReduceOrCopy64b<FUNC, T, HAS_SRC1>(t0[j], t1[j]);

         #pragma unroll
         for (int j = 0; j < UNROLL2; ++j)
           StoreOneOrTwo64b<T, HAS_DEST1>(dest0, dest1, t2[j],
               idx + (UNROLL2 + j) * THREADS);
       }
     }

     // stage 2b: slightly less optimized for section when we don't have full
     // UNROLLs
     int Ndone2a = Nalign * (sizeof(PackType)/sizeof(T));
     Ndone += Ndone2a;
     Nrem = N - Ndone;

     // TODO: This kind of pointer update arithmetic is expensive.  Should
     // probably find a better way.
     if (Nrem > 0) {
       dest0 += Ndone2a; if (HAS_DEST1) { dest1 += Ndone2a; }
       src0  += Ndone2a; if (HAS_SRC1)  { src1  += Ndone2a; }
     }

     Nalign = Nrem / (sizeof(PackType)/sizeof(T));

     #pragma unroll 4
     for (int idx = tid; idx < Nalign; idx += THREADS) {
       PackType t0, t1, t2;

       FetchOneOrTwo64b<T, HAS_SRC1>(t0, src0, t1, src1, idx);
       t2 = ReduceOrCopy64b<FUNC, T, HAS_SRC1>(t0, t1);
       StoreOneOrTwo64b<T, HAS_DEST1>(dest0, dest1, t2, idx);
     }

     // stage 2c: tail
     int Ndone2b = Nalign * (sizeof(PackType)/sizeof(T));
     Ndone += Nalign * (sizeof(PackType)/sizeof(T));
     Nrem = N - Ndone;

     if (Nrem > 0) {
       dest0 += Ndone2b; if (HAS_DEST1) { dest1 += Ndone2b; }
       src0  += Ndone2b; if (HAS_SRC1)  { src1  += Ndone2b; }
     }

     for (int idx = tid; idx < Nrem; idx += THREADS) {
       // never ought to make it more than one time through this loop.  only a
       // few threads should even participate
       T val = vFetch(src0+idx);
       if (HAS_SRC1) {
         val = FUNC()(val, vFetch(src1+idx));
       }

       vStore(dest0+idx, val);
       if (HAS_DEST1) {
         vStore(dest1+idx, val);
       }
     }
   } // done fast path
 }

 template<int THREADS, int UNROLL, typename T>
 __device__ inline void CalcLastChunk(int * const bigSliceN,
     int * const smallSliceN, int * const lastSliceN, int * const numSlices,
     int * const numBigSlices, int * const numSmallSlices, const int N,
     const int numChunks, const int chunkSize) {
   int Nleft = N - ((numChunks - 1) * chunkSize);
   // semi-equally split up the remaining work into numslices slices.
   // it's "semi"-equal because we want the divisions to land as neatly as we
   // can on alignable boundaries
   int NperTile = UNROLL * THREADS * (sizeof(PackType)/sizeof(T));
   int numTiles = (Nleft + NperTile - 1) / NperTile;
   int numTilesPerBigSlice = (numTiles + *numSlices - 1)
       / *numSlices;
   int numTilesPerSmallSlice = numTiles / *numSlices;

   *bigSliceN   = NperTile * numTilesPerBigSlice;
   *smallSliceN = NperTile * numTilesPerSmallSlice;
   *numBigSlices = numTiles % *numSlices;
   *numSmallSlices = (*smallSliceN > 0) ?
       *numSlices - *numBigSlices : 0;

   // the lastSlice will take the place of one of the small slices unless
   // there are no small slices (because this is a very small reduction), in
   // which case we replace one of the big slices and leave the small slices
   // as 0.
   if (*numSmallSlices > 0) {
     --*numSmallSlices;
     if (*numSmallSlices == 0)
       *smallSliceN = 0;
   }
   else {
     --*numBigSlices;
     if (*numBigSlices == 0)
       *bigSliceN = 0;
   }

   *lastSliceN = Nleft -
       (*numBigSlices * *bigSliceN
           + *numSmallSlices * *smallSliceN);

   // in cases where args.N % numSlices is pretty small, we'd rather have one
   // slightly big last slice than one big slice, a bunch of small slices,
   // and one smaller last slice
   if ((*numBigSlices == 1) &&
       (*numSmallSlices == *numSlices - 2) &&
       (*lastSliceN < *smallSliceN)) {
     *numBigSlices += *numSmallSlices;
     *numSmallSlices = 0;
     *bigSliceN = *smallSliceN;
     *smallSliceN = 0;
     *lastSliceN = Nleft -
         *numBigSlices * *bigSliceN;
   }

   // done recalculating
   *numSlices = *numBigSlices +
       *numSmallSlices + 1;
 }


 #endif // COMMON_KERNEL_H_
	/*************************************************************************
	* Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of NVIDIA CORPORATION nor the names of its
	* contributors may be used to endorse or promote products derived
	* from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	************************************************************************/


	#ifndef COMMON_KERNEL_H_
	#define COMMON_KERNEL_H_

	#include <cstdio>
	#include <cstdint>

	#include <cuda_runtime.h>

	// BAR macro and helpers
	#define WARP_SIZE 32
	#define ROUNDUP(x, y) \
	(((((x) + (y) - 1) / (y))) * (y))
	#define BAR_EXEC(type, barid, nthreads) \
	asm("bar." #type " " #barid ", " #nthreads ";\n\t")
	#define BAR_EXPAND(type, barid, nthreads) \
	BAR_EXEC(type, barid, (nthreads))

	// Named barrier macro.
	// Expands to asm("bar.type barid, nthreads") where
	// nthreads has been rounded up to WARP_SIZE.
	#define BAR(type, barid, nthreads) \
	BAR_EXPAND(type, barid, ROUNDUP(nthreads, WARP_SIZE))

	__device__ unsigned int spinct;

	// Spin wait until func evaluates to true
	template<typename FUNC>
	__device__ inline void Wait(const FUNC& func) {
	while (!func()) {
	// waste time
	atomicInc(&spinct, 10);
	}
	}

	typedef uint64_t PackType;

	// unpack x and y to elements of type T and apply FUNC to each element
	template<class FUNC, typename T>
	struct MULTI {
	__device__ PackType operator()(const PackType x, const PackType y) const;
	};

	template<class FUNC>
	struct MULTI<FUNC, char> {
	static_assert(sizeof(PackType) == 2 * sizeof(uint32_t),
	"PackType must be twice the size of uint32_t.");
	union converter {
	PackType storage;
	struct {
	uint32_t a, b;
	};
	};

	__device__ PackType operator()(const PackType x, const PackType y) const {
	converter cx, cy, cr;
	cx.storage = x;
	cy.storage = y;

	// for char, we do these as vector ops
	cr.a = FUNC()(cx.a, cy.a);
	cr.b = FUNC()(cx.b, cy.b);

	return cr.storage;
	}
	};

	template<class FUNC>
	struct MULTI<FUNC, int> {
	static_assert(sizeof(PackType) == 2 * sizeof(int),
	"PackType must be twice the size of int.");
	union converter {
	PackType storage;
	struct {
	int a, b;
	};
	};

	__device__ PackType operator()(const PackType x, const PackType y) const {
	converter cx, cy, cr;
	cx.storage = x;
	cy.storage = y;

	cr.a = FUNC()(cx.a, cy.a);
	cr.b = FUNC()(cx.b, cy.b);

	return cr.storage;
	}
	};

	#ifdef CUDA_HAS_HALF
	template<class FUNC>
	struct MULTI<FUNC, half> {
	static_assert(sizeof(PackType) == 2 * sizeof(float),
	"PackType must be twice the size of float.");
	union converter {
	PackType storage;
	struct {
	half2 a, b;
	};
	};

	__device__ PackType operator()(const PackType x, const PackType y) const {
	converter cx, cy, cr;
	cx.storage = x;
	cy.storage = y;

	cr.a = FUNC()(cx.a, cy.a);
	cr.b = FUNC()(cx.b, cy.b);

	return cr.storage;
	}
	};
	#endif

	template<class FUNC>
	struct MULTI<FUNC, float> {
	static_assert(sizeof(PackType) == 2 * sizeof(float),
	"PackType must be twice the size of float.");
	union converter {
	PackType storage;
	struct {
	float a, b;
	};
	};

	__device__ PackType operator()(const PackType x, const PackType y) const {
	converter cx, cy, cr;
	cx.storage = x;
	cy.storage = y;

	cr.a = FUNC()(cx.a, cy.a);
	cr.b = FUNC()(cx.b, cy.b);

	return cr.storage;
	}
	};

	template<class FUNC>
	struct MULTI<FUNC, double> {
	static_assert(sizeof(PackType) == sizeof(double),
	"PackType must be the same size as double.");
	__device__ PackType operator()(const PackType x, const PackType y) const {
	double rv = FUNC()(__longlong_as_double(x), __longlong_as_double(y));
	return __double_as_longlong(rv);
	}
	};

	template<class FUNC>
	struct MULTI<FUNC, unsigned long long> {
	static_assert(sizeof(PackType) == sizeof(unsigned long long),
	"PackType must be the same size as unsigned long long.");
	__device__ PackType operator()(const PackType x, const PackType y) const {
	unsigned long long rv = FUNC()(x, y);
	return rv;
	}
	};

	template<class FUNC>
	struct MULTI<FUNC, long long> {
	static_assert(sizeof(PackType) == sizeof(long long),
	"PackType must be the same size as long long.");
	__device__ PackType operator()(const PackType x, const PackType y) const {
	long long rv = FUNC()((long long)x, (long long)y);
	return rv;
	}
	};

	template<typename T, bool FETCHTWO>
	__device__ inline void FetchOneOrTwo64b(PackType& s0,
	const volatile T * __restrict__ const src0, PackType& s1,
	const volatile T * __restrict__ const src1, const int idx) {
	s0 = (reinterpret_cast<const volatile PackType *>(src0))[idx];
	if (FETCHTWO) {
	s1 = (reinterpret_cast<const volatile PackType *>(src1))[idx];
	}
	}

	template<typename T, bool STORETWO>
	__device__ inline void StoreOneOrTwo64b(volatile T * __restrict__ const dest0,
	volatile T * __restrict__ const dest1, PackType val, const int idx) {
	(reinterpret_cast<volatile PackType *>(dest0))[idx] = val;
	if (STORETWO) {
	(reinterpret_cast<volatile PackType *>(dest1))[idx] = val;
	}
	}

	template<class FUNC, typename T, bool ISREDUCE>
	__device__ inline PackType ReduceOrCopy64b(const PackType s0,
	const PackType s1) {
	if (ISREDUCE) {
	return MULTI<FUNC, T>()(s0, s1);
	} else {
	return s0;
	}
	}

	#define ALIGNUP(x, a) ((((x)-1) & ~((a)-1)) + (a))

	template<typename T>
	__device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) {
	size_t ptrval = reinterpret_cast<size_t>(ptr);
	return reinterpret_cast<volatile T*>(ALIGNUP(ptrval, align));
	}

	template<typename T> inline __device__
	T vFetch(const volatile T* ptr) {
	return *ptr;
	}

	#ifdef CUDA_HAS_HALF
	template<> inline __device__
	half vFetch<half>(const volatile half* ptr) {
	half r;
	r.x = ptr->x;
	return r;
	}
	#endif

	template<typename T> inline __device__
	void vStore(volatile T* ptr, const T val) {
	*ptr = val;
	}

	#ifdef CUDA_HAS_HALF
	template<> inline __device__
	void vStore<half>(volatile half* ptr, const half val) {
	ptr->x = val.x;
	}
	#endif

	// Assumptions:
	// - there is exactly 1 block
	// - THREADS is the number of producer threads
	// - this function is called by all producer threads
	template<int UNROLL, int THREADS, class FUNC, typename T, bool HAS_DEST1,
	bool HAS_SRC1>
	__device__ inline void ReduceOrCopy(const int tid,
	volatile T * __restrict__ dest0, volatile T * __restrict__ dest1,
	const volatile T * __restrict__ src0, const volatile T * __restrict__ src1,
	int N) {
	if (N==0) {
	return;
	}

	const int UNROLL2 = (UNROLL >= 2) ? (UNROLL / 2) : 1;
	const bool NOUNROLL2 = ((UNROLL / 2) == 0);

	int Npreamble = (N<alignof(PackType)) ? N : AlignUp(dest0, alignof(PackType)) - dest0;

	// stage 0: check if we'll be able to use the fast, 64-bit aligned path.
	// If not, we'll just use the slow preamble path for the whole operation
	bool alignable = (((AlignUp(src0, alignof(PackType)) == src0 + Npreamble)) &&
	(!HAS_DEST1 \|\| (AlignUp(dest1, alignof(PackType)) == dest1 + Npreamble)) &&
	(!HAS_SRC1 \|\| (AlignUp(src1, alignof(PackType)) == src1 + Npreamble)));

	if (!alignable) {
	Npreamble = N;
	}

	/*
	if (threadIdx.x == 0) {
	printf("** alignable: %s", (alignable ? "YES" : " NO"));
	printf(", dest0 = 0x%08X", dest0);
	printf(", src0 = 0x%08X", src0);
	if (HAS_DEST1) printf(", dest1 = 0x%08X", dest1);
	if (HAS_SRC1) printf(", src1 = 0x%08X", src1);
	printf("\n");
	}
	*/

	// stage 1: preamble: handle any elements up to the point of everything coming
	// into alignment
	for (int idx = tid; idx < Npreamble; idx += THREADS) {
	// ought to be no way this is ever more than one iteration, except when
	// alignable is false
	T val = vFetch(src0+idx);
	if (HAS_SRC1) {
	val = FUNC()(val, vFetch(src1+idx));
	}

	vStore(dest0+idx, val);
	if (HAS_DEST1) {
	vStore(dest1+idx, val);
	}
	}

	// reduce N by however many elements we've handled already
	int Ndone = Npreamble;
	int Nrem = N - Ndone;

	// stage 2: fast path: use 64b loads/stores to do the bulk of the work,
	// assuming the pointers we have are all 64-bit alignable.
	if (alignable) {
	if (Ndone > 0) {
	// align up pointers
	dest0 += Ndone; if (HAS_DEST1) { dest1 += Ndone; }
	src0 += Ndone; if (HAS_SRC1) { src1 += Ndone; }
	}

	// stage 2a: main loop
	int Nalign = (Nrem / (sizeof(PackType) / sizeof(T)) / (UNROLL * THREADS))
	* (UNROLL * THREADS); // round down

	#pragma unroll 1 // don't unroll this loop
	for (int idx = tid; idx < Nalign; idx += UNROLL * THREADS) {
	PackType t0[UNROLL2];
	PackType t1[UNROLL2];
	PackType t2[UNROLL2];

	#pragma unroll
	for (int j = 0; j < UNROLL2; ++j)
	FetchOneOrTwo64b<T, HAS_SRC1>(t0[j], src0, t1[j], src1,
	idx + j * THREADS);

	#pragma unroll
	for (int j = 0; j < UNROLL2; ++j)
	t2[j] = ReduceOrCopy64b<FUNC, T, HAS_SRC1>(t0[j], t1[j]);

	if (!NOUNROLL2) {
	#pragma unroll
	for (int j = 0; j < UNROLL2; ++j)
	FetchOneOrTwo64b<T, HAS_SRC1>(t0[j], src0, t1[j], src1,
	idx + (UNROLL2 + j) * THREADS);
	}

	#pragma unroll
	for (int j = 0; j < UNROLL2; ++j)
	StoreOneOrTwo64b<T, HAS_DEST1>(dest0, dest1, t2[j], idx + j * THREADS);

	if (!NOUNROLL2) {
	#pragma unroll
	for (int j = 0; j < UNROLL2; ++j)
	t2[j] = ReduceOrCopy64b<FUNC, T, HAS_SRC1>(t0[j], t1[j]);

	#pragma unroll
	for (int j = 0; j < UNROLL2; ++j)
	StoreOneOrTwo64b<T, HAS_DEST1>(dest0, dest1, t2[j],
	idx + (UNROLL2 + j) * THREADS);
	}
	}

	// stage 2b: slightly less optimized for section when we don't have full
	// UNROLLs
	int Ndone2a = Nalign * (sizeof(PackType)/sizeof(T));
	Ndone += Ndone2a;
	Nrem = N - Ndone;

	// TODO: This kind of pointer update arithmetic is expensive. Should
	// probably find a better way.
	if (Nrem > 0) {
	dest0 += Ndone2a; if (HAS_DEST1) { dest1 += Ndone2a; }
	src0 += Ndone2a; if (HAS_SRC1) { src1 += Ndone2a; }
	}

	Nalign = Nrem / (sizeof(PackType)/sizeof(T));

	#pragma unroll 4
	for (int idx = tid; idx < Nalign; idx += THREADS) {
	PackType t0, t1, t2;

	FetchOneOrTwo64b<T, HAS_SRC1>(t0, src0, t1, src1, idx);
	t2 = ReduceOrCopy64b<FUNC, T, HAS_SRC1>(t0, t1);
	StoreOneOrTwo64b<T, HAS_DEST1>(dest0, dest1, t2, idx);
	}

	// stage 2c: tail
	int Ndone2b = Nalign * (sizeof(PackType)/sizeof(T));
	Ndone += Nalign * (sizeof(PackType)/sizeof(T));
	Nrem = N - Ndone;

	if (Nrem > 0) {
	dest0 += Ndone2b; if (HAS_DEST1) { dest1 += Ndone2b; }
	src0 += Ndone2b; if (HAS_SRC1) { src1 += Ndone2b; }
	}

	for (int idx = tid; idx < Nrem; idx += THREADS) {
	// never ought to make it more than one time through this loop. only a
	// few threads should even participate
	T val = vFetch(src0+idx);
	if (HAS_SRC1) {
	val = FUNC()(val, vFetch(src1+idx));
	}

	vStore(dest0+idx, val);
	if (HAS_DEST1) {
	vStore(dest1+idx, val);
	}
	}
	} // done fast path
	}

	template<int THREADS, int UNROLL, typename T>
	__device__ inline void CalcLastChunk(int * const bigSliceN,
	int * const smallSliceN, int * const lastSliceN, int * const numSlices,
	int * const numBigSlices, int * const numSmallSlices, const int N,
	const int numChunks, const int chunkSize) {
	int Nleft = N - ((numChunks - 1) * chunkSize);
	// semi-equally split up the remaining work into numslices slices.
	// it's "semi"-equal because we want the divisions to land as neatly as we
	// can on alignable boundaries
	int NperTile = UNROLL * THREADS * (sizeof(PackType)/sizeof(T));
	int numTiles = (Nleft + NperTile - 1) / NperTile;
	int numTilesPerBigSlice = (numTiles + *numSlices - 1)
	/ *numSlices;
	int numTilesPerSmallSlice = numTiles / *numSlices;

	bigSliceN = NperTile numTilesPerBigSlice;
	smallSliceN = NperTile numTilesPerSmallSlice;
	numBigSlices = numTiles % numSlices;
	numSmallSlices = (smallSliceN > 0) ?
	numSlices - numBigSlices : 0;

	// the lastSlice will take the place of one of the small slices unless
	// there are no small slices (because this is a very small reduction), in
	// which case we replace one of the big slices and leave the small slices
	// as 0.
	if (*numSmallSlices > 0) {
	--*numSmallSlices;
	if (*numSmallSlices == 0)
	*smallSliceN = 0;
	}
	else {
	--*numBigSlices;
	if (*numBigSlices == 0)
	*bigSliceN = 0;
	}

	*lastSliceN = Nleft -
	(numBigSlices *bigSliceN
	+ numSmallSlices *smallSliceN);

	// in cases where args.N % numSlices is pretty small, we'd rather have one
	// slightly big last slice than one big slice, a bunch of small slices,
	// and one smaller last slice
	if ((*numBigSlices == 1) &&
	(numSmallSlices == numSlices - 2) &&
	(lastSliceN < smallSliceN)) {
	numBigSlices += numSmallSlices;
	*numSmallSlices = 0;
	bigSliceN = smallSliceN;
	*smallSliceN = 0;
	*lastSliceN = Nleft -
	numBigSlices *bigSliceN;
	}

	// done recalculating
	numSlices = numBigSlices +
	*numSmallSlices + 1;
	}


	#endif // COMMON_KERNEL_H_