common.h - platform/external/pytorch - Git at Google

 #ifndef THCUNN_COMMON_H
 #define THCUNN_COMMON_H

 // CUDA: grid stride looping
 #define CUDA_KERNEL_LOOP(i, n) \
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)

 #define THCUNN_assertSameGPU(...) THAssertMsg(THCudaTensor_checkGPU(__VA_ARGS__), \
   "Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")

 // _generic can be removed once everything is genericized
 #define THCUNN_assertSameGPU_generic(...) THAssertMsg(THCTensor_(checkGPU)(__VA_ARGS__), \
   "Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")

 // Use 1024 threads per block, which requires cuda sm_2x or above
 const int CUDA_NUM_THREADS = 1024;

 // CUDA: number of blocks for threads.
 inline int GET_BLOCKS(const int N)
 {
   return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
 }

 #define THCUNN_resizeAs_indices(STATE, I1, I2)              \
   THLongStorage *size2 = THCTensor_(newSizeOf)(STATE, I2); \
   if (!THCudaLongTensor_isSize(STATE, I1, size2))           \
   { \
     THCudaLongTensor_resize(STATE, I1, size2, NULL);        \
   } \
   THLongStorage_free(size2);

 #endif
	#ifndef THCUNN_COMMON_H
	#define THCUNN_COMMON_H

	// CUDA: grid stride looping
	#define CUDA_KERNEL_LOOP(i, n) \
	for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)

	#define THCUNN_assertSameGPU(...) THAssertMsg(THCudaTensor_checkGPU(__VA_ARGS__), \
	"Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")

	// _generic can be removed once everything is genericized
	#define THCUNN_assertSameGPU_generic(...) THAssertMsg(THCTensor_(checkGPU)(__VA_ARGS__), \
	"Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")

	// Use 1024 threads per block, which requires cuda sm_2x or above
	const int CUDA_NUM_THREADS = 1024;

	// CUDA: number of blocks for threads.
	inline int GET_BLOCKS(const int N)
	{
	return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
	}

	#define THCUNN_resizeAs_indices(STATE, I1, I2) \
	THLongStorage *size2 = THCTensor_(newSizeOf)(STATE, I2); \
	if (!THCudaLongTensor_isSize(STATE, I1, size2)) \
	{ \
	THCudaLongTensor_resize(STATE, I1, size2, NULL); \
	} \
	THLongStorage_free(size2);

	#endif