common.h - platform/external/pytorch - Git at Google

 #ifndef THCUNN_COMMON_H
 #define THCUNN_COMMON_H

 // CUDA: grid stride looping
 #define CUDA_KERNEL_LOOP(i, n) \
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)

 // Use 1024 threads per block, which requires cuda sm_2x or above
 const int CUDA_NUM_THREADS = 1024;

 // CUDA: number of blocks for threads.
 inline int GET_BLOCKS(const int N)
 {
   return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
 }

 #endif
	#ifndef THCUNN_COMMON_H
	#define THCUNN_COMMON_H

	// CUDA: grid stride looping
	#define CUDA_KERNEL_LOOP(i, n) \
	for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)

	// Use 1024 threads per block, which requires cuda sm_2x or above
	const int CUDA_NUM_THREADS = 1024;

	// CUDA: number of blocks for threads.
	inline int GET_BLOCKS(const int N)
	{
	return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
	}

	#endif