blob: 12fbdd132a4034909165937f2bbfb2a2ebd24f62 [file] [log] [blame]
#ifndef THCUNN_COMMON_H
#define THCUNN_COMMON_H
// CUDA: grid stride looping
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
#define THCUNN_assertSameGPU(...) THAssertMsg(THCudaTensor_checkGPU(__VA_ARGS__), \
"Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")
// _generic can be removed once everything is genericized
#define THCUNN_assertSameGPU_generic(...) THAssertMsg(THCTensor_(checkGPU)(__VA_ARGS__), \
"Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")
// Use 1024 threads per block, which requires cuda sm_2x or above
const int CUDA_NUM_THREADS = 1024;
// CUDA: number of blocks for threads.
inline int GET_BLOCKS(const int N)
{
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}
#define THCUNN_resizeAs_indices(STATE, I1, I2) \
THLongStorage *size2 = THCTensor_(newSizeOf)(STATE, I2); \
if (!THCudaLongTensor_isSize(STATE, I1, size2)) \
{ \
THCudaLongTensor_resize(STATE, I1, size2, NULL); \
} \
THLongStorage_free(size2);
#endif