blob: a88bd7dbe96708d6a31ff11712e50e83e5663136 [file] [log] [blame]
#ifndef THC_GENERAL_INC
#define THC_GENERAL_INC
#include "THGeneral.h"
#include "THAllocator.h"
#include "THCThreadLocal.h"
#undef log1p
#include "cuda.h"
#include "cuda_runtime.h"
#include "cublas_v2.h"
#cmakedefine USE_MAGMA
#ifdef __cplusplus
# define THC_EXTERNC extern "C"
#else
# define THC_EXTERNC extern
#endif
#ifdef _WIN32
# ifdef THC_EXPORTS
# define THC_API THC_EXTERNC __declspec(dllexport)
# define THC_CLASS __declspec(dllexport)
# else
# define THC_API THC_EXTERNC __declspec(dllimport)
# define THC_CLASS __declspec(dllimport)
# endif
#else
# define THC_API THC_EXTERNC
# define THC_CLASS
#endif
#ifndef THAssert
#define THAssert(exp) \
do { \
if (!(exp)) { \
_THError(__FILE__, __LINE__, "assert(%s) failed", #exp); \
} \
} while(0)
#endif
struct THCRNGState; /* Random number generator state. */
typedef struct THCStream THCStream;
typedef struct THCState THCState;
typedef struct _THCDeviceAllocator {
cudaError_t (*malloc)( void*, void**, size_t, cudaStream_t);
cudaError_t (*realloc)(void*, void**, size_t, size_t, cudaStream_t);
cudaError_t (*free)(void*, void*);
cudaError_t (*emptyCache)(void*);
cudaError_t (*cacheInfo)(void*, int, size_t*, size_t*);
void* state;
} THCDeviceAllocator;
typedef struct _THCCudaResourcesPerDevice {
THCStream** streams;
/* Number of materialized cuBLAS handles */
int numBlasHandles;
/* cuBLAS handes are lazily initialized */
cublasHandle_t* blasHandles;
/* Size of scratch space per each stream on this device available */
size_t scratchSpacePerStream;
/* Device-resident scratch space per stream, used for global memory
reduction kernels. Lazily initialized. */
void** devScratchSpacePerStream;
} THCCudaResourcesPerDevice;
/* Global state to be held in the cutorch table. */
struct THCState {
struct THCRNGState* rngState;
struct cudaDeviceProp* deviceProperties;
/* Set of all allocated resources. resourcePerDevice[dev]->streams[0] is NULL,
which specifies the per-device default stream. blasHandles do not have a
default and must be explicitly initialized. We always initialize 1
blasHandle but we can use more.
*/
THCCudaResourcesPerDevice* resourcesPerDevice;
/* Captured number of devices upon startup; convenience for bounds checking */
int numDevices;
/* Number of Torch defined resources available, indices 1 ... numStreams */
int numUserStreams;
int numUserBlasHandles;
/* Allocator using cudaMallocHost. */
THAllocator* cudaHostAllocator;
THAllocator* cudaUVAAllocator;
THCDeviceAllocator* cudaDeviceAllocator;
/* Index of the current selected BLAS handle. The actual BLAS handle used
depends on the current device. */
THCThreadLocal/*<int>*/ currentPerDeviceBlasHandle;
/* Array of thread locals containing the current stream for each device */
THCThreadLocal* currentStreams;
/* Table of enabled peer-to-peer access between directed pairs of GPUs.
If i accessing allocs on j is enabled, p2pAccess[i][j] is 1; 0 otherwise. */
int** p2pAccessEnabled;
/* Is direct cross-kernel p2p access allowed? Normally, only cross-GPU
copies are allowed via p2p if p2p access is enabled at all for
the pair of GPUs in question, but if this flag is true, then
all cross-GPU access checks are disabled, allowing kernels to
directly access memory on another GPUs.
Note that p2p access must exist and be enabled for the pair of
GPUs in question. */
int p2pKernelAccessEnabled;
void (*cutorchGCFunction)(void *data);
void *cutorchGCData;
ptrdiff_t heapSoftmax;
ptrdiff_t heapDelta;
};
THC_API THCState* THCState_alloc(void);
THC_API void THCState_free(THCState* state);
THC_API void THCudaInit(THCState* state);
THC_API void THCudaShutdown(THCState* state);
/* If device `dev` can access allocations on device `devToAccess`, this will return */
/* 1; otherwise, 0. */
THC_API int THCState_getPeerToPeerAccess(THCState* state, int dev, int devToAccess);
/* Enables or disables allowed p2p access using cutorch copy. If we are */
/* attempting to enable access, throws an error if CUDA cannot enable p2p */
/* access. */
THC_API void THCState_setPeerToPeerAccess(THCState* state, int dev, int devToAccess,
int enable);
/* By default, direct in-kernel access to memory on remote GPUs is
disabled. When set, this allows direct in-kernel access to remote
GPUs where GPU/GPU p2p access is enabled and allowed. */
THC_API int THCState_getKernelPeerToPeerAccessEnabled(THCState* state);
THC_API void THCState_setKernelPeerToPeerAccessEnabled(THCState* state, int val);
THC_API struct cudaDeviceProp* THCState_getCurrentDeviceProperties(THCState* state);
THC_API struct THCRNGState* THCState_getRngState(THCState* state);
THC_API THAllocator* THCState_getCudaHostAllocator(THCState* state);
THC_API THAllocator* THCState_getCudaUVAAllocator(THCState* state);
THC_API void THCState_setDeviceAllocator(THCState* state, THCDeviceAllocator* allocator);
THC_API void THCMagma_init(THCState *state);
/* State manipulators and accessors */
THC_API int THCState_getNumDevices(THCState* state);
THC_API void THCState_reserveStreams(THCState* state, int numStreams, int nonBlocking);
THC_API int THCState_getNumStreams(THCState* state);
/* Stream API */
THC_API cudaStream_t THCState_getCurrentStreamOnDevice(THCState *state, int device);
THC_API cudaStream_t THCState_getCurrentStream(THCState *state);
THC_API struct THCStream* THCState_getStream(THCState *state);
THC_API void THCState_setStream(THCState *state, struct THCStream* stream);
/* deprecated stream API */
THC_API cudaStream_t THCState_getDeviceStream(THCState *state, int device, int stream);
THC_API int THCState_getCurrentStreamIndex(THCState *state);
THC_API void THCState_setCurrentStreamIndex(THCState *state, int stream);
THC_API void THCState_reserveBlasHandles(THCState* state, int numHandles);
THC_API int THCState_getNumBlasHandles(THCState* state);
THC_API cublasHandle_t THCState_getDeviceBlasHandle(THCState *state, int device, int handle);
THC_API cublasHandle_t THCState_getCurrentBlasHandle(THCState *state);
THC_API int THCState_getCurrentBlasHandleIndex(THCState *state);
THC_API void THCState_setCurrentBlasHandleIndex(THCState *state, int handle);
/* For the current device and stream, returns the allocated scratch space */
THC_API void* THCState_getCurrentDeviceScratchSpace(THCState* state);
THC_API void* THCState_getDeviceScratchSpace(THCState* state, int device, int stream);
THC_API size_t THCState_getCurrentDeviceScratchSpaceSize(THCState* state);
THC_API size_t THCState_getDeviceScratchSpaceSize(THCState* state, int device);
#define THCudaCheck(err) __THCudaCheck(err, __FILE__, __LINE__)
#define THCudaCheckWarn(err) __THCudaCheckWarn(err, __FILE__, __LINE__)
#define THCublasCheck(err) __THCublasCheck(err, __FILE__, __LINE__)
THC_API void __THCudaCheck(cudaError_t err, const char *file, const int line);
THC_API void __THCudaCheckWarn(cudaError_t err, const char *file, const int line);
THC_API void __THCublasCheck(cublasStatus_t status, const char *file, const int line);
THC_API cudaError_t THCudaMalloc(THCState *state, void **ptr, size_t size);
THC_API cudaError_t THCudaFree(THCState *state, void *ptr);
THC_API cudaError_t THCudaMemGetInfo(THCState *state, size_t* freeBytes, size_t* totalBytes);
THC_API void THCSetGCHandler(THCState *state,
void (*torchGCHandlerFunction)(void *data),
void *data );
THC_API void THCHeapUpdate(THCState *state, ptrdiff_t size);
#endif