THCGeneral.h.in - platform/external/pytorch - Git at Google

 #ifndef THC_GENERAL_INC
 #define THC_GENERAL_INC

 #include "THGeneral.h"
 #include "THAllocator.h"
 #include "THCThreadLocal.h"
 #undef log1p

 #include "cuda.h"
 #include "cuda_runtime.h"
 #include "cublas_v2.h"

 #cmakedefine USE_MAGMA

 #ifdef __cplusplus
 # define THC_EXTERNC extern "C"
 #else
 # define THC_EXTERNC extern
 #endif

 #ifdef _WIN32
 # ifdef THC_EXPORTS
 #  define THC_API THC_EXTERNC __declspec(dllexport)
 #  define THC_CLASS __declspec(dllexport)
 # else
 #  define THC_API THC_EXTERNC __declspec(dllimport)
 #  define THC_CLASS __declspec(dllimport)
 # endif
 #else
 # define THC_API THC_EXTERNC
 # define THC_CLASS
 #endif

 #ifndef THAssert
 #define THAssert(exp)                                                   \
   do {                                                                  \
     if (!(exp)) {                                                       \
       _THError(__FILE__, __LINE__, "assert(%s) failed", #exp);          \
     }                                                                   \
   } while(0)
 #endif

 struct THCRNGState;  /* Random number generator state. */
 typedef struct THCStream THCStream;
 typedef struct THCState THCState;

 typedef struct _THCDeviceAllocator {
    cudaError_t (*malloc)( void*, void**, size_t,         cudaStream_t);
    cudaError_t (*realloc)(void*, void**, size_t, size_t, cudaStream_t);
    cudaError_t (*free)(void*, void*);
    cudaError_t (*emptyCache)(void*);
    cudaError_t  (*cacheInfo)(void*, int, size_t*, size_t*);
    void* state;
 } THCDeviceAllocator;

 typedef struct _THCCudaResourcesPerDevice {
   THCStream** streams;
   /* Number of materialized cuBLAS handles */
   int numBlasHandles;
   /* cuBLAS handes are lazily initialized */
   cublasHandle_t* blasHandles;
   /* Size of scratch space per each stream on this device available */
   size_t scratchSpacePerStream;
   /* Device-resident scratch space per stream, used for global memory
      reduction kernels. Lazily initialized. */
   void** devScratchSpacePerStream;
 } THCCudaResourcesPerDevice;


 /* Global state to be held in the cutorch table. */
 struct THCState {
   struct THCRNGState* rngState;
   struct cudaDeviceProp* deviceProperties;
   /* Set of all allocated resources. resourcePerDevice[dev]->streams[0] is NULL,
      which specifies the per-device default stream. blasHandles do not have a
      default and must be explicitly initialized. We always initialize 1
      blasHandle but we can use more.
   */
   THCCudaResourcesPerDevice* resourcesPerDevice;
   /* Captured number of devices upon startup; convenience for bounds checking */
   int numDevices;
   /* Number of Torch defined resources available, indices 1 ... numStreams */
   int numUserStreams;
   int numUserBlasHandles;

   /* Allocator using cudaMallocHost. */
   THAllocator* cudaHostAllocator;
   THAllocator* cudaUVAAllocator;
   THCDeviceAllocator* cudaDeviceAllocator;

   /* Index of the current selected BLAS handle. The actual BLAS handle used
      depends on the current device. */
   THCThreadLocal/*<int>*/ currentPerDeviceBlasHandle;
   /* Array of thread locals containing the current stream for each device */
   THCThreadLocal* currentStreams;

   /* Table of enabled peer-to-peer access between directed pairs of GPUs.
      If i accessing allocs on j is enabled, p2pAccess[i][j] is 1; 0 otherwise. */
   int** p2pAccessEnabled;

   /* Is direct cross-kernel p2p access allowed? Normally, only cross-GPU
      copies are allowed via p2p if p2p access is enabled at all for
      the pair of GPUs in question, but if this flag is true, then
      all cross-GPU access checks are disabled, allowing kernels to
      directly access memory on another GPUs.
      Note that p2p access must exist and be enabled for the pair of
      GPUs in question. */
   int p2pKernelAccessEnabled;

   void (*cutorchGCFunction)(void *data);
   void *cutorchGCData;
   ptrdiff_t heapSoftmax;
   ptrdiff_t heapDelta;
 };

 THC_API THCState* THCState_alloc(void);
 THC_API void THCState_free(THCState* state);

 THC_API void THCudaInit(THCState* state);
 THC_API void THCudaShutdown(THCState* state);

 /* If device `dev` can access allocations on device `devToAccess`, this will return */
 /* 1; otherwise, 0. */
 THC_API int THCState_getPeerToPeerAccess(THCState* state, int dev, int devToAccess);
 /* Enables or disables allowed p2p access using cutorch copy. If we are */
 /* attempting to enable access, throws an error if CUDA cannot enable p2p */
 /* access. */
 THC_API void THCState_setPeerToPeerAccess(THCState* state, int dev, int devToAccess,
                                           int enable);

 /* By default, direct in-kernel access to memory on remote GPUs is
    disabled. When set, this allows direct in-kernel access to remote
    GPUs where GPU/GPU p2p access is enabled and allowed. */
 THC_API int THCState_getKernelPeerToPeerAccessEnabled(THCState* state);
 THC_API void THCState_setKernelPeerToPeerAccessEnabled(THCState* state, int val);

 THC_API struct cudaDeviceProp* THCState_getCurrentDeviceProperties(THCState* state);

 THC_API struct THCRNGState* THCState_getRngState(THCState* state);
 THC_API THAllocator* THCState_getCudaHostAllocator(THCState* state);
 THC_API THAllocator* THCState_getCudaUVAAllocator(THCState* state);
 THC_API void THCState_setDeviceAllocator(THCState* state, THCDeviceAllocator* allocator);

 THC_API void THCMagma_init(THCState *state);

 /* State manipulators and accessors */
 THC_API int THCState_getNumDevices(THCState* state);
 THC_API void THCState_reserveStreams(THCState* state, int numStreams, int nonBlocking);
 THC_API int THCState_getNumStreams(THCState* state);

 /* Stream API */
 THC_API cudaStream_t THCState_getCurrentStreamOnDevice(THCState *state, int device);
 THC_API cudaStream_t THCState_getCurrentStream(THCState *state);
 THC_API struct THCStream* THCState_getStream(THCState *state);
 THC_API void THCState_setStream(THCState *state, struct THCStream* stream);
 /* deprecated stream API */
 THC_API cudaStream_t THCState_getDeviceStream(THCState *state, int device, int stream);
 THC_API int THCState_getCurrentStreamIndex(THCState *state);
 THC_API void THCState_setCurrentStreamIndex(THCState *state, int stream);

 THC_API void THCState_reserveBlasHandles(THCState* state, int numHandles);
 THC_API int THCState_getNumBlasHandles(THCState* state);

 THC_API cublasHandle_t THCState_getDeviceBlasHandle(THCState *state, int device, int handle);
 THC_API cublasHandle_t THCState_getCurrentBlasHandle(THCState *state);
 THC_API int THCState_getCurrentBlasHandleIndex(THCState *state);
 THC_API void THCState_setCurrentBlasHandleIndex(THCState *state, int handle);

 /* For the current device and stream, returns the allocated scratch space */
 THC_API void* THCState_getCurrentDeviceScratchSpace(THCState* state);
 THC_API void* THCState_getDeviceScratchSpace(THCState* state, int device, int stream);
 THC_API size_t THCState_getCurrentDeviceScratchSpaceSize(THCState* state);
 THC_API size_t THCState_getDeviceScratchSpaceSize(THCState* state, int device);

 #define THCudaCheck(err)  __THCudaCheck(err, __FILE__, __LINE__)
 #define THCudaCheckWarn(err)  __THCudaCheckWarn(err, __FILE__, __LINE__)
 #define THCublasCheck(err)  __THCublasCheck(err,  __FILE__, __LINE__)

 THC_API void __THCudaCheck(cudaError_t err, const char *file, const int line);
 THC_API void __THCudaCheckWarn(cudaError_t err, const char *file, const int line);
 THC_API void __THCublasCheck(cublasStatus_t status, const char *file, const int line);

 THC_API cudaError_t THCudaMalloc(THCState *state, void **ptr, size_t size);
 THC_API cudaError_t THCudaFree(THCState *state, void *ptr);
 THC_API cudaError_t THCudaMemGetInfo(THCState *state, size_t* freeBytes, size_t* totalBytes);
 THC_API void THCSetGCHandler(THCState *state,
                              void (*torchGCHandlerFunction)(void *data),
                              void *data );
 THC_API void THCHeapUpdate(THCState *state, ptrdiff_t size);

 #endif
	#ifndef THC_GENERAL_INC
	#define THC_GENERAL_INC

	#include "THGeneral.h"
	#include "THAllocator.h"
	#include "THCThreadLocal.h"
	#undef log1p

	#include "cuda.h"
	#include "cuda_runtime.h"
	#include "cublas_v2.h"

	#cmakedefine USE_MAGMA

	#ifdef __cplusplus
	# define THC_EXTERNC extern "C"
	#else
	# define THC_EXTERNC extern
	#endif

	#ifdef _WIN32
	# ifdef THC_EXPORTS
	# define THC_API THC_EXTERNC __declspec(dllexport)
	# define THC_CLASS __declspec(dllexport)
	# else
	# define THC_API THC_EXTERNC __declspec(dllimport)
	# define THC_CLASS __declspec(dllimport)
	# endif
	#else
	# define THC_API THC_EXTERNC
	# define THC_CLASS
	#endif

	#ifndef THAssert
	#define THAssert(exp) \
	do { \
	if (!(exp)) { \
	_THError(__FILE__, __LINE__, "assert(%s) failed", #exp); \
	} \
	} while(0)
	#endif

	struct THCRNGState; /* Random number generator state. */
	typedef struct THCStream THCStream;
	typedef struct THCState THCState;

	typedef struct _THCDeviceAllocator {
	cudaError_t (malloc)( void, void**, size_t, cudaStream_t);
	cudaError_t (realloc)(void, void**, size_t, size_t, cudaStream_t);
	cudaError_t (free)(void, void*);
	cudaError_t (emptyCache)(void);
	cudaError_t (cacheInfo)(void, int, size_t, size_t);
	void* state;
	} THCDeviceAllocator;

	typedef struct _THCCudaResourcesPerDevice {
	THCStream** streams;
	/* Number of materialized cuBLAS handles */
	int numBlasHandles;
	/* cuBLAS handes are lazily initialized */
	cublasHandle_t* blasHandles;
	/* Size of scratch space per each stream on this device available */
	size_t scratchSpacePerStream;
	/* Device-resident scratch space per stream, used for global memory
	reduction kernels. Lazily initialized. */
	void** devScratchSpacePerStream;
	} THCCudaResourcesPerDevice;


	/* Global state to be held in the cutorch table. */
	struct THCState {
	struct THCRNGState* rngState;
	struct cudaDeviceProp* deviceProperties;
	/* Set of all allocated resources. resourcePerDevice[dev]->streams[0] is NULL,
	which specifies the per-device default stream. blasHandles do not have a
	default and must be explicitly initialized. We always initialize 1
	blasHandle but we can use more.
	*/
	THCCudaResourcesPerDevice* resourcesPerDevice;
	/* Captured number of devices upon startup; convenience for bounds checking */
	int numDevices;
	/* Number of Torch defined resources available, indices 1 ... numStreams */
	int numUserStreams;
	int numUserBlasHandles;

	/* Allocator using cudaMallocHost. */
	THAllocator* cudaHostAllocator;
	THAllocator* cudaUVAAllocator;
	THCDeviceAllocator* cudaDeviceAllocator;

	/* Index of the current selected BLAS handle. The actual BLAS handle used
	depends on the current device. */
	THCThreadLocal/<int>/ currentPerDeviceBlasHandle;
	/* Array of thread locals containing the current stream for each device */
	THCThreadLocal* currentStreams;

	/* Table of enabled peer-to-peer access between directed pairs of GPUs.
	If i accessing allocs on j is enabled, p2pAccess[i][j] is 1; 0 otherwise. */
	int** p2pAccessEnabled;

	/* Is direct cross-kernel p2p access allowed? Normally, only cross-GPU
	copies are allowed via p2p if p2p access is enabled at all for
	the pair of GPUs in question, but if this flag is true, then
	all cross-GPU access checks are disabled, allowing kernels to
	directly access memory on another GPUs.
	Note that p2p access must exist and be enabled for the pair of
	GPUs in question. */
	int p2pKernelAccessEnabled;

	void (cutorchGCFunction)(void data);
	void *cutorchGCData;
	ptrdiff_t heapSoftmax;
	ptrdiff_t heapDelta;
	};

	THC_API THCState* THCState_alloc(void);
	THC_API void THCState_free(THCState* state);

	THC_API void THCudaInit(THCState* state);
	THC_API void THCudaShutdown(THCState* state);

	/* If device `dev` can access allocations on device `devToAccess`, this will return */
	/* 1; otherwise, 0. */
	THC_API int THCState_getPeerToPeerAccess(THCState* state, int dev, int devToAccess);
	/* Enables or disables allowed p2p access using cutorch copy. If we are */
	/* attempting to enable access, throws an error if CUDA cannot enable p2p */
	/* access. */
	THC_API void THCState_setPeerToPeerAccess(THCState* state, int dev, int devToAccess,
	int enable);

	/* By default, direct in-kernel access to memory on remote GPUs is
	disabled. When set, this allows direct in-kernel access to remote
	GPUs where GPU/GPU p2p access is enabled and allowed. */
	THC_API int THCState_getKernelPeerToPeerAccessEnabled(THCState* state);
	THC_API void THCState_setKernelPeerToPeerAccessEnabled(THCState* state, int val);

	THC_API struct cudaDeviceProp* THCState_getCurrentDeviceProperties(THCState* state);

	THC_API struct THCRNGState* THCState_getRngState(THCState* state);
	THC_API THAllocator* THCState_getCudaHostAllocator(THCState* state);
	THC_API THAllocator* THCState_getCudaUVAAllocator(THCState* state);
	THC_API void THCState_setDeviceAllocator(THCState* state, THCDeviceAllocator* allocator);

	THC_API void THCMagma_init(THCState *state);

	/* State manipulators and accessors */
	THC_API int THCState_getNumDevices(THCState* state);
	THC_API void THCState_reserveStreams(THCState* state, int numStreams, int nonBlocking);
	THC_API int THCState_getNumStreams(THCState* state);

	/* Stream API */
	THC_API cudaStream_t THCState_getCurrentStreamOnDevice(THCState *state, int device);
	THC_API cudaStream_t THCState_getCurrentStream(THCState *state);
	THC_API struct THCStream* THCState_getStream(THCState *state);
	THC_API void THCState_setStream(THCState state, struct THCStream stream);
	/* deprecated stream API */
	THC_API cudaStream_t THCState_getDeviceStream(THCState *state, int device, int stream);
	THC_API int THCState_getCurrentStreamIndex(THCState *state);
	THC_API void THCState_setCurrentStreamIndex(THCState *state, int stream);

	THC_API void THCState_reserveBlasHandles(THCState* state, int numHandles);
	THC_API int THCState_getNumBlasHandles(THCState* state);

	THC_API cublasHandle_t THCState_getDeviceBlasHandle(THCState *state, int device, int handle);
	THC_API cublasHandle_t THCState_getCurrentBlasHandle(THCState *state);
	THC_API int THCState_getCurrentBlasHandleIndex(THCState *state);
	THC_API void THCState_setCurrentBlasHandleIndex(THCState *state, int handle);

	/* For the current device and stream, returns the allocated scratch space */
	THC_API void* THCState_getCurrentDeviceScratchSpace(THCState* state);
	THC_API void* THCState_getDeviceScratchSpace(THCState* state, int device, int stream);
	THC_API size_t THCState_getCurrentDeviceScratchSpaceSize(THCState* state);
	THC_API size_t THCState_getDeviceScratchSpaceSize(THCState* state, int device);

	#define THCudaCheck(err) __THCudaCheck(err, __FILE__, __LINE__)
	#define THCudaCheckWarn(err) __THCudaCheckWarn(err, __FILE__, __LINE__)
	#define THCublasCheck(err) __THCublasCheck(err, __FILE__, __LINE__)

	THC_API void __THCudaCheck(cudaError_t err, const char *file, const int line);
	THC_API void __THCudaCheckWarn(cudaError_t err, const char *file, const int line);
	THC_API void __THCublasCheck(cublasStatus_t status, const char *file, const int line);

	THC_API cudaError_t THCudaMalloc(THCState state, void *ptr, size_t size);
	THC_API cudaError_t THCudaFree(THCState state, void ptr);
	THC_API cudaError_t THCudaMemGetInfo(THCState state, size_t freeBytes, size_t* totalBytes);
	THC_API void THCSetGCHandler(THCState *state,
	void (torchGCHandlerFunction)(void data),
	void *data );
	THC_API void THCHeapUpdate(THCState *state, ptrdiff_t size);

	#endif