THCGeneral.c - platform/external/pytorch - Git at Google

 #include "THCGeneral.h"
 #include "TH.h"
 #include "THCTensorRandom.h"
 #include "THCBlas.h"
 #include "THCAllocator.h"
 #include <stdlib.h>

 /* Size of scratch space available in global memory per each SM + stream */
 #define GLOBAL_SCRATCH_SPACE_PER_SM_STREAM 4 * sizeof(float)

 THCCudaResourcesPerDevice* THCState_getDeviceResourcePtr(
   THCState *state, int device);

 void THCudaInit(THCState* state)
 {
   state->cutorchGCFunction = NULL;
   state->cutorchGCData = NULL;

   int count = 0;
   THCudaCheck(cudaGetDeviceCount(&count));

   int device = 0;
   THCudaCheck(cudaGetDevice(&device));

   state->rngState = (THCRNGState*)malloc(sizeof(THCRNGState));
   THCRandom_init(state, count, device);

   THCAllocator_init(state);

   state->numDevices = count;
   state->deviceProperties =
     (struct cudaDeviceProp*)malloc(count * sizeof(struct cudaDeviceProp));

   state->numUserStreams = 0;
   state->numUserBlasHandles = 0;

   /* Enable P2P access between all pairs, if possible */
   THCudaEnablePeerToPeerAccess(state);

   state->resourcesPerDevice = (THCCudaResourcesPerDevice*)
     malloc(count * sizeof(THCCudaResourcesPerDevice));
   for (int i = 0; i < count; ++i) {
     THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, i);

     THCudaCheck(cudaSetDevice(i));
     THCudaCheck(cudaGetDeviceProperties(&state->deviceProperties[i], i));
     /* Stream index 0 will be the default stream for convenience; by
        default no user streams are reserved */
     res->streams = NULL;
     res->blasHandles = NULL;

     /* The scratch space that we want to have available per each device is
        based on the number of SMs available per device */
     int numSM = state->deviceProperties[i].multiProcessorCount;
     size_t sizePerStream = numSM * GLOBAL_SCRATCH_SPACE_PER_SM_STREAM;
     res->scratchSpacePerStream = sizePerStream;

     /* Allocate scratch space for each stream */
     res->devScratchSpacePerStream = (void**) malloc(sizeof(void*));
     THCudaCheck(THCudaMalloc(state, &res->devScratchSpacePerStream[0],
                            sizePerStream));
   }

   /* Restore to previous device */
   THCudaCheck(cudaSetDevice(device));

   /* Start in the default stream on the current device */
   state->currentPerDeviceStream = 0;
   state->currentStream = NULL;

   /* There is no such thing as a default cublas handle.
      To maintain consistency with streams API, handle 0 is always NULL and we
      start counting at 1
    */
   THCState_reserveBlasHandles(state, 1);
   state->currentPerDeviceBlasHandle = 1;
   state->currentBlasHandle = THCState_getDeviceBlasHandle(state, device, 1);

   state->heapSoftmax = 3e8; // 300MB, adjusted upward dynamically
   state->heapDelta = 0;
 }

 void THCudaShutdown(THCState* state)
 {
   THCRandom_shutdown(state);
   THCAllocator_shutdown(state);

   free(state->rngState);
   free(state->deviceProperties);

   int deviceCount = 0;
   int prevDev = -1;
   THCudaCheck(cudaGetDevice(&prevDev));
   THCudaCheck(cudaGetDeviceCount(&deviceCount));

   /* cleanup p2p access state */
   for (int dev = 0; dev < deviceCount; ++dev) {
     free(state->p2pAccessEnabled[dev]);
   }
   free(state->p2pAccessEnabled);

   /* cleanup per-device state */
   for (int dev = 0; dev < deviceCount; ++dev) {
     THCudaCheck(cudaSetDevice(dev));
     /* Free Torch-defined streams (0 is the default stream) */
     for (int stream = 1; stream <= state->numUserStreams; ++stream) {
       THCudaCheck(cudaStreamDestroy(
                     THCState_getDeviceStream(state, dev, stream)));
     }
     /* Free Torch-defined handles (0 is NULL for consistency with streams API) */
     for (int handle = 1; handle <= state->numUserBlasHandles; ++handle) {
       THCublasCheck(cublasDestroy(
                       THCState_getDeviceBlasHandle(state, dev, handle)));
     }
     /* Free per-stream scratch space; starts at 0 because there is space for
        the default stream as well*/
     for (int stream = 0; stream <= state->numUserStreams; ++stream) {
       THCudaCheck(THCudaFree(state, THCState_getDeviceScratchSpace(state, dev, stream)));
     }

     free(state->resourcesPerDevice[dev].streams);
     free(state->resourcesPerDevice[dev].blasHandles);
     free(state->resourcesPerDevice[dev].devScratchSpacePerStream);
   }
   free(state->resourcesPerDevice);

   THCudaCheck(cudaSetDevice(prevDev));
 }

 void THCudaEnablePeerToPeerAccess(THCState* state)
 {
   /* By default, all direct p2p kernel access (besides copy) is disallowed, */
   /* since direct access without knowing whether or not a certain operation */
   /* should be cross-GPU leads to synchronization errors. The user can choose */
   /* to disable this functionality, however. */
   state->p2pKernelAccessEnabled = 0;

   int prevDev = -1;
   THCudaCheck(cudaGetDevice(&prevDev));

   int numDevices = -1;
   THCudaCheck(cudaGetDeviceCount(&numDevices));

   state->p2pAccessEnabled = (int**) malloc(sizeof(int*) * numDevices);
   for (int i = 0; i < numDevices; ++i) {
     state->p2pAccessEnabled[i] = (int*) malloc(sizeof(int) * numDevices);
   }

   /* Build a table of all allowed p2p accesses, to avoid checking the p2p
      status at runtime. */
   for (int i = 0; i < numDevices; ++i) {
     THCudaCheck(cudaSetDevice(i));

     for (int j = 0; j < numDevices; ++j) {
       /* Presume no access by default */
       state->p2pAccessEnabled[i][j] = 0;

       if (i == j) {
         /* A GPU can access itself */
         state->p2pAccessEnabled[i][j] = 1;
       } else {
         int access = 0;
         THCudaCheck(cudaDeviceCanAccessPeer(&access, i, j));

         if (access) {
           cudaError_t err = cudaDeviceEnablePeerAccess(j, 0);
           if (err == cudaErrorPeerAccessAlreadyEnabled) {
             /* Any future call to cudaGetLastError will now return an error, */
             /* even though we've already dealt with this specific error here. */
             /* Call cudaGetLastError once to reset the last error state. */
             cudaGetLastError();
             continue;
           }

           /* In case there are unknown errors returned from the above */
           THCudaCheck(err);

           /* Access could be enabled */
           state->p2pAccessEnabled[i][j] = 1;
         }
       }
     }
   }

   /* Restore previous device before continuing */
   THCudaCheck(cudaSetDevice(prevDev));
 }

 int THCState_getPeerToPeerAccess(THCState* state, int dev, int devToAccess)
 {
   int numDevices = 0;
   THCudaCheck(cudaGetDeviceCount(&numDevices));
   if (dev < 0 || dev >= numDevices) {
     THError("%d is not a device", dev);
   }

   if (devToAccess < 0 || dev >= numDevices) {
     THError("%d is not a device", devToAccess);
   }

   return state->p2pAccessEnabled[dev][devToAccess];
 }

 void THCState_setPeerToPeerAccess(THCState* state, int dev, int devToAccess,
                                   int enable)
 {
   /* This will perform device bounds checking for us */
   int prevEnabled = THCState_getPeerToPeerAccess(state, dev, devToAccess);

   if (enable != prevEnabled) {
     /* If we're attempting to enable p2p access but p2p access isn't */
     /* supported, throw an error */
     if (enable) {
       int access = 0;
       THCudaCheck(cudaDeviceCanAccessPeer(&access, dev, devToAccess));

       if (!access) {
         THError("p2p access not supported for %d accessing %d",
                 dev, devToAccess);
       }
     }

     state->p2pAccessEnabled[dev][devToAccess] = enable;

     int prevDev = 0;
     THCudaCheck(cudaGetDevice(&prevDev));
     THCudaCheck(cudaSetDevice(dev));

     /* This should be in sync with the current access state */
     if (enable) {
       THCudaCheck(cudaDeviceEnablePeerAccess(devToAccess, 0));
     } else {
       THCudaCheck(cudaDeviceDisablePeerAccess(devToAccess));
     }

     THCudaCheck(cudaSetDevice(prevDev));
   }
 }

 int THCState_getKernelPeerToPeerAccessEnabled(THCState* state) {
   return state->p2pKernelAccessEnabled;
 }

 void THCState_setKernelPeerToPeerAccessEnabled(THCState* state, int val) {
   state->p2pKernelAccessEnabled = val;
 }

 struct cudaDeviceProp* THCState_getCurrentDeviceProperties(THCState* state)
 {
   int curDev = -1;
   THCudaCheck(cudaGetDevice(&curDev));

   return &(state->deviceProperties[curDev]);
 }

 int THCState_getNumDevices(THCState *state)
 {
   return state->numDevices;
 }

 void THCState_reserveStreams(THCState* state, int numStreams, int nonBlocking)
 {
   if (numStreams <= state->numUserStreams)
   {
     return;
   }

   int prevDev = -1;
   THCudaCheck(cudaGetDevice(&prevDev));

   /* Otherwise, we have to allocate a new set of streams and stream data */
   for (int dev = 0; dev < state->numDevices; ++dev) {
     THCudaCheck(cudaSetDevice(dev));

     /* +1 for the default stream as well */
     cudaStream_t* newStreams =
       (cudaStream_t*) malloc((numStreams + 1) * sizeof(cudaStream_t));

     void** newScratchSpace =
       (void**) malloc((numStreams + 1) * sizeof(void*));

     /* Copy over old stream data
        (0 is default stream, 1 ... numUserStreams are rest) */
     for (int stream = 0; stream <= state->numUserStreams; ++stream) {
       newStreams[stream] =
         THCState_getDeviceStream(state, dev, stream);
       newScratchSpace[stream] =
         THCState_getDeviceScratchSpace(state, dev, stream);
     }

     /* Allocate new stream resources */
     size_t scratchSpaceSize = THCState_getDeviceScratchSpaceSize(state, dev);
     unsigned int flags =
       nonBlocking ? cudaStreamNonBlocking : cudaStreamDefault;

     for (int stream = state->numUserStreams + 1; stream <= numStreams; ++stream) {
       newStreams[stream] = NULL;
       THCudaCheck(cudaStreamCreateWithFlags(newStreams + stream, flags));
       newScratchSpace[stream] = NULL;
       THCudaCheck(THCudaMalloc(state, &newScratchSpace[stream], scratchSpaceSize));
     }

     THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, dev);
     free(res->streams);
     res->streams = newStreams;
     free(res->devScratchSpacePerStream);
     res->devScratchSpacePerStream = newScratchSpace;
   }

   state->numUserStreams = numStreams;

   THCudaCheck(cudaSetDevice(prevDev));
 }

 void THCState_reserveBlasHandles(THCState* state, int numBlasHandles)
 {
   if (numBlasHandles <= state->numUserBlasHandles)
   {
     return;
   }

   int prevDev = -1;
   THCudaCheck(cudaGetDevice(&prevDev));

   /* Otherwise, we have to allocate a new set of blasHandles */
   for (int dev = 0; dev < state->numDevices; ++dev) {
     THCudaCheck(cudaSetDevice(dev));

     /* +1 to be consistent with stream API, blas handle 0 is NULL and unused */
     cublasHandle_t* newBlasHandles =
       (cublasHandle_t*) malloc((numBlasHandles + 1) * sizeof(cublasHandle_t));

     /* Copy over old blasHandles
        (0 is NULL, 1 ... numUserBlasHandles are rest) */
     newBlasHandles[0] = NULL;
     for (int hndl = 1; hndl <= state->numUserBlasHandles; ++hndl) {
       newBlasHandles[hndl] = THCState_getDeviceBlasHandle(state, dev, hndl);
     }

     /* Allocate new handles */
     for (int hndl = state->numUserBlasHandles + 1; hndl <= numBlasHandles; ++hndl) {
       newBlasHandles[hndl] = NULL;
       THCublasCheck(cublasCreate(newBlasHandles + hndl));
     }

     THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, dev);
     free(res->blasHandles);
     res->blasHandles = newBlasHandles;
   }

   state->numUserBlasHandles = numBlasHandles;

   THCudaCheck(cudaSetDevice(prevDev));
 }

 int THCState_getNumStreams(THCState* state)
 {
   return state->numUserStreams;
 }

 int THCState_getNumBlasHandles(THCState* state)
 {
   return state->numUserBlasHandles;
 }

 THCCudaResourcesPerDevice* THCState_getDeviceResourcePtr(
   THCState *state, int device)
 {
   /* `device` is a CUDA index */
   if (device >= state->numDevices || device < 0)
   {
     THError("%d is not a device", device + 1 /* back to Torch index */);
   }

   return &(state->resourcesPerDevice[device]);
 }

 cudaStream_t THCState_getDeviceStream(THCState *state, int device, int stream)
 {
   if (stream > state->numUserStreams || stream < 0)
   {
     THError("%d is not a stream", stream);
   }
   return (THCState_getDeviceResourcePtr(state, device)->streams == NULL) ? 0
     : THCState_getDeviceResourcePtr(state, device)->streams[stream];
 }

 cublasHandle_t THCState_getDeviceBlasHandle(THCState *state, int device, int handle)
 {
   if (handle <= 0 || handle > state->numUserBlasHandles)
   {
     THError("%d is not a valid handle, valid range is: (1, %d)",
             handle, state->numUserBlasHandles);
   }
   return THCState_getDeviceResourcePtr(state, device)->blasHandles[handle];
 }

 cudaStream_t THCState_getCurrentStream(THCState *state)
 {
   /* This is called at the point of kernel execution.
      For some debugging code or improperly instrumented kernels,
      `state` is null */
   if (state) {
     return state->currentStream;
   } else {
     /* assume default stream */
     return NULL;
   }
 }

 cublasHandle_t THCState_getCurrentBlasHandle(THCState *state)
 {
   /* This is called at the point of kernel execution.
      For some debugging code or improperly instrumented kernels,
      `state` is null */
   if (state) {
     if (state->currentBlasHandle <= 0) {
       THError("%d is not a valid handle, valid range is: (1, %d)",
               state->currentBlasHandle, state->numUserBlasHandles);
     }
     return state->currentBlasHandle;
   }
   THError("THCState and blasHandles must be set as there is no default blasHandle");
   return NULL;
 }

 int THCState_getCurrentStreamIndex(THCState *state)
 {
   return state->currentPerDeviceStream;
 }

 int THCState_getCurrentBlasHandleIndex(THCState *state)
 {
   if (state->currentPerDeviceBlasHandle <= 0)
   {
     THError("%d is not a valid handle, valid range is: (1, %d)",
             state->currentPerDeviceBlasHandle, state->numUserBlasHandles);
   }
   return state->currentPerDeviceBlasHandle;
 }

 void THCState_setStream(THCState *state, int device, int stream)
 {
   /* `device` is a CUDA index */
   if (device >= state->numDevices || device < 0)
   {
     THError("%d is not a device", device + 1 /* back to Torch index */);
   }

   if (stream > state->numUserStreams || stream < 0)
   {
     THError("%d is not a stream", stream);
   }
   state->currentStream =
     THCState_getDeviceStream(state, device, stream);
   state->currentPerDeviceStream = stream;
   THCublasCheck(cublasSetStream(state->currentBlasHandle,
                                 state->currentStream));
 }

 void THCState_setBlasHandle(THCState *state, int device, int handle)
 {  /* `device` is a CUDA index */
   if (device >= state->numDevices || device < 0)
   {
     THError("%d is not a device", device + 1 /* back to Torch index */);
   }

   if (handle > state->numUserBlasHandles || handle <= 0)
   {
     THError("%d is not a valid handle, valid range is: (1, %d)",
             handle, state->numUserBlasHandles);
   }
   state->currentBlasHandle =
     THCState_getDeviceBlasHandle(state, device, handle);
   state->currentPerDeviceBlasHandle = handle;
   THCublasCheck(cublasSetStream(state->currentBlasHandle, state->currentStream));
 }

 void THCState_setStreamForCurrentDevice(THCState *state, int stream)
 {
   if (state->currentPerDeviceStream != stream)
   {
     int device = -1;
     THCudaCheck(cudaGetDevice(&device));
     THCState_setStream(state, device, stream);
   }
 }

 void THCState_setBlasHandleForCurrentDevice(THCState *state, int handle)
 {
   if (state->currentPerDeviceBlasHandle != handle)
   {
     int device = -1;
     THCudaCheck(cudaGetDevice(&device));
     THCState_setBlasHandle(state, device, handle);
   }
 }

 void* THCState_getCurrentDeviceScratchSpace(THCState* state)
 {
   int device = -1;
   THCudaCheck(cudaGetDevice(&device));
   int stream = THCState_getCurrentStreamIndex(state);

   return THCState_getDeviceScratchSpace(state, device, stream);
 }

 void* THCState_getDeviceScratchSpace(THCState* state, int device, int stream)
 {
   THCCudaResourcesPerDevice* res =
     THCState_getDeviceResourcePtr(state, device);

   if (stream > state->numUserStreams || stream < 0)
   {
     THError("%d is not a stream", stream);
   }

   return res->devScratchSpacePerStream[stream];
 }

 size_t THCState_getCurrentDeviceScratchSpaceSize(THCState* state)
 {
   int device = -1;
   THCudaCheck(cudaGetDevice(&device));
   return THCState_getDeviceScratchSpaceSize(state, device);
 }

 size_t THCState_getDeviceScratchSpaceSize(THCState* state, int device)
 {
   THCCudaResourcesPerDevice* res =
     THCState_getDeviceResourcePtr(state, device);

   return res->scratchSpacePerStream;
 }

 void __THCudaCheck(cudaError_t err, const char *file, const int line)
 {
   if(err != cudaSuccess)
   {
     static int alreadyFailed = 0;
     if(!alreadyFailed) {
       fprintf(stderr, "THCudaCheck FAIL file=%s line=%i error=%i : %s\n", file, line, err, cudaGetErrorString(err));
       alreadyFailed = 1;
     }
     _THError(file, line, "cuda runtime error (%d) : %s", err,
              cudaGetErrorString(err));
   }
 }

 void __THCublasCheck(cublasStatus_t status, const char *file, const int line)
 {
   if(status != CUBLAS_STATUS_SUCCESS)
   {
     const char* errmsg = NULL;

     switch(status)
     {
       case CUBLAS_STATUS_NOT_INITIALIZED:
         errmsg = "library not initialized";
         break;

       case CUBLAS_STATUS_ALLOC_FAILED:
         errmsg = "resource allocation failed";
         break;

       case CUBLAS_STATUS_INVALID_VALUE:
         errmsg = "an invalid numeric value was used as an argument";
         break;

       case CUBLAS_STATUS_ARCH_MISMATCH:
         errmsg = "an absent device architectural feature is required";
         break;

       case CUBLAS_STATUS_MAPPING_ERROR:
         errmsg = "an access to GPU memory space failed";
         break;

       case CUBLAS_STATUS_EXECUTION_FAILED:
         errmsg = "the GPU program failed to execute";
         break;

       case CUBLAS_STATUS_INTERNAL_ERROR:
         errmsg = "an internal operation failed";
         break;

       default:
         errmsg = "unknown error";
         break;
     }

     _THError(file, line, "cublas runtime error : %s", errmsg);
   }
 }

 static long heapSize = 0; // not thread-local
 static const long heapMaxDelta = 1e6;
 static const double heapSoftmaxGrowthThresh = 0.8; // grow softmax if >80% max after GC
 static const double heapSoftmaxGrowthFactor = 1.4; // grow softmax by 40%

 void THCSetGCHandler(THCState *state, void (*cutorchGCFunction_)(void *data), void *data )
 {
   state->cutorchGCFunction = cutorchGCFunction_;
   state->cutorchGCData = data;
 }

 cudaError_t THCudaMalloc(THCState *state, void** ptr, size_t size)
 {
   THCudaCheck(cudaGetLastError());
   cudaError_t err = cudaMalloc(ptr, size);
   if (state->cutorchGCFunction != NULL && err != cudaSuccess) {
     cudaGetLastError(); // reset OOM error
     (state->cutorchGCFunction)(state->cutorchGCData);
     err = cudaMalloc(ptr, size);
   }
   return err;
 }

 cudaError_t THCudaFree(THCState *state, void *ptr)
 {
   cudaError_t err = cudaFree(ptr);
   return err;
 }

 static long applyHeapDelta(THCState *state) {
   long newHeapSize = THAtomicAddLong(&heapSize, state->heapDelta) + state->heapDelta;
   state->heapDelta = 0;
   return newHeapSize;
 }

 // Here we maintain a dynamic softmax threshold for THC-allocated storages.
 // When THC heap size goes above this softmax, the GC hook is triggered.
 // If heap size is above 80% of the softmax after GC, then the softmax is
 // increased.
 static void maybeTriggerGC(THCState *state, long curHeapSize) {
   if (state->cutorchGCFunction != NULL && curHeapSize > state->heapSoftmax) {
     (state->cutorchGCFunction)(state->cutorchGCData);

     // ensure heapSize is accurate before updating heapSoftmax
     long newHeapSize = applyHeapDelta(state);

     if (newHeapSize > state->heapSoftmax * heapSoftmaxGrowthThresh) {
       state->heapSoftmax = state->heapSoftmax * heapSoftmaxGrowthFactor;
     }
   }
 }

 void THCHeapUpdate(THCState *state, long size) {
   state->heapDelta += size;
   // batch updates to global heapSize to minimize thread contention
   if (labs(state->heapDelta) < heapMaxDelta) {
     return;
   }

   long newHeapSize = applyHeapDelta(state);
   if (size > 0) {
     maybeTriggerGC(state, newHeapSize);
   }
 }

 #undef GLOBAL_SCRATCH_SPACE_PER_SM_STREAM
	#include "THCGeneral.h"
	#include "TH.h"
	#include "THCTensorRandom.h"
	#include "THCBlas.h"
	#include "THCAllocator.h"
	#include <stdlib.h>

	/* Size of scratch space available in global memory per each SM + stream */
	#define GLOBAL_SCRATCH_SPACE_PER_SM_STREAM 4 * sizeof(float)

	THCCudaResourcesPerDevice* THCState_getDeviceResourcePtr(
	THCState *state, int device);

	void THCudaInit(THCState* state)
	{
	state->cutorchGCFunction = NULL;
	state->cutorchGCData = NULL;

	int count = 0;
	THCudaCheck(cudaGetDeviceCount(&count));

	int device = 0;
	THCudaCheck(cudaGetDevice(&device));

	state->rngState = (THCRNGState*)malloc(sizeof(THCRNGState));
	THCRandom_init(state, count, device);

	THCAllocator_init(state);

	state->numDevices = count;
	state->deviceProperties =
	(struct cudaDeviceProp)malloc(count sizeof(struct cudaDeviceProp));

	state->numUserStreams = 0;
	state->numUserBlasHandles = 0;

	/* Enable P2P access between all pairs, if possible */
	THCudaEnablePeerToPeerAccess(state);

	state->resourcesPerDevice = (THCCudaResourcesPerDevice*)
	malloc(count * sizeof(THCCudaResourcesPerDevice));
	for (int i = 0; i < count; ++i) {
	THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, i);

	THCudaCheck(cudaSetDevice(i));
	THCudaCheck(cudaGetDeviceProperties(&state->deviceProperties[i], i));
	/* Stream index 0 will be the default stream for convenience; by
	default no user streams are reserved */
	res->streams = NULL;
	res->blasHandles = NULL;

	/* The scratch space that we want to have available per each device is
	based on the number of SMs available per device */
	int numSM = state->deviceProperties[i].multiProcessorCount;
	size_t sizePerStream = numSM * GLOBAL_SCRATCH_SPACE_PER_SM_STREAM;
	res->scratchSpacePerStream = sizePerStream;

	/* Allocate scratch space for each stream */
	res->devScratchSpacePerStream = (void*) malloc(sizeof(void));
	THCudaCheck(THCudaMalloc(state, &res->devScratchSpacePerStream[0],
	sizePerStream));
	}

	/* Restore to previous device */
	THCudaCheck(cudaSetDevice(device));

	/* Start in the default stream on the current device */
	state->currentPerDeviceStream = 0;
	state->currentStream = NULL;

	/* There is no such thing as a default cublas handle.
	To maintain consistency with streams API, handle 0 is always NULL and we
	start counting at 1
	*/
	THCState_reserveBlasHandles(state, 1);
	state->currentPerDeviceBlasHandle = 1;
	state->currentBlasHandle = THCState_getDeviceBlasHandle(state, device, 1);

	state->heapSoftmax = 3e8; // 300MB, adjusted upward dynamically
	state->heapDelta = 0;
	}

	void THCudaShutdown(THCState* state)
	{
	THCRandom_shutdown(state);
	THCAllocator_shutdown(state);

	free(state->rngState);
	free(state->deviceProperties);

	int deviceCount = 0;
	int prevDev = -1;
	THCudaCheck(cudaGetDevice(&prevDev));
	THCudaCheck(cudaGetDeviceCount(&deviceCount));

	/* cleanup p2p access state */
	for (int dev = 0; dev < deviceCount; ++dev) {
	free(state->p2pAccessEnabled[dev]);
	}
	free(state->p2pAccessEnabled);

	/* cleanup per-device state */
	for (int dev = 0; dev < deviceCount; ++dev) {
	THCudaCheck(cudaSetDevice(dev));
	/* Free Torch-defined streams (0 is the default stream) */
	for (int stream = 1; stream <= state->numUserStreams; ++stream) {
	THCudaCheck(cudaStreamDestroy(
	THCState_getDeviceStream(state, dev, stream)));
	}
	/* Free Torch-defined handles (0 is NULL for consistency with streams API) */
	for (int handle = 1; handle <= state->numUserBlasHandles; ++handle) {
	THCublasCheck(cublasDestroy(
	THCState_getDeviceBlasHandle(state, dev, handle)));
	}
	/* Free per-stream scratch space; starts at 0 because there is space for
	the default stream as well*/
	for (int stream = 0; stream <= state->numUserStreams; ++stream) {
	THCudaCheck(THCudaFree(state, THCState_getDeviceScratchSpace(state, dev, stream)));
	}

	free(state->resourcesPerDevice[dev].streams);
	free(state->resourcesPerDevice[dev].blasHandles);
	free(state->resourcesPerDevice[dev].devScratchSpacePerStream);
	}
	free(state->resourcesPerDevice);

	THCudaCheck(cudaSetDevice(prevDev));
	}

	void THCudaEnablePeerToPeerAccess(THCState* state)
	{
	/* By default, all direct p2p kernel access (besides copy) is disallowed, */
	/* since direct access without knowing whether or not a certain operation */
	/* should be cross-GPU leads to synchronization errors. The user can choose */
	/* to disable this functionality, however. */
	state->p2pKernelAccessEnabled = 0;

	int prevDev = -1;
	THCudaCheck(cudaGetDevice(&prevDev));

	int numDevices = -1;
	THCudaCheck(cudaGetDeviceCount(&numDevices));

	state->p2pAccessEnabled = (int*) malloc(sizeof(int) * numDevices);
	for (int i = 0; i < numDevices; ++i) {
	state->p2pAccessEnabled[i] = (int) malloc(sizeof(int) numDevices);
	}

	/* Build a table of all allowed p2p accesses, to avoid checking the p2p
	status at runtime. */
	for (int i = 0; i < numDevices; ++i) {
	THCudaCheck(cudaSetDevice(i));

	for (int j = 0; j < numDevices; ++j) {
	/* Presume no access by default */
	state->p2pAccessEnabled[i][j] = 0;

	if (i == j) {
	/* A GPU can access itself */
	state->p2pAccessEnabled[i][j] = 1;
	} else {
	int access = 0;
	THCudaCheck(cudaDeviceCanAccessPeer(&access, i, j));

	if (access) {
	cudaError_t err = cudaDeviceEnablePeerAccess(j, 0);
	if (err == cudaErrorPeerAccessAlreadyEnabled) {
	/* Any future call to cudaGetLastError will now return an error, */
	/* even though we've already dealt with this specific error here. */
	/* Call cudaGetLastError once to reset the last error state. */
	cudaGetLastError();
	continue;
	}

	/* In case there are unknown errors returned from the above */
	THCudaCheck(err);

	/* Access could be enabled */
	state->p2pAccessEnabled[i][j] = 1;
	}
	}
	}
	}

	/* Restore previous device before continuing */
	THCudaCheck(cudaSetDevice(prevDev));
	}

	int THCState_getPeerToPeerAccess(THCState* state, int dev, int devToAccess)
	{
	int numDevices = 0;
	THCudaCheck(cudaGetDeviceCount(&numDevices));
	if (dev < 0 \|\| dev >= numDevices) {
	THError("%d is not a device", dev);
	}

	if (devToAccess < 0 \|\| dev >= numDevices) {
	THError("%d is not a device", devToAccess);
	}

	return state->p2pAccessEnabled[dev][devToAccess];
	}

	void THCState_setPeerToPeerAccess(THCState* state, int dev, int devToAccess,
	int enable)
	{
	/* This will perform device bounds checking for us */
	int prevEnabled = THCState_getPeerToPeerAccess(state, dev, devToAccess);

	if (enable != prevEnabled) {
	/* If we're attempting to enable p2p access but p2p access isn't */
	/* supported, throw an error */
	if (enable) {
	int access = 0;
	THCudaCheck(cudaDeviceCanAccessPeer(&access, dev, devToAccess));

	if (!access) {
	THError("p2p access not supported for %d accessing %d",
	dev, devToAccess);
	}
	}

	state->p2pAccessEnabled[dev][devToAccess] = enable;

	int prevDev = 0;
	THCudaCheck(cudaGetDevice(&prevDev));
	THCudaCheck(cudaSetDevice(dev));

	/* This should be in sync with the current access state */
	if (enable) {
	THCudaCheck(cudaDeviceEnablePeerAccess(devToAccess, 0));
	} else {
	THCudaCheck(cudaDeviceDisablePeerAccess(devToAccess));
	}

	THCudaCheck(cudaSetDevice(prevDev));
	}
	}

	int THCState_getKernelPeerToPeerAccessEnabled(THCState* state) {
	return state->p2pKernelAccessEnabled;
	}

	void THCState_setKernelPeerToPeerAccessEnabled(THCState* state, int val) {
	state->p2pKernelAccessEnabled = val;
	}

	struct cudaDeviceProp* THCState_getCurrentDeviceProperties(THCState* state)
	{
	int curDev = -1;
	THCudaCheck(cudaGetDevice(&curDev));

	return &(state->deviceProperties[curDev]);
	}

	int THCState_getNumDevices(THCState *state)
	{
	return state->numDevices;
	}

	void THCState_reserveStreams(THCState* state, int numStreams, int nonBlocking)
	{
	if (numStreams <= state->numUserStreams)
	{
	return;
	}

	int prevDev = -1;
	THCudaCheck(cudaGetDevice(&prevDev));

	/* Otherwise, we have to allocate a new set of streams and stream data */
	for (int dev = 0; dev < state->numDevices; ++dev) {
	THCudaCheck(cudaSetDevice(dev));

	/* +1 for the default stream as well */
	cudaStream_t* newStreams =
	(cudaStream_t) malloc((numStreams + 1) sizeof(cudaStream_t));

	void** newScratchSpace =
	(void*) malloc((numStreams + 1) sizeof(void*));

	/* Copy over old stream data
	(0 is default stream, 1 ... numUserStreams are rest) */
	for (int stream = 0; stream <= state->numUserStreams; ++stream) {
	newStreams[stream] =
	THCState_getDeviceStream(state, dev, stream);
	newScratchSpace[stream] =
	THCState_getDeviceScratchSpace(state, dev, stream);
	}

	/* Allocate new stream resources */
	size_t scratchSpaceSize = THCState_getDeviceScratchSpaceSize(state, dev);
	unsigned int flags =
	nonBlocking ? cudaStreamNonBlocking : cudaStreamDefault;

	for (int stream = state->numUserStreams + 1; stream <= numStreams; ++stream) {
	newStreams[stream] = NULL;
	THCudaCheck(cudaStreamCreateWithFlags(newStreams + stream, flags));
	newScratchSpace[stream] = NULL;
	THCudaCheck(THCudaMalloc(state, &newScratchSpace[stream], scratchSpaceSize));
	}

	THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, dev);
	free(res->streams);
	res->streams = newStreams;
	free(res->devScratchSpacePerStream);
	res->devScratchSpacePerStream = newScratchSpace;
	}

	state->numUserStreams = numStreams;

	THCudaCheck(cudaSetDevice(prevDev));
	}

	void THCState_reserveBlasHandles(THCState* state, int numBlasHandles)
	{
	if (numBlasHandles <= state->numUserBlasHandles)
	{
	return;
	}

	int prevDev = -1;
	THCudaCheck(cudaGetDevice(&prevDev));

	/* Otherwise, we have to allocate a new set of blasHandles */
	for (int dev = 0; dev < state->numDevices; ++dev) {
	THCudaCheck(cudaSetDevice(dev));

	/* +1 to be consistent with stream API, blas handle 0 is NULL and unused */
	cublasHandle_t* newBlasHandles =
	(cublasHandle_t) malloc((numBlasHandles + 1) sizeof(cublasHandle_t));

	/* Copy over old blasHandles
	(0 is NULL, 1 ... numUserBlasHandles are rest) */
	newBlasHandles[0] = NULL;
	for (int hndl = 1; hndl <= state->numUserBlasHandles; ++hndl) {
	newBlasHandles[hndl] = THCState_getDeviceBlasHandle(state, dev, hndl);
	}

	/* Allocate new handles */
	for (int hndl = state->numUserBlasHandles + 1; hndl <= numBlasHandles; ++hndl) {
	newBlasHandles[hndl] = NULL;
	THCublasCheck(cublasCreate(newBlasHandles + hndl));
	}

	THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, dev);
	free(res->blasHandles);
	res->blasHandles = newBlasHandles;
	}

	state->numUserBlasHandles = numBlasHandles;

	THCudaCheck(cudaSetDevice(prevDev));
	}

	int THCState_getNumStreams(THCState* state)
	{
	return state->numUserStreams;
	}

	int THCState_getNumBlasHandles(THCState* state)
	{
	return state->numUserBlasHandles;
	}

	THCCudaResourcesPerDevice* THCState_getDeviceResourcePtr(
	THCState *state, int device)
	{
	/* `device` is a CUDA index */
	if (device >= state->numDevices \|\| device < 0)
	{
	THError("%d is not a device", device + 1 /* back to Torch index */);
	}

	return &(state->resourcesPerDevice[device]);
	}

	cudaStream_t THCState_getDeviceStream(THCState *state, int device, int stream)
	{
	if (stream > state->numUserStreams \|\| stream < 0)
	{
	THError("%d is not a stream", stream);
	}
	return (THCState_getDeviceResourcePtr(state, device)->streams == NULL) ? 0
	: THCState_getDeviceResourcePtr(state, device)->streams[stream];
	}

	cublasHandle_t THCState_getDeviceBlasHandle(THCState *state, int device, int handle)
	{
	if (handle <= 0 \|\| handle > state->numUserBlasHandles)
	{
	THError("%d is not a valid handle, valid range is: (1, %d)",
	handle, state->numUserBlasHandles);
	}
	return THCState_getDeviceResourcePtr(state, device)->blasHandles[handle];
	}

	cudaStream_t THCState_getCurrentStream(THCState *state)
	{
	/* This is called at the point of kernel execution.
	For some debugging code or improperly instrumented kernels,
	`state` is null */
	if (state) {
	return state->currentStream;
	} else {
	/* assume default stream */
	return NULL;
	}
	}

	cublasHandle_t THCState_getCurrentBlasHandle(THCState *state)
	{
	/* This is called at the point of kernel execution.
	For some debugging code or improperly instrumented kernels,
	`state` is null */
	if (state) {
	if (state->currentBlasHandle <= 0) {
	THError("%d is not a valid handle, valid range is: (1, %d)",
	state->currentBlasHandle, state->numUserBlasHandles);
	}
	return state->currentBlasHandle;
	}
	THError("THCState and blasHandles must be set as there is no default blasHandle");
	return NULL;
	}

	int THCState_getCurrentStreamIndex(THCState *state)
	{
	return state->currentPerDeviceStream;
	}

	int THCState_getCurrentBlasHandleIndex(THCState *state)
	{
	if (state->currentPerDeviceBlasHandle <= 0)
	{
	THError("%d is not a valid handle, valid range is: (1, %d)",
	state->currentPerDeviceBlasHandle, state->numUserBlasHandles);
	}
	return state->currentPerDeviceBlasHandle;
	}

	void THCState_setStream(THCState *state, int device, int stream)
	{
	/* `device` is a CUDA index */
	if (device >= state->numDevices \|\| device < 0)
	{
	THError("%d is not a device", device + 1 /* back to Torch index */);
	}

	if (stream > state->numUserStreams \|\| stream < 0)
	{
	THError("%d is not a stream", stream);
	}
	state->currentStream =
	THCState_getDeviceStream(state, device, stream);
	state->currentPerDeviceStream = stream;
	THCublasCheck(cublasSetStream(state->currentBlasHandle,
	state->currentStream));
	}

	void THCState_setBlasHandle(THCState *state, int device, int handle)
	{ /* `device` is a CUDA index */
	if (device >= state->numDevices \|\| device < 0)
	{
	THError("%d is not a device", device + 1 /* back to Torch index */);
	}

	if (handle > state->numUserBlasHandles \|\| handle <= 0)
	{
	THError("%d is not a valid handle, valid range is: (1, %d)",
	handle, state->numUserBlasHandles);
	}
	state->currentBlasHandle =
	THCState_getDeviceBlasHandle(state, device, handle);
	state->currentPerDeviceBlasHandle = handle;
	THCublasCheck(cublasSetStream(state->currentBlasHandle, state->currentStream));
	}

	void THCState_setStreamForCurrentDevice(THCState *state, int stream)
	{
	if (state->currentPerDeviceStream != stream)
	{
	int device = -1;
	THCudaCheck(cudaGetDevice(&device));
	THCState_setStream(state, device, stream);
	}
	}

	void THCState_setBlasHandleForCurrentDevice(THCState *state, int handle)
	{
	if (state->currentPerDeviceBlasHandle != handle)
	{
	int device = -1;
	THCudaCheck(cudaGetDevice(&device));
	THCState_setBlasHandle(state, device, handle);
	}
	}

	void* THCState_getCurrentDeviceScratchSpace(THCState* state)
	{
	int device = -1;
	THCudaCheck(cudaGetDevice(&device));
	int stream = THCState_getCurrentStreamIndex(state);

	return THCState_getDeviceScratchSpace(state, device, stream);
	}

	void* THCState_getDeviceScratchSpace(THCState* state, int device, int stream)
	{
	THCCudaResourcesPerDevice* res =
	THCState_getDeviceResourcePtr(state, device);

	if (stream > state->numUserStreams \|\| stream < 0)
	{
	THError("%d is not a stream", stream);
	}

	return res->devScratchSpacePerStream[stream];
	}

	size_t THCState_getCurrentDeviceScratchSpaceSize(THCState* state)
	{
	int device = -1;
	THCudaCheck(cudaGetDevice(&device));
	return THCState_getDeviceScratchSpaceSize(state, device);
	}

	size_t THCState_getDeviceScratchSpaceSize(THCState* state, int device)
	{
	THCCudaResourcesPerDevice* res =
	THCState_getDeviceResourcePtr(state, device);

	return res->scratchSpacePerStream;
	}

	void __THCudaCheck(cudaError_t err, const char *file, const int line)
	{
	if(err != cudaSuccess)
	{
	static int alreadyFailed = 0;
	if(!alreadyFailed) {
	fprintf(stderr, "THCudaCheck FAIL file=%s line=%i error=%i : %s\n", file, line, err, cudaGetErrorString(err));
	alreadyFailed = 1;
	}
	_THError(file, line, "cuda runtime error (%d) : %s", err,
	cudaGetErrorString(err));
	}
	}

	void __THCublasCheck(cublasStatus_t status, const char *file, const int line)
	{
	if(status != CUBLAS_STATUS_SUCCESS)
	{
	const char* errmsg = NULL;

	switch(status)
	{
	case CUBLAS_STATUS_NOT_INITIALIZED:
	errmsg = "library not initialized";
	break;

	case CUBLAS_STATUS_ALLOC_FAILED:
	errmsg = "resource allocation failed";
	break;

	case CUBLAS_STATUS_INVALID_VALUE:
	errmsg = "an invalid numeric value was used as an argument";
	break;

	case CUBLAS_STATUS_ARCH_MISMATCH:
	errmsg = "an absent device architectural feature is required";
	break;

	case CUBLAS_STATUS_MAPPING_ERROR:
	errmsg = "an access to GPU memory space failed";
	break;

	case CUBLAS_STATUS_EXECUTION_FAILED:
	errmsg = "the GPU program failed to execute";
	break;

	case CUBLAS_STATUS_INTERNAL_ERROR:
	errmsg = "an internal operation failed";
	break;

	default:
	errmsg = "unknown error";
	break;
	}

	_THError(file, line, "cublas runtime error : %s", errmsg);
	}
	}

	static long heapSize = 0; // not thread-local
	static const long heapMaxDelta = 1e6;
	static const double heapSoftmaxGrowthThresh = 0.8; // grow softmax if >80% max after GC
	static const double heapSoftmaxGrowthFactor = 1.4; // grow softmax by 40%

	void THCSetGCHandler(THCState state, void (cutorchGCFunction_)(void data), void data )
	{
	state->cutorchGCFunction = cutorchGCFunction_;
	state->cutorchGCData = data;
	}

	cudaError_t THCudaMalloc(THCState state, void* ptr, size_t size)
	{
	THCudaCheck(cudaGetLastError());
	cudaError_t err = cudaMalloc(ptr, size);
	if (state->cutorchGCFunction != NULL && err != cudaSuccess) {
	cudaGetLastError(); // reset OOM error
	(state->cutorchGCFunction)(state->cutorchGCData);
	err = cudaMalloc(ptr, size);
	}
	return err;
	}

	cudaError_t THCudaFree(THCState state, void ptr)
	{
	cudaError_t err = cudaFree(ptr);
	return err;
	}

	static long applyHeapDelta(THCState *state) {
	long newHeapSize = THAtomicAddLong(&heapSize, state->heapDelta) + state->heapDelta;
	state->heapDelta = 0;
	return newHeapSize;
	}

	// Here we maintain a dynamic softmax threshold for THC-allocated storages.
	// When THC heap size goes above this softmax, the GC hook is triggered.
	// If heap size is above 80% of the softmax after GC, then the softmax is
	// increased.
	static void maybeTriggerGC(THCState *state, long curHeapSize) {
	if (state->cutorchGCFunction != NULL && curHeapSize > state->heapSoftmax) {
	(state->cutorchGCFunction)(state->cutorchGCData);

	// ensure heapSize is accurate before updating heapSoftmax
	long newHeapSize = applyHeapDelta(state);

	if (newHeapSize > state->heapSoftmax * heapSoftmaxGrowthThresh) {
	state->heapSoftmax = state->heapSoftmax * heapSoftmaxGrowthFactor;
	}
	}
	}

	void THCHeapUpdate(THCState *state, long size) {
	state->heapDelta += size;
	// batch updates to global heapSize to minimize thread contention
	if (labs(state->heapDelta) < heapMaxDelta) {
	return;
	}

	long newHeapSize = applyHeapDelta(state);
	if (size > 0) {
	maybeTriggerGC(state, newHeapSize);
	}
	}

	#undef GLOBAL_SCRATCH_SPACE_PER_SM_STREAM