THCGeneral.c - platform/external/pytorch - Git at Google

 #include "THCGeneral.h"
 #include "TH.h"
 #include "THCAllocator.h"
 #include "THCCachingHostAllocator.h"
 #include "THCStream.h"
 #include "THCThreadLocal.h"
 #include "THCTensorRandom.h"
 #include <stdlib.h>
 #include <stdint.h>

 /* Size of scratch space available in global memory per each SM + stream */
 #define MIN_GLOBAL_SCRATCH_SPACE_PER_SM_STREAM 4 * sizeof(float)

 /* Minimum amount of scratch space per device. Total scratch memory per
  * device is either this amount, or the # of SMs * the space per SM defined
  * above, whichever is greater.*/
 #define MIN_GLOBAL_SCRATCH_SPACE_PER_DEVICE 32768 * sizeof(float)

 THCCudaResourcesPerDevice* THCState_getDeviceResourcePtr(
   THCState *state, int device);

 THCState* THCState_alloc(void)
 {
   THCState* state = (THCState*) malloc(sizeof(THCState));
   memset(state, 0, sizeof(THCState));
   return state;
 }

 void THCState_free(THCState* state)
 {
   free(state);
 }

 static cudaError_t cudaMallocWrapper(void* ctx, void** devPtr, size_t size, cudaStream_t stream)
 {
   return cudaMalloc(devPtr, size);
 }

 static cudaError_t cudaFreeWrapper(void* ctx, void* devPtr)
 {
   return cudaFree(devPtr);
 }

 static THCDeviceAllocator defaultDeviceAllocator = {
   &cudaMallocWrapper,
   NULL,
   &cudaFreeWrapper,
   NULL,
   NULL,
   NULL
 };

 void THCudaInit(THCState* state)
 {
   if (!state->cudaDeviceAllocator) {
     state->cudaDeviceAllocator = &defaultDeviceAllocator;
   }
   if (!state->cudaHostAllocator) {
     state->cudaHostAllocator = &THCudaHostAllocator;
   }
   if (!state->cudaUVAAllocator) {
     state->cudaUVAAllocator = &THCUVAAllocator;
   }

   int numDevices = 0;
   THCudaCheck(cudaGetDeviceCount(&numDevices));
   state->numDevices = numDevices;

   int device = 0;
   THCudaCheck(cudaGetDevice(&device));

   /* Start in the default stream on the current device */
   state->currentStreams = (THCThreadLocal*) malloc(numDevices * sizeof(THCThreadLocal));
   for (int i = 0; i < numDevices; ++i) {
     state->currentStreams[i] = THCThreadLocal_alloc();
   }
   state->currentPerDeviceBlasHandle = THCThreadLocal_alloc();

   state->resourcesPerDevice = (THCCudaResourcesPerDevice*)
     malloc(numDevices * sizeof(THCCudaResourcesPerDevice));
   memset(state->resourcesPerDevice, 0, numDevices * sizeof(THCCudaResourcesPerDevice));

   state->deviceProperties =
     (struct cudaDeviceProp*)malloc(numDevices * sizeof(struct cudaDeviceProp));

   state->rngState = (THCRNGState*)malloc(sizeof(THCRNGState));
   THCRandom_init(state, numDevices, device);

   // By default, all direct p2p kernel access (besides copy) is disallowed,
   // since direct access without knowing whether or not a certain operation
   // should be cross-GPU leads to synchronization errors. The user can choose
   // to disable this functionality, however.
   state->p2pKernelAccessEnabled = 0;

   // p2pAccessEnabled records if p2p copies are allowed between pairs of
   // devices. Values include "1" (copy allowed), "0" (copy not allowed), and
   // "-1" (unknown).
   state->p2pAccessEnabled = (int**) malloc(sizeof(int*) * numDevices);
   for (int i = 0; i < numDevices; ++i) {
     state->p2pAccessEnabled[i] = (int*) malloc(sizeof(int) * numDevices);
     memset(state->p2pAccessEnabled[i], -1, sizeof(int) * numDevices);
     state->p2pAccessEnabled[i][i] = 1;
   }

   for (int i = 0; i < numDevices; ++i) {
     THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, i);
     THCudaCheck(cudaSetDevice(i));
     THCudaCheck(cudaGetDeviceProperties(&state->deviceProperties[i], i));

     // Allocate space for the NULL stream
     res->streams = (THCStream**) malloc(sizeof(THCStream*));
     res->streams[0] = NULL;

     /* The scratch space that we want to have available per each device is
        based on the number of SMs available per device. We guarantee a
        minimum of 128kb of space per device, but to future-proof against
        future architectures that may have huge #s of SMs, we guarantee that
        we have at least 16 bytes for each SM. */
     int numSM = state->deviceProperties[i].multiProcessorCount;
     size_t sizePerStream =
       MIN_GLOBAL_SCRATCH_SPACE_PER_DEVICE >= numSM * MIN_GLOBAL_SCRATCH_SPACE_PER_SM_STREAM ?
       MIN_GLOBAL_SCRATCH_SPACE_PER_DEVICE :
       numSM * MIN_GLOBAL_SCRATCH_SPACE_PER_SM_STREAM;
     res->scratchSpacePerStream = sizePerStream;
   }

   /* Restore to previous device */
   THCudaCheck(cudaSetDevice(device));

   // Unlike CUDA streams, there is no NULL cuBLAS handle. The default THC
   // cuBLAS handle is the first user BLAS handle. Note that the actual BLAS
   // handles are created lazily.
   state->numUserBlasHandles = 1;

   state->heapSoftmax = 3e8; // 300MB, adjusted upward dynamically
   state->heapDelta = 0;
 }

 void THCudaShutdown(THCState* state)
 {
   THCRandom_shutdown(state);

   free(state->rngState);
   free(state->deviceProperties);

   int deviceCount = 0;
   int prevDev = -1;
   THCudaCheck(cudaGetDevice(&prevDev));
   THCudaCheck(cudaGetDeviceCount(&deviceCount));

   /* cleanup p2p access state */
   for (int dev = 0; dev < deviceCount; ++dev) {
     free(state->p2pAccessEnabled[dev]);
   }
   free(state->p2pAccessEnabled);

   /* cleanup per-device state */
   for (int dev = 0; dev < deviceCount; ++dev) {
     THCudaCheck(cudaSetDevice(dev));
     THCCudaResourcesPerDevice* res = &(state->resourcesPerDevice[dev]);
     /* Free user reserved streams (0 is the default stream) */
     for (int i = 1; i <= state->numUserStreams; ++i) {
       THCStream_free(res->streams[i]);
     }
     /* Free user defined BLAS handles */
     for (int i = 0; i < res->numBlasHandles; ++i) {
       THCublasCheck(cublasDestroy(res->blasHandles[i]));
     }
     /* Free per-stream scratch space; starts at 0 because there is space for
        the default stream as well*/
     if (res->devScratchSpacePerStream) {
       for (int stream = 0; stream <= state->numUserStreams; ++stream) {
         THCudaCheck(THCudaFree(state, res->devScratchSpacePerStream[stream]));
       }
     }

     free(res->streams);
     free(res->blasHandles);
     free(res->devScratchSpacePerStream);
     THCStream_free((THCStream*)THCThreadLocal_get(state->currentStreams[dev]));
     THCThreadLocal_free(state->currentStreams[dev]);
   }
   free(state->resourcesPerDevice);
   if (state->cudaDeviceAllocator->emptyCache) {
     state->cudaDeviceAllocator->emptyCache(state->cudaDeviceAllocator->state);
   }
   if (state->cudaHostAllocator == &THCCachingHostAllocator) {
     THCCachingHostAllocator_emptyCache();
   }
   free(state->currentStreams);
   THCThreadLocal_free(state->currentPerDeviceBlasHandle);

   THCudaCheck(cudaSetDevice(prevDev));
 }

 int THCState_getPeerToPeerAccess(THCState* state, int dev, int devToAccess)
 {
   if (dev < 0 || dev >= state->numDevices) {
     THError("%d is not a device", dev);
   }
   if (devToAccess < 0 || devToAccess >= state->numDevices) {
     THError("%d is not a device", devToAccess);
   }
   if (state->p2pAccessEnabled[dev][devToAccess] == -1) {
     int prevDev = 0;
     THCudaCheck(cudaGetDevice(&prevDev));
     THCudaCheck(cudaSetDevice(dev));

     int access = 0;
     THCudaCheck(cudaDeviceCanAccessPeer(&access, dev, devToAccess));
     if (access) {
       cudaError_t err = cudaDeviceEnablePeerAccess(devToAccess, 0);
       if (err == cudaErrorPeerAccessAlreadyEnabled) {
         // ignore and clear the error if access was already enabled
         cudaGetLastError();
       } else {
         THCudaCheck(err);
       }
       state->p2pAccessEnabled[dev][devToAccess] = 1;
     } else {
       state->p2pAccessEnabled[dev][devToAccess] = 0;
     }

     THCudaCheck(cudaSetDevice(prevDev));
   }
   return state->p2pAccessEnabled[dev][devToAccess];
 }

 void THCState_setPeerToPeerAccess(THCState* state, int dev, int devToAccess,
                                   int enable)
 {
   /* This will perform device bounds checking for us */
   int prevEnabled = THCState_getPeerToPeerAccess(state, dev, devToAccess);

   if (enable != prevEnabled) {
     /* If we're attempting to enable p2p access but p2p access isn't */
     /* supported, throw an error */
     if (enable) {
       int access = 0;
       THCudaCheck(cudaDeviceCanAccessPeer(&access, dev, devToAccess));

       if (!access) {
         THError("p2p access not supported for %d accessing %d",
                 dev, devToAccess);
       }
     }

     state->p2pAccessEnabled[dev][devToAccess] = enable;

     int prevDev = 0;
     THCudaCheck(cudaGetDevice(&prevDev));
     THCudaCheck(cudaSetDevice(dev));

     /* This should be in sync with the current access state */
     if (enable) {
       THCudaCheck(cudaDeviceEnablePeerAccess(devToAccess, 0));
     } else {
       THCudaCheck(cudaDeviceDisablePeerAccess(devToAccess));
     }

     THCudaCheck(cudaSetDevice(prevDev));
   }
 }

 int THCState_getKernelPeerToPeerAccessEnabled(THCState* state) {
   return state->p2pKernelAccessEnabled;
 }

 void THCState_setKernelPeerToPeerAccessEnabled(THCState* state, int val) {
   state->p2pKernelAccessEnabled = val;
 }

 struct cudaDeviceProp* THCState_getCurrentDeviceProperties(THCState* state)
 {
   int curDev = -1;
   THCudaCheck(cudaGetDevice(&curDev));

   return &(state->deviceProperties[curDev]);
 }

 struct THCRNGState* THCState_getRngState(THCState *state)
 {
   return state->rngState;
 }

 THAllocator* THCState_getCudaHostAllocator(THCState* state)
 {
   return state->cudaHostAllocator;
 }

 THAllocator* THCState_getCudaUVAAllocator(THCState* state)
 {
   return state->cudaUVAAllocator;
 }

 THC_API THCDeviceAllocator* THCState_getDeviceAllocator(THCState* state)
 {
   return state->cudaDeviceAllocator;
 }

 void THCState_setDeviceAllocator(THCState* state, THCDeviceAllocator* allocator)
 {
   state->cudaDeviceAllocator = allocator;
 }

 int THCState_getNumDevices(THCState *state)
 {
   return state->numDevices;
 }

 static void THCState_initializeScratchSpace(THCState* state, int dev)
 {
   THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, dev);
   if (res->devScratchSpacePerStream) {
     return;
   }
   size_t size = (state->numUserStreams + 1) * sizeof(void*);
   void** scratch = (void**)malloc(size);
   for (int i = 0; i <= state->numUserStreams; ++i) {
     THCudaCheck(THCudaMalloc(state, &scratch[i], res->scratchSpacePerStream));
   }
   res->devScratchSpacePerStream = scratch;
 }

 void THCState_reserveStreams(THCState* state, int numStreams, int nonBlocking)
 {
   if (numStreams <= state->numUserStreams)
   {
     return;
   }

   int prevDev = -1;
   THCudaCheck(cudaGetDevice(&prevDev));

   /* Otherwise, we have to allocate a new set of streams and stream data */
   for (int dev = 0; dev < state->numDevices; ++dev) {
     THCudaCheck(cudaSetDevice(dev));
     THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, dev);

     /* +1 for the default stream as well */
     THCStream** newStreams = realloc(res->streams, (numStreams + 1) * sizeof(THCStream*));
     THAssert(newStreams);

     THCState_initializeScratchSpace(state, dev);
     void** newScratchSpace = realloc(res->devScratchSpacePerStream, (numStreams + 1) * sizeof(void*));
     THAssert(newScratchSpace);

     /* Allocate new stream resources */
     size_t scratchSpaceSize = THCState_getDeviceScratchSpaceSize(state, dev);
     unsigned int flags =
       nonBlocking ? cudaStreamNonBlocking : cudaStreamDefault;

     for (int stream = state->numUserStreams + 1; stream <= numStreams; ++stream) {
       newStreams[stream] = THCStream_new(flags);
       newScratchSpace[stream] = NULL;
       THCudaCheck(THCudaMalloc(state, &newScratchSpace[stream], scratchSpaceSize));
     }

     res->streams = newStreams;
     res->devScratchSpacePerStream = newScratchSpace;
   }

   state->numUserStreams = numStreams;

   THCudaCheck(cudaSetDevice(prevDev));
 }

 void THCState_reserveDeviceBlasHandles(THCState* state, int device, int numBlasHandles)
 {
   int prevDev = -1;
   THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
   if (numBlasHandles <= res->numBlasHandles) {
     return;
   }

   THCudaCheck(cudaGetDevice(&prevDev));
   THCudaCheck(cudaSetDevice(device));

   size_t size = numBlasHandles * sizeof(cublasHandle_t);
   cublasHandle_t* handles = (cublasHandle_t*) realloc(res->blasHandles, size);
   for (int i = res->numBlasHandles; i < numBlasHandles; ++i) {
     handles[i] = NULL;
     THCublasCheck(cublasCreate(&handles[i]));
   }
   res->blasHandles = handles;
   res->numBlasHandles = numBlasHandles;

   THCudaCheck(cudaSetDevice(prevDev));
 }

 void THCState_reserveBlasHandles(THCState* state, int numBlasHandles)
 {
   // cuBLAS handles are created lazily from THCState_getDeviceBlasHandle
   // to avoid initializing unused devices
   if (numBlasHandles > state->numUserBlasHandles)
   {
     state->numUserBlasHandles = numBlasHandles;
   }
 }

 int THCState_getNumStreams(THCState* state)
 {
   return state->numUserStreams;
 }

 int THCState_getNumBlasHandles(THCState* state)
 {
   return state->numUserBlasHandles;
 }

 THCCudaResourcesPerDevice* THCState_getDeviceResourcePtr(
   THCState *state, int device)
 {
   /* `device` is a CUDA index */
   if (device >= state->numDevices || device < 0)
   {
     THError("%d is not a device", device + 1 /* back to Torch index */);
   }

   return &(state->resourcesPerDevice[device]);
 }

 cudaStream_t THCState_getDeviceStream(THCState *state, int device, int streamIndex)
 {
   if (streamIndex > state->numUserStreams || streamIndex < 0)
   {
     THError("%d is not a stream", streamIndex);
   }
   THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
   THCStream* stream = res->streams[streamIndex];
   return stream ? stream->stream : NULL;
 }

 cublasHandle_t THCState_getDeviceBlasHandle(THCState *state, int device, int handle)
 {
   if (handle <= 0 || handle > state->numUserBlasHandles) {
     THError("%d is not a valid handle, valid range is: (1, %d)",
             handle, state->numUserBlasHandles);
   }
   THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
   THCState_reserveDeviceBlasHandles(state, device, handle);
   return res->blasHandles[handle - 1];
 }

 static THCStream* THCState_getStreamOnDevice(THCState* state, int device)
 {
   return (THCStream*) THCThreadLocal_get(state->currentStreams[device]);
 }

 static void THCState_setStreamOnDevice(THCState *state, int device, THCStream *stream)
 {
   if (stream) {
     if (stream->device != device) {
       THError("invalid stream; expected stream for device %d, but was on %d",
           device, stream->device);
     }
     THCStream_retain(stream);
   }
   THCThreadLocal local = state->currentStreams[device];
   THCStream_free((THCStream*)THCThreadLocal_get(local));
   THCThreadLocal_set(local, stream);
 }

 cudaStream_t THCState_getCurrentStreamOnDevice(THCState *state, int device)
 {
   THCStream* stream = THCState_getStreamOnDevice(state, device);
   return stream ? stream->stream : NULL;
 }

 cudaStream_t THCState_getCurrentStream(THCState *state)
 {
   /* This is called at the point of kernel execution.
      For some debugging code or improperly instrumented kernels,
      `state` is null */
   if (state) {
     int device;
     THCudaCheck(cudaGetDevice(&device));
     return THCState_getCurrentStreamOnDevice(state, device);
   } else {
     /* assume default stream */
     return NULL;
   }
 }

 cublasHandle_t THCState_getCurrentBlasHandle(THCState *state)
 {
   /* This is called at the point of kernel execution.
      For some debugging code or improperly instrumented kernels,
      `state` is null */
   if (state) {
     int device;
     THCudaCheck(cudaGetDevice(&device));

     int handle = THCState_getCurrentBlasHandleIndex(state);
     return THCState_getDeviceBlasHandle(state, device, handle);
   }
   THError("THCState and blasHandles must be set as there is no default blasHandle");
   return NULL;
 }

 int THCState_getCurrentStreamIndex(THCState *state)
 {
   THCStream* stream = THCState_getStream(state);
   if (!stream) {
     return 0;
   }

   int device;
   THCudaCheck(cudaGetDevice(&device));
   THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
   for (int i = 0; i <= state->numUserStreams; ++i) {
     if (res->streams[i] == stream) {
       return i;
     }
   }

   return -1;
 }

 int THCState_getCurrentBlasHandleIndex(THCState *state)
 {
   void* value = THCThreadLocal_get(state->currentPerDeviceBlasHandle);
   if (value == NULL) {
     return 1;
   }
   return (int) (intptr_t) value;
 }

 THCStream* THCState_getStream(THCState *state)
 {
   int device;
   THCudaCheck(cudaGetDevice(&device));
   return THCState_getStreamOnDevice(state, device);
 }

 void THCState_setStream(THCState *state, THCStream *stream)
 {
   int device;
   THCudaCheck(cudaGetDevice(&device));
   THCState_setStreamOnDevice(state, device, stream);
 }

 void THCState_setCurrentStreamIndex(THCState *state, int streamIndex)
 {
   if (streamIndex < 0 || streamIndex > state->numUserStreams) {
     THError("%d is not a valid stream, valid range is: (0, %d)", streamIndex,
         state->numUserStreams);
   }

   int device;
   for (device = 0; device < state->numDevices; ++device) {
     THCStream* stream = NULL;
     if (streamIndex != 0) {
       THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
       stream = res->streams[streamIndex];
     }

     THCState_setStreamOnDevice(state, device, stream);
   }
 }

 void THCState_setCurrentBlasHandleIndex(THCState *state, int handle)
 {
   if (handle > state->numUserBlasHandles || handle <= 0)
   {
     THError("%d is not a valid handle, valid range is: (1, %d)",
             handle, state->numUserBlasHandles);
   }
   THCThreadLocal_set(state->currentPerDeviceBlasHandle, (void*)(intptr_t)handle);
 }

 void* THCState_getCurrentDeviceScratchSpace(THCState* state)
 {
   int device = -1;
   THCudaCheck(cudaGetDevice(&device));
   int stream = THCState_getCurrentStreamIndex(state);
   if (stream < 0) {
     // new stream API
     return NULL;
   }
   return THCState_getDeviceScratchSpace(state, device, stream);
 }

 void* THCState_getDeviceScratchSpace(THCState* state, int dev, int stream)
 {
   THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, dev);
   if (stream > state->numUserStreams || stream < 0) {
     THError("%d is not a stream", stream);
   }
   THCState_initializeScratchSpace(state, dev);
   return res->devScratchSpacePerStream[stream];
 }

 size_t THCState_getCurrentDeviceScratchSpaceSize(THCState* state)
 {
   int device = -1;
   THCudaCheck(cudaGetDevice(&device));
   return THCState_getDeviceScratchSpaceSize(state, device);
 }

 size_t THCState_getDeviceScratchSpaceSize(THCState* state, int device)
 {
   THCCudaResourcesPerDevice* res =
     THCState_getDeviceResourcePtr(state, device);

   return res->scratchSpacePerStream;
 }

 void __THCudaCheck(cudaError_t err, const char *file, const int line)
 {
   if(err != cudaSuccess)
   {
     static int alreadyFailed = 0;
     if(!alreadyFailed) {
       fprintf(stderr, "THCudaCheck FAIL file=%s line=%i error=%i : %s\n", file, line, err, cudaGetErrorString(err));
       alreadyFailed = 1;
     }
     _THError(file, line, "cuda runtime error (%d) : %s", err,
              cudaGetErrorString(err));
   }
 }

 void __THCudaCheckWarn(cudaError_t err, const char *file, const int line)
 {
   if(err != cudaSuccess)
   {
     fprintf(stderr, "THCudaCheckWarn FAIL file=%s line=%i error=%i : %s\n", file, line, err, cudaGetErrorString(err));
   }
 }

 void __THCublasCheck(cublasStatus_t status, const char *file, const int line)
 {
   if(status != CUBLAS_STATUS_SUCCESS)
   {
     const char* errmsg = NULL;

     switch(status)
     {
       case CUBLAS_STATUS_NOT_INITIALIZED:
         errmsg = "library not initialized";
         break;

       case CUBLAS_STATUS_ALLOC_FAILED:
         errmsg = "resource allocation failed";
         break;

       case CUBLAS_STATUS_INVALID_VALUE:
         errmsg = "an invalid numeric value was used as an argument";
         break;

       case CUBLAS_STATUS_ARCH_MISMATCH:
         errmsg = "an absent device architectural feature is required";
         break;

       case CUBLAS_STATUS_MAPPING_ERROR:
         errmsg = "an access to GPU memory space failed";
         break;

       case CUBLAS_STATUS_EXECUTION_FAILED:
         errmsg = "the GPU program failed to execute";
         break;

       case CUBLAS_STATUS_INTERNAL_ERROR:
         errmsg = "an internal operation failed";
         break;

       default:
         errmsg = "unknown error";
         break;
     }

     _THError(file, line, "cublas runtime error : %s", errmsg);
   }
 }

 static ptrdiff_t heapSize = 0; // not thread-local
 static const ptrdiff_t heapMaxDelta = (ptrdiff_t)1e6;
 static const ptrdiff_t heapMinDelta = (ptrdiff_t)-1e6;
 static const double heapSoftmaxGrowthThresh = 0.8; // grow softmax if >80% max after GC
 static const double heapSoftmaxGrowthFactor = 1.4; // grow softmax by 40%

 void THCSetGCHandler(THCState *state, void (*cutorchGCFunction_)(void *data), void *data )
 {
   state->cutorchGCFunction = cutorchGCFunction_;
   state->cutorchGCData = data;
 }

 cudaError_t THCudaMalloc(THCState *state, void** ptr, size_t size)
 {
   THCudaCheck(cudaGetLastError());
   cudaStream_t stream = THCState_getCurrentStream(state);
   THCDeviceAllocator* allocator = state->cudaDeviceAllocator;
   cudaError_t err = allocator->malloc(allocator->state, ptr, size, stream);
   if (state->cutorchGCFunction != NULL && err != cudaSuccess) {
     cudaGetLastError(); // reset OOM error
     (state->cutorchGCFunction)(state->cutorchGCData);
     err = allocator->malloc(allocator->state, ptr, size, stream);
   }
   return err;
 }

 cudaError_t THCudaFree(THCState *state, void *ptr)
 {
   THCDeviceAllocator* allocator = state->cudaDeviceAllocator;
   return allocator->free(allocator->state, ptr);
 }

 cudaError_t THCudaMemGetInfo(THCState *state,  size_t* freeBytes, size_t* totalBytes)
 {
   size_t cachedBytes = 0;
   size_t largestBlock = 0;
   THCDeviceAllocator* allocator = state->cudaDeviceAllocator;

   /* get info from CUDA first */
   cudaError_t ret = cudaMemGetInfo(freeBytes, totalBytes);
   if (ret!= cudaSuccess)
     return ret;

   int device;
   ret = cudaGetDevice(&device);
   if (ret!= cudaSuccess)
     return ret;

   /* not always true - our optimistic guess here */
   largestBlock = *freeBytes;

   if (allocator->cacheInfo != NULL)
     allocator->cacheInfo(allocator->state, device, &cachedBytes, &largestBlock);

   /* Adjust resulting free bytes number. largesBlock unused for now */
   *freeBytes += cachedBytes;
   return cudaSuccess;
 }

 static ptrdiff_t applyHeapDelta(THCState *state) {
   ptrdiff_t newHeapSize = THAtomicAddPtrdiff(&heapSize, state->heapDelta) + state->heapDelta;
   state->heapDelta = 0;
   return newHeapSize;
 }

 // Here we maintain a dynamic softmax threshold for THC-allocated storages.
 // When THC heap size goes above this softmax, the GC hook is triggered.
 // If heap size is above 80% of the softmax after GC, then the softmax is
 // increased.
 static void maybeTriggerGC(THCState *state, ptrdiff_t curHeapSize) {
   if (state->cutorchGCFunction != NULL && curHeapSize > state->heapSoftmax) {
     (state->cutorchGCFunction)(state->cutorchGCData);

     // ensure heapSize is accurate before updating heapSoftmax
     ptrdiff_t newHeapSize = applyHeapDelta(state);

     if (newHeapSize > state->heapSoftmax * heapSoftmaxGrowthThresh) {
       state->heapSoftmax = (ptrdiff_t)state->heapSoftmax * heapSoftmaxGrowthFactor;
     }
   }
 }

 void THCHeapUpdate(THCState *state, ptrdiff_t size) {
   state->heapDelta += size;
   // batch updates to global heapSize to minimize thread contention
   if (state->heapDelta < heapMaxDelta && state->heapDelta > heapMinDelta) {
     return;
   }

   ptrdiff_t newHeapSize = applyHeapDelta(state);
   if (size > 0) {
     maybeTriggerGC(state, newHeapSize);
   }
 }

 #undef MIN_GLOBAL_SCRATCH_SPACE_PER_SM_STREAM
 #undef MIN_GLOBAL_SCRATCH_SPACE_PER_DEVICE

 #include "THCStorage.c"
 #include "THCAllocator.c"