blob: 38789208075769ffd7c46d6f9c54d54573de82e7 [file] [log] [blame]
#include "THCGeneral.h"
#include "TH.h"
#include "THCAllocator.h"
#include "THCCachingHostAllocator.h"
#include "THCStream.h"
#include "THCThreadLocal.h"
#include "THCTensorRandom.h"
#include <stdlib.h>
#include <stdint.h>
/* Size of scratch space available in global memory per each SM + stream */
#define MIN_GLOBAL_SCRATCH_SPACE_PER_SM_STREAM 4 * sizeof(float)
/* Minimum amount of scratch space per device. Total scratch memory per
* device is either this amount, or the # of SMs * the space per SM defined
* above, whichever is greater.*/
#define MIN_GLOBAL_SCRATCH_SPACE_PER_DEVICE 32768 * sizeof(float)
THCCudaResourcesPerDevice* THCState_getDeviceResourcePtr(
THCState *state, int device);
THCState* THCState_alloc(void)
{
THCState* state = (THCState*) malloc(sizeof(THCState));
memset(state, 0, sizeof(THCState));
return state;
}
void THCState_free(THCState* state)
{
free(state);
}
static cudaError_t cudaMallocWrapper(void* ctx, void** devPtr, size_t size, cudaStream_t stream)
{
return cudaMalloc(devPtr, size);
}
static cudaError_t cudaFreeWrapper(void* ctx, void* devPtr)
{
return cudaFree(devPtr);
}
static THCDeviceAllocator defaultDeviceAllocator = {
&cudaMallocWrapper,
NULL,
&cudaFreeWrapper,
NULL,
NULL,
NULL
};
void THCudaInit(THCState* state)
{
if (!state->cudaDeviceAllocator) {
state->cudaDeviceAllocator = &defaultDeviceAllocator;
}
if (!state->cudaHostAllocator) {
state->cudaHostAllocator = &THCudaHostAllocator;
}
if (!state->cudaUVAAllocator) {
state->cudaUVAAllocator = &THCUVAAllocator;
}
int numDevices = 0;
THCudaCheck(cudaGetDeviceCount(&numDevices));
state->numDevices = numDevices;
int device = 0;
THCudaCheck(cudaGetDevice(&device));
/* Start in the default stream on the current device */
state->currentStreams = (THCThreadLocal*) malloc(numDevices * sizeof(THCThreadLocal));
for (int i = 0; i < numDevices; ++i) {
state->currentStreams[i] = THCThreadLocal_alloc();
}
state->currentPerDeviceBlasHandle = THCThreadLocal_alloc();
state->resourcesPerDevice = (THCCudaResourcesPerDevice*)
malloc(numDevices * sizeof(THCCudaResourcesPerDevice));
memset(state->resourcesPerDevice, 0, numDevices * sizeof(THCCudaResourcesPerDevice));
state->deviceProperties =
(struct cudaDeviceProp*)malloc(numDevices * sizeof(struct cudaDeviceProp));
state->rngState = (THCRNGState*)malloc(sizeof(THCRNGState));
THCRandom_init(state, numDevices, device);
// By default, all direct p2p kernel access (besides copy) is disallowed,
// since direct access without knowing whether or not a certain operation
// should be cross-GPU leads to synchronization errors. The user can choose
// to disable this functionality, however.
state->p2pKernelAccessEnabled = 0;
// p2pAccessEnabled records if p2p copies are allowed between pairs of
// devices. Values include "1" (copy allowed), "0" (copy not allowed), and
// "-1" (unknown).
state->p2pAccessEnabled = (int**) malloc(sizeof(int*) * numDevices);
for (int i = 0; i < numDevices; ++i) {
state->p2pAccessEnabled[i] = (int*) malloc(sizeof(int) * numDevices);
memset(state->p2pAccessEnabled[i], -1, sizeof(int) * numDevices);
state->p2pAccessEnabled[i][i] = 1;
}
for (int i = 0; i < numDevices; ++i) {
THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, i);
THCudaCheck(cudaSetDevice(i));
THCudaCheck(cudaGetDeviceProperties(&state->deviceProperties[i], i));
// Allocate space for the NULL stream
res->streams = (THCStream**) malloc(sizeof(THCStream*));
res->streams[0] = NULL;
/* The scratch space that we want to have available per each device is
based on the number of SMs available per device. We guarantee a
minimum of 128kb of space per device, but to future-proof against
future architectures that may have huge #s of SMs, we guarantee that
we have at least 16 bytes for each SM. */
int numSM = state->deviceProperties[i].multiProcessorCount;
size_t sizePerStream =
MIN_GLOBAL_SCRATCH_SPACE_PER_DEVICE >= numSM * MIN_GLOBAL_SCRATCH_SPACE_PER_SM_STREAM ?
MIN_GLOBAL_SCRATCH_SPACE_PER_DEVICE :
numSM * MIN_GLOBAL_SCRATCH_SPACE_PER_SM_STREAM;
res->scratchSpacePerStream = sizePerStream;
}
/* Restore to previous device */
THCudaCheck(cudaSetDevice(device));
// Unlike CUDA streams, there is no NULL cuBLAS handle. The default THC
// cuBLAS handle is the first user BLAS handle. Note that the actual BLAS
// handles are created lazily.
state->numUserBlasHandles = 1;
state->heapSoftmax = 3e8; // 300MB, adjusted upward dynamically
state->heapDelta = 0;
}
void THCudaShutdown(THCState* state)
{
THCRandom_shutdown(state);
free(state->rngState);
free(state->deviceProperties);
int deviceCount = 0;
int prevDev = -1;
THCudaCheck(cudaGetDevice(&prevDev));
THCudaCheck(cudaGetDeviceCount(&deviceCount));
/* cleanup p2p access state */
for (int dev = 0; dev < deviceCount; ++dev) {
free(state->p2pAccessEnabled[dev]);
}
free(state->p2pAccessEnabled);
/* cleanup per-device state */
for (int dev = 0; dev < deviceCount; ++dev) {
THCudaCheck(cudaSetDevice(dev));
THCCudaResourcesPerDevice* res = &(state->resourcesPerDevice[dev]);
/* Free user reserved streams (0 is the default stream) */
for (int i = 1; i <= state->numUserStreams; ++i) {
THCStream_free(res->streams[i]);
}
/* Free user defined BLAS handles */
for (int i = 0; i < res->numBlasHandles; ++i) {
THCublasCheck(cublasDestroy(res->blasHandles[i]));
}
/* Free per-stream scratch space; starts at 0 because there is space for
the default stream as well*/
if (res->devScratchSpacePerStream) {
for (int stream = 0; stream <= state->numUserStreams; ++stream) {
THCudaCheck(THCudaFree(state, res->devScratchSpacePerStream[stream]));
}
}
free(res->streams);
free(res->blasHandles);
free(res->devScratchSpacePerStream);
THCStream_free((THCStream*)THCThreadLocal_get(state->currentStreams[dev]));
THCThreadLocal_free(state->currentStreams[dev]);
}
free(state->resourcesPerDevice);
if (state->cudaDeviceAllocator->emptyCache) {
state->cudaDeviceAllocator->emptyCache(state->cudaDeviceAllocator->state);
}
if (state->cudaHostAllocator == &THCCachingHostAllocator) {
THCCachingHostAllocator_emptyCache();
}
free(state->currentStreams);
THCThreadLocal_free(state->currentPerDeviceBlasHandle);
THCudaCheck(cudaSetDevice(prevDev));
}
int THCState_getPeerToPeerAccess(THCState* state, int dev, int devToAccess)
{
if (dev < 0 || dev >= state->numDevices) {
THError("%d is not a device", dev);
}
if (devToAccess < 0 || devToAccess >= state->numDevices) {
THError("%d is not a device", devToAccess);
}
if (state->p2pAccessEnabled[dev][devToAccess] == -1) {
int prevDev = 0;
THCudaCheck(cudaGetDevice(&prevDev));
THCudaCheck(cudaSetDevice(dev));
int access = 0;
THCudaCheck(cudaDeviceCanAccessPeer(&access, dev, devToAccess));
if (access) {
cudaError_t err = cudaDeviceEnablePeerAccess(devToAccess, 0);
if (err == cudaErrorPeerAccessAlreadyEnabled) {
// ignore and clear the error if access was already enabled
cudaGetLastError();
} else {
THCudaCheck(err);
}
state->p2pAccessEnabled[dev][devToAccess] = 1;
} else {
state->p2pAccessEnabled[dev][devToAccess] = 0;
}
THCudaCheck(cudaSetDevice(prevDev));
}
return state->p2pAccessEnabled[dev][devToAccess];
}
void THCState_setPeerToPeerAccess(THCState* state, int dev, int devToAccess,
int enable)
{
/* This will perform device bounds checking for us */
int prevEnabled = THCState_getPeerToPeerAccess(state, dev, devToAccess);
if (enable != prevEnabled) {
/* If we're attempting to enable p2p access but p2p access isn't */
/* supported, throw an error */
if (enable) {
int access = 0;
THCudaCheck(cudaDeviceCanAccessPeer(&access, dev, devToAccess));
if (!access) {
THError("p2p access not supported for %d accessing %d",
dev, devToAccess);
}
}
state->p2pAccessEnabled[dev][devToAccess] = enable;
int prevDev = 0;
THCudaCheck(cudaGetDevice(&prevDev));
THCudaCheck(cudaSetDevice(dev));
/* This should be in sync with the current access state */
if (enable) {
THCudaCheck(cudaDeviceEnablePeerAccess(devToAccess, 0));
} else {
THCudaCheck(cudaDeviceDisablePeerAccess(devToAccess));
}
THCudaCheck(cudaSetDevice(prevDev));
}
}
int THCState_getKernelPeerToPeerAccessEnabled(THCState* state) {
return state->p2pKernelAccessEnabled;
}
void THCState_setKernelPeerToPeerAccessEnabled(THCState* state, int val) {
state->p2pKernelAccessEnabled = val;
}
struct cudaDeviceProp* THCState_getCurrentDeviceProperties(THCState* state)
{
int curDev = -1;
THCudaCheck(cudaGetDevice(&curDev));
return &(state->deviceProperties[curDev]);
}
struct THCRNGState* THCState_getRngState(THCState *state)
{
return state->rngState;
}
THAllocator* THCState_getCudaHostAllocator(THCState* state)
{
return state->cudaHostAllocator;
}
THAllocator* THCState_getCudaUVAAllocator(THCState* state)
{
return state->cudaUVAAllocator;
}
THC_API THCDeviceAllocator* THCState_getDeviceAllocator(THCState* state)
{
return state->cudaDeviceAllocator;
}
void THCState_setDeviceAllocator(THCState* state, THCDeviceAllocator* allocator)
{
state->cudaDeviceAllocator = allocator;
}
int THCState_getNumDevices(THCState *state)
{
return state->numDevices;
}
static void THCState_initializeScratchSpace(THCState* state, int dev)
{
THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, dev);
if (res->devScratchSpacePerStream) {
return;
}
size_t size = (state->numUserStreams + 1) * sizeof(void*);
void** scratch = (void**)malloc(size);
for (int i = 0; i <= state->numUserStreams; ++i) {
THCudaCheck(THCudaMalloc(state, &scratch[i], res->scratchSpacePerStream));
}
res->devScratchSpacePerStream = scratch;
}
void THCState_reserveStreams(THCState* state, int numStreams, int nonBlocking)
{
if (numStreams <= state->numUserStreams)
{
return;
}
int prevDev = -1;
THCudaCheck(cudaGetDevice(&prevDev));
/* Otherwise, we have to allocate a new set of streams and stream data */
for (int dev = 0; dev < state->numDevices; ++dev) {
THCudaCheck(cudaSetDevice(dev));
THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, dev);
/* +1 for the default stream as well */
THCStream** newStreams = realloc(res->streams, (numStreams + 1) * sizeof(THCStream*));
THAssert(newStreams);
THCState_initializeScratchSpace(state, dev);
void** newScratchSpace = realloc(res->devScratchSpacePerStream, (numStreams + 1) * sizeof(void*));
THAssert(newScratchSpace);
/* Allocate new stream resources */
size_t scratchSpaceSize = THCState_getDeviceScratchSpaceSize(state, dev);
unsigned int flags =
nonBlocking ? cudaStreamNonBlocking : cudaStreamDefault;
for (int stream = state->numUserStreams + 1; stream <= numStreams; ++stream) {
newStreams[stream] = THCStream_new(flags);
newScratchSpace[stream] = NULL;
THCudaCheck(THCudaMalloc(state, &newScratchSpace[stream], scratchSpaceSize));
}
res->streams = newStreams;
res->devScratchSpacePerStream = newScratchSpace;
}
state->numUserStreams = numStreams;
THCudaCheck(cudaSetDevice(prevDev));
}
void THCState_reserveDeviceBlasHandles(THCState* state, int device, int numBlasHandles)
{
int prevDev = -1;
THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
if (numBlasHandles <= res->numBlasHandles) {
return;
}
THCudaCheck(cudaGetDevice(&prevDev));
THCudaCheck(cudaSetDevice(device));
size_t size = numBlasHandles * sizeof(cublasHandle_t);
cublasHandle_t* handles = (cublasHandle_t*) realloc(res->blasHandles, size);
for (int i = res->numBlasHandles; i < numBlasHandles; ++i) {
handles[i] = NULL;
THCublasCheck(cublasCreate(&handles[i]));
}
res->blasHandles = handles;
res->numBlasHandles = numBlasHandles;
THCudaCheck(cudaSetDevice(prevDev));
}
void THCState_reserveBlasHandles(THCState* state, int numBlasHandles)
{
// cuBLAS handles are created lazily from THCState_getDeviceBlasHandle
// to avoid initializing unused devices
if (numBlasHandles > state->numUserBlasHandles)
{
state->numUserBlasHandles = numBlasHandles;
}
}
int THCState_getNumStreams(THCState* state)
{
return state->numUserStreams;
}
int THCState_getNumBlasHandles(THCState* state)
{
return state->numUserBlasHandles;
}
THCCudaResourcesPerDevice* THCState_getDeviceResourcePtr(
THCState *state, int device)
{
/* `device` is a CUDA index */
if (device >= state->numDevices || device < 0)
{
THError("%d is not a device", device + 1 /* back to Torch index */);
}
return &(state->resourcesPerDevice[device]);
}
cudaStream_t THCState_getDeviceStream(THCState *state, int device, int streamIndex)
{
if (streamIndex > state->numUserStreams || streamIndex < 0)
{
THError("%d is not a stream", streamIndex);
}
THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
THCStream* stream = res->streams[streamIndex];
return stream ? stream->stream : NULL;
}
cublasHandle_t THCState_getDeviceBlasHandle(THCState *state, int device, int handle)
{
if (handle <= 0 || handle > state->numUserBlasHandles) {
THError("%d is not a valid handle, valid range is: (1, %d)",
handle, state->numUserBlasHandles);
}
THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
THCState_reserveDeviceBlasHandles(state, device, handle);
return res->blasHandles[handle - 1];
}
static THCStream* THCState_getStreamOnDevice(THCState* state, int device)
{
return (THCStream*) THCThreadLocal_get(state->currentStreams[device]);
}
static void THCState_setStreamOnDevice(THCState *state, int device, THCStream *stream)
{
if (stream) {
if (stream->device != device) {
THError("invalid stream; expected stream for device %d, but was on %d",
device, stream->device);
}
THCStream_retain(stream);
}
THCThreadLocal local = state->currentStreams[device];
THCStream_free((THCStream*)THCThreadLocal_get(local));
THCThreadLocal_set(local, stream);
}
cudaStream_t THCState_getCurrentStreamOnDevice(THCState *state, int device)
{
THCStream* stream = THCState_getStreamOnDevice(state, device);
return stream ? stream->stream : NULL;
}
cudaStream_t THCState_getCurrentStream(THCState *state)
{
/* This is called at the point of kernel execution.
For some debugging code or improperly instrumented kernels,
`state` is null */
if (state) {
int device;
THCudaCheck(cudaGetDevice(&device));
return THCState_getCurrentStreamOnDevice(state, device);
} else {
/* assume default stream */
return NULL;
}
}
cublasHandle_t THCState_getCurrentBlasHandle(THCState *state)
{
/* This is called at the point of kernel execution.
For some debugging code or improperly instrumented kernels,
`state` is null */
if (state) {
int device;
THCudaCheck(cudaGetDevice(&device));
int handle = THCState_getCurrentBlasHandleIndex(state);
return THCState_getDeviceBlasHandle(state, device, handle);
}
THError("THCState and blasHandles must be set as there is no default blasHandle");
return NULL;
}
int THCState_getCurrentStreamIndex(THCState *state)
{
THCStream* stream = THCState_getStream(state);
if (!stream) {
return 0;
}
int device;
THCudaCheck(cudaGetDevice(&device));
THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
for (int i = 0; i <= state->numUserStreams; ++i) {
if (res->streams[i] == stream) {
return i;
}
}
return -1;
}
int THCState_getCurrentBlasHandleIndex(THCState *state)
{
void* value = THCThreadLocal_get(state->currentPerDeviceBlasHandle);
if (value == NULL) {
return 1;
}
return (int) (intptr_t) value;
}
THCStream* THCState_getStream(THCState *state)
{
int device;
THCudaCheck(cudaGetDevice(&device));
return THCState_getStreamOnDevice(state, device);
}
void THCState_setStream(THCState *state, THCStream *stream)
{
int device;
THCudaCheck(cudaGetDevice(&device));
THCState_setStreamOnDevice(state, device, stream);
}
void THCState_setCurrentStreamIndex(THCState *state, int streamIndex)
{
if (streamIndex < 0 || streamIndex > state->numUserStreams) {
THError("%d is not a valid stream, valid range is: (0, %d)", streamIndex,
state->numUserStreams);
}
int device;
for (device = 0; device < state->numDevices; ++device) {
THCStream* stream = NULL;
if (streamIndex != 0) {
THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, device);
stream = res->streams[streamIndex];
}
THCState_setStreamOnDevice(state, device, stream);
}
}
void THCState_setCurrentBlasHandleIndex(THCState *state, int handle)
{
if (handle > state->numUserBlasHandles || handle <= 0)
{
THError("%d is not a valid handle, valid range is: (1, %d)",
handle, state->numUserBlasHandles);
}
THCThreadLocal_set(state->currentPerDeviceBlasHandle, (void*)(intptr_t)handle);
}
void* THCState_getCurrentDeviceScratchSpace(THCState* state)
{
int device = -1;
THCudaCheck(cudaGetDevice(&device));
int stream = THCState_getCurrentStreamIndex(state);
if (stream < 0) {
// new stream API
return NULL;
}
return THCState_getDeviceScratchSpace(state, device, stream);
}
void* THCState_getDeviceScratchSpace(THCState* state, int dev, int stream)
{
THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, dev);
if (stream > state->numUserStreams || stream < 0) {
THError("%d is not a stream", stream);
}
THCState_initializeScratchSpace(state, dev);
return res->devScratchSpacePerStream[stream];
}
size_t THCState_getCurrentDeviceScratchSpaceSize(THCState* state)
{
int device = -1;
THCudaCheck(cudaGetDevice(&device));
return THCState_getDeviceScratchSpaceSize(state, device);
}
size_t THCState_getDeviceScratchSpaceSize(THCState* state, int device)
{
THCCudaResourcesPerDevice* res =
THCState_getDeviceResourcePtr(state, device);
return res->scratchSpacePerStream;
}
void __THCudaCheck(cudaError_t err, const char *file, const int line)
{
if(err != cudaSuccess)
{
static int alreadyFailed = 0;
if(!alreadyFailed) {
fprintf(stderr, "THCudaCheck FAIL file=%s line=%i error=%i : %s\n", file, line, err, cudaGetErrorString(err));
alreadyFailed = 1;
}
_THError(file, line, "cuda runtime error (%d) : %s", err,
cudaGetErrorString(err));
}
}
void __THCudaCheckWarn(cudaError_t err, const char *file, const int line)
{
if(err != cudaSuccess)
{
fprintf(stderr, "THCudaCheckWarn FAIL file=%s line=%i error=%i : %s\n", file, line, err, cudaGetErrorString(err));
}
}
void __THCublasCheck(cublasStatus_t status, const char *file, const int line)
{
if(status != CUBLAS_STATUS_SUCCESS)
{
const char* errmsg = NULL;
switch(status)
{
case CUBLAS_STATUS_NOT_INITIALIZED:
errmsg = "library not initialized";
break;
case CUBLAS_STATUS_ALLOC_FAILED:
errmsg = "resource allocation failed";
break;
case CUBLAS_STATUS_INVALID_VALUE:
errmsg = "an invalid numeric value was used as an argument";
break;
case CUBLAS_STATUS_ARCH_MISMATCH:
errmsg = "an absent device architectural feature is required";
break;
case CUBLAS_STATUS_MAPPING_ERROR:
errmsg = "an access to GPU memory space failed";
break;
case CUBLAS_STATUS_EXECUTION_FAILED:
errmsg = "the GPU program failed to execute";
break;
case CUBLAS_STATUS_INTERNAL_ERROR:
errmsg = "an internal operation failed";
break;
default:
errmsg = "unknown error";
break;
}
_THError(file, line, "cublas runtime error : %s", errmsg);
}
}
static ptrdiff_t heapSize = 0; // not thread-local
static const ptrdiff_t heapMaxDelta = (ptrdiff_t)1e6;
static const ptrdiff_t heapMinDelta = (ptrdiff_t)-1e6;
static const double heapSoftmaxGrowthThresh = 0.8; // grow softmax if >80% max after GC
static const double heapSoftmaxGrowthFactor = 1.4; // grow softmax by 40%
void THCSetGCHandler(THCState *state, void (*cutorchGCFunction_)(void *data), void *data )
{
state->cutorchGCFunction = cutorchGCFunction_;
state->cutorchGCData = data;
}
cudaError_t THCudaMalloc(THCState *state, void** ptr, size_t size)
{
THCudaCheck(cudaGetLastError());
cudaStream_t stream = THCState_getCurrentStream(state);
THCDeviceAllocator* allocator = state->cudaDeviceAllocator;
cudaError_t err = allocator->malloc(allocator->state, ptr, size, stream);
if (state->cutorchGCFunction != NULL && err != cudaSuccess) {
cudaGetLastError(); // reset OOM error
(state->cutorchGCFunction)(state->cutorchGCData);
err = allocator->malloc(allocator->state, ptr, size, stream);
}
return err;
}
cudaError_t THCudaFree(THCState *state, void *ptr)
{
THCDeviceAllocator* allocator = state->cudaDeviceAllocator;
return allocator->free(allocator->state, ptr);
}
cudaError_t THCudaMemGetInfo(THCState *state, size_t* freeBytes, size_t* totalBytes)
{
size_t cachedBytes = 0;
size_t largestBlock = 0;
THCDeviceAllocator* allocator = state->cudaDeviceAllocator;
/* get info from CUDA first */
cudaError_t ret = cudaMemGetInfo(freeBytes, totalBytes);
if (ret!= cudaSuccess)
return ret;
int device;
ret = cudaGetDevice(&device);
if (ret!= cudaSuccess)
return ret;
/* not always true - our optimistic guess here */
largestBlock = *freeBytes;
if (allocator->cacheInfo != NULL)
allocator->cacheInfo(allocator->state, device, &cachedBytes, &largestBlock);
/* Adjust resulting free bytes number. largesBlock unused for now */
*freeBytes += cachedBytes;
return cudaSuccess;
}
static ptrdiff_t applyHeapDelta(THCState *state) {
ptrdiff_t newHeapSize = THAtomicAddPtrdiff(&heapSize, state->heapDelta) + state->heapDelta;
state->heapDelta = 0;
return newHeapSize;
}
// Here we maintain a dynamic softmax threshold for THC-allocated storages.
// When THC heap size goes above this softmax, the GC hook is triggered.
// If heap size is above 80% of the softmax after GC, then the softmax is
// increased.
static void maybeTriggerGC(THCState *state, ptrdiff_t curHeapSize) {
if (state->cutorchGCFunction != NULL && curHeapSize > state->heapSoftmax) {
(state->cutorchGCFunction)(state->cutorchGCData);
// ensure heapSize is accurate before updating heapSoftmax
ptrdiff_t newHeapSize = applyHeapDelta(state);
if (newHeapSize > state->heapSoftmax * heapSoftmaxGrowthThresh) {
state->heapSoftmax = (ptrdiff_t)state->heapSoftmax * heapSoftmaxGrowthFactor;
}
}
}
void THCHeapUpdate(THCState *state, ptrdiff_t size) {
state->heapDelta += size;
// batch updates to global heapSize to minimize thread contention
if (state->heapDelta < heapMaxDelta && state->heapDelta > heapMinDelta) {
return;
}
ptrdiff_t newHeapSize = applyHeapDelta(state);
if (size > 0) {
maybeTriggerGC(state, newHeapSize);
}
}
#undef MIN_GLOBAL_SCRATCH_SPACE_PER_SM_STREAM
#undef MIN_GLOBAL_SCRATCH_SPACE_PER_DEVICE
#include "THCStorage.c"
#include "THCAllocator.c"