| #ifndef THC_CACHING_HOST_ALLOCATOR_INC |
| #define THC_CACHING_HOST_ALLOCATOR_INC |
| |
| #include <THC/THCGeneral.h> |
| |
| |
| #include <c10/cuda/CUDAStream.h> |
| |
| // |
| // A caching allocator for CUDA host allocations (pinned memory). |
| // |
| // This provides a drop-in replacement for THCudaHostAllocator, which re-uses |
| // freed pinned (page-locked) memory allocations. This avoids device |
| // synchronizations due to cudaFreeHost calls. |
| // |
| // To ensure correct behavior, THCCachingHostAllocator_recordEvent must be |
| // called anytime a pointer from this allocator is used in a cudaMemcpyAsync |
| // call between host and device. We implement this for storages and tensors in |
| // copy_from_cpu_async_ and copy_to_cpu_async_. |
| // |
| // Note that this allocator does not split larger allocations into smaller |
| // blocks, unlike the caching device allocator. |
| // |
| TORCH_CUDA_CPP_API c10::Allocator* getTHCCachingHostAllocator(void); |
| |
| // Records an event in the specified stream. The allocation 'ptr' will not be |
| // re-used until the event has occurred. |
| TORCH_CUDA_CPP_API cudaError_t |
| THCCachingHostAllocator_recordEvent(void* ptr, at::cuda::CUDAStream stream); |
| |
| // Releases cached pinned memory allocations via cudaHostFree |
| TORCH_CUDA_CPP_API void THCCachingHostAllocator_emptyCache(void); |
| |
| #endif |