| #ifndef THC_DEVICE_ALLOCATOR_INC |
| #define THC_DEVICE_ALLOCATOR_INC |
| #include <c10/core/Allocator.h> |
| #include <c10/cuda/CUDAGraphsC10Utils.h> |
| #include <c10/cuda/CUDAMacros.h> |
| #include <c10/cuda/CUDAStream.h> |
| #include <c10/util/Registry.h> |
| |
| #include <array> |
| #include <mutex> |
| |
| namespace c10 { |
| |
| // Caching allocator will execute every registered callback if it unable to find |
| // block inside of already allocated area. |
| class C10_CUDA_API FreeMemoryCallback { |
| public: |
| virtual ~FreeMemoryCallback() = default; |
| virtual bool Execute() = 0; |
| }; |
| |
| C10_DECLARE_REGISTRY(FreeCudaMemoryCallbacksRegistry, FreeMemoryCallback); |
| #define REGISTER_FREE_MEMORY_CALLBACK(name, ...) \ |
| C10_REGISTER_CLASS(FreeCudaMemoryCallbacksRegistry, name, __VA_ARGS__); |
| |
| namespace cuda { |
| |
| // TODO: Turn this into an honest to goodness class. I briefly attempted to do |
| // this, but it was a bit irritating to figure out how to also correctly |
| // apply pimpl pattern so I didn't have to leak any internal implementation |
| // details in the header (CUDACachingAllocator could be made a pimpl, but |
| // you also need to appropriately define a class which is a subclass |
| // of Allocator. Not impossible, but required a bit more surgery than |
| // I wanted to do at the time.) |
| // |
| // Why is this using a namespace rather than old-style THCCachingAllocator_ |
| // prefix? Mostly because it made the HIPify rules easier to write; _ is |
| // not counted as a word boundary, so you would otherwise have to list each |
| // of these functions. |
| |
| namespace CUDACachingAllocator { |
| |
| struct Stat { |
| int64_t current = 0; |
| int64_t peak = 0; |
| int64_t allocated = 0; |
| int64_t freed = 0; |
| }; |
| |
| enum struct StatType : uint64_t { |
| AGGREGATE = 0, |
| SMALL_POOL = 1, |
| LARGE_POOL = 2, |
| NUM_TYPES = 3 // remember to update this whenever a new stat type is added |
| }; |
| |
| typedef std::array<Stat, static_cast<size_t>(StatType::NUM_TYPES)> StatArray; |
| |
| // Struct containing memory allocator summary statistics for a device. |
| struct DeviceStats { |
| // COUNT: allocations requested by client code |
| StatArray allocation; |
| // COUNT: number of allocated segments from cudaMalloc(). |
| StatArray segment; |
| // COUNT: number of active memory blocks (allocated or used by stream) |
| StatArray active; |
| // COUNT: number of inactive, split memory blocks (unallocated but can't be |
| // released via cudaFree) |
| StatArray inactive_split; |
| |
| // SUM: bytes requested by client code |
| StatArray allocated_bytes; |
| // SUM: bytes reserved by this memory allocator (both free and used) |
| StatArray reserved_bytes; |
| // SUM: bytes within active memory blocks |
| StatArray active_bytes; |
| // SUM: bytes within inactive, split memory blocks |
| StatArray inactive_split_bytes; |
| |
| // COUNT: total number of failed calls to CUDA malloc necessitating cache |
| // flushes. |
| int64_t num_alloc_retries = 0; |
| |
| // COUNT: total number of OOMs (i.e. failed calls to CUDA after cache flush) |
| int64_t num_ooms = 0; |
| |
| // COUNT: total number of oversize blocks allocated from pool |
| Stat oversize_allocations; |
| |
| // COUNT: total number of oversize blocks requiring malloc |
| Stat oversize_segments; |
| |
| // SIZE: maximum block size that is allowed to be split. |
| int64_t max_split_size = 0; |
| }; |
| |
| struct Context { |
| virtual ~Context() {} |
| }; |
| |
| typedef std::unique_ptr<Context> (*CreateContextFn)(void); |
| |
| struct History { |
| void* addr; |
| size_t real_size; // unrounded, actually requested size |
| std::unique_ptr<Context> context; // per-watcher context |
| std::unique_ptr<History> next; // when blocks are merged we keep records of |
| // what used to be in the block |
| }; |
| |
| // Struct containing info of an allocation block (i.e. a fractional part of a |
| // cudaMalloc).. |
| struct BlockInfo { |
| int64_t size = 0; |
| int32_t gc_counter = 0; |
| bool allocated = false; |
| bool active = false; |
| History* history = |
| nullptr; // borrowed reference because it is owned by the allocator |
| }; |
| |
| // Struct containing info of a memory segment (i.e. one contiguous cudaMalloc). |
| struct SegmentInfo { |
| int64_t device = 0; |
| int64_t address = 0; |
| int64_t total_size = 0; |
| int64_t allocated_size = 0; |
| int64_t active_size = 0; |
| cudaStream_t stream = 0; |
| bool is_large = false; |
| std::vector<BlockInfo> blocks; |
| }; |
| |
| C10_CUDA_API void* raw_alloc(size_t nbytes); |
| C10_CUDA_API void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream); |
| C10_CUDA_API void raw_delete(void* ptr); |
| |
| C10_CUDA_API Allocator* get(); |
| C10_CUDA_API void init(int device_count); |
| C10_CUDA_API void setMemoryFraction(double fraction, int device); |
| C10_CUDA_API void setAllocatorSettings(const std::string& env); |
| C10_CUDA_API void emptyCache(); |
| C10_CUDA_API void cacheInfo( |
| int dev_id, |
| size_t* cachedAndFree, |
| size_t* largestBlock); |
| C10_CUDA_API void* getBaseAllocation(void* ptr, size_t* size); |
| C10_CUDA_API void recordStream(const DataPtr&, CUDAStream stream); |
| C10_CUDA_API DeviceStats getDeviceStats(int device); |
| C10_CUDA_API void resetAccumulatedStats(int device); |
| C10_CUDA_API void resetPeakStats(int device); |
| C10_CUDA_API std::vector<SegmentInfo> snapshot(); |
| |
| // CUDAGraph interactions |
| C10_CUDA_API void notifyCaptureBegin( |
| int device, |
| CaptureId_t graph_id, |
| MempoolId_t mempool_id); |
| C10_CUDA_API void notifyCaptureEnd(int device, CaptureId_t graph_id); |
| C10_CUDA_API void notifyCaptureDestroy(int device, MempoolId_t mempool_id); |
| |
| C10_CUDA_API std::mutex* getFreeMutex(); |
| |
| C10_CUDA_API void setContextRecorder(CreateContextFn recorder); |
| |
| C10_CUDA_API std::shared_ptr<void> getIpcDevPtr(std::string handle); |
| } // namespace CUDACachingAllocator |
| |
| } // namespace cuda |
| } // namespace c10 |
| |
| #endif |