c10/cuda/CUDACachingAllocator.h - platform/external/pytorch - Git at Google

 #pragma once

 #include <c10/core/Allocator.h>
 #include <c10/core/StorageImpl.h>
 #include <c10/cuda/CUDAGraphsC10Utils.h>
 #include <c10/cuda/CUDAMacros.h>
 #include <c10/cuda/CUDAStream.h>
 #include <c10/util/Registry.h>

 #include <array>
 #include <mutex>
 #include <set>

 namespace c10 {

 // Caching allocator will execute every registered callback if it unable to find
 // block inside of already allocated area.
 class C10_CUDA_API FreeMemoryCallback {
  public:
   virtual ~FreeMemoryCallback() = default;
   virtual bool Execute() = 0;
 };

 C10_DECLARE_REGISTRY(FreeCudaMemoryCallbacksRegistry, FreeMemoryCallback);
 #define REGISTER_FREE_MEMORY_CALLBACK(name, ...) \
   C10_REGISTER_CLASS(FreeCudaMemoryCallbacksRegistry, name, __VA_ARGS__);

 namespace cuda {

 // TODO: Turn this into an honest to goodness class. I briefly attempted to do
 // this, but it was a bit irritating to figure out how to also correctly
 // apply pimpl pattern so I didn't have to leak any internal implementation
 // details in the header (CUDACachingAllocator could be made a pimpl, but
 // you also need to appropriately define a class which is a subclass
 // of Allocator. Not impossible, but required a bit more surgery than
 // I wanted to do at the time.)
 //
 // Why is this using a namespace rather than old-style THCCachingAllocator_
 // prefix?  Mostly because it made the HIPify rules easier to write; _ is
 // not counted as a word boundary, so you would otherwise have to list each
 // of these functions.

 namespace CUDACachingAllocator {

 struct Stat {
   int64_t current = 0;
   int64_t peak = 0;
   int64_t allocated = 0;
   int64_t freed = 0;
 };

 enum struct StatType : uint64_t {
   AGGREGATE = 0,
   SMALL_POOL = 1,
   LARGE_POOL = 2,
   NUM_TYPES = 3 // remember to update this whenever a new stat type is added
 };

 typedef std::array<Stat, static_cast<size_t>(StatType::NUM_TYPES)> StatArray;

 // Struct containing memory allocator summary statistics for a device.
 struct DeviceStats {
   // COUNT: allocations requested by client code
   StatArray allocation;
   // COUNT: number of allocated segments from cudaMalloc().
   StatArray segment;
   // COUNT: number of active memory blocks (allocated or used by stream)
   StatArray active;
   // COUNT: number of inactive, split memory blocks (unallocated but can't be
   // released via cudaFree)
   StatArray inactive_split;

   // SUM: bytes allocated by this memory alocator
   StatArray allocated_bytes;
   // SUM: bytes reserved by this memory allocator (both free and used)
   StatArray reserved_bytes;
   // SUM: bytes within active memory blocks
   StatArray active_bytes;
   // SUM: bytes within inactive, split memory blocks
   StatArray inactive_split_bytes;
   // SUM: bytes requested by client code
   StatArray requested_bytes;

   // COUNT: total number of failed calls to CUDA malloc necessitating cache
   // flushes.
   int64_t num_alloc_retries = 0;

   // COUNT: total number of OOMs (i.e. failed calls to CUDA after cache flush)
   int64_t num_ooms = 0;

   // COUNT: total number of oversize blocks allocated from pool
   Stat oversize_allocations;

   // COUNT: total number of oversize blocks requiring malloc
   Stat oversize_segments;

   // SIZE: maximum block size that is allowed to be split.
   int64_t max_split_size = 0;
 };

 typedef std::shared_ptr<GatheredContext> (*CreateContextFn)(void);

 struct History {
   void* addr;
   size_t real_size; // unrounded, actually requested size
   std::shared_ptr<GatheredContext> context; // per-watcher context
 };

 // Struct containing info of an allocation block (i.e. a fractional part of a
 // cudaMalloc)..
 struct BlockInfo {
   int64_t size = 0;
   int64_t requested_size = 0;
   int32_t gc_counter = 0;
   bool allocated = false;
   bool active = false;
   std::vector<History> history;
 };

 // Struct containing info of a memory segment (i.e. one contiguous cudaMalloc).
 struct SegmentInfo {
   int64_t device = 0;
   int64_t address = 0;
   int64_t total_size = 0;
   int64_t requested_size = 0;
   int64_t allocated_size = 0;
   int64_t active_size = 0;
   cudaStream_t stream = 0;
   bool is_large = false;
   MempoolId_t owner_private_pool_id = {0, 0};
   std::vector<BlockInfo> blocks;
 };

 struct AllocatorState {
   virtual ~AllocatorState() = default;
 };

 struct TraceEntry {
   enum Action {
     ALLOC, // API made to the caching allocator for new memory
     FREE_REQUESTED, // API call made to the caching allocator to free memory
     FREE_COMPLETED, // The allocator might have to delay a free because
                     // it is still in use on another stream via record_stream
                     // This event is generated when a free actually completes.
     SEGMENT_ALLOC, // a call to cudaMalloc to get more memory from the OS
     SEGMENT_FREE, // a call to cudaFree to return memory to the OS (e.g. to
                   // defragement or empty_caches)
     SNAPSHOT, // a call to snapshot, used to correlate memory snapshots to trace
               // events
     OOM // the allocator threw an OutOfMemoryError (addr_ is the amount of free
         // bytes reported by cuda)
   };
   TraceEntry(
       Action action,
       int64_t addr,
       size_t size,
       cudaStream_t stream,
       std::shared_ptr<GatheredContext> context = nullptr)
       : action_(action),
         addr_(addr),
         context_(std::move(context)),
         stream_(stream),
         size_(size) {}
   Action action_;
   int64_t addr_; // for OOM, this is the amount of free bytes reported by cuda
   std::shared_ptr<GatheredContext> context_;
   cudaStream_t stream_;
   int64_t size_;
 };

 struct SnapshotInfo {
   std::vector<SegmentInfo> segments;
   std::vector<std::vector<TraceEntry>> device_traces;
 };

 // returns the pointers freed in the pool
 // and the pointers allocated. Note: a pointer
 // may appear in both freed and allocated
 struct CheckpointDelta {
   std::vector<void*> ptrs_freed;
   std::vector<at::DataPtr> dataptrs_allocd;
 };

 C10_CUDA_API void setAllocatorSettings(const std::string& env);

 // Size pretty-printer
 std::string format_size(uint64_t size);

 using OutOfMemoryObserver = std::function<void(
     int64_t device,
     int64_t allocated,
     int64_t device_total,
     int64_t device_free)>;

 class CUDAAllocator : public Allocator {
  public:
   virtual void* raw_alloc(size_t nbytes) = 0;
   virtual void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) = 0;
   virtual void raw_delete(void* ptr) = 0;
   virtual void init(int device_count) = 0;
   virtual bool initialized() = 0;
   virtual void setMemoryFraction(double fraction, int device) = 0;
   virtual void emptyCache() = 0;
   virtual void cacheInfo(int dev_id, size_t* largestBlock) = 0;
   virtual void* getBaseAllocation(void* ptr, size_t* size) = 0;
   virtual void recordStream(const DataPtr&, CUDAStream stream) = 0;
   virtual DeviceStats getDeviceStats(int device) = 0;
   virtual void resetAccumulatedStats(int device) = 0;
   virtual void resetPeakStats(int device) = 0;
   virtual SnapshotInfo snapshot() = 0;
   virtual void beginAllocateStreamToPool(
       int device,
       cudaStream_t stream,
       MempoolId_t mempool_id) = 0;
   virtual void endAllocateStreamToPool(int device, cudaStream_t stream) = 0;
   virtual void releasePool(int device, MempoolId_t mempool_id) = 0;
   virtual std::shared_ptr<void> getIpcDevPtr(std::string handle) = 0;
   virtual void recordHistory(
       bool enabled,
       CreateContextFn context_recorder,
       size_t alloc_trace_max_entries,
       bool alloc_trace_record_context) = 0;
   virtual void attachOutOfMemoryObserver(OutOfMemoryObserver observer) = 0;
   virtual bool needsPoolSpecificPeerAccess() = 0;
   virtual std::shared_ptr<AllocatorState> getCheckpointState(
       int device,
       MempoolId_t id) = 0;
   virtual CheckpointDelta setCheckpointPoolState(
       int device,
       std::shared_ptr<AllocatorState> pps) = 0;
   virtual std::string name() = 0;
 };

 // Allocator object, statically initialized
 // See BackendInitializer in CUDACachingAllocator.cpp.
 // Atomic loads on x86 are just normal loads,
 // (atomic stores are different), so reading this value
 // is no different than loading a pointer.
 C10_CUDA_API extern std::atomic<CUDAAllocator*> allocator;

 inline CUDAAllocator* get() {
   return allocator.load();
 }

 // Called directly by clients.
 inline void* raw_alloc(size_t nbytes) {
   return get()->raw_alloc(nbytes);
 }

 inline void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) {
   return get()->raw_alloc_with_stream(nbytes, stream);
 }

 inline void raw_delete(void* ptr) {
   return get()->raw_delete(ptr);
 }

 inline void init(int device_count) {
   return get()->init(device_count);
 }

 inline void setMemoryFraction(double fraction, int device) {
   return get()->setMemoryFraction(fraction, device);
 }

 inline void emptyCache() {
   return get()->emptyCache();
 }

 inline void cacheInfo(int dev_id, size_t* largestBlock) {
   return get()->cacheInfo(dev_id, largestBlock);
 }

 inline void* getBaseAllocation(void* ptr, size_t* size) {
   return get()->getBaseAllocation(ptr, size);
 }

 inline void recordStream(const DataPtr& dataPtr, CUDAStream stream) {
   return get()->recordStream(dataPtr, stream);
 }

 inline DeviceStats getDeviceStats(int device) {
   return get()->getDeviceStats(device);
 }

 inline void resetAccumulatedStats(int device) {
   return get()->resetAccumulatedStats(device);
 }

 inline void resetPeakStats(int device) {
   return get()->resetPeakStats(device);
 }

 inline SnapshotInfo snapshot() {
   return get()->snapshot();
 }

 inline std::shared_ptr<AllocatorState> getCheckpointState(
     int device,
     MempoolId_t id) {
   return get()->getCheckpointState(device, id);
 }

 inline CheckpointDelta setCheckpointPoolState(
     int device,
     std::shared_ptr<AllocatorState> pps) {
   return get()->setCheckpointPoolState(device, pps);
 }

 // CUDAGraph interactions
 inline void beginAllocateStreamToPool(
     int device,
     cudaStream_t stream,
     MempoolId_t mempool_id) {
   return get()->beginAllocateStreamToPool(device, stream, mempool_id);
 }

 inline void endAllocateStreamToPool(int device, cudaStream_t stream) {
   return get()->endAllocateStreamToPool(device, stream);
 }

 inline void recordHistory(
     bool enabled,
     CreateContextFn context_recorder,
     size_t alloc_trace_max_entries,
     bool alloc_trace_record_context) {
   return get()->recordHistory(
       enabled,
       context_recorder,
       alloc_trace_max_entries,
       alloc_trace_record_context);
 }

 inline void attachOutOfMemoryObserver(OutOfMemoryObserver observer) {
   return get()->attachOutOfMemoryObserver(observer);
 }

 inline void releasePool(int device, MempoolId_t mempool_id) {
   return get()->releasePool(device, mempool_id);
 }
 // Not part of CUDA_ALLOCATOR_BACKEND_INTERFACE
 inline std::shared_ptr<void> getIpcDevPtr(std::string handle) {
   return get()->getIpcDevPtr(handle);
 }

 inline std::string name() {
   return get()->name();
 }

 } // namespace CUDACachingAllocator
 } // namespace cuda
 } // namespace c10
	#pragma once

	#include <c10/core/Allocator.h>
	#include <c10/core/StorageImpl.h>
	#include <c10/cuda/CUDAGraphsC10Utils.h>
	#include <c10/cuda/CUDAMacros.h>
	#include <c10/cuda/CUDAStream.h>
	#include <c10/util/Registry.h>

	#include <array>
	#include <mutex>
	#include <set>

	namespace c10 {

	// Caching allocator will execute every registered callback if it unable to find
	// block inside of already allocated area.
	class C10_CUDA_API FreeMemoryCallback {
	public:
	virtual ~FreeMemoryCallback() = default;
	virtual bool Execute() = 0;
	};

	C10_DECLARE_REGISTRY(FreeCudaMemoryCallbacksRegistry, FreeMemoryCallback);
	#define REGISTER_FREE_MEMORY_CALLBACK(name, ...) \
	C10_REGISTER_CLASS(FreeCudaMemoryCallbacksRegistry, name, __VA_ARGS__);

	namespace cuda {

	// TODO: Turn this into an honest to goodness class. I briefly attempted to do
	// this, but it was a bit irritating to figure out how to also correctly
	// apply pimpl pattern so I didn't have to leak any internal implementation
	// details in the header (CUDACachingAllocator could be made a pimpl, but
	// you also need to appropriately define a class which is a subclass
	// of Allocator. Not impossible, but required a bit more surgery than
	// I wanted to do at the time.)
	//
	// Why is this using a namespace rather than old-style THCCachingAllocator_
	// prefix? Mostly because it made the HIPify rules easier to write; _ is
	// not counted as a word boundary, so you would otherwise have to list each
	// of these functions.

	namespace CUDACachingAllocator {

	struct Stat {
	int64_t current = 0;
	int64_t peak = 0;
	int64_t allocated = 0;
	int64_t freed = 0;
	};

	enum struct StatType : uint64_t {
	AGGREGATE = 0,
	SMALL_POOL = 1,
	LARGE_POOL = 2,
	NUM_TYPES = 3 // remember to update this whenever a new stat type is added
	};

	typedef std::array<Stat, static_cast<size_t>(StatType::NUM_TYPES)> StatArray;

	// Struct containing memory allocator summary statistics for a device.
	struct DeviceStats {
	// COUNT: allocations requested by client code
	StatArray allocation;
	// COUNT: number of allocated segments from cudaMalloc().
	StatArray segment;
	// COUNT: number of active memory blocks (allocated or used by stream)
	StatArray active;
	// COUNT: number of inactive, split memory blocks (unallocated but can't be
	// released via cudaFree)
	StatArray inactive_split;

	// SUM: bytes allocated by this memory alocator
	StatArray allocated_bytes;
	// SUM: bytes reserved by this memory allocator (both free and used)
	StatArray reserved_bytes;
	// SUM: bytes within active memory blocks
	StatArray active_bytes;
	// SUM: bytes within inactive, split memory blocks
	StatArray inactive_split_bytes;
	// SUM: bytes requested by client code
	StatArray requested_bytes;

	// COUNT: total number of failed calls to CUDA malloc necessitating cache
	// flushes.
	int64_t num_alloc_retries = 0;

	// COUNT: total number of OOMs (i.e. failed calls to CUDA after cache flush)
	int64_t num_ooms = 0;

	// COUNT: total number of oversize blocks allocated from pool
	Stat oversize_allocations;

	// COUNT: total number of oversize blocks requiring malloc
	Stat oversize_segments;

	// SIZE: maximum block size that is allowed to be split.
	int64_t max_split_size = 0;
	};

	typedef std::shared_ptr<GatheredContext> (*CreateContextFn)(void);

	struct History {
	void* addr;
	size_t real_size; // unrounded, actually requested size
	std::shared_ptr<GatheredContext> context; // per-watcher context
	};

	// Struct containing info of an allocation block (i.e. a fractional part of a
	// cudaMalloc)..
	struct BlockInfo {
	int64_t size = 0;
	int64_t requested_size = 0;
	int32_t gc_counter = 0;
	bool allocated = false;
	bool active = false;
	std::vector<History> history;
	};

	// Struct containing info of a memory segment (i.e. one contiguous cudaMalloc).
	struct SegmentInfo {
	int64_t device = 0;
	int64_t address = 0;
	int64_t total_size = 0;
	int64_t requested_size = 0;
	int64_t allocated_size = 0;
	int64_t active_size = 0;
	cudaStream_t stream = 0;
	bool is_large = false;
	MempoolId_t owner_private_pool_id = {0, 0};
	std::vector<BlockInfo> blocks;
	};

	struct AllocatorState {
	virtual ~AllocatorState() = default;
	};

	struct TraceEntry {
	enum Action {
	ALLOC, // API made to the caching allocator for new memory
	FREE_REQUESTED, // API call made to the caching allocator to free memory
	FREE_COMPLETED, // The allocator might have to delay a free because
	// it is still in use on another stream via record_stream
	// This event is generated when a free actually completes.
	SEGMENT_ALLOC, // a call to cudaMalloc to get more memory from the OS
	SEGMENT_FREE, // a call to cudaFree to return memory to the OS (e.g. to
	// defragement or empty_caches)
	SNAPSHOT, // a call to snapshot, used to correlate memory snapshots to trace
	// events
	OOM // the allocator threw an OutOfMemoryError (addr_ is the amount of free
	// bytes reported by cuda)
	};
	TraceEntry(
	Action action,
	int64_t addr,
	size_t size,
	cudaStream_t stream,
	std::shared_ptr<GatheredContext> context = nullptr)
	: action_(action),
	addr_(addr),
	context_(std::move(context)),
	stream_(stream),
	size_(size) {}
	Action action_;
	int64_t addr_; // for OOM, this is the amount of free bytes reported by cuda
	std::shared_ptr<GatheredContext> context_;
	cudaStream_t stream_;
	int64_t size_;
	};

	struct SnapshotInfo {
	std::vector<SegmentInfo> segments;
	std::vector<std::vector<TraceEntry>> device_traces;
	};

	// returns the pointers freed in the pool
	// and the pointers allocated. Note: a pointer
	// may appear in both freed and allocated
	struct CheckpointDelta {
	std::vector<void*> ptrs_freed;
	std::vector<at::DataPtr> dataptrs_allocd;
	};

	C10_CUDA_API void setAllocatorSettings(const std::string& env);

	// Size pretty-printer
	std::string format_size(uint64_t size);

	using OutOfMemoryObserver = std::function<void(
	int64_t device,
	int64_t allocated,
	int64_t device_total,
	int64_t device_free)>;

	class CUDAAllocator : public Allocator {
	public:
	virtual void* raw_alloc(size_t nbytes) = 0;
	virtual void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) = 0;
	virtual void raw_delete(void* ptr) = 0;
	virtual void init(int device_count) = 0;
	virtual bool initialized() = 0;
	virtual void setMemoryFraction(double fraction, int device) = 0;
	virtual void emptyCache() = 0;
	virtual void cacheInfo(int dev_id, size_t* largestBlock) = 0;
	virtual void* getBaseAllocation(void* ptr, size_t* size) = 0;
	virtual void recordStream(const DataPtr&, CUDAStream stream) = 0;
	virtual DeviceStats getDeviceStats(int device) = 0;
	virtual void resetAccumulatedStats(int device) = 0;
	virtual void resetPeakStats(int device) = 0;
	virtual SnapshotInfo snapshot() = 0;
	virtual void beginAllocateStreamToPool(
	int device,
	cudaStream_t stream,
	MempoolId_t mempool_id) = 0;
	virtual void endAllocateStreamToPool(int device, cudaStream_t stream) = 0;
	virtual void releasePool(int device, MempoolId_t mempool_id) = 0;
	virtual std::shared_ptr<void> getIpcDevPtr(std::string handle) = 0;
	virtual void recordHistory(
	bool enabled,
	CreateContextFn context_recorder,
	size_t alloc_trace_max_entries,
	bool alloc_trace_record_context) = 0;
	virtual void attachOutOfMemoryObserver(OutOfMemoryObserver observer) = 0;
	virtual bool needsPoolSpecificPeerAccess() = 0;
	virtual std::shared_ptr<AllocatorState> getCheckpointState(
	int device,
	MempoolId_t id) = 0;
	virtual CheckpointDelta setCheckpointPoolState(
	int device,
	std::shared_ptr<AllocatorState> pps) = 0;
	virtual std::string name() = 0;
	};

	// Allocator object, statically initialized
	// See BackendInitializer in CUDACachingAllocator.cpp.
	// Atomic loads on x86 are just normal loads,
	// (atomic stores are different), so reading this value
	// is no different than loading a pointer.
	C10_CUDA_API extern std::atomic<CUDAAllocator*> allocator;

	inline CUDAAllocator* get() {
	return allocator.load();
	}

	// Called directly by clients.
	inline void* raw_alloc(size_t nbytes) {
	return get()->raw_alloc(nbytes);
	}

	inline void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) {
	return get()->raw_alloc_with_stream(nbytes, stream);
	}

	inline void raw_delete(void* ptr) {
	return get()->raw_delete(ptr);
	}

	inline void init(int device_count) {
	return get()->init(device_count);
	}

	inline void setMemoryFraction(double fraction, int device) {
	return get()->setMemoryFraction(fraction, device);
	}

	inline void emptyCache() {
	return get()->emptyCache();
	}

	inline void cacheInfo(int dev_id, size_t* largestBlock) {
	return get()->cacheInfo(dev_id, largestBlock);
	}

	inline void* getBaseAllocation(void* ptr, size_t* size) {
	return get()->getBaseAllocation(ptr, size);
	}

	inline void recordStream(const DataPtr& dataPtr, CUDAStream stream) {
	return get()->recordStream(dataPtr, stream);
	}

	inline DeviceStats getDeviceStats(int device) {
	return get()->getDeviceStats(device);
	}

	inline void resetAccumulatedStats(int device) {
	return get()->resetAccumulatedStats(device);
	}

	inline void resetPeakStats(int device) {
	return get()->resetPeakStats(device);
	}

	inline SnapshotInfo snapshot() {
	return get()->snapshot();
	}

	inline std::shared_ptr<AllocatorState> getCheckpointState(
	int device,
	MempoolId_t id) {
	return get()->getCheckpointState(device, id);
	}

	inline CheckpointDelta setCheckpointPoolState(
	int device,
	std::shared_ptr<AllocatorState> pps) {
	return get()->setCheckpointPoolState(device, pps);
	}

	// CUDAGraph interactions
	inline void beginAllocateStreamToPool(
	int device,
	cudaStream_t stream,
	MempoolId_t mempool_id) {
	return get()->beginAllocateStreamToPool(device, stream, mempool_id);
	}

	inline void endAllocateStreamToPool(int device, cudaStream_t stream) {
	return get()->endAllocateStreamToPool(device, stream);
	}

	inline void recordHistory(
	bool enabled,
	CreateContextFn context_recorder,
	size_t alloc_trace_max_entries,
	bool alloc_trace_record_context) {
	return get()->recordHistory(
	enabled,
	context_recorder,
	alloc_trace_max_entries,
	alloc_trace_record_context);
	}

	inline void attachOutOfMemoryObserver(OutOfMemoryObserver observer) {
	return get()->attachOutOfMemoryObserver(observer);
	}

	inline void releasePool(int device, MempoolId_t mempool_id) {
	return get()->releasePool(device, mempool_id);
	}
	// Not part of CUDA_ALLOCATOR_BACKEND_INTERFACE
	inline std::shared_ptr<void> getIpcDevPtr(std::string handle) {
	return get()->getIpcDevPtr(handle);
	}

	inline std::string name() {
	return get()->name();
	}

	} // namespace CUDACachingAllocator
	} // namespace cuda
	} // namespace c10