c10/cuda/CUDAMallocAsyncAllocator.cpp - platform/external/pytorch - Git at Google

 #include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAException.h>
 #include <c10/cuda/CUDAFunctions.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/util/UniqueVoidPtr.h>
 #include <c10/util/flat_hash_map.h>
 #include <c10/util/irange.h>

 #include <unordered_set>
 #include <vector>

 namespace c10 {
 namespace cuda {
 namespace CUDACachingAllocator {
 namespace CudaMallocAsync {

 #if CUDA_VERSION >= 11040
 // CUDA device allocator that uses cudaMallocAsync to implement
 // the same interface as CUDACachingAllocator.cpp.

 // Designed to be safe for CUDA graph capture.
 // Interactions with CUDA graph capture are mediated by
 // notifyCaptureBegin
 // notifyCaptureAboutToEnd
 // notifyCaptureEnded
 // notifyCaptureDestroy

 // Implementation details, not declared in CUDACachingAllocator.h
 namespace {

 // General helpers

 struct UsageStream {
   cudaStream_t stream;
   int device;
   UsageStream() = default;
   UsageStream(cudaStream_t s, int d) : stream(s), device(d) {}
   UsageStream(const UsageStream& us) = default;
   UsageStream(const UsageStream&& us) : stream(us.stream), device(us.device) {}
   UsageStream& operator=(UsageStream other) {
     stream = other.stream;
     device = other.device;
     return *this;
   }
 };

 bool operator==(const UsageStream& lhs, const UsageStream& rhs) {
   return (lhs.stream == rhs.stream) && (lhs.device == rhs.device);
 }

 struct UsageStreamHash {
   size_t operator()(const UsageStream& us) const noexcept {
     return std::hash<void*>{}(us.stream) + size_t(us.device);
   }
 };

 struct PtrUsage {
   // recorded_streams holds side usage streams added by record_stream calls.
   // In other words, it does NOT include the original creation stream.
   ska::flat_hash_set<UsageStream, UsageStreamHash> recorded_streams;
   UsageStream creation_stream;
   uint64_t size;
   bool captured;
   PtrUsage(uint64_t s, bool c) : size(s), captured(c) {}
 };

 int device_count = 0;
 // these don't need to be c10::once_flags as in CUDAGeneratorImpl.cpp
 // because they'll only be flipped by functions that have locked the mutex.
 std::vector<bool> devs_initialized_flags;
 std::vector<UsageStream> dummy_unifying_free_streams;

 // Possible micro-optimization:
 // Some accesses to ptr_info are read-only.
 // We could let those be concurrent with a shared_mutex and
 // have concurrent calls take a shared_lock.
 // Keeping it simple with an ordinary mutex for now.
 std::mutex general_mutex;

 /**
  * Note [Avoid freeing uncaptured ptrs during CUDA graph capture]
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  * During CUDA graph capture, it's illegal to call cudaFreeAsync
  * on a pointer that came from a non-captured cudaMallocAsync.
  * Unfortunately, Python being what it is, it's impossible to be
  * sure no uncaptured tensor will ever have its destructor called
  * in a capturing region.
  * We avoid errors by
  *  1. remembering if allocated pointers were captured or uncaptured
  *  2. during capture, if we detect an attempt to free an uncaptured
  *     allocation on a capturing stream, don't free it immediately,
  *     just remember it and defer its cudaFreeAsync call to after
  *     the end of capture (specifically, to notifyCaptureEnded).
  */

 using PtrInfo = ska::flat_hash_map<void*, PtrUsage>;
 PtrInfo ptr_info;
 std::vector<void*> ungraphed_ptrs_defer_free_until_no_capture;

 // These two help setMemoryFraction limit the amount of memory
 // used by PyTorch in particular (as opposed to other libraries
 // in the same process that might be sharing the same cudaMemPool_t).
 std::vector<size_t> pytorch_used_bytes;
 std::vector<size_t> pytorch_memory_limits;

 // Graph-specific helpers

 /**
  * Note [Avoid dangling free streams during CUDA graph capture]
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  * During capture, all stream dependencies must branch out from
  * the stream on which capture began and rejoin this initial stream
  * before capture ends.
  * The user rigs desired forking and joining with event waits.
  * But it's hard to be sure when tensor destructors get called relative
  * to the final joins.
  * For example, suppose a user
  *   forks work stream B from initial capture stream A
  *   creates a tensor T in B
  *   joins by syncing A with B
  *   ends capture.
  * All well and good, right? Maybe not: maybe T went out of scope
  * and its destructor got called AFTER the rejoin, leaving the graph with
  * "unjoined work": a dangling cudaFreeAsync node in stream B.
  * Ensuring that all tensor destructors for all side stream tensors
  * are called before side streams rejoin the main stream is
  * difficult. The user might have to add a bunch of explicit
  * "del"s at the right spots in code that was fine for ordinary
  * eager execution.
  * Fortunately, we can spare the user this burden:
  * during capture, we remember _all_ free streams,
  * and manually rejoin them with the capture stream during
  * notifyCaptureAboutToEnd.
  * This approach is heavy-handed, but hopefully capture only needs to
  * happen once, so we don't mind being heavy-handed.
  *
  * TODO: If, someday, we augment the graph bindings to support recapture
  * https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#whole-graph-update
  * (eg, as a way to accommodate dynamic params) we should think more
  * carefully about the CPU overhead of remembering and rejoining
  * all free streams during capture. Maybe it's not a big deal.
  */
 std::unordered_set<UsageStream, UsageStreamHash> capture_free_streams;
 bool capture_underway = false;

 // Implementation functions

 // Assumes the caller holds general_mutex
 inline void lazy_init_device(int device) {
   if (!devs_initialized_flags[device]) {
     CUDAGuard g(device);

     // See "Retaining memory in the pool" here:
     // https://developer.nvidia.com/blog/using-cuda-stream-ordered-memory-allocator-part-1/
     cudaMemPool_t mempool;
     C10_CUDA_CHECK(cudaDeviceGetDefaultMemPool(&mempool, device));
     uint64_t threshold = UINT64_MAX;
     C10_CUDA_CHECK(cudaMemPoolSetAttribute(
         mempool, cudaMemPoolAttrReleaseThreshold, &threshold));

     // I think all these are on by default, but I want to enable them
     // explicitly to ensure awareness.
     int enable = 1;
     C10_CUDA_CHECK(cudaMemPoolSetAttribute(
         mempool, cudaMemPoolReuseFollowEventDependencies, &enable));
     C10_CUDA_CHECK(cudaMemPoolSetAttribute(
         mempool, cudaMemPoolReuseAllowOpportunistic, &enable));
     C10_CUDA_CHECK(cudaMemPoolSetAttribute(
         mempool, cudaMemPoolReuseAllowInternalDependencies, &enable));

     // Grabs a stream from the current device to use as the "unifier" free
     // stream for allocations that end up used on multiple streams.
     const auto dufs = getStreamFromPool();
     dummy_unifying_free_streams[device] =
         UsageStream(dufs.stream(), dufs.device_index());

     pytorch_used_bytes[device] = 0;
     pytorch_memory_limits[device] = UINT64_MAX;

     devs_initialized_flags[device] = true;
   }
 }

 inline void sync_raw(cudaStream_t dependency, cudaStream_t dependent) {
   // CUDACachingAllocator.cpp uses raw cuda events, as do we.
   cudaEvent_t event;
   C10_CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
   C10_CUDA_CHECK(cudaEventRecord(event, dependency));
   C10_CUDA_CHECK(cudaStreamWaitEvent(dependent, event));
   C10_CUDA_CHECK(cudaEventDestroy(event));
 }

 // Assumes the caller holds general_mutex
 inline void free_impl(PtrInfo::iterator& it) {
   // Possible micro-optimization: If we did a value-copy here, we could move
   // ptr_info.erase(it) up here and drop the lock immediately.
   const auto& recorded_streams = it->second.recorded_streams;
   const auto& creation_stream = it->second.creation_stream;

   // If the usage stream is a null (default) stream,
   // cudaFreeAsync infers the device from the ambient context,
   // so we need to set the right ambient context.
   CUDAGuard g(creation_stream.device);

   if (recorded_streams.empty()) {
     // ptr was only used on one stream, which must have been
     // the original allocation stream.
     // Frees ptr in the original allocation stream.

     C10_CUDA_CHECK(cudaFreeAsync(it->first, creation_stream.stream));

     if (C10_UNLIKELY(capture_underway)) {
       // See Note [Avoid dangling free streams during CUDA graph capture]
       capture_free_streams.insert(creation_stream);
     }
   } else {
     // ptr was used on many streams. We don't know which was the most recent.
     // There could even have been multiple most recent usage streams acting
     // on different regions of the memory.
     // But cudaFreeAsync only accepts a single most recent usage stream.
     // We can still safely free ptr with a trick:
     // Use a dummy "unifying stream", sync the unifying stream with all of
     // ptr's usage streams, and pass the dummy stream to cudaFreeAsync.

     // Retrieves the dummy "unifier" stream from the device
     // on which the pointer was originally allocated.
     auto dummy_unifying_free_stream =
         dummy_unifying_free_streams[creation_stream.device];
     TORCH_INTERNAL_ASSERT(
         dummy_unifying_free_stream.device == creation_stream.device);

     // we're already on creation_stream.device, no need to re-guard
     sync_raw(creation_stream.stream, dummy_unifying_free_stream.stream);

     // The number of usage streams is typically small (low single digits)
     for (const auto& recorded_stream : recorded_streams) {
       // Logic here accommodates the chance some of the usage streams were on
       // other devices, which is possible if some usage kernels accessed the
       // memory via p2p.

       // cudaEventRecord requires that the input event and stream are on the
       // same device.
       CUDAGuard g_usage(recorded_stream.device);

       sync_raw(recorded_stream.stream, dummy_unifying_free_stream.stream);
     }

     // Frees ptr in the dummy "unifier" stream.
     C10_CUDA_CHECK(cudaFreeAsync(it->first, dummy_unifying_free_stream.stream));
     // At this point, unless dummy_unifying_free_stream happens to alias some
     // future user stream, the allocation is only available for "opportunistic"
     // reuse, ie, if the CPU sees dummy_unifying_free_stream has reached the
     // point that all events recorded on all usage streams have resolved from
     // the CPU's perspective. In theory, we could remove the need for the driver
     // to do this tracking by e.g. replacing
     // cudaStreamWaitEvent(dummy_unifying_free_stream.stream, event);
     // with
     // cudaStreamWaitEvent(creation_stream.stream, event);
     // then cudaFreeAsyncing straight back into creation_stream.stream,
     // but this forces a potentially false dependency of creation_stream.stream
     // on all the recorded_streams.

     if (C10_UNLIKELY(capture_underway)) {
       // See Note [Avoid dangling free streams during CUDA graph capture]
       capture_free_streams.emplace(
           dummy_unifying_free_stream.stream, dummy_unifying_free_stream.device);
     }
   }

   pytorch_used_bytes[creation_stream.device] -= it->second.size;

   ptr_info.erase(it);
 }

 void freeAsync(void* ptr) {
   std::lock_guard<std::mutex> lk(general_mutex);

   auto err = cudaGetLastError();
   C10_CUDA_CHECK(err);
   auto it = ptr_info.find(ptr);
   TORCH_INTERNAL_ASSERT(it != ptr_info.end(), "ptr not found in ptr_info");

   if (C10_UNLIKELY(capture_underway)) {
     if (!it->second.captured) {
       TORCH_WARN_ONCE(
           "freeAsync() was called on an uncaptured allocation during graph capture "
           "(address = ",
           ptr,
           "). This may be benign, for example, a Python tensor in the capture "
           "might happen to shadow (use the same name as) an unrelated temporary "
           "tensor from somewhere before capture, pushing the earlier tensor "
           "out of scope. "
           "However, if the tensor we're freeing here IS used by the capture, "
           "freeing it is an error, and may cause illegal memory accesses or "
           "memory corruption during graph replay.");
       // See Note [Avoid freeing uncaptured ptrs during CUDA graph capture]
       // Remembers the raw pointer, not the iterator.
       // This forces notifyCaptureEnded to do another lookup,
       // but avoids the risk the iterator might be invalidated
       // between now and then.
       ungraphed_ptrs_defer_free_until_no_capture.push_back(ptr);
       return;
     }
   } else if (C10_UNLIKELY(it->second.captured)) {
     TORCH_WARN(
         "Attempting uncaptured free of a captured allocation with address ",
         ptr,
         "\nThis is technically allowed, but may indicate you are losing "
         "the last user-visible tensor through which the allocation can "
         "be accessed, so you'll have no way to view the data after "
         "future replays of the owning graph.");
   }

   free_impl(it);
 }

 // Symmetric with NativeCachingAllocator::malloc for now,
 // although I don't think we absolutely need the symmetry.
 void mallocAsync(void** devPtr, int device, size_t size, cudaStream_t stream) {
   TORCH_INTERNAL_ASSERT(
       0 <= device && device < device_count,
       "Invalid device index ",
       device,
       ": did you call init?");

   // If stream is a null (default) stream,
   // cudaMallocAsync infers the device from the ambient context,
   // so we need to set the right ambient context.
   CUDAGuard g(device);

   std::lock_guard<std::mutex> lk(general_mutex);

   if (!capture_underway &&
       ungraphed_ptrs_defer_free_until_no_capture.size() > 0) {
     // See Note [Avoid freeing uncaptured ptrs during CUDA graph capture]
     for (const auto ptr : ungraphed_ptrs_defer_free_until_no_capture) {
       auto it = ptr_info.find(ptr);
       TORCH_INTERNAL_ASSERT(it != ptr_info.end(), "ptr not found in ptr_info");
       free_impl(it);
     }

     ungraphed_ptrs_defer_free_until_no_capture.clear();
   }

   lazy_init_device(device);

   // Defensively checks for preexisting CUDA error state.
   auto err = cudaGetLastError();
   C10_CUDA_CHECK(err);

   // TODO: Could we avoid calling cudaMallocAsync while holding general_mutex,
   // perhaps by letting lazy_init_device use separate once_flags or an internal
   // static initializer?
   if (pytorch_used_bytes[device] + size > pytorch_memory_limits[device]) {
     err = cudaErrorMemoryAllocation;
   } else {
     err = cudaMallocAsync(devPtr, size, stream);
   }

   if (err == cudaErrorMemoryAllocation) {
     // Clears CUDA's internal error state so the user, if desired, can catch the
     // OOM exception, free some stuff on the script side, and retry the
     // allocation. This aligns with the behavior of alloc_block in
     // CUDACachingAllocator.cpp.
     (void)cudaGetLastError(); // clear CUDA error
     size_t device_free;
     size_t device_total;
     C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
     TORCH_CHECK_WITH(
         OutOfMemoryError,
         false,
         "Allocation on device ",
         device,
         " would exceed allowed memory. (out of memory)",
         "\nCurrently allocated     : ",
         format_size(pytorch_used_bytes[device]),
         "\nRequested               : ",
         format_size(size),
         "\nDevice limit            : ",
         format_size(device_total),
         "\nFree (according to CUDA): ",
         format_size(device_free),
         "\nPyTorch limit (set by user-supplied memory fraction)"
         "\n                        : ",
         format_size(pytorch_memory_limits[device]));
   } else {
     C10_CUDA_CHECK(err);
   }

   auto inserted = ptr_info.emplace(*devPtr, PtrUsage(size, capture_underway));
   TORCH_INTERNAL_ASSERT(
       inserted.second,
       "address returned by cudaMallocAsync already exists "
       "in ptr_info");

   inserted.first->second.creation_stream = {stream, device};

   pytorch_used_bytes[device] += size;
 }

 } // anonymous namespace

 void local_raw_delete(void* ptr);

 // Same pattern as CUDACachingAllocator.cpp.
 struct CudaMallocAsyncAllocator : public CUDAAllocator {
   DataPtr allocate(size_t size) const override {
     constexpr size_t one_exa_bytes = 1152921504606846976ULL;
     TORCH_CHECK_WITH(
         OutOfMemoryError,
         size < one_exa_bytes,
         "CUDA out of memory. Tried to allocate more than 1EB memory.");
     int device;
     C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
     void* r = nullptr;
     if (size != 0) {
       mallocAsync(&r, device, size, cuda::getCurrentCUDAStream(device));
     }
     return {r, r, &local_raw_delete, Device(DeviceType::CUDA, device)};
   }
   DeleterFnPtr raw_deleter() const override {
     return &local_raw_delete;
   }

   // This function should not issue any context-creating calls,
   // just set up for later calls to init per-device pools based
   // on the current device each later call sees.
   void init(int dev_count) override {
     static bool called = [](int dev_count) {
       ;
       // Are there external guarantees init will be called before
       // any of the allocator's other functions?
       // std::lock_guard<std::mutex> lk(general_mutex);
       device_count = dev_count;
       devs_initialized_flags.resize(dev_count, false);
       dummy_unifying_free_streams.resize(dev_count);
       pytorch_used_bytes.resize(dev_count);
       pytorch_memory_limits.resize(dev_count);
       return true;
     }(dev_count);
     (void)called;
   }

   bool initialized() override {
     return devs_initialized_flags.size() > 0;
   }

   static inline void assertValidDevice(int device) {
     TORCH_CHECK(
         0 <= device && device < device_count, "Invalid device argument.");
   }

   void setMemoryFraction(double fraction, int device) override {
     TORCH_INTERNAL_ASSERT(
         0 <= fraction && fraction <= 1,
         "invalid fraction:",
         fraction,
         ". Please set within (0, 1).");

     std::lock_guard<std::mutex> lk(general_mutex);
     assertValidDevice(device);
     CUDAGuard g(device);
     // Should setMemoryFraction be allowed to trigger a full device context and
     // pool-creating lazy_init_device, or should we simply assert this device is
     // already initialized, ie
     // TORCH_CHECK(devs_initialized_flags[device], ...)?
     lazy_init_device(device);

     size_t device_free;
     size_t device_total;
     C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
     pytorch_memory_limits[device] =
         static_cast<uint64_t>(fraction * device_total);

     // Alternative: Instead of a manual hard limit, we could use
     // cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold,
     // &threshold); This is a soft hint: The driver allows the pool's reserved
     // memory to spike above threshold in regions of high cudaMallocAsync
     // demand, but opportunistically trims reserved memory back to threshold
     // when the memory in use is < threshold. I don't like this because it
     // introduces performance nondeterminism.
   }

   void emptyCache(void) override {
     std::lock_guard<std::mutex> lk(general_mutex);

     for (int dev = 0; dev < device_count; dev++) {
       if (devs_initialized_flags[dev]) {
         CUDAGuard g(dev);

         cudaMemPool_t mempool;
         cudaDeviceGetDefaultMemPool(&mempool, dev);
         cudaDeviceSynchronize();
         cudaMemPoolTrimTo(mempool, 0);
       }
     }
   }

   void cacheInfo(int device, size_t* maxWorkspaceGuess) override {
     // The only consumer of cacheInfo is getMaxWorkspaceSize in Conv_v7.cpp.
     // Afaict, the role of cacheInfo is to give getMaxWorkspaceSize a reasonable
     // maximum workspace size to use for an upcoming cudnnFind call.
     //
     // The native allocator's cacheInfo chooses to return the size of its
     // largest unused block (which is the largest allocation the native
     // allocator can service immediately and asynchronously without a
     // cudaMalloc.
     //
     // Here, we use a different heuristic: figure out the max usable workspace
     // size with a bit of educated trial and error. It's ok to be
     // perf-inefficient because cacheInfo is a prelude to cudnnFind.
     //
     // The algo cache then stores the best-performing algo with workspace <=
     // maxWorkspaceGuess. Later calls with the same param set hit in cache and
     // try to allocate the same workspace. If, in one of those future calls,
     // workspace allocation fails (ie because less ambient memory is available),
     // the bindings rerun cudnnFind, including calling cacheInfo again
     // beforehand to estimate a new (smaller) largest-available workspace. Over
     // a few such calls, the cache should settle to the algo with a workspace
     // size that's small enough to succeed every time (for that param set).
     //
     // So the strategy here is to return a rough, largeish guess and let the
     // bindings retry to trim as needed over time.
     //
     // The only caveat is, even if a workspace is allocated without OOM errors
     // now and in future calls, it's hard to be sure those later error-free
     // cudaMallocAsyncs are fast and come straight from the pool (ie,
     // cudaMallocAsync didn't need to reserve more memory from the system).
     // Hopefully, after repeated workspace requests, the pool's reserved memory
     // also stabilizes to a point where they all come straight from the pool.
     std::lock_guard<std::mutex> lk(general_mutex);
     assertValidDevice(device);
     CUDAGuard g(device);
     lazy_init_device(device);

     size_t free_upper_bound;
     size_t device_total;
     C10_CUDA_CHECK(cudaMemGetInfo(&free_upper_bound, &device_total));
     TORCH_INTERNAL_ASSERT(
         free_upper_bound + pytorch_used_bytes[device] <= device_total);
     size_t guess = std::min(
         free_upper_bound,
         pytorch_memory_limits[device] - pytorch_used_bytes[device]);
     auto stream = c10::cuda::getCurrentCUDAStream();
     void* dummy;

     // Defensively checks for preexisting CUDA error state.
     auto err = cudaGetLastError();
     C10_CUDA_CHECK(err);

     while (true) {
       // Duplicates some logic from mallocAsync to work with the error state
       // directly instead of repeatedly catching an exception thrown by
       // mallocAsync.
       if (pytorch_used_bytes[device] + guess > pytorch_memory_limits[device]) {
         err = cudaErrorMemoryAllocation;
       } else {
         err = cudaMallocAsync(&dummy, guess, stream);
       }

       if (err == cudaSuccess) {
         cudaFreeAsync(dummy, stream);
         *maxWorkspaceGuess = guess;
         return;
       } else if (err == cudaErrorMemoryAllocation) {
         (void)cudaGetLastError(); // clear CUDA error
         guess >>= 1; // quick and dirty: try half the size next iteration
       } else {
         C10_CUDA_CHECK(err);
       }
     }
   }

   void* getBaseAllocation(void* ptr, size_t* size) override {
     std::lock_guard<std::mutex> lk(general_mutex);

     auto it = ptr_info.find(ptr);
     TORCH_INTERNAL_ASSERT(it != ptr_info.end(), "ptr not found in ptr_info");

     if (size) {
       *size = it->second.size;
     }

     return ptr;
   }

   void recordStream(const DataPtr& ptr, cuda::CUDAStream stream) override {
     std::lock_guard<std::mutex> lk(general_mutex);
     auto ptr_val = ptr.get();
     // Empty tensor's storage().data() might be a null ptr. As there is no
     // blocks associated with those tensors, it is fine to do nothing here.
     if (!ptr_val) {
       return;
     }

     // The pointer should exist in the map already.
     auto it = ptr_info.find(ptr_val);
     TORCH_INTERNAL_ASSERT(it != ptr_info.end(), "ptr not found in ptr_info");

     UsageStream to_record{stream.stream(), stream.device_index()};
     if (to_record == it->second.creation_stream) {
       TORCH_WARN(
           "Called record_stream on tensor whose original creation stream "
           "matches the recorded stream. This is unnecessary and has no effect.");
     } else {
       it->second.recorded_streams.insert(to_record);
     }
   }

   std::shared_ptr<void> getIpcDevPtr(std::string handle) override {
     TORCH_CHECK(
         false,
         "cudaMallocAsync does not yet support getIpcDevPtr. "
         "If you need it, please file an issue describing your use case.");
   }

   void recordHistory(
       bool enabled,
       CreateContextFn context_recorder,
       size_t alloc_trace_max_entries,
       bool alloc_trace_record_context) override {
     TORCH_CHECK(
         false,
         "cudaMallocAsync does not yet support recordHistory. "
         "If you need it, please file an issue describing your use case.");
   }

   void attachOutOfMemoryObserver(OutOfMemoryObserver observer) override {
     TORCH_CHECK(
         false,
         "cudaMallocAsync does not yet support attachOutOfMemoryObserver. "
         "If you need it, please file an issue describing your use case.");
   }

   std::shared_ptr<AllocatorState> getCheckpointState(int device, MempoolId_t id)
       override {
     TORCH_CHECK(
         false,
         "cudaMallocAsync does not yet support getCheckpointState. "
         "If you need it, please file an issue describing your use case.");
   }

   CheckpointDelta setCheckpointPoolState(
       int device,
       std::shared_ptr<AllocatorState> pps) override {
     TORCH_CHECK(
         false,
         "cudaMallocAsync does not yet support setCheckpointPoolState. "
         "If you need it, please file an issue describing your use case.");
   }

   // Collects stats for device.
   // If device hasn't been used yet, returns 0s without creating a context.
   DeviceStats getDeviceStats(int device) override {
     assertValidDevice(device);

     // Memory currently reserved by the mempool
     uint64_t reserved_mem_current = 0;
     // High-water mark of memory reserved by the mempool since last reset
     uint64_t reserved_mem_peak = 0;
     // Memory currently in use by the mempool
     uint64_t used_mem_current = 0;
     // High-water mark of memory
     uint64_t used_mem_peak = 0;

     std::lock_guard<std::mutex> lk(general_mutex);

     if (devs_initialized_flags[device]) {
       CUDAGuard g(device);

       cudaMemPool_t mempool;
       C10_CUDA_CHECK(cudaDeviceGetDefaultMemPool(&mempool, device));
       C10_CUDA_CHECK(cudaMemPoolGetAttribute(
           mempool, cudaMemPoolAttrReservedMemCurrent, &reserved_mem_current));

       C10_CUDA_CHECK(cudaMemPoolGetAttribute(
           mempool, cudaMemPoolAttrReservedMemHigh, &reserved_mem_peak));

       C10_CUDA_CHECK(cudaMemPoolGetAttribute(
           mempool, cudaMemPoolAttrUsedMemCurrent, &used_mem_current));

       C10_CUDA_CHECK(cudaMemPoolGetAttribute(
           mempool, cudaMemPoolAttrUsedMemHigh, &used_mem_peak));
     }

     // Many stat types are specific to the native allocator. We leave these
     // untouched. Their "struct Stat"s will contain zeroed values.
     DeviceStats stats;

     // In the native allocator:
     // allocated_bytes is the total bytes of blocks that have been malloc()ed
     // and not yet free()d.
     // active_bytes is the total bytes of blocks that have been malloc()ed but
     // not yet released back into a free pool. In other words, it includes all
     // allocated_bytes, as well as the bytes of "limbo state" blocks had have
     // already been free()ed but not yet free_block()ed back into a pool due to
     // outstanding stream_uses.
     //
     // Here, in the cudaMallocAsync allocator:
     // We simply ask the driver's opinion about active memory.
     // We don't bother distinguishing between allocated_bytes and active_bytes.
     stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].current =
         used_mem_current;
     stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)].peak =
         used_mem_peak;
     stats.active_bytes[static_cast<size_t>(StatType::AGGREGATE)].current =
         used_mem_current;
     stats.active_bytes[static_cast<size_t>(StatType::AGGREGATE)].peak =
         used_mem_peak;
     stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current =
         reserved_mem_current;
     stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].peak =
         reserved_mem_peak;

     return stats;
   }

   void resetAccumulatedStats(int device) override {
     assertValidDevice(device);
     TORCH_WARN_ONCE(
         "For backend:cudaMallocAsync, resetAccumulatedStats has no effect.");
   }

   void resetPeakStats(int device) override {
     assertValidDevice(device);

     CUDAGuard g(device);
     cudaMemPool_t mempool;
     C10_CUDA_CHECK(cudaDeviceGetDefaultMemPool(&mempool, device));
     // Using zero as the reset value is the method recommended by Cuda driver
     // team. Vivek Kini says:
     //   "Resetting to zero (which is the only valid value when setting
     //    ReservedMemHigh) resets it to ReservedMemCurrent inside the driver
     //   (same goes for UsedMemHigh/UsedMemCurrent)"
     uint64_t zero = 0;
     C10_CUDA_CHECK(cudaMemPoolSetAttribute(
         mempool, cudaMemPoolAttrReservedMemHigh, &zero));
     C10_CUDA_CHECK(
         cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrUsedMemHigh, &zero));
   }

   SnapshotInfo snapshot() override {
     TORCH_CHECK(
         false,
         "Calling snapshot with backend:cudaMallocAsync is not meaningful. "
         "(For backend:native, snapshot returns a detailed summary of all "
         "blocks tracked by the allocator, but the cudaMallocAsync backend "
         "does not track individual blocks.)");
     // Alternative: TORCH_WARN
     return {};
   }

   // CUDAGraph interactions
   void beginAllocateStreamToPool(
       int device,
       cudaStream_t stream,
       MempoolId_t mempool_id) override {
     std::lock_guard<std::mutex> lk(general_mutex);

     TORCH_INTERNAL_ASSERT(capture_free_streams.empty());
     TORCH_CHECK(
         !capture_underway,
         "Only one capture at a time is allowed in a process.")
     capture_underway = true;
   }

   void endAllocateStreamToPool(int device, cudaStream_t) override {
     assertValidDevice(device);

     std::lock_guard<std::mutex> lk(general_mutex);

     TORCH_CHECK(
         capture_underway,
         "CudaMallocAsync::notifyCaptureAboutToEnd called, "
         "but CudaMallocAsync::capture_underway is false.");

     auto capture_stream = cuda::getCurrentCUDAStream(device);

     // See Note [Avoid dangling free streams during CUDA graph capture]
     for (const auto& free_stream : capture_free_streams) {
       // cudaEventRecord requires that the input event and stream are on the
       // same device.
       CUDAGuard g(free_stream.device);

       // CUDACachingAllocator.cpp uses raw cuda events, as do we.
       cudaEvent_t event;
       C10_CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
       C10_CUDA_CHECK(cudaEventRecord(event, free_stream.stream));
       C10_CUDA_CHECK(cudaStreamWaitEvent(capture_stream.stream(), event));
       C10_CUDA_CHECK(cudaEventDestroy(event));
     }

     capture_free_streams.clear();
     TORCH_CHECK(
         capture_underway,
         "CudaMallocAsync::notifyCaptureEnded called, "
         "but CudaMallocAsync::capture_underway is false.");
     capture_underway = false;
   }

   void releasePool(int device, MempoolId_t mempool_id) override {
     // Q: Do we need to do anything special here, like clear long-lived
     //    pointers created during the original capture (for example,
     //    tensors intended as the graph's I/O surface) that might still
     //    be resident in ptr_info?
     // A: I don't think so.
     //    Those allocations survived capture because the user held
     //    explicit tensor references to them,
     //    Those tensors' destructors will call freeAsync() on each pointer
     //    when the user is done with them.
     //    The freeAsync()s will probably incur
     //    TORCH_WARN("Attempting uncaptured free of a captured allocation..."
     //    but stale ptrs will not permanently leak into ptr_info.
   }

   void* raw_alloc(size_t nbytes) override {
     if (nbytes == 0) {
       return nullptr;
     }
     int device;
     C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
     void* r = nullptr;
     mallocAsync(&r, device, nbytes, cuda::getCurrentCUDAStream(device));
     return r;
   }

   void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) override {
     if (nbytes == 0) {
       return nullptr;
     }
     int device;
     C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
     void* r = nullptr;
     mallocAsync(&r, device, nbytes, stream);
     return r;
   }
   void raw_delete(void* ptr) override {
     freeAsync(ptr);
   }
   void enablePeerAccess(int dev, int dev_to_access) override {
     // Double-checks allocator backend hasn't changed, which would definitely be
     // an error. cudaMallocAsync pools are unaffected by
     // cudaDeviceEnablePeerAccess. We need pool-specific enablement. See
     // https://developer.nvidia.com/blog/using-cuda-stream-ordered-memory-allocator-part-2/
     c10::cuda::CUDAGuard device_guard(dev);
     cudaMemPool_t mempool;
     C10_CUDA_CHECK(cudaDeviceGetDefaultMemPool(&mempool, dev_to_access));
     cudaMemAccessDesc desc = {};
     desc.location.type = cudaMemLocationTypeDevice;
     desc.location.id = dev;
     desc.flags = cudaMemAccessFlagsProtReadWrite;
     C10_CUDA_CHECK(cudaMemPoolSetAccess(mempool, &desc, 1 /* numDescs */));
   }
   virtual cudaError_t memcpyAsync(
       void* dst,
       int dstDevice,
       const void* src,
       int srcDevice,
       size_t count,
       cudaStream_t stream,
       bool p2p_enabled) override {
     if (p2p_enabled || dstDevice == srcDevice) {
       return cudaMemcpyAsync(dst, src, count, cudaMemcpyDeviceToDevice, stream);
     } else {
       return cudaMemcpyPeerAsync(dst, dstDevice, src, srcDevice, count, stream);
     }
   }
   std::string name() override {
     return "cudaMallocAsync";
   }
 };

 CudaMallocAsyncAllocator device_allocator;

 void local_raw_delete(void* ptr) {
   freeAsync(ptr);
 }
 CUDAAllocator* allocator() {
   return &device_allocator;
 }

 #else
 CUDAAllocator* allocator() {
   TORCH_CHECK(false, "Cannot use cudaMallocAsyncAllocator with cuda < 11.4.");
   return nullptr;
 }

 #endif

 } // namespace CudaMallocAsync
 } // namespace CUDACachingAllocator
 } // namespace cuda
 } // namespace c10