[PyTorch GPU Allocator] Better use of blocks with rounding of allocation sizes (#74213) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/74213 In the current CUDACachingAllocator, the sizes are rounded up in multiple of blocks size of 512, so this works for smaller sizes. However for large sizes, we can have lots of different size blocks in the larger pool. This is problematic when we have variable batch sizes 1001, 1021, 1023 -> all will go to different block size and will create different size of blocks. This will create lots of unused blocks and will waste GPU memory capacity. This diff adds a rounding approach to allocation size. It rounds up the size to nearest power-of-2 divisions and the power2-division can be changed with env variable setting. For example, if we need to round-up size of1200 and if number of divisions is 4, the size 1200 lies between 1024 and 2048 and if we do 4 divisions between them, the values are 1024, 1280, 1536, and 1792. So the function will return 1280 as the nearest ceiling of power-2 division. env setting: export PYTORCH_CUDA_ALLOC_CONF=roundup_power2_divisions:4 ghstack-source-id: 151446017 Reviewed By: ezyang Differential Revision: D34868036 fbshipit-source-id: 494785add16e6b37c920dcb5a2b81d4c637b554a (cherry picked from commit 548454ccacbd8700e7ffd2d762e40b4ba37abbae)

commit: ac3effd150e829da8bac7525adadec2fad31ece1 [log] [tgz]
author: Banit Agrawal <bagrawal@fb.com> Tue Mar 15 19:24:39 2022 -0700
committer: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com> Wed Mar 16 02:53:53 2022 +0000
tree: db59cf4a95b05d598d4c945478a03b30752c199c
parent: c1d070d0f0ac32ff5a149e9179cbb4ead534156b [diff]
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index c1ac4bd..9c203da 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp

@@ -7,6 +7,7 @@
 #include <c10/util/UniqueVoidPtr.h>
 #include <c10/util/flat_hash_map.h>
 #include <c10/util/irange.h>
+#include <c10/util/llvmMathExtras.h>
 
 #include <cuda_runtime_api.h>
 #include <algorithm>
@@ -331,6 +332,14 @@
     return instance().m_max_split_size;
   }
 
+  // This is used to round-up allocation size to nearest power of 2 divisions.
+  // More description below in function roundup_power2_next_division
+  // As ane example, if we want 4 divisions between 2's power, this can be done
+  // using env variable: PYTORCH_CUDA_ALLOC_CONF=roundup_power2_divisions:4
+  static size_t roundup_power2_divisions() {
+    return instance().m_roundup_power2_divisions;
+  }
+
  private:
   static CachingAllocatorConfig& instance() {
     static CachingAllocatorConfig* s_instance = ([]() {
@@ -342,8 +351,10 @@
   }
 
   CachingAllocatorConfig()
-      : m_max_split_size(std::numeric_limits<size_t>::max()) {}
+      : m_max_split_size(std::numeric_limits<size_t>::max()),
+        m_roundup_power2_divisions(0) {}
   size_t m_max_split_size;
+  size_t m_roundup_power2_divisions;
 
   void parseArgs() {
     const char* val = getenv("PYTORCH_CUDA_ALLOC_CONF");
@@ -373,6 +384,13 @@
             val2 = std::min(
                 val2, (std::numeric_limits<size_t>::max() / (1024 * 1024)));
             m_max_split_size = val2 * 1024 * 1024;
+          } else if (kv[0].compare("roundup_power2_divisions") == 0) {
+            size_t val2 = stoi(kv[1]);
+            TORCH_CHECK(
+                llvm::isPowerOf2_64(val2),
+                "For roundups, the divisons has to be power of 2 ",
+                "");
+            m_roundup_power2_divisions = val2;
           } else {
             TORCH_CHECK(false, "Unrecognized CachingAllocator option: ", kv[0]);
           }
@@ -808,11 +826,43 @@
     return result;
   }
 
+  // This function takes the size and number of divisions argument and rounds
+  // up the size argument for the nearest power-of-2 division.
+  // For example, if we need to round-up 1200 and number of divisions is 4,
+  // the size 1200 lies between 1024 and 2048 and if we do 4 divisions between
+  // them, the values are 1024, 1280, 1536, and 1792. So the function will
+  // return 1280 as the nearest ceiling of power-2 divison.
+  static size_t roundup_power2_next_division(size_t size, size_t divisions) {
+    if (C10_UNLIKELY(size <= 4 || divisions <= 1)) {
+      return size;
+    }
+    if (llvm::isPowerOf2_64(size)) {
+      return size;
+    }
+
+    // divide the space between these 2's power into equal divisions
+    // If division is zero, return the power-of-2 ceiling.
+    size_t power2_floor = llvm::PowerOf2Floor(size);
+    size_t power2_divison =
+        power2_floor >> (63 - llvm::countLeadingZeros(divisions));
+    if (C10_UNLIKELY(power2_divison == 0)) {
+      return (power2_floor << 1);
+    }
+    size_t round_size_floor = size & (~(power2_divison - 1));
+    return (round_size_floor == size) ? size
+                                      : round_size_floor + power2_divison;
+  }
+
   static size_t round_size(size_t size) {
     if (size < kMinBlockSize) {
       return kMinBlockSize;
     } else {
-      return kMinBlockSize * ((size + kMinBlockSize - 1) / kMinBlockSize);
+      auto divisions = CachingAllocatorConfig::roundup_power2_divisions();
+      if (divisions > 0 && size > (kMinBlockSize * divisions)) {
+        return roundup_power2_next_division(size, divisions);
+      } else {
+        return kMinBlockSize * ((size + kMinBlockSize - 1) / kMinBlockSize);
+      }
     }
   }
 

diff --git a/docs/source/notes/cuda.rst b/docs/source/notes/cuda.rst
index b2901a6..c0de532 100644
--- a/docs/source/notes/cuda.rst
+++ b/docs/source/notes/cuda.rst

@@ -364,6 +364,18 @@
   :meth:`~torch.cuda.memory_summary` methods are useful for tuning.  This
   option should be used as a last resort for a workload that is aborting
   due to 'out of memory' and showing a large amount of inactive split blocks.
+* ``roundup_power2_divisions`` helps with rounding the requested allocation
+  size to nearest power-2 division and making better use of the blocks. In
+  the current CUDACachingAllocator, the sizes are rounded up in multiple
+  of blocks size of 512, so this works fine for smaller sizes. However, this
+  can be inefficient for large near-by allocations as each will go to different
+  size of blocks and re-use of those blocks are minimized. This might create
+  lots of unused blocks and will waste GPU memory capacity. This option enables
+  the rounding of allocation size to nearest power-2 division. For example, if
+  we need to round-up size of 1200 and if number of divisions is 4,
+  the size 1200 lies between 1024 and 2048 and if we do 4 divisions between
+  them, the values are 1024, 1280, 1536, and 1792. So, allocation size of 1200
+  will be rounded to 1280 as the nearest ceiling of power-2 division.
 
 .. _cufft-plan-cache:
commit	ac3effd150e829da8bac7525adadec2fad31ece1	[log] [tgz]
author	Banit Agrawal <bagrawal@fb.com>	Tue Mar 15 19:24:39 2022 -0700
committer	PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>	Wed Mar 16 02:53:53 2022 +0000
tree	db59cf4a95b05d598d4c945478a03b30752c199c
parent	c1d070d0f0ac32ff5a149e9179cbb4ead534156b [diff]