expandable_segments <-> other allocator options (#134338) Previously setting garbage_collection_threshold or max_split_size_mb along with expandable_segments:True could cause the allocator to hit assert failures when running nearly out of memory. This PR ensures garbage_collection and max_split freeing do not accidentally try to release expandable segments. Pull Request resolved: https://github.com/pytorch/pytorch/pull/134338 Approved by: https://github.com/ezyang

commit: d91b49dbaacb3bc39a0d0126fa4ffeafe1f65889 [log] [tgz]
author: zdevito <zdevito@meta.com> Thu Aug 29 09:19:05 2024 -0700
committer: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com> Thu Aug 29 18:43:59 2024 +0000
tree: e56dc7ea0ee1ea8a96da6cb17cc69b8a9ca64fa1
parent: 3fc6e47d42ee88caeb0239c12db3af74285d7644 [diff]
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index bfab450..fa26a9f 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp

@@ -2611,7 +2611,7 @@
       while (it != large_blocks.blocks.end()) {
         Block* block = *it;
         ++it;
-        if (!block->is_split() &&
+        if (!block->is_split() && !block->expandable_segment_ &&
             static_cast<double>(block->gc_count()) >= age_threshold) {
           block_freed = true;
           gc_reclaimed += block->size;
@@ -2754,7 +2754,8 @@
         ? CUDAAllocatorConfig::max_split_size()
         : key.size;
     auto it = pool.blocks.lower_bound(&key);
-    if (it == pool.blocks.end() || (*it)->stream != p.stream()) {
+    if (it == pool.blocks.end() || (*it)->stream != p.stream() ||
+        (*it)->expandable_segment_) {
       // No single block is large enough; free multiple oversize blocks,
       // starting with the largest
       if (it == pool.blocks.begin())
@@ -2766,12 +2767,15 @@
              ((*it)->size >= CUDAAllocatorConfig::max_split_size()) &&
              ((*it)->stream == p.stream())) {
         auto cur = it;
-        totalReleased += (*it)->size;
-        if (it != pool.blocks.begin()) {
+        bool is_first = cur == pool.blocks.begin();
+        if (!is_first) {
           --it;
+        }
+        if (!(*cur)->expandable_segment_) {
           release_block(*cur, context);
-        } else {
-          release_block(*cur, context);
+          totalReleased += (*cur)->size;
+        }
+        if (is_first) {
           break;
         }
       }

diff --git a/test/test_cuda.py b/test/test_cuda.py
index b2f1ef3..f53ae5f 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py

@@ -4098,6 +4098,62 @@
         finally:
             torch.cuda.memory._record_memory_history(None)
 
+    def test_max_split_expandable(self):
+        torch.cuda.memory.empty_cache()
+        mb = 1024 * 1024
+        _, all_memory = torch.cuda.memory.mem_get_info()
+        total_allowed = 120 * mb
+        fraction_allowed = total_allowed / all_memory
+        assert int(fraction_allowed * all_memory) == total_allowed
+        torch.cuda.memory.set_per_process_memory_fraction(fraction_allowed)
+
+        def alloc(n):
+            return torch.ones(n * mb, dtype=torch.int8, device="cuda")
+
+        torch.cuda.memory._set_allocator_settings(
+            "expandable_segments:False,max_split_size_mb:40"
+        )
+        a = alloc(40)
+        torch.cuda.memory._set_allocator_settings(
+            "expandable_segments:True,max_split_size_mb:40"
+        )
+        b = alloc(40)
+        torch.cuda.memory._set_allocator_settings(
+            "expandable_segments:False,max_split_size_mb:40"
+        )
+        c = alloc(40)
+        with self.assertRaises(torch.OutOfMemoryError):
+            alloc(40)
+        del a, b, c
+        # force release_cached_blocks to run with some expandable segments in the free list
+        alloc(120)
+
+    def test_garbage_collect_expandable(self):
+        torch.cuda.memory.empty_cache()
+        mb = 1024 * 1024
+        _, all_memory = torch.cuda.memory.mem_get_info()
+        total_allowed = 120 * mb
+        fraction_allowed = total_allowed / all_memory
+        assert int(fraction_allowed * all_memory) == total_allowed
+        torch.cuda.memory.set_per_process_memory_fraction(fraction_allowed)
+
+        def alloc(n):
+            return torch.ones(n * mb, dtype=torch.int8, device="cuda")
+
+        torch.cuda.memory._set_allocator_settings(
+            "expandable_segments:False,garbage_collection_threshold:0.5"
+        )
+        a = alloc(40)
+        torch.cuda.memory._set_allocator_settings(
+            "expandable_segments:True,garbage_collection_threshold:0.5"
+        )
+        b = alloc(40)
+        del a, b
+        # causes GC to run. The expandable segment block will be split
+        # so GC would not attempt to free it anyway, but this at least makes sure
+        # expandable_segment blocks can be in the free list when this is called.
+        alloc(80)
+
     def test_allocator_settings(self):
         def power2_div(size, div_factor):
             pow2 = 1
commit	d91b49dbaacb3bc39a0d0126fa4ffeafe1f65889	[log] [tgz]
author	zdevito <zdevito@meta.com>	Thu Aug 29 09:19:05 2024 -0700
committer	PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>	Thu Aug 29 18:43:59 2024 +0000
tree	e56dc7ea0ee1ea8a96da6cb17cc69b8a9ca64fa1
parent	3fc6e47d42ee88caeb0239c12db3af74285d7644 [diff]