Crash fix cudaMallocAsync usage of TF_CUDA_MALLOC_ASYNC_SUPPORTED_PREALLOC.

commit: bfc518fab288dd8db2fbd4ca003c62c14b94fe88 [log] [tgz]
author: Frederic Bastien <fbastien@nvidia.com> Tue Jul 20 08:12:46 2021 -0700
committer: Frederic Bastien <fbastien@nvidia.com> Mon Aug 30 08:23:38 2021 -0700
tree: d70e8c2676fb8f6e4b6b59d39e06367b9dcafcc5
parent: 52666640be02c1441941cb71069b9d402f46f594 [diff]
diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc
index acc40fb..6d26a76 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc

@@ -100,7 +100,8 @@
 GpuCudaMallocAsyncAllocator::GpuCudaMallocAsyncAllocator(
     PlatformDeviceId platform_device_id, size_t pool_size, bool reserve_memory,
     bool compute_stats)
-    : name_(absl::StrCat("gpu_async_", platform_device_id.value())) {
+    : name_(absl::StrCat("gpu_async_", platform_device_id.value())),
+      reserve_memory_(reserve_memory) {
   ++number_instantiated_;
 #if TF_CUDA_MALLOC_ASYNC_SUPPORTED
   stream_exec_ = DeviceIdUtil::ExecutorForPlatformDeviceId(GPUMachineManager(),
@@ -255,25 +256,6 @@
   all_ids_->push_back(platform_device_id);
 
   VLOG(2) << Name() << " GpuCudaMallocAsyncAllocator PoolSize " << pool_size;
-  int64 prealloc_size = 0;
-  // TF_CUDA_MALLOC_ASYNC_SUPPORTED_PREALLOC=-1 is a special value that
-  // preallocates the total pool size.
-  TF_CHECK_OK(ReadInt64FromEnvVar("TF_CUDA_MALLOC_ASYNC_SUPPORTED_PREALLOC", 0,
-                                  &prealloc_size));
-  if (prealloc_size == -1) {
-    prealloc_size = pool_size;
-  } else if (reserve_memory) {
-    prealloc_size = pool_size;
-  }
-
-  if (prealloc_size != 0) {
-    void* ptr = AllocateRaw(0, prealloc_size);
-    DeallocateRaw(ptr);
-    VLOG(2) << Name() << " GpuCudaMallocAsyncAllocator reserved the pool for "
-            << prealloc_size << " bytes"
-            << ". First ptr: " << ptr;
-    ClearStats();
-  }
 #else   // TF_CUDA_MALLOC_ASYNC_SUPPORTED
   LOG(FATAL) << "GpuCudaMallocAsyncAllocator requires CUDA 11.2+";  // Crash OK.
 #endif  // TF_CUDA_MALLOC_ASYNC_SUPPORTED
@@ -397,4 +379,36 @@
   return true;
 }
 
+void GpuCudaMallocAsyncAllocator::SetStream(void* stream) {
+#if TF_CUDA_MALLOC_ASYNC_SUPPORTED
+  uint64_t pool_size_64 = 0;
+  if (auto status = cuMemPoolGetAttribute(
+          pool_, CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, &pool_size_64)) {
+    LOG(FATAL) <<  // Crash OK.
+        "Failed to get CUDA pool attribute: " << GetCudaErrorMessage(status);
+
+  }
+  cuda_stream_ = *(reinterpret_cast<CUstream*>(stream));
+  int64 prealloc_size = 0;
+  // TF_CUDA_MALLOC_ASYNC_SUPPORTED_PREALLOC=-1 is a special value that
+  // preallocates the total pool size.
+  TF_CHECK_OK(ReadInt64FromEnvVar("TF_CUDA_MALLOC_ASYNC_SUPPORTED_PREALLOC", 0,
+                                  &prealloc_size));
+  if (prealloc_size == -1) {
+    prealloc_size = pool_size_64;
+  } else if (reserve_memory_) {
+    prealloc_size = pool_size_64;
+  }
+
+  if (prealloc_size != 0) {
+    void* ptr = AllocateRaw(0, prealloc_size);
+    DeallocateRaw(ptr);
+    VLOG(2) << Name() << " GpuCudaMallocAsyncAllocator reserved the pool for "
+            << prealloc_size << " bytes"
+            << ". First ptr: " << ptr;
+    ClearStats();
+  }
+#endif
+}
+
 }  // namespace tensorflow

diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.h
index af61ad5..1d6ef9d 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.h

@@ -83,11 +83,7 @@
 
   bool ClearStats() override;
 
-  void SetStream(void* stream) override {
-#if TF_CUDA_MALLOC_ASYNC_SUPPORTED
-    cuda_stream_ = *(static_cast<CUstream*>(stream));
-#endif
-  }
+  void SetStream(void* stream) override;
 
   // With the right VLOG set, it prints:
   // - the number of ptr currently allocated per size (histogram).
@@ -120,6 +116,8 @@
 
   string name_;
 
+  bool reserve_memory_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(GpuCudaMallocAsyncAllocator);
 
   // Stats.

diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
index 560d211..c2a8318 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc

@@ -25,6 +25,7 @@
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -158,6 +159,39 @@
   EXPECT_EQ(status.code(), error::OK);
 }
 
+TEST_F(GPUDeviceTest, CudaMallocAsyncPreallocate) {
+  SessionOptions opts = MakeSessionOptions("0", 0, 1, {}, {},
+                                           /*use_cuda_malloc_async=*/true);
+  setenv("TF_CUDA_MALLOC_ASYNC_SUPPORTED_PREALLOC", "2048", 1);
+  std::vector<std::unique_ptr<Device>> devices;
+  Status status;
+
+  int number_instantiated = GpuCudaMallocAsyncAllocator::GetInstantiatedCountTestOnly();
+  { // The new scope is to trigger the destruction of the object.
+    status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+        opts, kDeviceNamePrefix, &devices);
+    EXPECT_EQ(devices.size(), 1);
+    Device* device = devices[0].get();
+    auto* device_info = device->tensorflow_gpu_device_info();
+    CHECK(device_info);
+    DeviceContext* device_context = device_info->default_context;
+
+    AllocatorAttributes allocator_attributes = AllocatorAttributes();
+    allocator_attributes.set_gpu_compatible(true);
+    Allocator* allocator = devices[0]->GetAllocator(allocator_attributes);
+    void* ptr = allocator->AllocateRaw(Allocator::kAllocatorAlignment,
+                                       1024);
+    EXPECT_NE(ptr, nullptr);
+    allocator->DeallocateRaw(ptr);
+  }
+
+  unsetenv("TF_CUDA_MALLOC_ASYNC_SUPPORTED_PREALLOC");
+
+  EXPECT_EQ(number_instantiated + 1 , GpuCudaMallocAsyncAllocator::GetInstantiatedCountTestOnly());
+
+  EXPECT_EQ(status.code(), error::OK);
+}
+
 TEST_F(GPUDeviceTest, FailedToParseVisibleDeviceList) {
   SessionOptions opts = MakeSessionOptions("0,abc");
   std::vector<std::unique_ptr<Device>> devices;
commit	bfc518fab288dd8db2fbd4ca003c62c14b94fe88	[log] [tgz]
author	Frederic Bastien <fbastien@nvidia.com>	Tue Jul 20 08:12:46 2021 -0700
committer	Frederic Bastien <fbastien@nvidia.com>	Mon Aug 30 08:23:38 2021 -0700
tree	d70e8c2676fb8f6e4b6b59d39e06367b9dcafcc5
parent	52666640be02c1441941cb71069b9d402f46f594 [diff]