[caffe2][cuda] Trace `allocate` and `local_raw_delete` events with PyTorch USDTs (#107322)
Summary: Adds new tracepoints to CUDA allocator code for tracking alloc and dealloc events in the allocator code.
Test Plan: This change simply adds static tracepoints to CUDA allocator code, and does not otherwise change any logic. Testing is not required.
Reviewed By: chaekit
Differential Revision: D48229150
Pull Request resolved: https://github.com/pytorch/pytorch/pull/107322
Approved by: https://github.com/chaekit
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index e86cd24..25bf35b 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -9,6 +9,7 @@
#include <c10/util/flat_hash_map.h>
#include <c10/util/irange.h>
#include <c10/util/llvmMathExtras.h>
+#include <c10/util/static_tracepoint.h>
#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
#include <c10/cuda/driver_api.h>
@@ -33,6 +34,9 @@
#include <utility>
#include <vector>
+TORCH_SDT_DEFINE_SEMAPHORE(malloc)
+TORCH_SDT_DEFINE_SEMAPHORE(free)
+
namespace c10 {
C10_DEFINE_REGISTRY(FreeCudaMemoryCallbacksRegistry, FreeMemoryCallback);
@@ -3306,6 +3310,10 @@
return {r, r, &uncached_delete, Device(DeviceType::CUDA, device)};
}
if (size != 0) {
+ if (TORCH_SDT_IS_ENABLED(malloc)) {
+ TORCH_SDT_WITH_SEMAPHORE(malloc, &r, device, size, 0);
+ }
+
// Allocator declars allocate const!?
const_cast<NativeCachingAllocator*>(this)->malloc(
&r, device, size, cuda::getCurrentCUDAStream(device));
@@ -3483,6 +3491,10 @@
NativeCachingAllocator allocator;
void local_raw_delete(void* ptr) {
+ if (TORCH_SDT_IS_ENABLED(free)) {
+ TORCH_SDT_WITH_SEMAPHORE(free, ptr);
+ }
+
allocator.free(ptr);
}