[caffe2][cuda] Trace `allocate` and `local_raw_delete` events with PyTorch USDTs (#107322)

Summary: Adds new tracepoints to CUDA allocator code for tracking alloc and dealloc events in the allocator code.

Test Plan: This change simply adds static tracepoints to CUDA allocator code, and does not otherwise change any logic. Testing is not required.

Reviewed By: chaekit

Differential Revision: D48229150

Pull Request resolved: https://github.com/pytorch/pytorch/pull/107322
Approved by: https://github.com/chaekit
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index e86cd24..25bf35b 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -9,6 +9,7 @@
 #include <c10/util/flat_hash_map.h>
 #include <c10/util/irange.h>
 #include <c10/util/llvmMathExtras.h>
+#include <c10/util/static_tracepoint.h>
 
 #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
 #include <c10/cuda/driver_api.h>
@@ -33,6 +34,9 @@
 #include <utility>
 #include <vector>
 
+TORCH_SDT_DEFINE_SEMAPHORE(malloc)
+TORCH_SDT_DEFINE_SEMAPHORE(free)
+
 namespace c10 {
 
 C10_DEFINE_REGISTRY(FreeCudaMemoryCallbacksRegistry, FreeMemoryCallback);
@@ -3306,6 +3310,10 @@
       return {r, r, &uncached_delete, Device(DeviceType::CUDA, device)};
     }
     if (size != 0) {
+      if (TORCH_SDT_IS_ENABLED(malloc)) {
+        TORCH_SDT_WITH_SEMAPHORE(malloc, &r, device, size, 0);
+      }
+
       // Allocator declars allocate const!?
       const_cast<NativeCachingAllocator*>(this)->malloc(
           &r, device, size, cuda::getCurrentCUDAStream(device));
@@ -3483,6 +3491,10 @@
 NativeCachingAllocator allocator;
 
 void local_raw_delete(void* ptr) {
+  if (TORCH_SDT_IS_ENABLED(free)) {
+    TORCH_SDT_WITH_SEMAPHORE(free, ptr);
+  }
+
   allocator.free(ptr);
 }