Fix CUDA error not getting captured by handler (#92227)

Fixes #91758. Still leaves functions on the hotpath.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/92227
Approved by: https://github.com/ngimel, https://github.com/malfet
diff --git a/c10/cuda/CUDADeviceAssertionHost.cpp b/c10/cuda/CUDADeviceAssertionHost.cpp
index 7c898e2..68cb63c 100644
--- a/c10/cuda/CUDADeviceAssertionHost.cpp
+++ b/c10/cuda/CUDADeviceAssertionHost.cpp
@@ -13,8 +13,17 @@
 #include <string>
 #include <thread>
 
-#define CHECK_CUDA_API_CALL_WITHOUT_CHECKING_DEVICE_ASSERTS() \
-  c10_cuda_check_implementation(__FILE__, __FUNCTION__, __LINE__, false)
+#define C10_CUDA_CHECK_WO_DSA(EXPR)                                 \
+  do {                                                              \
+    const cudaError_t __err = EXPR;                                 \
+    c10::cuda::c10_cuda_check_implementation(                       \
+        static_cast<int32_t>(__err),                                \
+        __FILE__,                                                   \
+        __func__, /* Line number data type not well-defined between \
+                      compilers, so we perform an explicit cast */  \
+        static_cast<uint32_t>(__LINE__),                            \
+        false);                                                     \
+  } while (0)
 
 namespace c10 {
 namespace cuda {
@@ -27,8 +36,7 @@
 /// an infinite initialization loop for CUDAKernelLaunchRegistry
 int dsa_get_device_id() {
   int device = -1;
-  C10_CUDA_ERROR_HANDLED(cudaGetDevice(&device));
-  CHECK_CUDA_API_CALL_WITHOUT_CHECKING_DEVICE_ASSERTS();
+  C10_CUDA_CHECK_WO_DSA(cudaGetDevice(&device));
   return device;
 }
 
@@ -41,9 +49,8 @@
 /// an infinite initialization loop for CUDAKernelLaunchRegistry
 int dsa_get_device_compute_capability(const int device_num) {
   int compute_capability = -1;
-  C10_CUDA_ERROR_HANDLED(cudaDeviceGetAttribute(
+  C10_CUDA_CHECK_WO_DSA(cudaDeviceGetAttribute(
       &compute_capability, cudaDevAttrComputeCapabilityMajor, device_num));
-  CHECK_CUDA_API_CALL_WITHOUT_CHECKING_DEVICE_ASSERTS();
   return compute_capability;
 }
 #endif
@@ -53,8 +60,7 @@
 /// an infinite initialization loop for CUDAKernelLaunchRegistry
 int dsa_get_device_count() {
   int device_count = -1;
-  C10_CUDA_ERROR_HANDLED(cudaGetDeviceCount(&device_count));
-  CHECK_CUDA_API_CALL_WITHOUT_CHECKING_DEVICE_ASSERTS();
+  C10_CUDA_CHECK_WO_DSA(cudaGetDeviceCount(&device_count));
   return device_count;
 }
 
@@ -288,25 +294,22 @@
   // system
   DeviceAssertionsData* uvm_assertions_ptr = nullptr;
 
-  C10_CUDA_ERROR_HANDLED(
+  C10_CUDA_CHECK_WO_DSA(
       cudaMallocManaged(&uvm_assertions_ptr, sizeof(DeviceAssertionsData)));
-  CHECK_CUDA_API_CALL_WITHOUT_CHECKING_DEVICE_ASSERTS();
 
-  C10_CUDA_ERROR_HANDLED(cudaMemAdvise(
+  C10_CUDA_CHECK_WO_DSA(cudaMemAdvise(
       uvm_assertions_ptr,
       sizeof(DeviceAssertionsData),
       cudaMemAdviseSetPreferredLocation,
       cudaCpuDeviceId));
-  CHECK_CUDA_API_CALL_WITHOUT_CHECKING_DEVICE_ASSERTS();
 
   // GPU will establish direct mapping of data in CPU memory, no page faults
   // will be generated
-  C10_CUDA_ERROR_HANDLED(cudaMemAdvise(
+  C10_CUDA_CHECK_WO_DSA(cudaMemAdvise(
       uvm_assertions_ptr,
       sizeof(DeviceAssertionsData),
       cudaMemAdviseSetAccessedBy,
       cudaCpuDeviceId));
-  CHECK_CUDA_API_CALL_WITHOUT_CHECKING_DEVICE_ASSERTS();
 
   // Initialize the memory from the CPU; otherwise, pages may have to be created
   // on demand. We think that UVM documentation indicates that first access may
diff --git a/c10/cuda/CUDAException.cpp b/c10/cuda/CUDAException.cpp
index b6e9b9e..3be77dd 100644
--- a/c10/cuda/CUDAException.cpp
+++ b/c10/cuda/CUDAException.cpp
@@ -10,11 +10,12 @@
 namespace cuda {
 
 void c10_cuda_check_implementation(
+    const int32_t err,
     const char* filename,
     const char* function_name,
     const int line_number,
     const bool include_device_assertions) {
-  const auto cuda_error = cudaGetLastError();
+  const auto cuda_error = static_cast<cudaError_t>(err);
   const auto cuda_kernel_failure = include_device_assertions
       ? c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().has_failed()
       : false;
diff --git a/c10/cuda/CUDAException.h b/c10/cuda/CUDAException.h
index 101036c..c23fc50 100644
--- a/c10/cuda/CUDAException.h
+++ b/c10/cuda/CUDAException.h
@@ -24,17 +24,16 @@
 };
 } // namespace c10
 
-#define C10_CUDA_CHECK(EXPR)                                             \
-  do {                                                                   \
-    /* We get & disarm the error inside of */                            \
-    /* `c10_cuda_check_implementation` */                                \
-    C10_UNUSED const cudaError_t __err = EXPR;                           \
-    c10::cuda::c10_cuda_check_implementation(                            \
-        __FILE__,                                                        \
-        __func__, /* Line number's data type is not well-defined between \
-                      compilers, so we perform an explicit cast */       \
-        static_cast<uint32_t>(__LINE__),                                 \
-        true);                                                           \
+#define C10_CUDA_CHECK(EXPR)                                        \
+  do {                                                              \
+    const cudaError_t __err = EXPR;                                 \
+    c10::cuda::c10_cuda_check_implementation(                       \
+        static_cast<int32_t>(__err),                                \
+        __FILE__,                                                   \
+        __func__, /* Line number data type not well-defined between \
+                      compilers, so we perform an explicit cast */  \
+        static_cast<uint32_t>(__LINE__),                            \
+        true);                                                      \
   } while (0)
 
 #define C10_CUDA_CHECK_WARN(EXPR)                              \
@@ -93,6 +92,7 @@
 /// In the event of a CUDA failure, formats a nice error message about that
 /// failure and also checks for device-side assertion failures
 C10_CUDA_API void c10_cuda_check_implementation(
+    const int32_t err,
     const char* filename,
     const char* function_name,
     const int line_number,