Fix CUDA error not getting captured by handler (#92227)
Fixes #91758. Still leaves functions on the hotpath.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/92227
Approved by: https://github.com/ngimel, https://github.com/malfet
diff --git a/c10/cuda/CUDADeviceAssertionHost.cpp b/c10/cuda/CUDADeviceAssertionHost.cpp
index 7c898e2..68cb63c 100644
--- a/c10/cuda/CUDADeviceAssertionHost.cpp
+++ b/c10/cuda/CUDADeviceAssertionHost.cpp
@@ -13,8 +13,17 @@
#include <string>
#include <thread>
-#define CHECK_CUDA_API_CALL_WITHOUT_CHECKING_DEVICE_ASSERTS() \
- c10_cuda_check_implementation(__FILE__, __FUNCTION__, __LINE__, false)
+#define C10_CUDA_CHECK_WO_DSA(EXPR) \
+ do { \
+ const cudaError_t __err = EXPR; \
+ c10::cuda::c10_cuda_check_implementation( \
+ static_cast<int32_t>(__err), \
+ __FILE__, \
+ __func__, /* Line number data type not well-defined between \
+ compilers, so we perform an explicit cast */ \
+ static_cast<uint32_t>(__LINE__), \
+ false); \
+ } while (0)
namespace c10 {
namespace cuda {
@@ -27,8 +36,7 @@
/// an infinite initialization loop for CUDAKernelLaunchRegistry
int dsa_get_device_id() {
int device = -1;
- C10_CUDA_ERROR_HANDLED(cudaGetDevice(&device));
- CHECK_CUDA_API_CALL_WITHOUT_CHECKING_DEVICE_ASSERTS();
+ C10_CUDA_CHECK_WO_DSA(cudaGetDevice(&device));
return device;
}
@@ -41,9 +49,8 @@
/// an infinite initialization loop for CUDAKernelLaunchRegistry
int dsa_get_device_compute_capability(const int device_num) {
int compute_capability = -1;
- C10_CUDA_ERROR_HANDLED(cudaDeviceGetAttribute(
+ C10_CUDA_CHECK_WO_DSA(cudaDeviceGetAttribute(
&compute_capability, cudaDevAttrComputeCapabilityMajor, device_num));
- CHECK_CUDA_API_CALL_WITHOUT_CHECKING_DEVICE_ASSERTS();
return compute_capability;
}
#endif
@@ -53,8 +60,7 @@
/// an infinite initialization loop for CUDAKernelLaunchRegistry
int dsa_get_device_count() {
int device_count = -1;
- C10_CUDA_ERROR_HANDLED(cudaGetDeviceCount(&device_count));
- CHECK_CUDA_API_CALL_WITHOUT_CHECKING_DEVICE_ASSERTS();
+ C10_CUDA_CHECK_WO_DSA(cudaGetDeviceCount(&device_count));
return device_count;
}
@@ -288,25 +294,22 @@
// system
DeviceAssertionsData* uvm_assertions_ptr = nullptr;
- C10_CUDA_ERROR_HANDLED(
+ C10_CUDA_CHECK_WO_DSA(
cudaMallocManaged(&uvm_assertions_ptr, sizeof(DeviceAssertionsData)));
- CHECK_CUDA_API_CALL_WITHOUT_CHECKING_DEVICE_ASSERTS();
- C10_CUDA_ERROR_HANDLED(cudaMemAdvise(
+ C10_CUDA_CHECK_WO_DSA(cudaMemAdvise(
uvm_assertions_ptr,
sizeof(DeviceAssertionsData),
cudaMemAdviseSetPreferredLocation,
cudaCpuDeviceId));
- CHECK_CUDA_API_CALL_WITHOUT_CHECKING_DEVICE_ASSERTS();
// GPU will establish direct mapping of data in CPU memory, no page faults
// will be generated
- C10_CUDA_ERROR_HANDLED(cudaMemAdvise(
+ C10_CUDA_CHECK_WO_DSA(cudaMemAdvise(
uvm_assertions_ptr,
sizeof(DeviceAssertionsData),
cudaMemAdviseSetAccessedBy,
cudaCpuDeviceId));
- CHECK_CUDA_API_CALL_WITHOUT_CHECKING_DEVICE_ASSERTS();
// Initialize the memory from the CPU; otherwise, pages may have to be created
// on demand. We think that UVM documentation indicates that first access may
diff --git a/c10/cuda/CUDAException.cpp b/c10/cuda/CUDAException.cpp
index b6e9b9e..3be77dd 100644
--- a/c10/cuda/CUDAException.cpp
+++ b/c10/cuda/CUDAException.cpp
@@ -10,11 +10,12 @@
namespace cuda {
void c10_cuda_check_implementation(
+ const int32_t err,
const char* filename,
const char* function_name,
const int line_number,
const bool include_device_assertions) {
- const auto cuda_error = cudaGetLastError();
+ const auto cuda_error = static_cast<cudaError_t>(err);
const auto cuda_kernel_failure = include_device_assertions
? c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref().has_failed()
: false;
diff --git a/c10/cuda/CUDAException.h b/c10/cuda/CUDAException.h
index 101036c..c23fc50 100644
--- a/c10/cuda/CUDAException.h
+++ b/c10/cuda/CUDAException.h
@@ -24,17 +24,16 @@
};
} // namespace c10
-#define C10_CUDA_CHECK(EXPR) \
- do { \
- /* We get & disarm the error inside of */ \
- /* `c10_cuda_check_implementation` */ \
- C10_UNUSED const cudaError_t __err = EXPR; \
- c10::cuda::c10_cuda_check_implementation( \
- __FILE__, \
- __func__, /* Line number's data type is not well-defined between \
- compilers, so we perform an explicit cast */ \
- static_cast<uint32_t>(__LINE__), \
- true); \
+#define C10_CUDA_CHECK(EXPR) \
+ do { \
+ const cudaError_t __err = EXPR; \
+ c10::cuda::c10_cuda_check_implementation( \
+ static_cast<int32_t>(__err), \
+ __FILE__, \
+ __func__, /* Line number data type not well-defined between \
+ compilers, so we perform an explicit cast */ \
+ static_cast<uint32_t>(__LINE__), \
+ true); \
} while (0)
#define C10_CUDA_CHECK_WARN(EXPR) \
@@ -93,6 +92,7 @@
/// In the event of a CUDA failure, formats a nice error message about that
/// failure and also checks for device-side assertion failures
C10_CUDA_API void c10_cuda_check_implementation(
+ const int32_t err,
const char* filename,
const char* function_name,
const int line_number,