c10/cuda/CUDADeviceAssertionHost.cpp - platform/external/pytorch - Git at Google

 #include <c10/cuda/CUDADeviceAssertionHost.h>
 #include <c10/cuda/CUDAException.h>
 #include <c10/cuda/CUDAFunctions.h>
 #include <c10/util/Backtrace.h>
 #include <c10/util/Exception.h>
 #include <c10/util/irange.h>
 #include <cuda_runtime.h>

 #include <memory>
 #include <string>
 #ifdef TORCH_USE_CUDA_DSA
 #include <chrono>
 #include <thread>
 #endif

 #define C10_CUDA_CHECK_WO_DSA(EXPR)                                 \
   do {                                                              \
     const cudaError_t __err = EXPR;                                 \
     c10::cuda::c10_cuda_check_implementation(                       \
         static_cast<int32_t>(__err),                                \
         __FILE__,                                                   \
         __func__, /* Line number data type not well-defined between \
                       compilers, so we perform an explicit cast */  \
         static_cast<uint32_t>(__LINE__),                            \
         false);                                                     \
   } while (0)

 namespace c10::cuda {

 namespace {

 #ifdef TORCH_USE_CUDA_DSA
 /// Get current device id
 /// We need our own implementation of this function to prevent
 /// an infinite initialization loop for CUDAKernelLaunchRegistry
 int dsa_get_device_id() {
   c10::DeviceIndex device = -1;
   C10_CUDA_CHECK_WO_DSA(c10::cuda::GetDevice(&device));
   return device;
 }

 /// Get a device's compute capability - note that this dangerously assumes
 /// that if one CUDA GPU supports device-side assertions they all do. This is
 /// probably fine since the latest CUDA GPU that doesn't support UVM is the
 /// K80 released 2014-11-17. Mixing that GPU with a newer one is likely to be
 /// rare enough that the defensive
 /// We need our own implementation of this function to prevent
 /// an infinite initialization loop for CUDAKernelLaunchRegistry
 int dsa_get_device_compute_capability(const int device_num) {
   int compute_capability = -1;
   C10_CUDA_CHECK_WO_DSA(cudaDeviceGetAttribute(
       &compute_capability, cudaDevAttrComputeCapabilityMajor, device_num));
   return compute_capability;
 }
 #endif

 /// Get the number of CUDA devices
 /// We need our own implementation of this function to prevent
 /// an infinite initialization loop for CUDAKernelLaunchRegistry
 int dsa_get_device_count() {
   int device_count = -1;
   C10_CUDA_CHECK_WO_DSA(c10::cuda::GetDeviceCount(&device_count));
   return device_count;
 }

 bool dsa_check_if_all_devices_support_managed_memory() {
 // It looks as though this'll work best on CUDA GPUs with Pascal
 // architectures or newer, per
 // https://developer.nvidia.com/blog/unified-memory-cuda-beginners/
 #ifdef TORCH_USE_CUDA_DSA
   for (const auto i : c10::irange(dsa_get_device_count())) {
     if (dsa_get_device_compute_capability(i) < 6) {
       return false;
     }
   }
   return true;
 #else
   return false;
 #endif
 }

 bool env_flag_set(const char* env_var_name) {
   const char* const env_string = std::getenv(env_var_name);
   return (env_string == nullptr) ? false : std::strcmp(env_string, "0");
 }

 /// Deleter for UVM/managed memory pointers
 void uvm_deleter(DeviceAssertionsData* uvm_assertions_ptr) {
   // Ignore error in destructor
   if (uvm_assertions_ptr) {
     C10_CUDA_IGNORE_ERROR(cudaFree(uvm_assertions_ptr));
   }
 }

 } // namespace

 /// Check that kernels ran correctly by checking the message buffer. BLOCKING.
 std::string c10_retrieve_device_side_assertion_info() {
 #ifdef TORCH_USE_CUDA_DSA
   const auto& launch_registry = CUDAKernelLaunchRegistry::get_singleton_ref();
   if (!launch_registry.enabled_at_runtime) {
     return "Device-side assertion tracking was not enabled by user.";
   } else if (!launch_registry.do_all_devices_support_managed_memory) {
     return "Device-side assertions disabled because not all devices support managed memory.";
   }

   // Hack that saves a lot of challenging sync logic.
   // The GPU increments the number of errors it's observed and the CPU can see
   // that happening immediately which means we can make it here before the GPU
   // is done writing information about those errors to memory.
   // A short pause gives it time to finish. Since something's gone wrong, this
   // pause shouldn't affect perf.
   std::this_thread::sleep_for(std::chrono::seconds(1));

   // The snapshot causes a brief block. That's okay because this function only
   // executes if something's gone wrong such that speed is no longer a priority.
   const auto launch_data = launch_registry.snapshot();
   const auto& assertion_data = launch_data.first;
   const auto& launch_infos = launch_data.second;

   std::stringstream oss;

   oss << "Looking for device-side assertion failure information...\n";

   // Loop over each device that could be managed by the process
   for (const auto device_num : c10::irange(assertion_data.size())) {
     const auto& assertion_data_for_device = assertion_data.at(device_num);

     // Did anything fail?
     const auto failures_found = std::min(
         assertion_data_for_device.assertion_count,
         C10_CUDA_DSA_ASSERTION_COUNT);
     if (failures_found == 0) {
       continue;
     }

     // Something failed, let's talk about that
     oss << failures_found
         << " CUDA device-side assertion failures were found on GPU #"
         << device_num << "!" << std::endl;
     if (assertion_data_for_device.assertion_count >
         C10_CUDA_DSA_ASSERTION_COUNT) {
       oss << "But at least " << assertion_data_for_device.assertion_count
           << " assertion failures occurred on the device" << std::endl;
       oss << "Adjust `C10_CUDA_DSA_ASSERTION_COUNT` if you need more assertion failure info"
           << std::endl;
     }

     for (const auto i : c10::irange(failures_found)) {
       const auto& self = assertion_data_for_device.assertions[i];
       const auto& launch_info = launch_infos[self.caller % launch_infos.size()];
       oss << "Assertion failure " << i << std::endl;
       oss << "  GPU assertion failure message = " << self.assertion_msg
           << std::endl;
       oss << "  File containing assertion = " << self.filename << ":"
           << self.line_number << std::endl;
       oss << "  Device function containing assertion = " << self.function_name
           << std::endl;
       oss << "  Thread ID that failed assertion = [" << self.thread_id[0] << ","
           << self.thread_id[1] << "," << self.thread_id[2] << "]" << std::endl;
       oss << "  Block ID that failed assertion = [" << self.block_id[0] << ","
           << self.block_id[1] << "," << self.block_id[2] << "]" << std::endl;
       if (launch_info.generation_number == self.caller) {
         oss << "  File containing kernel launch = "
             << launch_info.launch_filename << ":" << launch_info.launch_linenum
             << std::endl;
         oss << "  Function containing kernel launch = "
             << launch_info.launch_function << std::endl;
         oss << "  Name of kernel launched that led to failure = "
             << launch_info.kernel_name << std::endl;
         oss << "  Device that launched kernel = " << launch_info.device
             << std::endl;
         oss << "  Stream kernel was launched on = " << launch_info.stream
             << std::endl;
         oss << "  Backtrace of kernel launch site = ";
         if (launch_registry.gather_launch_stacktrace) {
           oss << "Launch stacktracing disabled." << std::endl;
         } else {
           oss << "\n" << launch_info.launch_stacktrace << std::endl;
         }
       } else {
         oss << "  CPU launch site info: Unavailable, the circular queue wrapped around. Increase `CUDAKernelLaunchRegistry::max_size`."
             << std::endl;
       }
     }
   }
   return oss.str();
 #else
   return "Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n";
 #endif
 }

 CUDAKernelLaunchRegistry::CUDAKernelLaunchRegistry()
     : do_all_devices_support_managed_memory(
           dsa_check_if_all_devices_support_managed_memory()),
       gather_launch_stacktrace(check_env_for_enable_launch_stacktracing()),
       enabled_at_runtime(check_env_for_dsa_enabled()) {
   for (C10_UNUSED const auto _ : c10::irange(dsa_get_device_count())) {
     uvm_assertions.emplace_back(nullptr, uvm_deleter);
   }

   kernel_launches.resize(max_kernel_launches);
 }

 bool CUDAKernelLaunchRegistry::check_env_for_enable_launch_stacktracing()
     const {
   return env_flag_set("PYTORCH_CUDA_DSA_STACKTRACING");
 }

 bool CUDAKernelLaunchRegistry::check_env_for_dsa_enabled() const {
   return env_flag_set("PYTORCH_USE_CUDA_DSA");
 }

 uint32_t CUDAKernelLaunchRegistry::insert(
     const char* launch_filename,
     const char* launch_function,
     const uint32_t launch_linenum,
     const char* kernel_name,
     const int32_t stream_id) {
 #ifdef TORCH_USE_CUDA_DSA
   if (!enabled_at_runtime) {
     return 0;
   }

   const auto backtrace = gather_launch_stacktrace ? c10::get_backtrace() : "";

   const std::lock_guard<std::mutex> lock(read_write_mutex);

   const auto my_gen_number = generation_number++;
   // TODO: It would probably be good to get a stack trace here so that
   // we can better indicate which launch caused the failure.
   kernel_launches[my_gen_number % max_kernel_launches] = {
       launch_filename,
       launch_function,
       launch_linenum,
       backtrace,
       kernel_name,
       dsa_get_device_id(),
       stream_id,
       my_gen_number};
   return my_gen_number;
 #else
   return 0;
 #endif
 }

 std::pair<std::vector<DeviceAssertionsData>, std::vector<CUDAKernelLaunchInfo>>
 CUDAKernelLaunchRegistry::snapshot() const {
   // This is likely to be the longest-lasting hold on the mutex, but
   // we only expect it to be called in cases where we're already failing
   // and speed is no longer important
   const std::lock_guard<std::mutex> lock(read_write_mutex);

   std::vector<DeviceAssertionsData> device_assertions_data;
   for (const auto& x : uvm_assertions) {
     if (x) {
       device_assertions_data.push_back(*x);
     } else {
       device_assertions_data.emplace_back();
     }
   }

   return std::make_pair(device_assertions_data, kernel_launches);
 }

 DeviceAssertionsData* CUDAKernelLaunchRegistry::
     get_uvm_assertions_ptr_for_current_device() {
 #ifdef TORCH_USE_CUDA_DSA
   if (!enabled_at_runtime) {
     return nullptr;
   }

   const auto device_num = dsa_get_device_id();

   // If we've already set up this GPU with managed memory, return a pointer to
   // the managed memory. This is a lock-free quick-return path.
   if (uvm_assertions.at(device_num)) {
     return uvm_assertions.at(device_num).get();
   }

   // Need a lock here so there's not race-condition on creating the new device
   // assertions buffer
   const std::lock_guard<std::mutex> lock(gpu_alloc_mutex);

   // If we've already set up this GPU with managed memory, return a pointer to
   // the managed memory. This locked path ensures that the device memory is
   // allocated only once
   if (uvm_assertions.at(device_num)) {
     return uvm_assertions.at(device_num).get();
   }

   // Otherwise, set up the GPU to be able to use the device-side assertion
   // system
   DeviceAssertionsData* uvm_assertions_ptr = nullptr;

   C10_CUDA_CHECK_WO_DSA(
       cudaMallocManaged(&uvm_assertions_ptr, sizeof(DeviceAssertionsData)));

   C10_CUDA_CHECK_WO_DSA(cudaMemAdvise(
       uvm_assertions_ptr,
       sizeof(DeviceAssertionsData),
       cudaMemAdviseSetPreferredLocation,
       cudaCpuDeviceId));

   // GPU will establish direct mapping of data in CPU memory, no page faults
   // will be generated
   C10_CUDA_CHECK_WO_DSA(cudaMemAdvise(
       uvm_assertions_ptr,
       sizeof(DeviceAssertionsData),
       cudaMemAdviseSetAccessedBy,
       cudaCpuDeviceId));

   // Initialize the memory from the CPU; otherwise, pages may have to be created
   // on demand. We think that UVM documentation indicates that first access may
   // not honor preferred location, which would be bad, if true, because we want
   // this memory on the host so we can access it post-assertion. Initializing
   // this on the CPU helps ensure that that's where the memory will live.
   *uvm_assertions_ptr = DeviceAssertionsData();

   // Ownership and lifetime management of `uvm_assertions_ptr` now passes to the
   // uvm_assertions unique_ptr vector
   uvm_assertions.at(device_num).reset(uvm_assertions_ptr);

   return uvm_assertions_ptr;
 #else
   return nullptr;
 #endif
 }

 CUDAKernelLaunchRegistry& CUDAKernelLaunchRegistry::get_singleton_ref() {
   static CUDAKernelLaunchRegistry launch_registry;
   return launch_registry;
 }

 bool CUDAKernelLaunchRegistry::has_failed() const {
   for (const auto& x : uvm_assertions) {
     if (x && x->assertion_count > 0) {
       return true;
     }
   }
   return false;
 }

 } // namespace c10::cuda
	#include <c10/cuda/CUDADeviceAssertionHost.h>
	#include <c10/cuda/CUDAException.h>
	#include <c10/cuda/CUDAFunctions.h>
	#include <c10/util/Backtrace.h>
	#include <c10/util/Exception.h>
	#include <c10/util/irange.h>
	#include <cuda_runtime.h>

	#include <memory>
	#include <string>
	#ifdef TORCH_USE_CUDA_DSA
	#include <chrono>
	#include <thread>
	#endif

	#define C10_CUDA_CHECK_WO_DSA(EXPR) \
	do { \
	const cudaError_t __err = EXPR; \
	c10::cuda::c10_cuda_check_implementation( \
	static_cast<int32_t>(__err), \
	__FILE__, \
	__func__, /* Line number data type not well-defined between \
	compilers, so we perform an explicit cast */ \
	static_cast<uint32_t>(__LINE__), \
	false); \
	} while (0)

	namespace c10::cuda {

	namespace {

	#ifdef TORCH_USE_CUDA_DSA
	/// Get current device id
	/// We need our own implementation of this function to prevent
	/// an infinite initialization loop for CUDAKernelLaunchRegistry
	int dsa_get_device_id() {
	c10::DeviceIndex device = -1;
	C10_CUDA_CHECK_WO_DSA(c10::cuda::GetDevice(&device));
	return device;
	}

	/// Get a device's compute capability - note that this dangerously assumes
	/// that if one CUDA GPU supports device-side assertions they all do. This is
	/// probably fine since the latest CUDA GPU that doesn't support UVM is the
	/// K80 released 2014-11-17. Mixing that GPU with a newer one is likely to be
	/// rare enough that the defensive
	/// We need our own implementation of this function to prevent
	/// an infinite initialization loop for CUDAKernelLaunchRegistry
	int dsa_get_device_compute_capability(const int device_num) {
	int compute_capability = -1;
	C10_CUDA_CHECK_WO_DSA(cudaDeviceGetAttribute(
	&compute_capability, cudaDevAttrComputeCapabilityMajor, device_num));
	return compute_capability;
	}
	#endif

	/// Get the number of CUDA devices
	/// We need our own implementation of this function to prevent
	/// an infinite initialization loop for CUDAKernelLaunchRegistry
	int dsa_get_device_count() {
	int device_count = -1;
	C10_CUDA_CHECK_WO_DSA(c10::cuda::GetDeviceCount(&device_count));
	return device_count;
	}

	bool dsa_check_if_all_devices_support_managed_memory() {
	// It looks as though this'll work best on CUDA GPUs with Pascal
	// architectures or newer, per
	// https://developer.nvidia.com/blog/unified-memory-cuda-beginners/
	#ifdef TORCH_USE_CUDA_DSA
	for (const auto i : c10::irange(dsa_get_device_count())) {
	if (dsa_get_device_compute_capability(i) < 6) {
	return false;
	}
	}
	return true;
	#else
	return false;
	#endif
	}

	bool env_flag_set(const char* env_var_name) {
	const char* const env_string = std::getenv(env_var_name);
	return (env_string == nullptr) ? false : std::strcmp(env_string, "0");
	}

	/// Deleter for UVM/managed memory pointers
	void uvm_deleter(DeviceAssertionsData* uvm_assertions_ptr) {
	// Ignore error in destructor
	if (uvm_assertions_ptr) {
	C10_CUDA_IGNORE_ERROR(cudaFree(uvm_assertions_ptr));
	}
	}

	} // namespace

	/// Check that kernels ran correctly by checking the message buffer. BLOCKING.
	std::string c10_retrieve_device_side_assertion_info() {
	#ifdef TORCH_USE_CUDA_DSA
	const auto& launch_registry = CUDAKernelLaunchRegistry::get_singleton_ref();
	if (!launch_registry.enabled_at_runtime) {
	return "Device-side assertion tracking was not enabled by user.";
	} else if (!launch_registry.do_all_devices_support_managed_memory) {
	return "Device-side assertions disabled because not all devices support managed memory.";
	}

	// Hack that saves a lot of challenging sync logic.
	// The GPU increments the number of errors it's observed and the CPU can see
	// that happening immediately which means we can make it here before the GPU
	// is done writing information about those errors to memory.
	// A short pause gives it time to finish. Since something's gone wrong, this
	// pause shouldn't affect perf.
	std::this_thread::sleep_for(std::chrono::seconds(1));

	// The snapshot causes a brief block. That's okay because this function only
	// executes if something's gone wrong such that speed is no longer a priority.
	const auto launch_data = launch_registry.snapshot();
	const auto& assertion_data = launch_data.first;
	const auto& launch_infos = launch_data.second;

	std::stringstream oss;

	oss << "Looking for device-side assertion failure information...\n";

	// Loop over each device that could be managed by the process
	for (const auto device_num : c10::irange(assertion_data.size())) {
	const auto& assertion_data_for_device = assertion_data.at(device_num);

	// Did anything fail?
	const auto failures_found = std::min(
	assertion_data_for_device.assertion_count,
	C10_CUDA_DSA_ASSERTION_COUNT);
	if (failures_found == 0) {
	continue;
	}

	// Something failed, let's talk about that
	oss << failures_found
	<< " CUDA device-side assertion failures were found on GPU #"
	<< device_num << "!" << std::endl;
	if (assertion_data_for_device.assertion_count >
	C10_CUDA_DSA_ASSERTION_COUNT) {
	oss << "But at least " << assertion_data_for_device.assertion_count
	<< " assertion failures occurred on the device" << std::endl;
	oss << "Adjust `C10_CUDA_DSA_ASSERTION_COUNT` if you need more assertion failure info"
	<< std::endl;
	}

	for (const auto i : c10::irange(failures_found)) {
	const auto& self = assertion_data_for_device.assertions[i];
	const auto& launch_info = launch_infos[self.caller % launch_infos.size()];
	oss << "Assertion failure " << i << std::endl;
	oss << " GPU assertion failure message = " << self.assertion_msg
	<< std::endl;
	oss << " File containing assertion = " << self.filename << ":"
	<< self.line_number << std::endl;
	oss << " Device function containing assertion = " << self.function_name
	<< std::endl;
	oss << " Thread ID that failed assertion = [" << self.thread_id[0] << ","
	<< self.thread_id[1] << "," << self.thread_id[2] << "]" << std::endl;
	oss << " Block ID that failed assertion = [" << self.block_id[0] << ","
	<< self.block_id[1] << "," << self.block_id[2] << "]" << std::endl;
	if (launch_info.generation_number == self.caller) {
	oss << " File containing kernel launch = "
	<< launch_info.launch_filename << ":" << launch_info.launch_linenum
	<< std::endl;
	oss << " Function containing kernel launch = "
	<< launch_info.launch_function << std::endl;
	oss << " Name of kernel launched that led to failure = "
	<< launch_info.kernel_name << std::endl;
	oss << " Device that launched kernel = " << launch_info.device
	<< std::endl;
	oss << " Stream kernel was launched on = " << launch_info.stream
	<< std::endl;
	oss << " Backtrace of kernel launch site = ";
	if (launch_registry.gather_launch_stacktrace) {
	oss << "Launch stacktracing disabled." << std::endl;
	} else {
	oss << "\n" << launch_info.launch_stacktrace << std::endl;
	}
	} else {
	oss << " CPU launch site info: Unavailable, the circular queue wrapped around. Increase `CUDAKernelLaunchRegistry::max_size`."
	<< std::endl;
	}
	}
	}
	return oss.str();
	#else
	return "Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n";
	#endif
	}

	CUDAKernelLaunchRegistry::CUDAKernelLaunchRegistry()
	: do_all_devices_support_managed_memory(
	dsa_check_if_all_devices_support_managed_memory()),
	gather_launch_stacktrace(check_env_for_enable_launch_stacktracing()),
	enabled_at_runtime(check_env_for_dsa_enabled()) {
	for (C10_UNUSED const auto _ : c10::irange(dsa_get_device_count())) {
	uvm_assertions.emplace_back(nullptr, uvm_deleter);
	}

	kernel_launches.resize(max_kernel_launches);
	}

	bool CUDAKernelLaunchRegistry::check_env_for_enable_launch_stacktracing()
	const {
	return env_flag_set("PYTORCH_CUDA_DSA_STACKTRACING");
	}

	bool CUDAKernelLaunchRegistry::check_env_for_dsa_enabled() const {
	return env_flag_set("PYTORCH_USE_CUDA_DSA");
	}

	uint32_t CUDAKernelLaunchRegistry::insert(
	const char* launch_filename,
	const char* launch_function,
	const uint32_t launch_linenum,
	const char* kernel_name,
	const int32_t stream_id) {
	#ifdef TORCH_USE_CUDA_DSA
	if (!enabled_at_runtime) {
	return 0;
	}

	const auto backtrace = gather_launch_stacktrace ? c10::get_backtrace() : "";

	const std::lock_guard<std::mutex> lock(read_write_mutex);

	const auto my_gen_number = generation_number++;
	// TODO: It would probably be good to get a stack trace here so that
	// we can better indicate which launch caused the failure.
	kernel_launches[my_gen_number % max_kernel_launches] = {
	launch_filename,
	launch_function,
	launch_linenum,
	backtrace,
	kernel_name,
	dsa_get_device_id(),
	stream_id,
	my_gen_number};
	return my_gen_number;
	#else
	return 0;
	#endif
	}

	std::pair<std::vector<DeviceAssertionsData>, std::vector<CUDAKernelLaunchInfo>>
	CUDAKernelLaunchRegistry::snapshot() const {
	// This is likely to be the longest-lasting hold on the mutex, but
	// we only expect it to be called in cases where we're already failing
	// and speed is no longer important
	const std::lock_guard<std::mutex> lock(read_write_mutex);

	std::vector<DeviceAssertionsData> device_assertions_data;
	for (const auto& x : uvm_assertions) {
	if (x) {
	device_assertions_data.push_back(*x);
	} else {
	device_assertions_data.emplace_back();
	}
	}

	return std::make_pair(device_assertions_data, kernel_launches);
	}

	DeviceAssertionsData* CUDAKernelLaunchRegistry::
	get_uvm_assertions_ptr_for_current_device() {
	#ifdef TORCH_USE_CUDA_DSA
	if (!enabled_at_runtime) {
	return nullptr;
	}

	const auto device_num = dsa_get_device_id();

	// If we've already set up this GPU with managed memory, return a pointer to
	// the managed memory. This is a lock-free quick-return path.
	if (uvm_assertions.at(device_num)) {
	return uvm_assertions.at(device_num).get();
	}

	// Need a lock here so there's not race-condition on creating the new device
	// assertions buffer
	const std::lock_guard<std::mutex> lock(gpu_alloc_mutex);

	// If we've already set up this GPU with managed memory, return a pointer to
	// the managed memory. This locked path ensures that the device memory is
	// allocated only once
	if (uvm_assertions.at(device_num)) {
	return uvm_assertions.at(device_num).get();
	}

	// Otherwise, set up the GPU to be able to use the device-side assertion
	// system
	DeviceAssertionsData* uvm_assertions_ptr = nullptr;

	C10_CUDA_CHECK_WO_DSA(
	cudaMallocManaged(&uvm_assertions_ptr, sizeof(DeviceAssertionsData)));

	C10_CUDA_CHECK_WO_DSA(cudaMemAdvise(
	uvm_assertions_ptr,
	sizeof(DeviceAssertionsData),
	cudaMemAdviseSetPreferredLocation,
	cudaCpuDeviceId));

	// GPU will establish direct mapping of data in CPU memory, no page faults
	// will be generated
	C10_CUDA_CHECK_WO_DSA(cudaMemAdvise(
	uvm_assertions_ptr,
	sizeof(DeviceAssertionsData),
	cudaMemAdviseSetAccessedBy,
	cudaCpuDeviceId));

	// Initialize the memory from the CPU; otherwise, pages may have to be created
	// on demand. We think that UVM documentation indicates that first access may
	// not honor preferred location, which would be bad, if true, because we want
	// this memory on the host so we can access it post-assertion. Initializing
	// this on the CPU helps ensure that that's where the memory will live.
	*uvm_assertions_ptr = DeviceAssertionsData();

	// Ownership and lifetime management of `uvm_assertions_ptr` now passes to the
	// uvm_assertions unique_ptr vector
	uvm_assertions.at(device_num).reset(uvm_assertions_ptr);

	return uvm_assertions_ptr;
	#else
	return nullptr;
	#endif
	}

	CUDAKernelLaunchRegistry& CUDAKernelLaunchRegistry::get_singleton_ref() {
	static CUDAKernelLaunchRegistry launch_registry;
	return launch_registry;
	}

	bool CUDAKernelLaunchRegistry::has_failed() const {
	for (const auto& x : uvm_assertions) {
	if (x && x->assertion_count > 0) {
	return true;
	}
	}
	return false;
	}

	} // namespace c10::cuda