c10/cuda/CUDADeviceAssertionHost.h - platform/external/pytorch - Git at Google

 #pragma once

 #include <c10/cuda/CUDAMacros.h>

 #include <memory>
 #include <mutex>
 #include <string>
 #include <vector>

 #ifdef USE_CUDA
 #define TORCH_USE_CUDA_DSA
 #endif

 /// Number of assertion failure messages we can store. If this is too small
 /// threads will fail silently.
 constexpr int C10_CUDA_DSA_ASSERTION_COUNT = 10;
 constexpr int C10_CUDA_DSA_MAX_STR_LEN = 512;

 namespace c10 {
 namespace cuda {

 /// Holds information about any device-side assertions that fail.
 /// Held in managed memory and access by both the CPU and the GPU.
 struct DeviceAssertionData {
   /// Stringification of the assertion
   char assertion_msg[C10_CUDA_DSA_MAX_STR_LEN];
   /// File the assertion was in
   char filename[C10_CUDA_DSA_MAX_STR_LEN];
   /// Name of the function the assertion was in
   char function_name[C10_CUDA_DSA_MAX_STR_LEN];
   /// Line number the assertion was at
   int line_number;
   /// Number uniquely identifying the kernel launch that triggered the assertion
   uint32_t caller;
   /// block_id of the thread that failed the assertion
   int32_t block_id[3];
   /// third_id of the thread that failed the assertion
   int32_t thread_id[3];
 };

 /// Used to hold assertions generated by the device
 /// Held in managed memory and access by both the CPU and the GPU.
 struct DeviceAssertionsData {
   /// Total number of assertions found; a subset of thse will be recorded
   /// in `assertions`
   int32_t assertion_count;
   /// An array of assertions that will be written to in a race-free manner
   DeviceAssertionData assertions[C10_CUDA_DSA_ASSERTION_COUNT];
 };

 /// Use to hold info about kernel launches so that we can run kernels
 /// asynchronously and still associate launches with device-side
 /// assertion failures
 struct CUDAKernelLaunchInfo {
   /// Filename of the code where the kernel was launched from
   const char* launch_filename;
   /// Function from which the kernel was launched
   const char* launch_function;
   /// Line number of where the code was launched from
   uint32_t launch_linenum;
   /// Backtrace of where the kernel was launched from, only populated if
   /// CUDAKernelLaunchRegistry::gather_launch_stacktrace is True
   std::string launch_stacktrace;
   /// Kernel that was launched
   const char* kernel_name;
   /// Device the kernel was launched on
   int device;
   /// Stream the kernel was launched on
   int32_t stream;
   /// A number that uniquely identifies the kernel launch
   uint64_t generation_number;
 };

 /// Circular buffer used to hold information about kernel launches
 /// this is later used to reconstruct how a device-side kernel assertion failure
 /// occurred CUDAKernelLaunchRegistry is used as a singleton
 class C10_CUDA_API CUDAKernelLaunchRegistry {
  private:
   /// Assume that this is the max number of kernel launches that might ever be
   /// enqueued across all streams on a single device
   static constexpr int max_kernel_launches = 1024;
   /// How many kernel launch infos we've inserted. Used to ensure that circular
   /// queue doesn't provide false information by always increasing, but also to
   /// mark where we are inserting into the queue
 #ifdef TORCH_USE_CUDA_DSA
   uint64_t generation_number = 0;
 #endif
   /// Shared mutex between writer and accessor to ensure multi-threaded safety.
   mutable std::mutex read_write_mutex;
   /// Used to ensure prevent race conditions in GPU memory allocation
   mutable std::mutex gpu_alloc_mutex;
   /// Pointer to managed memory keeping track of device-side assertions. There
   /// is one entry for each possible device the process might work with. Unused
   /// entries are nullptrs. We could also use an unordered_set here, but this
   /// vector design will be faster and the wasted memory is small since we
   /// expect the number of GPUs per node will always be small
   std::vector<
       std::unique_ptr<DeviceAssertionsData, void (*)(DeviceAssertionsData*)>>
       uvm_assertions;
   /// A single circular buffer holds information about every kernel launch the
   /// process makes across all devices.
   std::vector<CUDAKernelLaunchInfo> kernel_launches;
   bool check_env_for_enable_launch_stacktracing() const;
   bool check_env_for_dsa_enabled() const;

  public:
   CUDAKernelLaunchRegistry();
   /// Register a new kernel launch and obtain a generation number back to be
   /// passed to the kernel
   uint32_t insert(
       const char* launch_filename,
       const char* launch_function,
       const uint32_t launch_linenum,
       const char* kernel_name,
       const int32_t stream_id);
   /// Get copies of the kernel launch registry and each device's assertion
   /// failure buffer so they can be inspected without raising race conditions
   std::
       pair<std::vector<DeviceAssertionsData>, std::vector<CUDAKernelLaunchInfo>>
       snapshot() const;
   /// Get a pointer to the current device's assertion failure buffer. If no such
   /// buffer exists then one is created. This means that the first kernel launch
   /// made on each device will be slightly slower because memory allocations are
   /// required
   DeviceAssertionsData* get_uvm_assertions_ptr_for_current_device();
   /// Gets the global singleton of the registry
   static CUDAKernelLaunchRegistry& get_singleton_ref();
   /// If not all devices support DSA, we disable it
   const bool do_all_devices_support_managed_memory = false;
   /// Whether or not to gather stack traces when launching kernels
   bool gather_launch_stacktrace = false;
   /// Whether or not host-side DSA is enabled or disabled at run-time
   /// Note: Device-side code cannot be enabled/disabled at run-time
   bool enabled_at_runtime = false;
   /// Whether or not a device has indicated a failure
   bool has_failed() const;
 #ifdef TORCH_USE_CUDA_DSA
   const bool enabled_at_compile_time = true;
 #else
   const bool enabled_at_compile_time = false;
 #endif
 };

 std::string c10_retrieve_device_side_assertion_info();

 } // namespace cuda
 } // namespace c10

 // Each kernel launched with TORCH_DSA_KERNEL_LAUNCH
 // requires the same input arguments. We introduce the following macro to
 // standardize these.
 #define TORCH_DSA_KERNEL_ARGS                                              \
   [[maybe_unused]] c10::cuda::DeviceAssertionsData *const assertions_data, \
       [[maybe_unused]] uint32_t assertion_caller_id

 // This macro can be used to pass the DSA arguments onward to another
 // function
 #define TORCH_DSA_KERNEL_ARGS_PASS assertions_data, assertion_caller_id
	#pragma once

	#include <c10/cuda/CUDAMacros.h>

	#include <memory>
	#include <mutex>
	#include <string>
	#include <vector>

	#ifdef USE_CUDA
	#define TORCH_USE_CUDA_DSA
	#endif

	/// Number of assertion failure messages we can store. If this is too small
	/// threads will fail silently.
	constexpr int C10_CUDA_DSA_ASSERTION_COUNT = 10;
	constexpr int C10_CUDA_DSA_MAX_STR_LEN = 512;

	namespace c10 {
	namespace cuda {

	/// Holds information about any device-side assertions that fail.
	/// Held in managed memory and access by both the CPU and the GPU.
	struct DeviceAssertionData {
	/// Stringification of the assertion
	char assertion_msg[C10_CUDA_DSA_MAX_STR_LEN];
	/// File the assertion was in
	char filename[C10_CUDA_DSA_MAX_STR_LEN];
	/// Name of the function the assertion was in
	char function_name[C10_CUDA_DSA_MAX_STR_LEN];
	/// Line number the assertion was at
	int line_number;
	/// Number uniquely identifying the kernel launch that triggered the assertion
	uint32_t caller;
	/// block_id of the thread that failed the assertion
	int32_t block_id[3];
	/// third_id of the thread that failed the assertion
	int32_t thread_id[3];
	};

	/// Used to hold assertions generated by the device
	/// Held in managed memory and access by both the CPU and the GPU.
	struct DeviceAssertionsData {
	/// Total number of assertions found; a subset of thse will be recorded
	/// in `assertions`
	int32_t assertion_count;
	/// An array of assertions that will be written to in a race-free manner
	DeviceAssertionData assertions[C10_CUDA_DSA_ASSERTION_COUNT];
	};

	/// Use to hold info about kernel launches so that we can run kernels
	/// asynchronously and still associate launches with device-side
	/// assertion failures
	struct CUDAKernelLaunchInfo {
	/// Filename of the code where the kernel was launched from
	const char* launch_filename;
	/// Function from which the kernel was launched
	const char* launch_function;
	/// Line number of where the code was launched from
	uint32_t launch_linenum;
	/// Backtrace of where the kernel was launched from, only populated if
	/// CUDAKernelLaunchRegistry::gather_launch_stacktrace is True
	std::string launch_stacktrace;
	/// Kernel that was launched
	const char* kernel_name;
	/// Device the kernel was launched on
	int device;
	/// Stream the kernel was launched on
	int32_t stream;
	/// A number that uniquely identifies the kernel launch
	uint64_t generation_number;
	};

	/// Circular buffer used to hold information about kernel launches
	/// this is later used to reconstruct how a device-side kernel assertion failure
	/// occurred CUDAKernelLaunchRegistry is used as a singleton
	class C10_CUDA_API CUDAKernelLaunchRegistry {
	private:
	/// Assume that this is the max number of kernel launches that might ever be
	/// enqueued across all streams on a single device
	static constexpr int max_kernel_launches = 1024;
	/// How many kernel launch infos we've inserted. Used to ensure that circular
	/// queue doesn't provide false information by always increasing, but also to
	/// mark where we are inserting into the queue
	#ifdef TORCH_USE_CUDA_DSA
	uint64_t generation_number = 0;
	#endif
	/// Shared mutex between writer and accessor to ensure multi-threaded safety.
	mutable std::mutex read_write_mutex;
	/// Used to ensure prevent race conditions in GPU memory allocation
	mutable std::mutex gpu_alloc_mutex;
	/// Pointer to managed memory keeping track of device-side assertions. There
	/// is one entry for each possible device the process might work with. Unused
	/// entries are nullptrs. We could also use an unordered_set here, but this
	/// vector design will be faster and the wasted memory is small since we
	/// expect the number of GPUs per node will always be small
	std::vector<
	std::unique_ptr<DeviceAssertionsData, void ()(DeviceAssertionsData)>>
	uvm_assertions;
	/// A single circular buffer holds information about every kernel launch the
	/// process makes across all devices.
	std::vector<CUDAKernelLaunchInfo> kernel_launches;
	bool check_env_for_enable_launch_stacktracing() const;
	bool check_env_for_dsa_enabled() const;

	public:
	CUDAKernelLaunchRegistry();
	/// Register a new kernel launch and obtain a generation number back to be
	/// passed to the kernel
	uint32_t insert(
	const char* launch_filename,
	const char* launch_function,
	const uint32_t launch_linenum,
	const char* kernel_name,
	const int32_t stream_id);
	/// Get copies of the kernel launch registry and each device's assertion
	/// failure buffer so they can be inspected without raising race conditions
	std::
	pair<std::vector<DeviceAssertionsData>, std::vector<CUDAKernelLaunchInfo>>
	snapshot() const;
	/// Get a pointer to the current device's assertion failure buffer. If no such
	/// buffer exists then one is created. This means that the first kernel launch
	/// made on each device will be slightly slower because memory allocations are
	/// required
	DeviceAssertionsData* get_uvm_assertions_ptr_for_current_device();
	/// Gets the global singleton of the registry
	static CUDAKernelLaunchRegistry& get_singleton_ref();
	/// If not all devices support DSA, we disable it
	const bool do_all_devices_support_managed_memory = false;
	/// Whether or not to gather stack traces when launching kernels
	bool gather_launch_stacktrace = false;
	/// Whether or not host-side DSA is enabled or disabled at run-time
	/// Note: Device-side code cannot be enabled/disabled at run-time
	bool enabled_at_runtime = false;
	/// Whether or not a device has indicated a failure
	bool has_failed() const;
	#ifdef TORCH_USE_CUDA_DSA
	const bool enabled_at_compile_time = true;
	#else
	const bool enabled_at_compile_time = false;
	#endif
	};

	std::string c10_retrieve_device_side_assertion_info();

	} // namespace cuda
	} // namespace c10

	// Each kernel launched with TORCH_DSA_KERNEL_LAUNCH
	// requires the same input arguments. We introduce the following macro to
	// standardize these.
	#define TORCH_DSA_KERNEL_ARGS \
	[[maybe_unused]] c10::cuda::DeviceAssertionsData *const assertions_data, \
	[[maybe_unused]] uint32_t assertion_caller_id

	// This macro can be used to pass the DSA arguments onward to another
	// function
	#define TORCH_DSA_KERNEL_ARGS_PASS assertions_data, assertion_caller_id