| #pragma once | 
 |  | 
 | #include <c10/cuda/CUDAException.h> | 
 | #include <c10/macros/Macros.h> | 
 |  | 
 | namespace c10::cuda { | 
 |  | 
 | #ifdef TORCH_USE_CUDA_DSA | 
 | // Copy string from `src` to `dst` | 
 | static __device__ void dstrcpy(char* dst, const char* src) { | 
 |   int i = 0; | 
 |   // Copy string from source to destination, ensuring that it | 
 |   // isn't longer than `C10_CUDA_DSA_MAX_STR_LEN-1` | 
 |   while (*src != '\0' && i++ < C10_CUDA_DSA_MAX_STR_LEN - 1) { | 
 |     *dst++ = *src++; | 
 |   } | 
 |   *dst = '\0'; | 
 | } | 
 |  | 
 | static __device__ void dsa_add_new_assertion_failure( | 
 |     DeviceAssertionsData* assertions_data, | 
 |     const char* assertion_msg, | 
 |     const char* filename, | 
 |     const char* function_name, | 
 |     const int line_number, | 
 |     const uint32_t caller, | 
 |     const dim3 block_id, | 
 |     const dim3 thread_id) { | 
 |   // `assertions_data` may be nullptr if device-side assertion checking | 
 |   // is disabled at run-time. If it is disabled at compile time this | 
 |   // function will never be called | 
 |   if (!assertions_data) { | 
 |     return; | 
 |   } | 
 |  | 
 |   // Atomically increment so other threads can fail at the same time | 
 |   // Note that incrementing this means that the CPU can observe that | 
 |   // a failure has happened and can begin to respond before we've | 
 |   // written information about that failure out to the buffer. | 
 |   const auto nid = atomicAdd(&(assertions_data->assertion_count), 1); | 
 |  | 
 |   if (nid >= C10_CUDA_DSA_ASSERTION_COUNT) { | 
 |     // At this point we're ran out of assertion buffer space. | 
 |     // We could print a message about this, but that'd get | 
 |     // spammy if a lot of threads did it, so we just silently | 
 |     // ignore any other assertion failures. In most cases the | 
 |     // failures will all probably be analogous anyway. | 
 |     return; | 
 |   } | 
 |  | 
 |   // Write information about the assertion failure to memory. | 
 |   // Note that this occurs only after the `assertion_count` | 
 |   // increment broadcasts that there's been a problem. | 
 |   auto& self = assertions_data->assertions[nid]; | 
 |   dstrcpy(self.assertion_msg, assertion_msg); | 
 |   dstrcpy(self.filename, filename); | 
 |   dstrcpy(self.function_name, function_name); | 
 |   self.line_number = line_number; | 
 |   self.caller = caller; | 
 |   self.block_id[0] = block_id.x; | 
 |   self.block_id[1] = block_id.y; | 
 |   self.block_id[2] = block_id.z; | 
 |   self.thread_id[0] = thread_id.x; | 
 |   self.thread_id[1] = thread_id.y; | 
 |   self.thread_id[2] = thread_id.z; | 
 | } | 
 |  | 
 | // Emulates a kernel assertion. The assertion won't stop the kernel's progress, | 
 | // so you should assume everything the kernel produces is garbage if there's an | 
 | // assertion failure. | 
 | // NOTE: This assumes that `assertions_data` and  `assertion_caller_id` are | 
 | //       arguments of the kernel and therefore accessible. | 
 | #define CUDA_KERNEL_ASSERT2(condition)                                   \ | 
 |   do {                                                                   \ | 
 |     if (C10_UNLIKELY(!(condition))) {                                    \ | 
 |       /* Has an atomic element so threads can fail at the same time */   \ | 
 |       c10::cuda::dsa_add_new_assertion_failure(                          \ | 
 |           assertions_data,                                               \ | 
 |           C10_STRINGIZE(condition),                                      \ | 
 |           __FILE__,                                                      \ | 
 |           __FUNCTION__,                                                  \ | 
 |           __LINE__,                                                      \ | 
 |           assertion_caller_id,                                           \ | 
 |           blockIdx,                                                      \ | 
 |           threadIdx);                                                    \ | 
 |       /* Now that the kernel has failed we early exit the kernel, but */ \ | 
 |       /* otherwise keep going and rely on the host to check UVM and */   \ | 
 |       /* determine we've had a problem */                                \ | 
 |       return;                                                            \ | 
 |     }                                                                    \ | 
 |   } while (false) | 
 | #else | 
 | #define CUDA_KERNEL_ASSERT2(condition) assert(condition) | 
 | #endif | 
 |  | 
 | } // namespace c10::cuda |