| #include <c10d/NCCLUtils.hpp> |
| |
| #ifdef USE_C10D_NCCL |
| |
| #include <mutex> |
| |
| namespace c10d { |
| |
| |
| ncclComm_t NCCLComm::getNcclComm() { |
| std::unique_lock<std::mutex> lock(mutex_); |
| if (aborted_) { |
| auto commFailureMsg = commFailureReason_ != c10::nullopt |
| ? c10::str(" Original reason for failure was: ", *commFailureReason_) |
| : ""; |
| TORCH_CHECK( |
| false, |
| c10::str( |
| "NCCL communicator was aborted on rank ", |
| rank_, |
| ". ", |
| commFailureMsg)); |
| } |
| return ncclComm_; |
| } |
| |
| std::string getNcclVersion() { |
| static std::once_flag ncclGetVersionFlag; |
| static std::string versionString; |
| |
| std::call_once(ncclGetVersionFlag, []() { |
| int version; |
| ncclResult_t status = ncclGetVersion(&version); |
| // can't compute the version if call did not return successfully or version |
| // code < 100 (corresponding to 0.1.0) |
| if (status != ncclSuccess || version < 100) { |
| versionString = "Unknown NCCL version"; |
| } else { |
| auto ncclMajor = version / 1000; |
| auto ncclMinor = (version % 1000) / 100; |
| auto ncclPatch = version % (ncclMajor * 1000 + ncclMinor * 100); |
| versionString = std::to_string(ncclMajor) + "." + |
| std::to_string(ncclMinor) + "." + std::to_string(ncclPatch); |
| } |
| }); |
| |
| return versionString; |
| } |
| |
| std::string ncclGetErrorWithVersion(ncclResult_t error) { |
| return std::string(ncclGetErrorString(error)) + ", NCCL version " + |
| getNcclVersion(); |
| } |
| |
| } // namespace c10d |
| |
| #endif // USE_C10D_NCCL |