Enable TensorPipe's CUDA GDR channel (#50763)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/50763
ghstack-source-id: 120561489
Test Plan: Exported to GitHub
Reviewed By: mrshenli
Differential Revision: D25959672
fbshipit-source-id: b70f4b130806bf430869170bf4412697a6910275
diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
index 0074c50..8a06296 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
+++ b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
@@ -130,6 +130,10 @@
constexpr int64_t kCudaIpcChannelPriority = 300;
#endif
+#if TENSORPIPE_HAS_CUDA_GDR_CHANNEL && defined(USE_CUDA_NOT_ROCM)
+constexpr int64_t kCudaGdrChannelPriority = 200;
+#endif
+
#ifdef USE_CUDA_NOT_ROCM
constexpr int64_t kCudaXthChannelPriority = 400;
constexpr int64_t kCudaBasicChannelPriority = 100;
@@ -293,6 +297,30 @@
#endif
+#if TENSORPIPE_HAS_CUDA_GDR_CHANNEL && defined(USE_CUDA_NOT_ROCM)
+
+std::unique_ptr<CudaChannelRegistration> makeCudaGdrChannel() {
+ auto context = std::make_shared<tensorpipe::channel::cuda_gdr::Context>();
+ return std::make_unique<CudaChannelRegistration>(
+ CudaChannelRegistration{std::move(context), kCudaGdrChannelPriority});
+}
+
+// The cuda_gdr channel sends CUDA memory over InfiniBand using GPUDirect RDMA.
+// It directly registers the user-provided tensor with libibverbs, an operation
+// which is expensive the first time, but it then caches the registration in
+// order to amortize the cost and get low latency for subsequent transfers. A
+// ready-to-send/ready-to-receive handshake is still needed before the transfer
+// in order to ensure readiness and to agree on the device indices and thus the
+// queue pair to use. It automatically pairs each GPU to the "closest" NIC if
+// there are multiple of them (closest = longest prefix match in PCI tree).
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+C10_REGISTER_CREATOR(
+ TensorPipeCudaChannelRegistry,
+ cuda_gdr,
+ makeCudaGdrChannel);
+
+#endif
+
#ifdef USE_CUDA_NOT_ROCM
std::unique_ptr<CudaChannelRegistration> makeCudaXthChannel() {