Enable TensorPipe's CUDA GDR channel (#50763) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/50763 ghstack-source-id: 120561489 Test Plan: Exported to GitHub Reviewed By: mrshenli Differential Revision: D25959672 fbshipit-source-id: b70f4b130806bf430869170bf4412697a6910275

commit: 4288f08d302a665b663753e988dd95caf2d4cdc8 [log] [tgz]
author: Luca Wehrstedt <lcw@fb.com> Thu Jan 28 10:10:04 2021 -0800
committer: Facebook GitHub Bot <facebook-github-bot@users.noreply.github.com> Thu Jan 28 10:12:28 2021 -0800
tree: bdac340ec3222fe26bc65efd605e24edeb3f413a
parent: cc211bb43e41b06b0fea3d1ed33f99f7686033c2 [diff]
diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
index 0074c50..8a06296 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
+++ b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp

@@ -130,6 +130,10 @@
 constexpr int64_t kCudaIpcChannelPriority = 300;
 #endif
 
+#if TENSORPIPE_HAS_CUDA_GDR_CHANNEL && defined(USE_CUDA_NOT_ROCM)
+constexpr int64_t kCudaGdrChannelPriority = 200;
+#endif
+
 #ifdef USE_CUDA_NOT_ROCM
 constexpr int64_t kCudaXthChannelPriority = 400;
 constexpr int64_t kCudaBasicChannelPriority = 100;
@@ -293,6 +297,30 @@
 
 #endif
 
+#if TENSORPIPE_HAS_CUDA_GDR_CHANNEL && defined(USE_CUDA_NOT_ROCM)
+
+std::unique_ptr<CudaChannelRegistration> makeCudaGdrChannel() {
+  auto context = std::make_shared<tensorpipe::channel::cuda_gdr::Context>();
+  return std::make_unique<CudaChannelRegistration>(
+      CudaChannelRegistration{std::move(context), kCudaGdrChannelPriority});
+}
+
+// The cuda_gdr channel sends CUDA memory over InfiniBand using GPUDirect RDMA.
+// It directly registers the user-provided tensor with libibverbs, an operation
+// which is expensive the first time, but it then caches the registration in
+// order to amortize the cost and get low latency for subsequent transfers. A
+// ready-to-send/ready-to-receive handshake is still needed before the transfer
+// in order to ensure readiness and to agree on the device indices and thus the
+// queue pair to use. It automatically pairs each GPU to the "closest" NIC if
+// there are multiple of them (closest = longest prefix match in PCI tree).
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+C10_REGISTER_CREATOR(
+    TensorPipeCudaChannelRegistry,
+    cuda_gdr,
+    makeCudaGdrChannel);
+
+#endif
+
 #ifdef USE_CUDA_NOT_ROCM
 
 std::unique_ptr<CudaChannelRegistration> makeCudaXthChannel() {
commit	4288f08d302a665b663753e988dd95caf2d4cdc8	[log] [tgz]
author	Luca Wehrstedt <lcw@fb.com>	Thu Jan 28 10:10:04 2021 -0800
committer	Facebook GitHub Bot <facebook-github-bot@users.noreply.github.com>	Thu Jan 28 10:12:28 2021 -0800
tree	bdac340ec3222fe26bc65efd605e24edeb3f413a
parent	cc211bb43e41b06b0fea3d1ed33f99f7686033c2 [diff]