Enable TensorPipe's CUDA GDR channel (#50763)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/50763

ghstack-source-id: 120561489

Test Plan: Exported to GitHub

Reviewed By: mrshenli

Differential Revision: D25959672

fbshipit-source-id: b70f4b130806bf430869170bf4412697a6910275
diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
index 0074c50..8a06296 100644
--- a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
+++ b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
@@ -130,6 +130,10 @@
 constexpr int64_t kCudaIpcChannelPriority = 300;
 #endif
 
+#if TENSORPIPE_HAS_CUDA_GDR_CHANNEL && defined(USE_CUDA_NOT_ROCM)
+constexpr int64_t kCudaGdrChannelPriority = 200;
+#endif
+
 #ifdef USE_CUDA_NOT_ROCM
 constexpr int64_t kCudaXthChannelPriority = 400;
 constexpr int64_t kCudaBasicChannelPriority = 100;
@@ -293,6 +297,30 @@
 
 #endif
 
+#if TENSORPIPE_HAS_CUDA_GDR_CHANNEL && defined(USE_CUDA_NOT_ROCM)
+
+std::unique_ptr<CudaChannelRegistration> makeCudaGdrChannel() {
+  auto context = std::make_shared<tensorpipe::channel::cuda_gdr::Context>();
+  return std::make_unique<CudaChannelRegistration>(
+      CudaChannelRegistration{std::move(context), kCudaGdrChannelPriority});
+}
+
+// The cuda_gdr channel sends CUDA memory over InfiniBand using GPUDirect RDMA.
+// It directly registers the user-provided tensor with libibverbs, an operation
+// which is expensive the first time, but it then caches the registration in
+// order to amortize the cost and get low latency for subsequent transfers. A
+// ready-to-send/ready-to-receive handshake is still needed before the transfer
+// in order to ensure readiness and to agree on the device indices and thus the
+// queue pair to use. It automatically pairs each GPU to the "closest" NIC if
+// there are multiple of them (closest = longest prefix match in PCI tree).
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
+C10_REGISTER_CREATOR(
+    TensorPipeCudaChannelRegistry,
+    cuda_gdr,
+    makeCudaGdrChannel);
+
+#endif
+
 #ifdef USE_CUDA_NOT_ROCM
 
 std::unique_ptr<CudaChannelRegistration> makeCudaXthChannel() {