Fix ibverbs completion queue capacity

Summary:
The header already contained an analysis of required completion queue
depth but the queue pair was still initialized with a maximum queue
depth of kMaxBuffers. This change fixes that and updates the analysis
to talk separately about receive and send completion queues.

Reviewed By: andrewwdye

Differential Revision: D4785786

fbshipit-source-id: 4dc302d523a3b7162dc261d14cfcc755681febf8
diff --git a/gloo/transport/ibverbs/pair.cc b/gloo/transport/ibverbs/pair.cc
index e0563da..9bac8a5 100644
--- a/gloo/transport/ibverbs/pair.cc
+++ b/gloo/transport/ibverbs/pair.cc
@@ -59,8 +59,8 @@
     memset(&attr, 0, sizeof(struct ibv_qp_init_attr));
     attr.send_cq = cq_;
     attr.recv_cq = cq_;
-    attr.cap.max_send_wr = Pair::kMaxBuffers;
-    attr.cap.max_recv_wr = Pair::kMaxBuffers;
+    attr.cap.max_send_wr = Pair::kSendCompletionQueueCapacity;
+    attr.cap.max_recv_wr = Pair::kRecvCompletionQueueCapacity;
     attr.cap.max_send_sge = 1;
     attr.cap.max_recv_sge = 1;
     attr.qp_type = IBV_QPT_RC;
@@ -214,7 +214,7 @@
 
   // The work request is serialized and sent to the driver so it
   // doesn't need to be valid after the ibv_post_recv call.
-  struct ibv_recv_wr* bad_wr;
+  struct ibv_recv_wr* bad_wr = nullptr;
   int rv = ibv_post_recv(qp_, &wr, &bad_wr);
   if (rv != 0) {
     signalIoFailure(GLOO_ERROR_MSG("ibv_post_recv: ", rv));
@@ -243,7 +243,7 @@
 
   // The work request is serialized and sent to the driver so it
   // doesn't need to be valid after the ibv_post_send call.
-  struct ibv_send_wr* bad_wr;
+  struct ibv_send_wr* bad_wr = nullptr;
   int rv = ibv_post_send(qp_, &wr, &bad_wr);
   if (rv != 0) {
     signalIoFailure(GLOO_ERROR_MSG("ibv_post_send: ", rv));
@@ -279,7 +279,7 @@
 
   struct ibv_recv_wr wr;
   memset(&wr, 0, sizeof(wr));
-  struct ibv_recv_wr* bad_wr;
+  struct ibv_recv_wr* bad_wr = nullptr;
   rv = ibv_post_recv(qp_, &wr, &bad_wr);
   if (rv != 0) {
     signalIoFailure(GLOO_ERROR_MSG("ibv_post_recv: ", rv));
diff --git a/gloo/transport/ibverbs/pair.h b/gloo/transport/ibverbs/pair.h
index 2d1ce3a..4f84fde 100644
--- a/gloo/transport/ibverbs/pair.h
+++ b/gloo/transport/ibverbs/pair.h
@@ -37,16 +37,30 @@
   // Use 3x the maximum number of buffers as the capacity
   // for entries in this pair's completion queue.
   //
-  // There are a maximum of:
+  // For receive completions, there are a maximum of:
   //   - MAX_BUFFERS posted receive work requests to receive memory
-  //     regions from the other side of the pair.
-  //   - MAX_BUFFERS posted send work requests to send memory
-  //     regions to the other side of the pair.
+  //     regions from the other side of the pair (for send buffers).
   //   - MAX_BUFFERS posted receive work requests for RDMA writes
-  //     from the other side of the pair. These requests are posted
-  //     at the same time a buffer's local memory region is sent to
-  //     the other side of the pair.
-  static constexpr auto kCompletionQueueCapacity = 3 * kMaxBuffers;
+  //     from the other side of the pair.
+  //
+  // For send completions, there are a maximum of:
+  //   - MAX_BUFFERS posted send work requests to send memory
+  //     regions to the other side of the pair (for receive buffers).
+  //   - MAX_BUFFERS posted send work requests for RDMA writes
+  //     to the other side of the pair.
+  //
+  // While this sums up to 4x kMaxBuffers work requests, send work
+  // requests can only be posted after receiving the corresponding
+  // memory region from the other side of the pair. This leads to a
+  // maximum of of 3x kMaxBuffers posted work requests at any given
+  // time. However, since the majority can be made up by either
+  // receive work requests or send work requests, we keep the capacity
+  // at 4x kMaxBuffers and allocate half to each type.
+  //
+  static constexpr auto kRecvCompletionQueueCapacity = 2 * kMaxBuffers;
+  static constexpr auto kSendCompletionQueueCapacity = 2 * kMaxBuffers;
+  static constexpr auto kCompletionQueueCapacity =
+    kRecvCompletionQueueCapacity + kSendCompletionQueueCapacity;
 
   // The ibv_req_notify(3) function takes an argument called
   // 'solicited_only' which makes it only trigger a notification for