src/core.cu - platform/external/pytorch - Git at Google

 /*************************************************************************
  * Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *  * Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  *  * Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *  * Neither the name of NVIDIA CORPORATION nor the names of its
  *    contributors may be used to endorse or promote products derived
  *    from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  ************************************************************************/

 #include <stdio.h>
 #include <stdlib.h>
 #include "core.h"
 #include "libwrap.h"
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <sched.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <string.h>
 #include <errno.h>

 DebugLevel ncclDebugLevel;

 extern "C" DSOGLOBAL
 ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
   pid_t pid = getpid();
   static int count = 0;
   int commId = __sync_fetch_and_add(&count, 1);
   int len = snprintf(out->internal, NCCL_UNIQUE_ID_BYTES, "nccl-%d-%d", pid, commId);
   if(strlen(out->internal) < len) {
     WARN("ncclUniqueId truncated");
     return ncclInternalError;
   }
   return ncclSuccess;
 }


 static ncclResult_t shmOpen(const char* shmname, size_t bytes, void** ptr) {
   int fd = shm_open(shmname, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
   if (fd == -1) {
     WARN("shm_open failed to open %s", shmname);
     return ncclSystemError;
   }

   if (ftruncate(fd, bytes) == -1) {
     WARN("ftruncate failed to allocate %ld bytes", bytes);
     shm_unlink(shmname);
     close(fd);
     return ncclSystemError;
   }

   *ptr = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
   if (*ptr == MAP_FAILED) {
     WARN("failure in mmap");
     shm_unlink(shmname);
     close(fd);
     return ncclSystemError;
   }

   close(fd);
   return ncclSuccess;
 }

 static ncclResult_t shmUnlink(const char* shmname) {
   if(shm_unlink(shmname) == -1) {
     WARN("smh_unlink failed");
     return ncclSystemError;
   } else {
     return ncclSuccess;
   }
 }

 static ncclResult_t shmUnmap(void* ptr, size_t bytes) {
   if(munmap(ptr, bytes) == -1) {
     WARN("munmap failed");
     return ncclSystemError;
   } else {
     return ncclSuccess;
   }
 }


 typedef struct {
   int rank;
   int ndev;
   int cudaDev;
   int ncclId;
   pid_t pid;
   ncclMem* hostptr;
   ncclMem* devptr;
   CUipcMemHandle devipc;
   size_t buffSize;
 } RankEntry;

 static int compRanks(const void* a, const void* b) {
   const RankEntry* A = (const RankEntry*)a;
   const RankEntry* B = (const RankEntry*)b;
   if (A->ncclId < B->ncclId) return -1;
   if (A->ncclId > B->ncclId) return  1;
   return 0;
 }

 static void orderRanks(RankEntry* ranks, int count) {
   qsort(ranks, count, sizeof(RankEntry), compRanks);
   for(int i=0; i<count; ++i)
     ranks[i].ncclId = i;
 }


 typedef struct {
   union {
     struct {
       volatile int bar;
       int ringDirectFail;
     };
     char pad[16];
    };
    RankEntry ranks[1];
 } RankGather;

 static ncclResult_t initGather(RankGather** gather, ncclUniqueId commId,
     int ndev, int rank, RankEntry myInfo) {
   size_t bytes = offsetof(RankGather, ranks) + ndev*sizeof(RankEntry);
   RankGather* tmp = NULL;
   int bar_tmp;

   ncclResult_t res = shmOpen(commId.internal, bytes, (void**)&tmp);
   if (res != ncclSuccess) {
     WARN("rank %d failed to open shm segment for gather", rank);
     return res;
   }

   tmp->ranks[rank] = myInfo;

   bar_tmp = tmp->bar - 1;
   bool swapped;
   do {
     bar_tmp += 1;
     if (bar_tmp == ndev-1) { // everyone is done
       ncclResult_t res = shmUnlink(commId.internal);
       if (res != ncclSuccess) {
         WARN("rank %d failed to unlink shm segment for gather", rank);
         shmUnmap(tmp, bytes);
         return res;
       }

       orderRanks(tmp->ranks, ndev);
     }
     swapped = __sync_bool_compare_and_swap(&tmp->bar, bar_tmp, bar_tmp+1);
   } while(!swapped);

   while (tmp->bar < ndev)
     sched_yield();
   __sync_synchronize();

   *gather = tmp;
   return ncclSuccess;
 }

 static void syncRingDirect(RankGather* gather, int* ringDirectOk) {
   int bar_tmp = gather->bar - 1;
   int ndev = gather->ranks[0].ndev;
   bool swapped;
   do {
     bar_tmp += 1;
     swapped = __sync_bool_compare_and_swap(&gather->bar, bar_tmp, bar_tmp+1);
   } while(!swapped);

   while (gather->bar < 2*ndev) // Wait for all ranks to arrive at this second barrier
     sched_yield();
   __sync_synchronize();

   *ringDirectOk = gather->ringDirectFail ? 0 : 1;
 }

 static ncclResult_t closeGather(RankGather* gather, int ndev) {
   int bar_tmp = gather->bar - 1;
   bool swapped;
   do {
     bar_tmp += 1;
     swapped = __sync_bool_compare_and_swap(&gather->bar, bar_tmp, bar_tmp+1);
   } while(!swapped);

   while (gather->bar < 3*ndev) // Wait for all ranks to arrive at this third barrier
     sched_yield();
   __sync_synchronize();

   size_t bytes = offsetof(RankGather, ranks) + ndev*sizeof(RankEntry);
   ncclResult_t res = shmUnmap(gather, bytes);
   if (res != ncclSuccess) {
     WARN("failed to unmap %ld bytes of gather", bytes);
     return res;
   }

   return ncclSuccess;
 }


 static ncclResult_t allocDevMem(ncclMem** ptr, size_t buffSize) {
   size_t size = offsetof(struct ncclMem, buff) + buffSize;
   cudaError_t res = cudaMalloc((void**)ptr, size);
   if (res != cudaSuccess) {
     *ptr = NULL;
     WARN("failed to allocate %lu byte device buffer", size);
     return ncclCudaMallocFailed;
   }
   if (cudaMemset(*ptr, 0, size) != cudaSuccess) {
     WARN("failed to memset device buffer.");
     cudaFree(*ptr);
     *ptr = NULL;
     return ncclUnhandledCudaError;
   }
   return ncclSuccess;
 }

 static const int ShmMapped = 1;
 static const int ShmLinked = 2;

 static ncclResult_t allocHostMem(ncclMem** ptr, size_t buffSize) {
   size_t size = offsetof(struct ncclMem, buff) + buffSize;
   cudaError_t res = cudaMallocHost((void**)ptr, size);
   if (res != cudaSuccess) {
     *ptr = NULL;
     WARN("failed to allocate %lu byte host buffer", size);
     return ncclSystemError;
   }
   memset(*ptr, 0, size);
   return ncclSuccess;
 }

 static ncclResult_t openHostMemShm(const char* shmname, ncclMem** ptr, size_t buffSize) {
   size_t size = offsetof(struct ncclMem, buff) + buffSize;
   ncclResult_t res = shmOpen(shmname, size, (void**)ptr);
   if (res != ncclSuccess) {
     WARN("failed to allocate %lu byte shm buffer", size);
     *ptr = NULL;
     return res;
   }

   if(cudaHostRegister(*ptr, size, cudaHostRegisterMapped) != cudaSuccess) {
     WARN("failed to register host buffer");
     shmUnlink(shmname);
     shmUnmap(*ptr, size);
     *ptr = NULL;
     return ncclUnhandledCudaError;
   }
   return ncclSuccess;
 }

 static ncclResult_t populateRankInfo(RankEntry* info, int rank, ncclComm_t comm) {
   char busId[13];
   nvmlDevice_t nvmlHandle;
   cudaError_t res = cudaDeviceGetPCIBusId(busId, 13, comm->cudaDev);
   if (res == cudaErrorInvalidDevice) {
     WARN("rank %d attempted to access an invalid cuda device %d", rank, comm->cudaDev);
     return ncclInvalidDeviceIndex;
   } else if (res != cudaSuccess) {
     WARN("rank %d failed to get PCI Bus Id for device %d", rank, comm->cudaDev);
     return ncclUnhandledCudaError;
   }
   INFO("rank %d using device %d (%s)", rank, comm->cudaDev, busId);

   if (wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlHandle) != ncclSuccess) {
     WARN("rank %d failed to get nvml handle for device %s", rank, busId);
     return ncclUnhandledCudaError;
   }
   // Order by nvml index
   if (wrapNvmlDeviceGetIndex(nvmlHandle, (unsigned*)&info->ncclId) != ncclSuccess) {
     WARN("rank %d failed to get nvml device index for device %d", rank, comm->cudaDev);
     return ncclUnhandledCudaError;
   }

   info->rank = rank;
   info->ndev = comm->nDev;
   info->cudaDev = comm->cudaDev;
   info->pid = getpid();
   info->buffSize = comm->buffSize;
   info->hostptr = comm->hostMem;
   info->devptr = comm->devMem;
   if (wrapCuIpcGetMemHandle(&info->devipc, (CUdeviceptr)comm->devMem) != ncclSuccess) {
     WARN("rank %d failed to open CUDA IPC handle", rank);
     return ncclUnhandledCudaError;
   }

   return ncclSuccess;
 }


 static const int CLEANUP_NONE  = 0;
 static const int CLEANUP_CUIPC = 1;
 static const int CLEANUP_UNMAP = 2;

 static ncclResult_t commClearMaps(ncclComm_t comm) {
   ncclResult_t res, retval = ncclSuccess;
   cudaError_t cures;

   for(int d=0; d<comm->nDev; ++d) {
     switch(comm->ptrs[d].remoteCleanup) {
       case CLEANUP_NONE:
         break;
       case CLEANUP_CUIPC:
         res = wrapCuIpcCloseMemHandle((CUdeviceptr)comm->ptrs[d].cleanupHandle);
         if (res != ncclSuccess) {
           WARN("rank %d failed to close IPC handle to rank %d",
             comm->userFromRing[comm->ncclId], comm->userFromRing[d]);
           retval = (retval == ncclSuccess) ? res : retval;
         }
         break;
       case CLEANUP_UNMAP:
         cures = cudaHostUnregister(comm->ptrs[d].cleanupHandle);
         if (cures != cudaSuccess) {
           WARN("rank %d failed to unregister handle to rank %d",
             comm->userFromRing[comm->ncclId], comm->userFromRing[d]);
             retval = (retval == ncclSuccess) ? ncclUnhandledCudaError : retval;
         }
         res = shmUnmap(comm->ptrs[d].cleanupHandle, offsetof(ncclMem, buff) + comm->buffSize);
         if (res != ncclSuccess) {
           WARN("rank %d failed to unmap handle to rank %d",
             comm->userFromRing[comm->ncclId], comm->userFromRing[d]);
             retval = (retval == ncclSuccess) ? res : retval;
         }
         break;
       default:
         WARN("Unknown cleanup type %d", comm->ptrs[d].remoteCleanup);
     }
     comm->ptrs[d].remoteCleanup = CLEANUP_NONE;
     comm->ptrs[d].cleanupHandle = NULL;
   }

   if (comm->userFromRing != NULL)
     memset(comm->userFromRing, 0, sizeof(int)*comm->nDev);
   if (comm->ringFromUser != NULL)
     memset(comm->ringFromUser, 0, sizeof(int)*comm->nDev);

   if (comm->devUserFromRing != NULL) {
     cudaError_t err = cudaMemset(comm->devUserFromRing, 0, sizeof(int)*comm->nDev);
     if (err != cudaSuccess) {
       WARN("Faild to clear dev map: %s", cudaGetErrorString(err));
       retval = (retval == ncclSuccess) ? ncclUnhandledCudaError : retval;
     }
   }
   return retval;
 }

 static ncclResult_t commBuildMaps(ncclComm_t comm, ncclUniqueId* commId, int rank, RankEntry* ranks, int* ringDirectFailed) {
   int ndev = comm->nDev;
   for(int i=0; i<ndev; ++i) {
     // Check for inconsistencies between ranks
     // If two ranks use the same rank, then one slot of
     // ranks[] will be left unset with zero ndev/buffSize.
     if (ranks[i].buffSize != comm->buffSize
         || ranks[i].ndev != comm->nDev) {
       commClearMaps(comm);
       return ncclRankMismatch;
     }

     // Create rank<->nccl maps
     int iRank = ranks[i].rank;
     comm->userFromRing[i] = iRank;
     comm->ringFromUser[iRank] = i;
   }

   if (cudaMemcpy(comm->devUserFromRing, comm->userFromRing, ndev*sizeof(int),
       cudaMemcpyHostToDevice) != cudaSuccess) {
     WARN("rank %d failed to copy maps to device", rank);
     commClearMaps(comm);
     return ncclUnhandledCudaError;
   }

   int myId = -1;
   for (int i=0; i<ndev; ++i) {
     if(ranks[i].rank == rank) {
       myId = i;
       break;
     }
   }

   if (myId == -1) {
     WARN("rank %d not found in communicator", rank);
     return ncclInvalidRank;
   }
   comm->ncclId = myId;

   int myDev = ranks[myId].cudaDev;
   pid_t myPid = ranks[myId].pid;
   comm->useRemoteRecv = 1; // Assume we directly write to result ptrs.

   // The order that we link with peers must ensure that
   // P2P slots are used for high-priority links first.
   for (int j=0; j<ndev; ++j) {
     int i = (myId - 1 + ndev + j) % ndev;
     int iRank = ranks[i].rank;
     int iDev = ranks[i].cudaDev;
     pid_t iPid = ranks[i].pid;
     int canpeer = 0;

     if (cudaDeviceCanAccessPeer(&canpeer, myDev, iDev) != cudaSuccess) {
       INFO("peer query failed between rank %d (dev %d) and rank %d (dev %d)",
         rank, myDev, iRank, iDev);
       canpeer = 0;
     }

     if (canpeer) {
       cudaError_t err;
       err = cudaDeviceEnablePeerAccess(iDev, 0);
       if (err == cudaErrorPeerAccessAlreadyEnabled) {
         cudaGetLastError();
       } else if (err != cudaSuccess) {
         INFO("peer access failed between rank %d (dev %d) and rank %d (dev %d)\n",
           rank, myDev, iRank, iDev);

         canpeer = 0;
       }
     }

     if (iPid == myPid) {
       if (canpeer || myDev == iDev) {
         INFO("rank access %d -> %d via P2P device mem", rank, iRank);
         comm->ptrs[i].local  = ranks[myId].devptr;
         comm->ptrs[i].remote = ranks[i].devptr;
         comm->ptrs[i].remoteCleanup = CLEANUP_NONE;
       } else { // go through hostmem
         INFO("rank access %d -> %d via zero-copy host mem", rank, iRank);
         if (j <= 2)
           *ringDirectFailed = 1;
         if (cudaHostGetDevicePointer(&comm->ptrs[i].local, ranks[myId].hostptr, 0) != cudaSuccess) {
           WARN("rank %d failed to map zero copy buffer to device", rank);
           commClearMaps(comm);
           return ncclUnhandledCudaError;
         }
         if (cudaHostGetDevicePointer(&comm->ptrs[i].remote, ranks[i].hostptr, 0) != cudaSuccess) {
           WARN("rank %d failed to map %d's zero copy buffer to device", rank, iRank);
           commClearMaps(comm);
           return ncclUnhandledCudaError;
         }
         comm->ptrs[i].remoteCleanup = CLEANUP_NONE;
       }
     } else { // multi-process!
       *ringDirectFailed = 1;
       if (canpeer || myDev == iDev) {
         INFO("rank access %d -> %d via Ipc P2P device mem", rank, iRank);
         comm->ptrs[i].local = ranks[myId].devptr;
         if (wrapCuIpcOpenMemHandle((CUdeviceptr*)(&comm->ptrs[i].remote),
             ranks[i].devipc, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS) != ncclSuccess) {
           WARN("rank %d failed to open Ipc handle to rank %d", rank, iRank);
           commClearMaps(comm);
           return ncclUnhandledCudaError;
         }
         comm->ptrs[i].remoteCleanup = CLEANUP_CUIPC;
         comm->ptrs[i].cleanupHandle = comm->ptrs[i].remote;
       } else { // go through hostmem
         INFO("rank access %d -> %d via zero copy host shm", rank, iRank);
         if (cudaHostGetDevicePointer(&comm->ptrs[i].local, ranks[myId].hostptr, 0) != cudaSuccess) {
           WARN("rank %d failed to obtain dev ptr to sysmem buffer", rank);
           commClearMaps(comm);
           return ncclUnhandledCudaError;
         }
         char rankname[1024];
         sprintf(rankname, "%s-%d", commId->internal, ranks[i].rank);
         if (openHostMemShm(rankname, (ncclMem**)&comm->ptrs[i].cleanupHandle, ranks[i].buffSize)
             != ncclSuccess) {
           WARN("rank %d failed to open sysmem buffer of rank %d", rank, iRank);
           commClearMaps(comm);
           return ncclUnhandledCudaError;
         }
         if (cudaHostGetDevicePointer(&comm->ptrs[i].remote, comm->ptrs[i].cleanupHandle, 0) != cudaSuccess) {
           WARN("rank %d failed to obtain dev ptr for rank %d", rank, iRank);
           commClearMaps(comm);
           return ncclUnhandledCudaError;
         }
         comm->ptrs[i].remoteCleanup = CLEANUP_UNMAP;
       }
     }
   }
   return ncclSuccess;
 }

 static void initDebug() {
   const char* nccl_debug = getenv("NCCL_DEBUG");
   if (nccl_debug == NULL) {
     ncclDebugLevel = NONE;
   } else if (strcmp(nccl_debug, "WARN") == 0) {
     ncclDebugLevel = WARN;
   } else if (strcmp(nccl_debug, "INFO") == 0) {
     ncclDebugLevel = INFO;
     INFO("NCCL debug level set to INFO");
   } else if (strcmp(nccl_debug, "ABORT") == 0) {
     ncclDebugLevel = ABORT;
     INFO("NCCL debug level set to ABORT");
   }

 }

 static void commFree(ncclComm_t comm) {
   if (comm == NULL)
     return;

   for(int i=0; i<MAXQUEUE; ++i) {
     if (comm->events.isDone[i] != NULL)
       if (cudaEventDestroy(comm->events.isDone[i]) != cudaSuccess)
         INFO("failed to destroy cuda event %d", i);
   }

   ncclResult_t res = commClearMaps(comm);
   if (res != ncclSuccess)
     INFO("failed to cleanup comm maps");

   if (comm->userFromRing != NULL)
     free(comm->userFromRing);

   if (comm->devUserFromRing != NULL)
     if (cudaFree(comm->devUserFromRing) != cudaSuccess)
       INFO("commFree failed to free dev maps");

   if (comm->ringFromUser != NULL)
     free(comm->ringFromUser);

   if (comm->devMem != NULL && cudaFree(comm->devMem) != cudaSuccess)
     INFO("Failed to free devMap");

   if (comm->hostMem != NULL) {
     if (comm->hostMemState & ShmMapped) {
       if (cudaHostUnregister(comm->hostMem) != cudaSuccess)
         INFO("Failed to unregister hostMem");
       size_t size = offsetof(ncclMem, buff) + comm->buffSize;
       if (shmUnmap(comm->hostMem, size) != ncclSuccess)
         INFO("Failed to unmap hostMem");
       comm->hostMemState ^= ShmMapped;
     } else {
       cudaFreeHost(comm->hostMem);
     }
   }
   free(comm);
 }

 static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, const ncclUniqueId* commId, int rank) {
   if (ndev < 1) {
     WARN("invalid device count (%d) requested", ndev);
     return ncclUnsupportedDeviceCount;
   }
   if (rank >= ndev || rank < 0) {
     WARN("rank %d exceeds ndev=%d", rank, ndev);
     return ncclInvalidRank;
   }

   size_t commBytes = offsetof(ncclComm, ptrs) + ndev*sizeof(ncclNodeRef);
   struct ncclComm* comm = (struct ncclComm*)malloc(commBytes);
   if (comm == NULL) {
     WARN("comm allocation failed");
     return ncclSystemError;
   }
   memset(comm, 0, commBytes);

   comm->nDev = ndev;
   cudaGetDevice(&comm->cudaDev);

   const char* str = getenv("NCCL_BUFFSIZE");
   if (str != NULL) {
     errno = 0;
     comm->buffSize = strtol(str, NULL, 10);
     if (errno == ERANGE || comm->buffSize == 0) {
       INFO("rank %d invalid NCCL_BUFFSIZE: %s, using default %lu",
           rank, str, DEFAULT_BUFFER_SIZE_BYTES);
       comm->buffSize = DEFAULT_BUFFER_SIZE_BYTES;
     }
   } else {
     comm->buffSize = DEFAULT_BUFFER_SIZE_BYTES;
   }
   INFO("rank %d using buffSize = %lu", rank, comm->buffSize);


   ncclResult_t res;
   res = allocDevMem(&comm->devMem, comm->buffSize);
   if (res != ncclSuccess) {
     WARN("rank %d failed to allocate device buffer", rank);
     commFree(comm);
     return res;
   }
   if (cudaMalloc(&comm->devUserFromRing, ndev*sizeof(int)) != cudaSuccess) {
     WARN("rank %d failed to allocated device maps", rank);
     commFree(comm);
     return ncclCudaMallocFailed;
   }

   comm->userFromRing = (int*)malloc(ndev*sizeof(int));
   if (comm->userFromRing == NULL) {
     WARN("rank %d failed to allocate host maps", rank);
     commFree(comm);
     return ncclSystemError;
   }

   comm->ringFromUser = (int*)malloc(ndev*sizeof(int));
   if (comm->ringFromUser == NULL) {
     WARN("rank %d failed to allocate host maps", rank);
     commFree(comm);
     return ncclSystemError;
   }

   EventQueue* eq = &comm->events;
   for(int i=0; i<MAXQUEUE; ++i) {
     if (cudaEventCreateWithFlags(eq->isDone+i, cudaEventDisableTiming) != cudaSuccess) {
       WARN("rank %d failed to create nccl event %d", rank, i);
       commFree(comm);
       return ncclUnhandledCudaError;
     }
   }

   if(commId == NULL) {
     comm->hostMemState = 0;
     res = allocHostMem(&comm->hostMem, comm->buffSize);
   } else {
     char rankname[1024];
     sprintf(rankname, "%s-%d", commId->internal, rank);
     res = openHostMemShm(rankname, &comm->hostMem, comm->buffSize);
     if (res != ncclSuccess) {
       WARN("rank %d failed to allocate host buffer", rank);
       commFree(comm);
       return res;
     }
     comm->hostMemState = ShmMapped | ShmLinked;
   }

   *comret = comm;
   return ncclSuccess;
 }

 static ncclResult_t commUnlinkHostMem(ncclComm_t comm, ncclUniqueId commId, int rank) {
   char rankname[1024];
   sprintf(rankname, "%s-%d", commId.internal, rank);
   if (comm->hostMemState & ShmLinked)
     comm->hostMemState ^= ShmLinked;
   return shmUnlink(rankname);
 }

 extern "C" DSOGLOBAL
 ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank) {
   if (strlen(commId.internal) < 1 ||
       strlen(commId.internal) >= NCCL_UNIQUE_ID_BYTES) {
     WARN("rank %d invalid commId", myrank);
     return ncclInvalidArgument;
   }

   initDebug();
   ncclResult_t res;
   RankEntry myStuff;
   RankGather* gath = NULL;

   res = wrapSymbols();
   if (res != ncclSuccess) {
     WARN("NCCL failed to initialize client libs");
     return res;
   }

   res = wrapNvmlInit();
   if (res != ncclSuccess) {
     WARN("rank %d failed to initialize nvml", myrank);
     return res;
   }

   res = commAlloc(newcomm, ndev, &commId, myrank);
   if (res != ncclSuccess) {
     WARN("rank %d failed to allocate communicator", myrank);
     return res;
   }

   res = populateRankInfo(&myStuff, myrank, *newcomm);
   if (res != ncclSuccess) {
     WARN("rank %d failed to obtain rank info", myrank);
     goto cleanup;
   }

   res = initGather(&gath, commId, ndev, myrank, myStuff);
   if (res != ncclSuccess) {
     WARN("rank %d failed to gather rank info", myrank);
     goto cleanup;
   }

   res = commBuildMaps(*newcomm, &commId, myrank, gath->ranks, &gath->ringDirectFail);
   if (res != ncclSuccess) {
     WARN("rank %d failed to build comm maps", myrank);
     goto cleanup;
   }

   syncRingDirect(gath, &((*newcomm)->useRemoteRecv));
   INFO("PushToRecv algos are %s\n", (*newcomm)->useRemoteRecv ? "enabled" : "disabled");

   res = closeGather(gath, ndev); // includes a barrier
   gath = NULL;
   if (res != ncclSuccess) {
     WARN("rank %d failed to close gather", myrank);
     goto cleanup;
   }

   goto final;

   cleanup:
   if (gath != NULL)
     closeGather(gath, ndev);
   commFree(*newcomm);

   final:
   if ((*newcomm)->hostMemState & ShmLinked) {
     if (commUnlinkHostMem(*newcomm, commId, myrank) != ncclSuccess)
       INFO("rank %d failed to unlink host mem shm segment", myrank);
   }

   if (wrapNvmlShutdown() != ncclSuccess)
     INFO("rank %d did not shutdown nvml properly", myrank);
   return res;
 }

 extern "C" DSOGLOBAL
 ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, int* devlist) {
   initDebug();

   ncclResult_t res;
   int savedDevice;
   RankEntry* ranks = NULL;
   int rank, cudaDev;
   ncclComm_t comm = NULL;
   char busId[13];
   nvmlDevice_t nvmlHandle;
   int affinity_set = 0;
   int ringDirectFail = 0; // Assume direct access to recv ptr OK

   res = wrapSymbols();
   if (res != ncclSuccess) {
     WARN("NCCL failed to initialize client libs");
     return res;
   }

   cudaGetDevice(&savedDevice);
   ranks = (RankEntry*)malloc(ndev*sizeof(RankEntry));
   if (ranks == NULL) {
     WARN("NCCL allocation failed");
     return ncclSystemError;
   }
   memset(ranks, 0, ndev*sizeof(RankEntry));

   res = wrapNvmlInit();
   if (res != ncclSuccess) {
     WARN("nccl failed to initialize nvml");
     return res;
   }

   for(rank=0; rank<ndev; ++rank)
     comms[rank] = NULL;

   for (rank=0; rank<ndev; ++rank) {
     cudaDev = (devlist == NULL) ? rank : devlist[rank];
     if (cudaSetDevice(cudaDev) != cudaSuccess) {
       WARN("rank %d failed to set cuda device %d", rank, cudaDev);
       res = ncclInvalidDeviceIndex;
       goto cleanup;
     }

     // Set CPU affinity
     affinity_set = 0;
     if (cudaDeviceGetPCIBusId(busId, 13, cudaDev) != cudaSuccess) {
       INFO("rank %d failed to get PCI Bus Id for device %d", rank, cudaDev);
       goto skipaffinity;
     }
     if (wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlHandle) != ncclSuccess) {
       INFO("rank %d failed to get nvml handle for device %s", rank, busId);
       goto skipaffinity;
     }
     if (wrapNvmlDeviceSetCpuAffinity(nvmlHandle) != ncclSuccess) {
       INFO("rank %d failed to set affinity", rank);
       goto skipaffinity;
     }
     affinity_set = 1;
     skipaffinity:

     res = commAlloc(&comm, ndev, NULL, rank);
     if (res != ncclSuccess) {
       WARN("rank %d failed to allocate communicator", rank);
       goto cleanup;
     }
     comms[rank] = comm;

     if (affinity_set && wrapNvmlDeviceClearCpuAffinity(nvmlHandle) != ncclSuccess) {
       INFO("rank %d set but failed to clear cpu affinity", rank);
     }
     res = populateRankInfo(ranks+rank, rank, comm);
     if (res != ncclSuccess) {
       WARN("rank %d failed to obtain rank info", rank);
       goto cleanup;
     }
   }

   orderRanks(ranks, ndev);
   for(rank=0; rank<ndev; ++rank) {
     comm = comms[rank];
     cudaSetDevice(comm->cudaDev);
     res = commBuildMaps(comm, NULL, rank, ranks, &ringDirectFail);
     if (res != ncclSuccess) {
       WARN("rank %d failed to build comm maps", rank);
       goto cleanup;
     }
   }

   INFO("PushToRecv algos are %s\n", (ringDirectFail) ? "disabled" : "enabled");
   for(rank=0; rank<ndev; ++rank) {
     comms[rank]->useRemoteRecv = ringDirectFail ? 0 : 1;
   }

   free(ranks);
   ranks = NULL;
   res = ncclSuccess;
   goto final;

   cleanup:
   if (ranks != NULL)
     free(ranks);
   for(rank=0; rank<ndev; ++rank) {
     if(comms[rank] != NULL) {
       commFree(comms[rank]);
     }
   }

   final:
   if(wrapNvmlShutdown() != ncclSuccess)
     INFO("NCCL did not shutdown nvml properly");
   cudaSetDevice(savedDevice);
   return res;
 }


 extern "C" DSOGLOBAL
 void ncclCommDestroy(ncclComm_t comm) {
   if (comm == NULL)
     return;

   int savedDevice;
   cudaGetDevice(&savedDevice);
   int commDevice = comm->cudaDev;

   if (savedDevice != commDevice) {
     CUDACHECK(cudaSetDevice(commDevice));
   }

   commFree(comm);

   if (savedDevice != commDevice)
     cudaSetDevice(savedDevice);
 }

 extern "C" DSOGLOBAL
 const char* ncclGetErrorString(ncclResult_t code) {
   switch (code) {
   case ncclSuccess                : return "no error";
   case ncclUnhandledCudaError     : return "unhandled cuda error";
   case ncclSystemError            : return "system error";
   case ncclInternalError          : return "internal error";
   case ncclInvalidDevicePointer   : return "invalid device pointer";
   case ncclInvalidRank            : return "invalid rank";
   case ncclUnsupportedDeviceCount : return "unsupported device count";
   case ncclDeviceNotFound         : return "device not found";
   case ncclInvalidDeviceIndex     : return "invalid device index";
   case ncclLibWrapperNotSet       : return "lib wrapper not initialized";
   case ncclCudaMallocFailed       : return "cuda malloc failed";
   case ncclRankMismatch           : return "parameter mismatch between ranks";
   case ncclInvalidArgument        : return "invalid argument";
   case ncclInvalidType            : return "invalid data type";
   case ncclInvalidOperation       : return "invalid reduction operations";
   }
   return "unknown result code";
 }

 extern "C" DSOGLOBAL
 ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
   *count = comm->nDev;
   return ncclSuccess;
 }

 extern "C" DSOGLOBAL
 ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
   *devid = comm->cudaDev;
   return ncclSuccess;
 }

 extern "C" DSOGLOBAL
 ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
   *rank = comm->userFromRing[comm->ncclId];
   return ncclSuccess;
 }