caffe2/contrib/nccl/cuda_nccl_gpu.cc - platform/external/pytorch - Git at Google

 #include "caffe2/contrib/nccl/cuda_nccl_gpu.h"

 namespace caffe2 {
 namespace nccl {
 namespace {

 std::vector<int> getDevices(const NCCLExecution& ex) {
   std::vector<int> result;
   result.reserve(ex.elements.size());
   for (const auto& el : ex.elements) {
     result.push_back(el.device);
   }
   return result;
 }

 class NCCLContext {
  public:
   explicit NCCLContext(const NCCLExecution& ex)
       : devices_(getDevices(ex)), master_gpu_id_(ex.stream_gpu_id) {
     comms_.resize(devices_.size());
     CAFFE_NCCL_CHECK(
         ncclCommInitAll(comms_.data(), devices_.size(), devices_.data()));

     streams_.resize(devices_.size());
     events_.resize(devices_.size());
     for (auto i = 0U; i < devices_.size(); ++i) {
       CUDAGuard g(devices_[i]);
       // get stream priorities
       int lo_pri, hi_pri;
       CUDA_ENFORCE(cudaDeviceGetStreamPriorityRange(&lo_pri, &hi_pri));
       CUDA_ENFORCE(cudaStreamCreateWithPriority(
           &streams_[i], cudaStreamNonBlocking, hi_pri));
       CUDA_ENFORCE(cudaEventCreateWithFlags(
           &events_[i], cudaEventDefault | cudaEventDisableTiming));
     }
     CUDAGuard g(master_gpu_id_);
     CUDA_ENFORCE(cudaEventCreateWithFlags(
         &master_event_, cudaEventDefault | cudaEventDisableTiming));
   }

   ~NCCLContext() {
     for (auto i = 0U; i < devices_.size(); ++i) {
       CUDAGuard g(devices_[i]);
       CUDA_ENFORCE(cudaStreamDestroy(streams_[i]));
       CUDA_ENFORCE(cudaEventDestroy(events_[i]));
     }
     CUDAGuard g(master_gpu_id_);
     CUDA_ENFORCE(cudaEventDestroy(master_event_));

     for (auto& comm : comms_) {
       ncclCommDestroy(comm);
     }
   }

   std::vector<int> devices_;
   std::vector<ncclComm_t> comms_;
   std::vector<cudaStream_t> streams_;
   int master_gpu_id_;
   cudaEvent_t master_event_;
   std::vector<cudaEvent_t> events_;

   C10_DISABLE_COPY_AND_ASSIGN(NCCLContext);
 };

 // We share the contexts across multiple operators, hence the cache.
 static std::mutex& gContextsMutex() {
   static std::mutex m;
   return m;
 }

 std::unordered_map<std::string, std::unique_ptr<NCCLContext>>& gContexts() {
   static std::unordered_map<std::string, std::unique_ptr<NCCLContext>> m;
   return m;
 }

 std::string ncclKey(const NCCLExecution& ex) {
   std::string result;
   int curr_device;
   CUDA_CHECK(cudaGetDevice(&curr_device));
   result += to_string(curr_device) + ":";
   for (const auto& el : ex.elements) {
     result += to_string(el.device) + ",";
   }
   return result;
 }

 NCCLContext* getNCCLContext(const NCCLExecution& ex) {
   auto& contexts = gContexts();
   const auto key = ncclKey(ex);
   if (!contexts[key]) {
     LOG(INFO) << "Creating NCCLContext for key: " << key;
     contexts[key].reset(new NCCLContext(ex));
   }
   return TORCH_CHECK_NOTNULL(contexts[key].get());
 }

 template <typename T>
 class ncclTypeWrapper;

 template <>
 class ncclTypeWrapper<float> {
  public:
   static const ncclDataType_t type = ncclFloat;
 };

 template <>
 class ncclTypeWrapper<int> {
  public:
   static const ncclDataType_t type = ncclInt;
 };

 #ifdef CAFFE_HAS_CUDA_FP16
 template <>
 class ncclTypeWrapper<at::Half> {
  public:
   static const ncclDataType_t type = ncclHalf;
 };
 #endif

 template <typename T, typename InitF, typename F>
 void runNCCL(const NCCLExecution& ex, InitF&& init_f, F&& f) {
   // do initialization
   for (auto i = 0U; i < ex.elements.size(); ++i) {
     auto& ctx = ex.elements[i];
     CUDAGuard g(ctx.device);
     init_f(ex.elements[i]);
   }

   std::lock_guard<std::mutex> g(gContextsMutex());
   auto* context = getNCCLContext(ex);
   auto& comms = context->comms_;
   auto& streams = context->streams_;
   auto& events = context->events_;
   // Record an event on the master context, wait on it in each of the
   // children streams, so the children streams are synchronized WRT
   // the original stream.
   {
     CUDAGuard g(ex.stream_gpu_id);
     CUDA_ENFORCE(cudaEventRecord(context->master_event_, ex.stream));
   }

   {
     // lock out alloc / free while NCCL launches
     std::lock_guard<std::mutex> lock(CUDAContext::mutex());

 #if NCCL_VERSION_MIN(2, 0, 0)
     CAFFE_NCCL_CHECK(ncclGroupStart());
 #endif

     for (auto i = 0U; i < ex.elements.size(); ++i) {
       auto& ctx = ex.elements[i];
       CUDAGuard g(ctx.device);
       auto& comm = comms[i];
       auto& stream = streams[i];

       TORCH_DCHECK_EQ(ctx.device, GetGPUIDForPointer(ctx.src->raw_data()));
       CUDA_ENFORCE(cudaStreamWaitEvent(stream, context->master_event_, 0));
       f(ctx, comm, stream);
     }

 #if NCCL_VERSION_MIN(2, 0, 0)
     CAFFE_NCCL_CHECK(ncclGroupEnd());
 #endif

     for (auto i = 0U; i < ex.elements.size(); ++i) {
       auto& ctx = ex.elements[i];
       CUDAGuard g(ctx.device);
       auto& stream = streams[i];
       auto& event = events[i];

       // Record an event on each children stream that we have finished
       // our computation
       CUDA_ENFORCE(cudaEventRecord(event, stream));
     }
   }

   // Now, wait on all the events in the original stream.
   CUDAGuard dg(ex.stream_gpu_id);
   for (auto& event : events) {
     CUDA_ENFORCE(cudaStreamWaitEvent(TORCH_CHECK_NOTNULL(ex.stream), event, 0));
   }
 }

 } // namespace

 void destroyContexts() {
   std::lock_guard<std::mutex> g(gContextsMutex());
   auto& contexts = gContexts();
   contexts.clear();
 }

 template <typename T>
 void NCCL<T>::AllReduce(const NCCLExecution& ex) {
   return runNCCL<T>(
       ex,
       [](const NCCLElement& ctx) {
         ctx.dst->Resize(ctx.src->sizes());
         ctx.dst->template mutable_data<T>();
       },
       [](const NCCLElement& ctx, ncclComm_t comm, cudaStream_t stream) {
         CAFFE_NCCL_CHECK(ncclAllReduce(
             ctx.src->raw_data(),
             ctx.dst->raw_mutable_data(),
             ctx.dst->numel(),
             ncclTypeWrapper<T>::type,
             ncclSum,
             comm,
             stream));
       });
 }

 template <typename T>
 void NCCL<T>::Broadcast(const NCCLExecution& ex) {
   return runNCCL<T>(
       ex,
       [](const NCCLElement& ctx) {
         ctx.dst->Resize(ctx.src->sizes());
         ctx.dst->template mutable_data<T>();
       },
       [&ex](const NCCLElement& ctx, ncclComm_t comm, cudaStream_t stream) {
         CAFFE_NCCL_CHECK(ncclBcast(
             ctx.dst->raw_mutable_data(),
             ctx.dst->numel(),
             ncclTypeWrapper<T>::type,
             ex.root,
             comm,
             stream));
       });
 }

 template <typename T>
 void NCCL<T>::Reduce(const NCCLExecution& ex) {
   return runNCCL<T>(
       ex,
       [](const NCCLElement& ctx) {
         if (ctx.dst) {
           ctx.dst->Resize(ctx.src->sizes());
           ctx.dst->template mutable_data<T>();
         }
       },
       [&ex](const NCCLElement& ctx, ncclComm_t comm, cudaStream_t stream) {
         CAFFE_NCCL_CHECK(ncclReduce(
             ctx.src->raw_data(),
             ctx.dst ? ctx.dst->raw_mutable_data() : nullptr,
             ctx.src->numel(),
             ncclTypeWrapper<T>::type,
             ncclSum,
             ex.root,
             comm,
             stream));
       });
 }

 template <typename T>
 void NCCL<T>::AllGather(const NCCLExecution& ex) {
   const auto n = ex.elements.size();
   return runNCCL<T>(
       ex,
       [n](const NCCLElement& ctx) {
         CAFFE_ENFORCE_NE(ctx.src, ctx.dst);
         std::vector<int64_t> dims;
         dims.reserve(ctx.src->dim() + 1);
         dims.push_back(n);
         for (auto d : ctx.src->sizes()) {
           dims.push_back(d);
         }
         ctx.dst->Resize(dims);
         ctx.dst->template mutable_data<T>();
       },
       [](const NCCLElement& ctx, ncclComm_t comm, cudaStream_t stream) {
 #if NCCL_VERSION_MIN(2, 0, 0)
         CAFFE_NCCL_CHECK(ncclAllGather(
             ctx.src->raw_data(),
             ctx.dst->raw_mutable_data(),
             ctx.src->numel(),
             ncclTypeWrapper<T>::type,
             comm,
             stream));
 #else
         CAFFE_NCCL_CHECK(ncclAllGather(
             ctx.src->raw_data(),
             ctx.src->size(),
             ncclTypeWrapper<T>::type,
             ctx.dst->raw_mutable_data(),
             comm,
             stream));
 #endif
       });
 }

 template <typename T>
 void NCCL<T>::ReduceScatter(const NCCLExecution& ex) {
   return runNCCL<T>(
       ex,
       [](const NCCLElement& ctx) {
         CAFFE_ENFORCE_NE(ctx.src, ctx.dst);
         const auto& srcDims = ctx.src->sizes();
         std::vector<int64_t> dstDims(srcDims.begin() + 1, srcDims.end());
         ctx.dst->Resize(dstDims);
         ctx.dst->template mutable_data<T>();
       },
       [](const NCCLElement& ctx, ncclComm_t comm, cudaStream_t stream) {
         CAFFE_NCCL_CHECK(ncclReduceScatter(
             ctx.src->raw_data(),
             ctx.dst->raw_mutable_data(),
             ctx.dst->numel(),
             ncclTypeWrapper<T>::type,
             ncclSum,
             comm,
             stream));
       });
 }

 // Explicit instantiation
 template class NCCL<float>;
 template class NCCL<int>;
 #ifdef CAFFE_HAS_CUDA_FP16
 template class NCCL<at::Half>;
 #endif

 } // namespace nccl
 } // namespace caffe2
	#include "caffe2/contrib/nccl/cuda_nccl_gpu.h"

	namespace caffe2 {
	namespace nccl {
	namespace {

	std::vector<int> getDevices(const NCCLExecution& ex) {
	std::vector<int> result;
	result.reserve(ex.elements.size());
	for (const auto& el : ex.elements) {
	result.push_back(el.device);
	}
	return result;
	}

	class NCCLContext {
	public:
	explicit NCCLContext(const NCCLExecution& ex)
	: devices_(getDevices(ex)), master_gpu_id_(ex.stream_gpu_id) {
	comms_.resize(devices_.size());
	CAFFE_NCCL_CHECK(
	ncclCommInitAll(comms_.data(), devices_.size(), devices_.data()));

	streams_.resize(devices_.size());
	events_.resize(devices_.size());
	for (auto i = 0U; i < devices_.size(); ++i) {
	CUDAGuard g(devices_[i]);
	// get stream priorities
	int lo_pri, hi_pri;
	CUDA_ENFORCE(cudaDeviceGetStreamPriorityRange(&lo_pri, &hi_pri));
	CUDA_ENFORCE(cudaStreamCreateWithPriority(
	&streams_[i], cudaStreamNonBlocking, hi_pri));
	CUDA_ENFORCE(cudaEventCreateWithFlags(
	&events_[i], cudaEventDefault \| cudaEventDisableTiming));
	}
	CUDAGuard g(master_gpu_id_);
	CUDA_ENFORCE(cudaEventCreateWithFlags(
	&master_event_, cudaEventDefault \| cudaEventDisableTiming));
	}

	~NCCLContext() {
	for (auto i = 0U; i < devices_.size(); ++i) {
	CUDAGuard g(devices_[i]);
	CUDA_ENFORCE(cudaStreamDestroy(streams_[i]));
	CUDA_ENFORCE(cudaEventDestroy(events_[i]));
	}
	CUDAGuard g(master_gpu_id_);
	CUDA_ENFORCE(cudaEventDestroy(master_event_));

	for (auto& comm : comms_) {
	ncclCommDestroy(comm);
	}
	}

	std::vector<int> devices_;
	std::vector<ncclComm_t> comms_;
	std::vector<cudaStream_t> streams_;
	int master_gpu_id_;
	cudaEvent_t master_event_;
	std::vector<cudaEvent_t> events_;

	C10_DISABLE_COPY_AND_ASSIGN(NCCLContext);
	};

	// We share the contexts across multiple operators, hence the cache.
	static std::mutex& gContextsMutex() {
	static std::mutex m;
	return m;
	}

	std::unordered_map<std::string, std::unique_ptr<NCCLContext>>& gContexts() {
	static std::unordered_map<std::string, std::unique_ptr<NCCLContext>> m;
	return m;
	}

	std::string ncclKey(const NCCLExecution& ex) {
	std::string result;
	int curr_device;
	CUDA_CHECK(cudaGetDevice(&curr_device));
	result += to_string(curr_device) + ":";
	for (const auto& el : ex.elements) {
	result += to_string(el.device) + ",";
	}
	return result;
	}

	NCCLContext* getNCCLContext(const NCCLExecution& ex) {
	auto& contexts = gContexts();
	const auto key = ncclKey(ex);
	if (!contexts[key]) {
	LOG(INFO) << "Creating NCCLContext for key: " << key;
	contexts[key].reset(new NCCLContext(ex));
	}
	return TORCH_CHECK_NOTNULL(contexts[key].get());
	}

	template <typename T>
	class ncclTypeWrapper;

	template <>
	class ncclTypeWrapper<float> {
	public:
	static const ncclDataType_t type = ncclFloat;
	};

	template <>
	class ncclTypeWrapper<int> {
	public:
	static const ncclDataType_t type = ncclInt;
	};

	#ifdef CAFFE_HAS_CUDA_FP16
	template <>
	class ncclTypeWrapper<at::Half> {
	public:
	static const ncclDataType_t type = ncclHalf;
	};
	#endif

	template <typename T, typename InitF, typename F>
	void runNCCL(const NCCLExecution& ex, InitF&& init_f, F&& f) {
	// do initialization
	for (auto i = 0U; i < ex.elements.size(); ++i) {
	auto& ctx = ex.elements[i];
	CUDAGuard g(ctx.device);
	init_f(ex.elements[i]);
	}

	std::lock_guard<std::mutex> g(gContextsMutex());
	auto* context = getNCCLContext(ex);
	auto& comms = context->comms_;
	auto& streams = context->streams_;
	auto& events = context->events_;
	// Record an event on the master context, wait on it in each of the
	// children streams, so the children streams are synchronized WRT
	// the original stream.
	{
	CUDAGuard g(ex.stream_gpu_id);
	CUDA_ENFORCE(cudaEventRecord(context->master_event_, ex.stream));
	}

	{
	// lock out alloc / free while NCCL launches
	std::lock_guard<std::mutex> lock(CUDAContext::mutex());

	#if NCCL_VERSION_MIN(2, 0, 0)
	CAFFE_NCCL_CHECK(ncclGroupStart());
	#endif

	for (auto i = 0U; i < ex.elements.size(); ++i) {
	auto& ctx = ex.elements[i];
	CUDAGuard g(ctx.device);
	auto& comm = comms[i];
	auto& stream = streams[i];

	TORCH_DCHECK_EQ(ctx.device, GetGPUIDForPointer(ctx.src->raw_data()));
	CUDA_ENFORCE(cudaStreamWaitEvent(stream, context->master_event_, 0));
	f(ctx, comm, stream);
	}

	#if NCCL_VERSION_MIN(2, 0, 0)
	CAFFE_NCCL_CHECK(ncclGroupEnd());
	#endif

	for (auto i = 0U; i < ex.elements.size(); ++i) {
	auto& ctx = ex.elements[i];
	CUDAGuard g(ctx.device);
	auto& stream = streams[i];
	auto& event = events[i];

	// Record an event on each children stream that we have finished
	// our computation
	CUDA_ENFORCE(cudaEventRecord(event, stream));
	}
	}

	// Now, wait on all the events in the original stream.
	CUDAGuard dg(ex.stream_gpu_id);
	for (auto& event : events) {
	CUDA_ENFORCE(cudaStreamWaitEvent(TORCH_CHECK_NOTNULL(ex.stream), event, 0));
	}
	}

	} // namespace

	void destroyContexts() {
	std::lock_guard<std::mutex> g(gContextsMutex());
	auto& contexts = gContexts();
	contexts.clear();
	}

	template <typename T>
	void NCCL<T>::AllReduce(const NCCLExecution& ex) {
	return runNCCL<T>(
	ex,
	[](const NCCLElement& ctx) {
	ctx.dst->Resize(ctx.src->sizes());
	ctx.dst->template mutable_data<T>();
	},
	[](const NCCLElement& ctx, ncclComm_t comm, cudaStream_t stream) {
	CAFFE_NCCL_CHECK(ncclAllReduce(
	ctx.src->raw_data(),
	ctx.dst->raw_mutable_data(),
	ctx.dst->numel(),
	ncclTypeWrapper<T>::type,
	ncclSum,
	comm,
	stream));
	});
	}

	template <typename T>
	void NCCL<T>::Broadcast(const NCCLExecution& ex) {
	return runNCCL<T>(
	ex,
	[](const NCCLElement& ctx) {
	ctx.dst->Resize(ctx.src->sizes());
	ctx.dst->template mutable_data<T>();
	},
	[&ex](const NCCLElement& ctx, ncclComm_t comm, cudaStream_t stream) {
	CAFFE_NCCL_CHECK(ncclBcast(
	ctx.dst->raw_mutable_data(),
	ctx.dst->numel(),
	ncclTypeWrapper<T>::type,
	ex.root,
	comm,
	stream));
	});
	}

	template <typename T>
	void NCCL<T>::Reduce(const NCCLExecution& ex) {
	return runNCCL<T>(
	ex,
	[](const NCCLElement& ctx) {
	if (ctx.dst) {
	ctx.dst->Resize(ctx.src->sizes());
	ctx.dst->template mutable_data<T>();
	}
	},
	[&ex](const NCCLElement& ctx, ncclComm_t comm, cudaStream_t stream) {
	CAFFE_NCCL_CHECK(ncclReduce(
	ctx.src->raw_data(),
	ctx.dst ? ctx.dst->raw_mutable_data() : nullptr,
	ctx.src->numel(),
	ncclTypeWrapper<T>::type,
	ncclSum,
	ex.root,
	comm,
	stream));
	});
	}

	template <typename T>
	void NCCL<T>::AllGather(const NCCLExecution& ex) {
	const auto n = ex.elements.size();
	return runNCCL<T>(
	ex,
	[n](const NCCLElement& ctx) {
	CAFFE_ENFORCE_NE(ctx.src, ctx.dst);
	std::vector<int64_t> dims;
	dims.reserve(ctx.src->dim() + 1);
	dims.push_back(n);
	for (auto d : ctx.src->sizes()) {
	dims.push_back(d);
	}
	ctx.dst->Resize(dims);
	ctx.dst->template mutable_data<T>();
	},
	[](const NCCLElement& ctx, ncclComm_t comm, cudaStream_t stream) {
	#if NCCL_VERSION_MIN(2, 0, 0)
	CAFFE_NCCL_CHECK(ncclAllGather(
	ctx.src->raw_data(),
	ctx.dst->raw_mutable_data(),
	ctx.src->numel(),
	ncclTypeWrapper<T>::type,
	comm,
	stream));
	#else
	CAFFE_NCCL_CHECK(ncclAllGather(
	ctx.src->raw_data(),
	ctx.src->size(),
	ncclTypeWrapper<T>::type,
	ctx.dst->raw_mutable_data(),
	comm,
	stream));
	#endif
	});
	}

	template <typename T>
	void NCCL<T>::ReduceScatter(const NCCLExecution& ex) {
	return runNCCL<T>(
	ex,
	[](const NCCLElement& ctx) {
	CAFFE_ENFORCE_NE(ctx.src, ctx.dst);
	const auto& srcDims = ctx.src->sizes();
	std::vector<int64_t> dstDims(srcDims.begin() + 1, srcDims.end());
	ctx.dst->Resize(dstDims);
	ctx.dst->template mutable_data<T>();
	},
	[](const NCCLElement& ctx, ncclComm_t comm, cudaStream_t stream) {
	CAFFE_NCCL_CHECK(ncclReduceScatter(
	ctx.src->raw_data(),
	ctx.dst->raw_mutable_data(),
	ctx.dst->numel(),
	ncclTypeWrapper<T>::type,
	ncclSum,
	comm,
	stream));
	});
	}

	// Explicit instantiation
	template class NCCL<float>;
	template class NCCL<int>;
	#ifdef CAFFE_HAS_CUDA_FP16
	template class NCCL<at::Half>;
	#endif

	} // namespace nccl
	} // namespace caffe2