aten/src/ATen/cuda/PeerToPeerAccess.cpp - platform/external/pytorch - Git at Google

 #include <ATen/cuda/PeerToPeerAccess.h>

 #include <ATen/cuda/CUDAContext.h>

 #include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/util/Exception.h>
 #include <c10/util/irange.h>

 #include <vector>

 namespace at::cuda {

 static std::vector<int8_t> p2pAccessEnabled_;
 static int64_t num_devices_ = -1;

 namespace detail {

 void init_p2p_access_cache(int64_t num_devices) {
   // p2pAccessEnabled records if p2p copies are allowed between pairs of
   // devices. Values include "1" (copy allowed), "0" (copy not allowed), and
   // "-1" (unknown).
   // Currently the max number of gpus in P2P group is 8, so if there are more
   // we enable P2P in groups of 8
   p2pAccessEnabled_.clear();
   p2pAccessEnabled_.resize(num_devices * num_devices, -1);
   num_devices_ = num_devices;

   for (const auto i : c10::irange(num_devices)) {
     p2pAccessEnabled_[i * num_devices + i] = 1;
   }
 }

 }  // namespace detail

 bool get_p2p_access(int dev, int dev_to_access) {
   at::globalContext().lazyInitCUDA();

   TORCH_CHECK(dev >= 0 || dev < num_devices_,
               dev, " is not a device");
   TORCH_CHECK(dev_to_access >= 0 || dev_to_access < num_devices_,
               dev_to_access, " is not a device");
   TORCH_INTERNAL_ASSERT(num_devices_ >= 0, "p2p access cache not initialized");

   auto &cache = p2pAccessEnabled_[dev * num_devices_ + dev_to_access];

   if (cache != -1) {
     return cache;
   }

   int result;
   C10_CUDA_CHECK(cudaDeviceCanAccessPeer(&result, dev, dev_to_access));
   cache = result ? 1 : 0;
   if (cache) {
     CUDACachingAllocator::enablePeerAccess(dev, dev_to_access);
   }

   return cache;
 }

 }  // namespace at::cuda::detail
	#include <ATen/cuda/PeerToPeerAccess.h>

	#include <ATen/cuda/CUDAContext.h>

	#include <c10/cuda/CUDACachingAllocator.h>
	#include <c10/cuda/CUDAGuard.h>
	#include <c10/util/Exception.h>
	#include <c10/util/irange.h>

	#include <vector>

	namespace at::cuda {

	static std::vector<int8_t> p2pAccessEnabled_;
	static int64_t num_devices_ = -1;

	namespace detail {

	void init_p2p_access_cache(int64_t num_devices) {
	// p2pAccessEnabled records if p2p copies are allowed between pairs of
	// devices. Values include "1" (copy allowed), "0" (copy not allowed), and
	// "-1" (unknown).
	// Currently the max number of gpus in P2P group is 8, so if there are more
	// we enable P2P in groups of 8
	p2pAccessEnabled_.clear();
	p2pAccessEnabled_.resize(num_devices * num_devices, -1);
	num_devices_ = num_devices;

	for (const auto i : c10::irange(num_devices)) {
	p2pAccessEnabled_[i * num_devices + i] = 1;
	}
	}

	} // namespace detail

	bool get_p2p_access(int dev, int dev_to_access) {
	at::globalContext().lazyInitCUDA();

	TORCH_CHECK(dev >= 0 \|\| dev < num_devices_,
	dev, " is not a device");
	TORCH_CHECK(dev_to_access >= 0 \|\| dev_to_access < num_devices_,
	dev_to_access, " is not a device");
	TORCH_INTERNAL_ASSERT(num_devices_ >= 0, "p2p access cache not initialized");

	auto &cache = p2pAccessEnabled_[dev * num_devices_ + dev_to_access];

	if (cache != -1) {
	return cache;
	}

	int result;
	C10_CUDA_CHECK(cudaDeviceCanAccessPeer(&result, dev, dev_to_access));
	cache = result ? 1 : 0;
	if (cache) {
	CUDACachingAllocator::enablePeerAccess(dev, dev_to_access);
	}

	return cache;
	}

	} // namespace at::cuda::detail