gloo/test/cuda_base_test.h - platform/external/pytorch - Git at Google

 /**
  * Copyright (c) 2017-present, Facebook, Inc.
  * All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree. An additional grant
  * of patent rights can be found in the PATENTS file in the same directory.
  */

 #pragma once

 #include "gloo/test/base_test.h"

 #include "gloo/cuda_private.h"

 namespace gloo {
 namespace test {

 void cudaSleep(cudaStream_t stream, size_t clocks);

 class CudaBaseTest : public BaseTest {
  public:
   int getDeviceCount() {
     int n = 0;
     CUDA_CHECK(cudaGetDeviceCount(&n));
     return n;
   }
 };

 class Fixture {
  public:
   Fixture(const std::shared_ptr<Context> context, int devices, int count)
       : context(context),
         count(count) {
     for (int i = 0; i < devices; i++) {
       CudaDeviceScope scope(i);
       srcs.push_back(CudaMemory<float>(count));
       ptrs.push_back(
         CudaDevicePointer<float>::create(*srcs.back(), count));
     }
   }

   Fixture(Fixture&& other) noexcept : count(other.count) {
     srcs = std::move(other.srcs);
     ptrs = std::move(other.ptrs);
   }

   void assignValues() {
     const auto stride = context->size_ * srcs.size();
     for (int i = 0; i < srcs.size(); i++) {
       const auto& stream = ptrs[i].getStream();
       srcs[i].set((context->rank_ * srcs.size()) + i, stride, stream);
       CUDA_CHECK(cudaStreamSynchronize(stream));
     }
   }

   void assignValuesAsync() {
     const auto stride = context->size_ * srcs.size();
     for (int i = 0; i < srcs.size(); i++) {
       const auto& stream = ptrs[i].getStream();
       // Insert sleep on stream to force to artificially delay the
       // kernel that actually populates the memory to surface
       // synchronization errors.
       cudaSleep(stream, 100000);
       srcs[i].set((context->rank_ * srcs.size()) + i, stride, stream);
     }
   }

   std::vector<float*> getFloatPointers() const {
     std::vector<float*> out;
     for (const auto& src : srcs) {
       out.push_back(*src);
     }
     return out;
   }

   std::vector<cudaStream_t> getCudaStreams() const {
     std::vector<cudaStream_t> out;
     for (const auto& ptr : ptrs) {
       out.push_back(ptr.getStream());
     }
     return out;
   }

   std::vector<std::unique_ptr<float[]> > getHostBuffers() {
     std::vector<std::unique_ptr<float[]> > out;
     for (auto& src : srcs) {
       out.push_back(src.copyToHost());
     }
     return out;
   }

   std::shared_ptr<Context> context;
   const int count;
   std::vector<CudaDevicePointer<float> > ptrs;
   std::vector<CudaMemory<float> > srcs;
 };

 } // namespace test
 } // namespace gloo
	/**
	* Copyright (c) 2017-present, Facebook, Inc.
	* All rights reserved.
	*
	* This source code is licensed under the BSD-style license found in the
	* LICENSE file in the root directory of this source tree. An additional grant
	* of patent rights can be found in the PATENTS file in the same directory.
	*/

	#pragma once

	#include "gloo/test/base_test.h"

	#include "gloo/cuda_private.h"

	namespace gloo {
	namespace test {

	void cudaSleep(cudaStream_t stream, size_t clocks);

	class CudaBaseTest : public BaseTest {
	public:
	int getDeviceCount() {
	int n = 0;
	CUDA_CHECK(cudaGetDeviceCount(&n));
	return n;
	}
	};

	class Fixture {
	public:
	Fixture(const std::shared_ptr<Context> context, int devices, int count)
	: context(context),
	count(count) {
	for (int i = 0; i < devices; i++) {
	CudaDeviceScope scope(i);
	srcs.push_back(CudaMemory<float>(count));
	ptrs.push_back(
	CudaDevicePointer<float>::create(*srcs.back(), count));
	}
	}

	Fixture(Fixture&& other) noexcept : count(other.count) {
	srcs = std::move(other.srcs);
	ptrs = std::move(other.ptrs);
	}

	void assignValues() {
	const auto stride = context->size_ * srcs.size();
	for (int i = 0; i < srcs.size(); i++) {
	const auto& stream = ptrs[i].getStream();
	srcs[i].set((context->rank_ * srcs.size()) + i, stride, stream);
	CUDA_CHECK(cudaStreamSynchronize(stream));
	}
	}

	void assignValuesAsync() {
	const auto stride = context->size_ * srcs.size();
	for (int i = 0; i < srcs.size(); i++) {
	const auto& stream = ptrs[i].getStream();
	// Insert sleep on stream to force to artificially delay the
	// kernel that actually populates the memory to surface
	// synchronization errors.
	cudaSleep(stream, 100000);
	srcs[i].set((context->rank_ * srcs.size()) + i, stride, stream);
	}
	}

	std::vector<float*> getFloatPointers() const {
	std::vector<float*> out;
	for (const auto& src : srcs) {
	out.push_back(*src);
	}
	return out;
	}

	std::vector<cudaStream_t> getCudaStreams() const {
	std::vector<cudaStream_t> out;
	for (const auto& ptr : ptrs) {
	out.push_back(ptr.getStream());
	}
	return out;
	}

	std::vector<std::unique_ptr<float[]> > getHostBuffers() {
	std::vector<std::unique_ptr<float[]> > out;
	for (auto& src : srcs) {
	out.push_back(src.copyToHost());
	}
	return out;
	}

	std::shared_ptr<Context> context;
	const int count;
	std::vector<CudaDevicePointer<float> > ptrs;
	std::vector<CudaMemory<float> > srcs;
	};

	} // namespace test
	} // namespace gloo