test/cpp/api/parallel.cpp - platform/external/pytorch - Git at Google

 #include <catch.hpp>

 #include <torch/csrc/autograd/functions/comm.h>
 #include <torch/nn/module.h>
 #include <torch/nn/modules/linear.h>
 #include <torch/nn/parallel/data_parallel.h>
 #include <torch/nn/pimpl.h>
 #include <torch/tensor.h>

 #include <iostream>
 #include <memory>
 #include <utility>
 #include <vector>

 using Catch::StartsWith;

 using namespace torch::autograd;
 using namespace torch::nn;

 #ifdef USE_CUDA

 TEST_CASE("Parallel/DifferentiableScatter", "[multi-cuda]") {
   Scatter scatter(
       {torch::Device(torch::kCUDA, 0), torch::Device(torch::kCUDA, 1)});

   auto input = torch::ones(10, torch::requires_grad(true));
   auto output = scatter.apply({input});

   REQUIRE(output.size() == 2);
   REQUIRE(output[0].size(0) == 5);
   REQUIRE(output[1].size(0) == 5);

   REQUIRE(torch::cat({output[0].to(torch::kCPU), output[1].to(torch::kCPU)})
               .allclose(input));

   auto sum = output[0].to({torch::kCUDA, 1}) + output[1];
   sum.backward();

   REQUIRE(input.grad().defined());
   REQUIRE(input.grad().device().is_cpu());
   REQUIRE(input.grad().sum().toCInt() == 10);
 }

 TEST_CASE("Parallel/DifferentiableGather", "[multi-cuda]") {
   Gather gather(torch::Device(torch::kCUDA, 1));

   auto a = torch::ones(5, torch::requires_grad(true).device({torch::kCUDA, 0}));
   auto b = torch::ones(5, torch::requires_grad(true).device({torch::kCUDA, 1}));

   auto outputs = gather.apply({a, b});
   REQUIRE(outputs.size() == 1);
   auto& output = outputs.front();

   REQUIRE(output.size(0) == 10);
   REQUIRE(output.device() == torch::Device(torch::kCUDA, 1));

   auto chunks = output.chunk(2);
   REQUIRE(chunks[0].to({torch::kCUDA, 0}).allclose(a));
   REQUIRE(chunks[1].allclose(b));

   output.backward();

   REQUIRE(a.grad().defined());
   REQUIRE(a.grad().device() == torch::Device(torch::kCUDA, 0));
   REQUIRE(a.grad().sum().toCInt() == 5);

   REQUIRE(b.grad().defined());
   REQUIRE(b.grad().device() == torch::Device(torch::kCUDA, 1));
   REQUIRE(b.grad().sum().toCInt() == 5);
 }

 TEST_CASE("Parallel/Replicate", "[multi-cuda]") {
   Linear linear(3, 4);
   auto replicas = parallel::replicate(
       linear, {torch::Device(torch::kCUDA, 0), torch::Device(torch::kCUDA, 1)});
   REQUIRE(replicas.size() == 2);

   auto original_parameters = linear->parameters();

   auto replica1_parameters = replicas[0]->parameters();
   for (auto& parameter : replica1_parameters) {
     REQUIRE(parameter->device() == torch::Device(torch::kCUDA, 0));
   }
   replicas[0]->to(torch::kCPU);
   REQUIRE(replica1_parameters.size() == original_parameters.size());
   for (size_t i = 0; i < original_parameters.size(); ++i) {
     REQUIRE(replica1_parameters[i]->allclose(*original_parameters[i]));
     REQUIRE(
         replica1_parameters[i].data<float>() !=
         original_parameters[i].data<float>());
   }

   auto replica2_parameters = replicas[1]->parameters();
   for (auto& parameter : replica2_parameters) {
     REQUIRE(parameter->device() == torch::Device(torch::kCUDA, 1));
   }
   replicas[1]->to(torch::kCPU);
   REQUIRE(replica2_parameters.size() == original_parameters.size());
   for (size_t i = 0; i < original_parameters.size(); ++i) {
     REQUIRE(replica2_parameters[i]->allclose(*original_parameters[i]));
     REQUIRE(
         replica2_parameters[i].data<float>() !=
         original_parameters[i].data<float>());
   }
 }

 TEST_CASE("Parallel/ParallelApply", "[multi-cuda]") {
   Linear a(3, 4);

   Linear b(std::static_pointer_cast<LinearImpl>(a->clone()));
   b->to({torch::kCUDA, 0});

   Linear c(std::static_pointer_cast<LinearImpl>(a->clone()));
   c->to({torch::kCUDA, 1});

   std::vector<Linear> modules = {a, b, c};
   std::vector<torch::Tensor> inputs = {
       torch::ones({2, 3}),
       torch::ones({2, 3}, torch::device({torch::kCUDA, 0})),
       torch::ones({2, 3}, torch::device({torch::kCUDA, 1}))};

   auto outputs = parallel::parallel_apply(modules, inputs);

   REQUIRE(outputs.size() == 3);
   REQUIRE(outputs[0].device().is_cpu());

   REQUIRE(outputs[1].device() == torch::Device(torch::kCUDA, 0));
   REQUIRE(outputs[1].to(torch::kCPU).allclose(outputs[0]));

   REQUIRE(outputs[2].device() == torch::Device(torch::kCUDA, 1));
   REQUIRE(outputs[2].to(torch::kCPU).allclose(outputs[0]));
 }

 TEST_CASE("Parallel/ParallelApplyWithDifferentOutputDevice", "[multi-cuda]") {
   struct M : torch::nn::Module {
     torch::Tensor forward(torch::Tensor input) {
       return torch::ones({5}, torch::dtype(torch::kInt32));
     }
   };

   std::vector<std::shared_ptr<M>> modules = {
       std::make_shared<M>(), std::make_shared<M>(), std::make_shared<M>()};
   std::vector<torch::Tensor> inputs = {
       torch::empty({}), torch::empty({}), torch::empty({})};
   std::vector<torch::Device> devices = {
       {torch::kCUDA, 1}, {torch::kCUDA, 0}, {torch::kCPU}};

   auto outputs = parallel::parallel_apply(modules, inputs, devices);

   REQUIRE(outputs.size() == 3);
   REQUIRE(outputs[0].device().is_cuda());
   REQUIRE(outputs[0].device() == torch::Device(torch::kCUDA, 1));

   REQUIRE(outputs[1].device().is_cuda());
   REQUIRE(outputs[1].device() == torch::Device(torch::kCUDA, 0));

   REQUIRE(outputs[2].device().is_cpu());
 }

 TEST_CASE("Parallel/ParallelApplyRethrowsException", "[multi-cuda]") {
   struct M : torch::nn::Cloneable<M> {
     void reset() override {}
     torch::Tensor forward(torch::Tensor input) {
       throw std::runtime_error("Badness!");
     }
   };

   auto m = std::make_shared<M>();
   auto input = torch::ones({10, 3});
   REQUIRE_THROWS_WITH(
       parallel::data_parallel(m, input), StartsWith("Badness!"));
 }

 TEST_CASE(
     "Parallel/DataParallelPlacesTheOutputOnTheRequestedDevice",
     "[multi-cuda]") {
   struct M : torch::nn::Cloneable<M> {
     void reset() override {}
     torch::Tensor forward(torch::Tensor input) {
       // Intermediate tensors should be on the replica's current device.
       intermediate_tensor = torch::rand(5);
       // The returned tensor should be on the output device.
       return torch::ones(3);
     }
     torch::Tensor intermediate_tensor;
   };
   auto m = std::make_shared<M>();
   auto input = torch::ones({10, 3});
   {
     auto output = parallel::data_parallel(
         m,
         input,
         /*devices=*/at::nullopt,
         /*output_device=*/torch::Device(torch::kCUDA, 1));
     REQUIRE(output.defined());
     REQUIRE(output.device().is_cuda());
     REQUIRE(output.device().index() == 1);
   }
   {
     // Verify for the single-device case (where we don't scatter/gather).
     auto output = parallel::data_parallel(
         m,
         input,
         /*devices=*/std::vector<torch::Device>{torch::Device(torch::kCUDA, 0)},
         /*output_device=*/torch::Device(torch::kCUDA, 1));
     REQUIRE(m->intermediate_tensor.defined());
     REQUIRE(m->intermediate_tensor.device().is_cuda());
     REQUIRE(m->intermediate_tensor.device().index() == 0);
     REQUIRE(output.defined());
     REQUIRE(output.device().is_cuda());
     REQUIRE(output.device().index() == 1);
   }
 }

 TEST_CASE("Parallel/DataParallelUsesAllAvailableCUDADevices", "[cuda]") {
   struct M : torch::nn::Cloneable<M> {
     void reset() override {}
     torch::Tensor forward(torch::Tensor input) {
       return torch::tensor(torch::DefaultTensorOptions::get().device().index());
     }
   };

   auto m = std::make_shared<M>();
   auto input = torch::ones({10, 3});
   auto output = parallel::data_parallel(m, input);

   const auto device_count = torch::cuda::device_count();
   REQUIRE(output.numel() == device_count);
   for (size_t i = 0; i < device_count; ++i) {
     REQUIRE(output[i].toCInt() == i);
   }
 }

 #endif
	#include <catch.hpp>

	#include <torch/csrc/autograd/functions/comm.h>
	#include <torch/nn/module.h>
	#include <torch/nn/modules/linear.h>
	#include <torch/nn/parallel/data_parallel.h>
	#include <torch/nn/pimpl.h>
	#include <torch/tensor.h>

	#include <iostream>
	#include <memory>
	#include <utility>
	#include <vector>

	using Catch::StartsWith;

	using namespace torch::autograd;
	using namespace torch::nn;

	#ifdef USE_CUDA

	TEST_CASE("Parallel/DifferentiableScatter", "[multi-cuda]") {
	Scatter scatter(
	{torch::Device(torch::kCUDA, 0), torch::Device(torch::kCUDA, 1)});

	auto input = torch::ones(10, torch::requires_grad(true));
	auto output = scatter.apply({input});

	REQUIRE(output.size() == 2);
	REQUIRE(output[0].size(0) == 5);
	REQUIRE(output[1].size(0) == 5);

	REQUIRE(torch::cat({output[0].to(torch::kCPU), output[1].to(torch::kCPU)})
	.allclose(input));

	auto sum = output[0].to({torch::kCUDA, 1}) + output[1];
	sum.backward();

	REQUIRE(input.grad().defined());
	REQUIRE(input.grad().device().is_cpu());
	REQUIRE(input.grad().sum().toCInt() == 10);
	}

	TEST_CASE("Parallel/DifferentiableGather", "[multi-cuda]") {
	Gather gather(torch::Device(torch::kCUDA, 1));

	auto a = torch::ones(5, torch::requires_grad(true).device({torch::kCUDA, 0}));
	auto b = torch::ones(5, torch::requires_grad(true).device({torch::kCUDA, 1}));

	auto outputs = gather.apply({a, b});
	REQUIRE(outputs.size() == 1);
	auto& output = outputs.front();

	REQUIRE(output.size(0) == 10);
	REQUIRE(output.device() == torch::Device(torch::kCUDA, 1));

	auto chunks = output.chunk(2);
	REQUIRE(chunks[0].to({torch::kCUDA, 0}).allclose(a));
	REQUIRE(chunks[1].allclose(b));

	output.backward();

	REQUIRE(a.grad().defined());
	REQUIRE(a.grad().device() == torch::Device(torch::kCUDA, 0));
	REQUIRE(a.grad().sum().toCInt() == 5);

	REQUIRE(b.grad().defined());
	REQUIRE(b.grad().device() == torch::Device(torch::kCUDA, 1));
	REQUIRE(b.grad().sum().toCInt() == 5);
	}

	TEST_CASE("Parallel/Replicate", "[multi-cuda]") {
	Linear linear(3, 4);
	auto replicas = parallel::replicate(
	linear, {torch::Device(torch::kCUDA, 0), torch::Device(torch::kCUDA, 1)});
	REQUIRE(replicas.size() == 2);

	auto original_parameters = linear->parameters();

	auto replica1_parameters = replicas[0]->parameters();
	for (auto& parameter : replica1_parameters) {
	REQUIRE(parameter->device() == torch::Device(torch::kCUDA, 0));
	}
	replicas[0]->to(torch::kCPU);
	REQUIRE(replica1_parameters.size() == original_parameters.size());
	for (size_t i = 0; i < original_parameters.size(); ++i) {
	REQUIRE(replica1_parameters[i]->allclose(*original_parameters[i]));
	REQUIRE(
	replica1_parameters[i].data<float>() !=
	original_parameters[i].data<float>());
	}

	auto replica2_parameters = replicas[1]->parameters();
	for (auto& parameter : replica2_parameters) {
	REQUIRE(parameter->device() == torch::Device(torch::kCUDA, 1));
	}
	replicas[1]->to(torch::kCPU);
	REQUIRE(replica2_parameters.size() == original_parameters.size());
	for (size_t i = 0; i < original_parameters.size(); ++i) {
	REQUIRE(replica2_parameters[i]->allclose(*original_parameters[i]));
	REQUIRE(
	replica2_parameters[i].data<float>() !=
	original_parameters[i].data<float>());
	}
	}

	TEST_CASE("Parallel/ParallelApply", "[multi-cuda]") {
	Linear a(3, 4);

	Linear b(std::static_pointer_cast<LinearImpl>(a->clone()));
	b->to({torch::kCUDA, 0});

	Linear c(std::static_pointer_cast<LinearImpl>(a->clone()));
	c->to({torch::kCUDA, 1});

	std::vector<Linear> modules = {a, b, c};
	std::vector<torch::Tensor> inputs = {
	torch::ones({2, 3}),
	torch::ones({2, 3}, torch::device({torch::kCUDA, 0})),
	torch::ones({2, 3}, torch::device({torch::kCUDA, 1}))};

	auto outputs = parallel::parallel_apply(modules, inputs);

	REQUIRE(outputs.size() == 3);
	REQUIRE(outputs[0].device().is_cpu());

	REQUIRE(outputs[1].device() == torch::Device(torch::kCUDA, 0));
	REQUIRE(outputs[1].to(torch::kCPU).allclose(outputs[0]));

	REQUIRE(outputs[2].device() == torch::Device(torch::kCUDA, 1));
	REQUIRE(outputs[2].to(torch::kCPU).allclose(outputs[0]));
	}

	TEST_CASE("Parallel/ParallelApplyWithDifferentOutputDevice", "[multi-cuda]") {
	struct M : torch::nn::Module {
	torch::Tensor forward(torch::Tensor input) {
	return torch::ones({5}, torch::dtype(torch::kInt32));
	}
	};

	std::vector<std::shared_ptr<M>> modules = {
	std::make_shared<M>(), std::make_shared<M>(), std::make_shared<M>()};
	std::vector<torch::Tensor> inputs = {
	torch::empty({}), torch::empty({}), torch::empty({})};
	std::vector<torch::Device> devices = {
	{torch::kCUDA, 1}, {torch::kCUDA, 0}, {torch::kCPU}};

	auto outputs = parallel::parallel_apply(modules, inputs, devices);

	REQUIRE(outputs.size() == 3);
	REQUIRE(outputs[0].device().is_cuda());
	REQUIRE(outputs[0].device() == torch::Device(torch::kCUDA, 1));

	REQUIRE(outputs[1].device().is_cuda());
	REQUIRE(outputs[1].device() == torch::Device(torch::kCUDA, 0));

	REQUIRE(outputs[2].device().is_cpu());
	}

	TEST_CASE("Parallel/ParallelApplyRethrowsException", "[multi-cuda]") {
	struct M : torch::nn::Cloneable<M> {
	void reset() override {}
	torch::Tensor forward(torch::Tensor input) {
	throw std::runtime_error("Badness!");
	}
	};

	auto m = std::make_shared<M>();
	auto input = torch::ones({10, 3});
	REQUIRE_THROWS_WITH(
	parallel::data_parallel(m, input), StartsWith("Badness!"));
	}

	TEST_CASE(
	"Parallel/DataParallelPlacesTheOutputOnTheRequestedDevice",
	"[multi-cuda]") {
	struct M : torch::nn::Cloneable<M> {
	void reset() override {}
	torch::Tensor forward(torch::Tensor input) {
	// Intermediate tensors should be on the replica's current device.
	intermediate_tensor = torch::rand(5);
	// The returned tensor should be on the output device.
	return torch::ones(3);
	}
	torch::Tensor intermediate_tensor;
	};
	auto m = std::make_shared<M>();
	auto input = torch::ones({10, 3});
	{
	auto output = parallel::data_parallel(
	m,
	input,
	/devices=/at::nullopt,
	/output_device=/torch::Device(torch::kCUDA, 1));
	REQUIRE(output.defined());
	REQUIRE(output.device().is_cuda());
	REQUIRE(output.device().index() == 1);
	}
	{
	// Verify for the single-device case (where we don't scatter/gather).
	auto output = parallel::data_parallel(
	m,
	input,
	/devices=/std::vector<torch::Device>{torch::Device(torch::kCUDA, 0)},
	/output_device=/torch::Device(torch::kCUDA, 1));
	REQUIRE(m->intermediate_tensor.defined());
	REQUIRE(m->intermediate_tensor.device().is_cuda());
	REQUIRE(m->intermediate_tensor.device().index() == 0);
	REQUIRE(output.defined());
	REQUIRE(output.device().is_cuda());
	REQUIRE(output.device().index() == 1);
	}
	}

	TEST_CASE("Parallel/DataParallelUsesAllAvailableCUDADevices", "[cuda]") {
	struct M : torch::nn::Cloneable<M> {
	void reset() override {}
	torch::Tensor forward(torch::Tensor input) {
	return torch::tensor(torch::DefaultTensorOptions::get().device().index());
	}
	};

	auto m = std::make_shared<M>();
	auto input = torch::ones({10, 3});
	auto output = parallel::data_parallel(m, input);

	const auto device_count = torch::cuda::device_count();
	REQUIRE(output.numel() == device_count);
	for (size_t i = 0; i < device_count; ++i) {
	REQUIRE(output[i].toCInt() == i);
	}
	}

	#endif