test/cpp/api/rnn.cpp - platform/external/pytorch - Git at Google

 #include <gtest/gtest.h>

 #include <torch/torch.h>

 #include <test/cpp/api/support.h>

 using namespace torch::nn;
 using namespace torch::test;

 template <typename R, typename Func>
 bool test_RNN_xor(Func&& model_maker, bool cuda = false) {
   torch::manual_seed(0);

   auto nhid = 32;
   auto model = std::make_shared<SimpleContainer>();
   auto l1 = model->add(Linear(1, nhid), "l1");
   auto rnn = model->add(model_maker(nhid), "rnn");
   auto lo = model->add(Linear(nhid, 1), "lo");

   torch::optim::Adam optimizer(model->parameters(), 1e-2);
   auto forward_op = [&](torch::Tensor x) {
     auto T = x.size(0);
     auto B = x.size(1);
     x = x.view({T * B, 1});
     x = l1->forward(x).view({T, B, nhid}).tanh_();
     x = rnn->forward(x).output[T - 1];
     x = lo->forward(x);
     return x;
   };

   if (cuda) {
     model->to(torch::kCUDA);
   }

   float running_loss = 1;
   int epoch = 0;
   auto max_epoch = 1500;
   while (running_loss > 1e-2) {
     auto bs = 16U;
     auto nlen = 5U;

     const auto backend = cuda ? torch::kCUDA : torch::kCPU;
     auto inputs =
         torch::rand({nlen, bs, 1}, backend).round().to(torch::kFloat32);
     auto labels = inputs.sum(0).detach();
     inputs.set_requires_grad(true);

     auto outputs = forward_op(inputs);
     torch::Tensor loss = torch::mse_loss(outputs, labels);

     optimizer.zero_grad();
     loss.backward();
     optimizer.step();

     running_loss = running_loss * 0.99 + loss.item<float>() * 0.01;
     if (epoch > max_epoch) {
       return false;
     }
     epoch++;
   }
   return true;
 };

 void check_lstm_sizes(RNNOutput output) {
   // Expect the LSTM to have 64 outputs and 3 layers, with an input of batch
   // 10 and 16 time steps (10 x 16 x n)

   ASSERT_EQ(output.output.ndimension(), 3);
   ASSERT_EQ(output.output.size(0), 10);
   ASSERT_EQ(output.output.size(1), 16);
   ASSERT_EQ(output.output.size(2), 64);

   ASSERT_EQ(output.state.ndimension(), 4);
   ASSERT_EQ(output.state.size(0), 2); // (hx, cx)
   ASSERT_EQ(output.state.size(1), 3); // layers
   ASSERT_EQ(output.state.size(2), 16); // Batchsize
   ASSERT_EQ(output.state.size(3), 64); // 64 hidden dims

   // Something is in the hiddens
   ASSERT_GT(output.state.norm().item<float>(), 0);
 }

 struct RNNTest : torch::test::SeedingFixture {};

 TEST_F(RNNTest, CheckOutputSizes) {
   LSTM model(LSTMOptions(128, 64).layers(3).dropout(0.2));
   // Input size is: sequence length, batch size, input size
   auto x = torch::randn({10, 16, 128}, torch::requires_grad());
   auto output = model->forward(x);
   auto y = x.mean();

   y.backward();
   check_lstm_sizes(output);

   auto next = model->forward(x, output.state);

   check_lstm_sizes(next);

   torch::Tensor diff = next.state - output.state;

   // Hiddens changed
   ASSERT_GT(diff.abs().sum().item<float>(), 1e-3);
 }

 TEST_F(RNNTest, CheckOutputValuesMatchPyTorch) {
   torch::manual_seed(0);
   // Make sure the outputs match pytorch outputs
   LSTM model(2, 2);
   for (auto& v : model->parameters()) {
     float size = v.numel();
     auto p = static_cast<float*>(v.storage().data());
     for (size_t i = 0; i < size; i++) {
       p[i] = i / size;
     }
   }

   auto x = torch::empty({3, 4, 2}, torch::requires_grad());
   float size = x.numel();
   auto p = static_cast<float*>(x.storage().data());
   for (size_t i = 0; i < size; i++) {
     p[i] = (size - i) / size;
   }

   auto out = model->forward(x);
   ASSERT_EQ(out.output.ndimension(), 3);
   ASSERT_EQ(out.output.size(0), 3);
   ASSERT_EQ(out.output.size(1), 4);
   ASSERT_EQ(out.output.size(2), 2);

   auto flat = out.output.view(3 * 4 * 2);
   float c_out[] = {0.4391, 0.5402, 0.4330, 0.5324, 0.4261, 0.5239,
                    0.4183, 0.5147, 0.6822, 0.8064, 0.6726, 0.7968,
                    0.6620, 0.7860, 0.6501, 0.7741, 0.7889, 0.9003,
                    0.7769, 0.8905, 0.7635, 0.8794, 0.7484, 0.8666};
   for (size_t i = 0; i < 3 * 4 * 2; i++) {
     ASSERT_LT(std::abs(flat[i].item<float>() - c_out[i]), 1e-3);
   }

   ASSERT_EQ(out.state.ndimension(), 4); // (hx, cx) x layers x B x 2
   ASSERT_EQ(out.state.size(0), 2);
   ASSERT_EQ(out.state.size(1), 1);
   ASSERT_EQ(out.state.size(2), 4);
   ASSERT_EQ(out.state.size(3), 2);
   flat = out.state.view(16);
   float h_out[] = {0.7889,
                    0.9003,
                    0.7769,
                    0.8905,
                    0.7635,
                    0.8794,
                    0.7484,
                    0.8666,
                    1.1647,
                    1.6106,
                    1.1425,
                    1.5726,
                    1.1187,
                    1.5329,
                    1.0931,
                    1.4911};
   for (size_t i = 0; i < 16; i++) {
     ASSERT_LT(std::abs(flat[i].item<float>() - h_out[i]), 1e-3);
   }
 }

 TEST_F(RNNTest, EndToEndLSTM) {
   ASSERT_TRUE(test_RNN_xor<LSTM>(
       [](int s) { return LSTM(LSTMOptions(s, s).layers(2)); }));
 }

 TEST_F(RNNTest, EndToEndGRU) {
   ASSERT_TRUE(
       test_RNN_xor<GRU>([](int s) { return GRU(GRUOptions(s, s).layers(2)); }));
 }

 TEST_F(RNNTest, EndToEndRNNRelu) {
   ASSERT_TRUE(test_RNN_xor<RNN>(
       [](int s) { return RNN(RNNOptions(s, s).relu().layers(2)); }));
 }

 TEST_F(RNNTest, EndToEndRNNTanh) {
   ASSERT_TRUE(test_RNN_xor<RNN>(
       [](int s) { return RNN(RNNOptions(s, s).tanh().layers(2)); }));
 }

 TEST_F(RNNTest, Sizes_CUDA) {
   torch::manual_seed(0);
   LSTM model(LSTMOptions(128, 64).layers(3).dropout(0.2));
   model->to(torch::kCUDA);
   auto x =
       torch::randn({10, 16, 128}, torch::requires_grad().device(torch::kCUDA));
   auto output = model->forward(x);
   auto y = x.mean();

   y.backward();
   check_lstm_sizes(output);

   auto next = model->forward(x, output.state);

   check_lstm_sizes(next);

   torch::Tensor diff = next.state - output.state;

   // Hiddens changed
   ASSERT_GT(diff.abs().sum().item<float>(), 1e-3);
 }

 TEST_F(RNNTest, EndToEndLSTM_CUDA) {
   ASSERT_TRUE(test_RNN_xor<LSTM>(
       [](int s) { return LSTM(LSTMOptions(s, s).layers(2)); }, true));
 }

 TEST_F(RNNTest, EndToEndGRU_CUDA) {
   ASSERT_TRUE(test_RNN_xor<GRU>(
       [](int s) { return GRU(GRUOptions(s, s).layers(2)); }, true));
 }

 TEST_F(RNNTest, EndToEndRNNRelu_CUDA) {
   ASSERT_TRUE(test_RNN_xor<RNN>(
       [](int s) { return RNN(RNNOptions(s, s).relu().layers(2)); }, true));
 }
 TEST_F(RNNTest, EndToEndRNNTanh_CUDA) {
   ASSERT_TRUE(test_RNN_xor<RNN>(
       [](int s) { return RNN(RNNOptions(s, s).tanh().layers(2)); }, true));
 }

 TEST_F(RNNTest, PrettyPrintRNNs) {
   ASSERT_EQ(
       c10::str(LSTM(LSTMOptions(128, 64).layers(3).dropout(0.2))),
       "torch::nn::LSTM(input_size=128, hidden_size=64, layers=3, dropout=0.2)");
   ASSERT_EQ(
       c10::str(GRU(GRUOptions(128, 64).layers(3).dropout(0.5))),
       "torch::nn::GRU(input_size=128, hidden_size=64, layers=3, dropout=0.5)");
   ASSERT_EQ(
       c10::str(RNN(RNNOptions(128, 64).layers(3).dropout(0.2).tanh())),
       "torch::nn::RNN(input_size=128, hidden_size=64, layers=3, dropout=0.2, activation=tanh)");
 }

 // This test assures that flatten_parameters does not crash,
 // when bidirectional is set to true
 // https://github.com/pytorch/pytorch/issues/19545
 TEST_F(RNNTest, BidirectionalFlattenParameters) {
   GRU gru(GRUOptions(100, 256).layers(2).bidirectional(true));
   gru->flatten_parameters();
 }

 template <typename Impl>
 void copyParameters(torch::nn::ModuleHolder<Impl>& target, size_t t_i,
                     const torch::nn::ModuleHolder<Impl>& source, size_t s_i) {
   at::NoGradGuard guard;
   target->w_ih[t_i].copy_(source->w_ih[s_i]);
   target->w_hh[t_i].copy_(source->w_hh[s_i]);
   target->b_ih[t_i].copy_(source->b_ih[s_i]);
   target->b_hh[t_i].copy_(source->b_hh[s_i]);
 }

 // This test is a port of python code introduced here:
 // https://towardsdatascience.com/understanding-bidirectional-rnn-in-pytorch-5bd25a5dd66
 // Reverse forward of bidrectional GRU should act
 // as regular forward of unidirectional GRU
 void BidirectionalGRUReverseForward(bool cuda) {
   auto opt = torch::TensorOptions().dtype(torch::kFloat32).requires_grad(false)
                                    .device(cuda ? torch::kCUDA : torch::kCPU);
   auto input = torch::tensor({1, 2, 3, 4, 5}, opt).reshape({5, 1, 1});
   auto input_reversed = torch::tensor({5, 4, 3, 2, 1}, opt).reshape({5, 1, 1});

   auto gru_options = GRUOptions(1, 1).layers(1).batch_first(false);
   GRU bi_grus {gru_options.bidirectional(true)};
   GRU reverse_gru {gru_options.bidirectional(false)};

   if (cuda) {
     bi_grus->to(torch::kCUDA);
     reverse_gru->to(torch::kCUDA);
   }

   // Now make sure the weights of the reverse gru layer match
   // ones of the (reversed) bidirectional's:
   copyParameters(reverse_gru, 0, bi_grus, 1);

   auto bi_output = bi_grus->forward(input);
   auto reverse_output = reverse_gru->forward(input_reversed);

   if (cuda) {
     bi_output.output = bi_output.output.to(torch::kCPU);
     bi_output.state = bi_output.state.to(torch::kCPU);
     reverse_output.output = reverse_output.output.to(torch::kCPU);
     reverse_output.state = reverse_output.state.to(torch::kCPU);
   }

   ASSERT_EQ(bi_output.output.size(0), reverse_output.output.size(0));
   auto size = bi_output.output.size(0);
   for (int i = 0; i < size; i++) {
     ASSERT_EQ(bi_output.output[i][0][1].item<float>(),
               reverse_output.output[size - 1 - i][0][0].item<float>());
   }
   // The hidden states of the reversed GRUs sits
   // in the odd indices in the first dimension.
   ASSERT_EQ(bi_output.state[1][0][0].item<float>(),
             reverse_output.state[0][0][0].item<float>());
 }

 TEST_F(RNNTest, BidirectionalGRUReverseForward) {
   BidirectionalGRUReverseForward(false);
 }

 TEST_F(RNNTest, BidirectionalGRUReverseForward_CUDA) {
   BidirectionalGRUReverseForward(true);
 }

 // Reverse forward of bidrectional LSTM should act
 // as regular forward of unidirectional LSTM
 void BidirectionalLSTMReverseForwardTest(bool cuda) {
   auto opt = torch::TensorOptions().dtype(torch::kFloat32).requires_grad(false)
                                    .device(cuda ? torch::kCUDA : torch::kCPU);
   auto input = torch::tensor({1, 2, 3, 4, 5}, opt).reshape({5, 1, 1});
   auto input_reversed = torch::tensor({5, 4, 3, 2, 1}, opt).reshape({5, 1, 1});

   auto lstm_opt = GRUOptions(1, 1).layers(1).batch_first(false);

   LSTM bi_lstm {lstm_opt.bidirectional(true)};
   LSTM reverse_lstm {lstm_opt.bidirectional(false)};

   if (cuda) {
     bi_lstm->to(torch::kCUDA);
     reverse_lstm->to(torch::kCUDA);
   }

   // Now make sure the weights of the reverse lstm layer match
   // ones of the (reversed) bidirectional's:
   copyParameters(reverse_lstm, 0, bi_lstm, 1);

   auto bi_output = bi_lstm->forward(input);
   auto reverse_output = reverse_lstm->forward(input_reversed);

   if (cuda) {
     bi_output.output = bi_output.output.to(torch::kCPU);
     bi_output.state = bi_output.state.to(torch::kCPU);
     reverse_output.output = reverse_output.output.to(torch::kCPU);
     reverse_output.state = reverse_output.state.to(torch::kCPU);
   }

   ASSERT_EQ(bi_output.output.size(0), reverse_output.output.size(0));
   auto size = bi_output.output.size(0);
   for (int i = 0; i < size; i++) {
     ASSERT_EQ(bi_output.output[i][0][1].item<float>(),
               reverse_output.output[size - 1 - i][0][0].item<float>());
   }
   // The hidden states of the reversed LSTM sits
   // in the odd indices in the first dimension.
   ASSERT_EQ(bi_output.state[0][1][0][0].item<float>(),
             reverse_output.state[0][0][0][0].item<float>());
   ASSERT_EQ(bi_output.state[1][1][0][0].item<float>(),
             reverse_output.state[1][0][0][0].item<float>());
 }

 TEST_F(RNNTest, BidirectionalLSTMReverseForward) {
   BidirectionalLSTMReverseForwardTest(false);
 }

 TEST_F(RNNTest, BidirectionalLSTMReverseForward_CUDA) {
   BidirectionalLSTMReverseForwardTest(true);
 }

 TEST_F(RNNTest, BidirectionalMultilayerGRU_CPU_vs_CUDA) {
   // Create two GRUs with the same options
   auto opt = GRUOptions(2, 4).layers(3).batch_first(false).bidirectional(true);
   GRU gru_cpu {opt};
   GRU gru_cuda {opt};

   // Copy weights and biases from CPU GRU to CUDA GRU
   {
     at::NoGradGuard guard;
     const auto num_directions = gru_cpu->options.bidirectional() ? 2 : 1;
     for (int64_t layer = 0; layer < gru_cpu->options.layers(); layer++) {
       for (auto direction = 0; direction < num_directions; direction++) {
         const auto layer_idx = (layer * num_directions) + direction;
         copyParameters(gru_cuda, layer_idx, gru_cpu, layer_idx);
       }
     }
   }

   gru_cpu->flatten_parameters();
   gru_cuda->flatten_parameters();

   // Move GRU to CUDA
   gru_cuda->to(torch::kCUDA);

   // Create the same inputs
   auto input_opt = torch::TensorOptions()
                     .dtype(torch::kFloat32).requires_grad(false);
   auto input_cpu = torch::tensor({1, 2, 3, 4, 5, 6}, input_opt)
                     .reshape({3, 1, 2});
   auto input_cuda = torch::tensor({1, 2, 3, 4, 5, 6}, input_opt)
                     .reshape({3, 1, 2}).to(torch::kCUDA);

   // Call forward on both GRUs
   auto output_cpu = gru_cpu->forward(input_cpu);
   auto output_cuda = gru_cuda->forward(input_cuda);

   output_cpu.output = output_cpu.output.to(torch::kCPU);
   output_cpu.state = output_cpu.state.to(torch::kCPU);

   // Assert that the output and state are equal on CPU and CUDA
   ASSERT_EQ(output_cpu.output.dim(), output_cuda.output.dim());
   for (int i = 0; i < output_cpu.output.dim(); i++) {
     ASSERT_EQ(output_cpu.output.size(i), output_cuda.output.size(i));
   }
   for (int i = 0; i < output_cpu.output.size(0); i++) {
     for (int j = 0; j < output_cpu.output.size(1); j++) {
       for (int k = 0; k < output_cpu.output.size(2); k++) {
         ASSERT_NEAR(
           output_cpu.output[i][j][k].item<float>(),
           output_cuda.output[i][j][k].item<float>(), 1e-5);
       }
     }
   }
 }

 TEST_F(RNNTest, BidirectionalMultilayerLSTM_CPU_vs_CUDA) {
   // Create two LSTMs with the same options
   auto opt = LSTMOptions(2, 4).layers(3).batch_first(false).bidirectional(true);
   LSTM lstm_cpu {opt};
   LSTM lstm_cuda {opt};

   // Copy weights and biases from CPU LSTM to CUDA LSTM
   {
     at::NoGradGuard guard;
     const auto num_directions = lstm_cpu->options.bidirectional() ? 2 : 1;
     for (int64_t layer = 0; layer < lstm_cpu->options.layers(); layer++) {
       for (auto direction = 0; direction < num_directions; direction++) {
         const auto layer_idx = (layer * num_directions) + direction;
         copyParameters(lstm_cuda, layer_idx, lstm_cpu, layer_idx);
       }
     }
   }

   lstm_cpu->flatten_parameters();
   lstm_cuda->flatten_parameters();

   // Move LSTM to CUDA
   lstm_cuda->to(torch::kCUDA);

   auto options = torch::TensorOptions()
                   .dtype(torch::kFloat32).requires_grad(false);
   auto input_cpu = torch::tensor({1, 2, 3, 4, 5, 6}, options)
                   .reshape({3, 1, 2});
   auto input_cuda = torch::tensor({1, 2, 3, 4, 5, 6}, options)
                   .reshape({3, 1, 2}).to(torch::kCUDA);

   // Call forward on both LSTMs
   auto output_cpu = lstm_cpu->forward(input_cpu);
   auto output_cuda = lstm_cuda->forward(input_cuda);

   output_cpu.output = output_cpu.output.to(torch::kCPU);
   output_cpu.state = output_cpu.state.to(torch::kCPU);

   // Assert that the output and state are equal on CPU and CUDA
   ASSERT_EQ(output_cpu.output.dim(), output_cuda.output.dim());
   for (int i = 0; i < output_cpu.output.dim(); i++) {
     ASSERT_EQ(output_cpu.output.size(i), output_cuda.output.size(i));
   }
   for (int i = 0; i < output_cpu.output.size(0); i++) {
     for (int j = 0; j < output_cpu.output.size(1); j++) {
       for (int k = 0; k < output_cpu.output.size(2); k++) {
         ASSERT_NEAR(
           output_cpu.output[i][j][k].item<float>(),
           output_cuda.output[i][j][k].item<float>(), 1e-5);
       }
     }
   }
 }
	#include <gtest/gtest.h>

	#include <torch/torch.h>

	#include <test/cpp/api/support.h>

	using namespace torch::nn;
	using namespace torch::test;

	template <typename R, typename Func>
	bool test_RNN_xor(Func&& model_maker, bool cuda = false) {
	torch::manual_seed(0);

	auto nhid = 32;
	auto model = std::make_shared<SimpleContainer>();
	auto l1 = model->add(Linear(1, nhid), "l1");
	auto rnn = model->add(model_maker(nhid), "rnn");
	auto lo = model->add(Linear(nhid, 1), "lo");

	torch::optim::Adam optimizer(model->parameters(), 1e-2);
	auto forward_op = [&](torch::Tensor x) {
	auto T = x.size(0);
	auto B = x.size(1);
	x = x.view({T * B, 1});
	x = l1->forward(x).view({T, B, nhid}).tanh_();
	x = rnn->forward(x).output[T - 1];
	x = lo->forward(x);
	return x;
	};

	if (cuda) {
	model->to(torch::kCUDA);
	}

	float running_loss = 1;
	int epoch = 0;
	auto max_epoch = 1500;
	while (running_loss > 1e-2) {
	auto bs = 16U;
	auto nlen = 5U;

	const auto backend = cuda ? torch::kCUDA : torch::kCPU;
	auto inputs =
	torch::rand({nlen, bs, 1}, backend).round().to(torch::kFloat32);
	auto labels = inputs.sum(0).detach();
	inputs.set_requires_grad(true);

	auto outputs = forward_op(inputs);
	torch::Tensor loss = torch::mse_loss(outputs, labels);

	optimizer.zero_grad();
	loss.backward();
	optimizer.step();

	running_loss = running_loss * 0.99 + loss.item<float>() * 0.01;
	if (epoch > max_epoch) {
	return false;
	}
	epoch++;
	}
	return true;
	};

	void check_lstm_sizes(RNNOutput output) {
	// Expect the LSTM to have 64 outputs and 3 layers, with an input of batch
	// 10 and 16 time steps (10 x 16 x n)

	ASSERT_EQ(output.output.ndimension(), 3);
	ASSERT_EQ(output.output.size(0), 10);
	ASSERT_EQ(output.output.size(1), 16);
	ASSERT_EQ(output.output.size(2), 64);

	ASSERT_EQ(output.state.ndimension(), 4);
	ASSERT_EQ(output.state.size(0), 2); // (hx, cx)
	ASSERT_EQ(output.state.size(1), 3); // layers
	ASSERT_EQ(output.state.size(2), 16); // Batchsize
	ASSERT_EQ(output.state.size(3), 64); // 64 hidden dims

	// Something is in the hiddens
	ASSERT_GT(output.state.norm().item<float>(), 0);
	}

	struct RNNTest : torch::test::SeedingFixture {};

	TEST_F(RNNTest, CheckOutputSizes) {
	LSTM model(LSTMOptions(128, 64).layers(3).dropout(0.2));
	// Input size is: sequence length, batch size, input size
	auto x = torch::randn({10, 16, 128}, torch::requires_grad());
	auto output = model->forward(x);
	auto y = x.mean();

	y.backward();
	check_lstm_sizes(output);

	auto next = model->forward(x, output.state);

	check_lstm_sizes(next);

	torch::Tensor diff = next.state - output.state;

	// Hiddens changed
	ASSERT_GT(diff.abs().sum().item<float>(), 1e-3);
	}

	TEST_F(RNNTest, CheckOutputValuesMatchPyTorch) {
	torch::manual_seed(0);
	// Make sure the outputs match pytorch outputs
	LSTM model(2, 2);
	for (auto& v : model->parameters()) {
	float size = v.numel();
	auto p = static_cast<float*>(v.storage().data());
	for (size_t i = 0; i < size; i++) {
	p[i] = i / size;
	}
	}

	auto x = torch::empty({3, 4, 2}, torch::requires_grad());
	float size = x.numel();
	auto p = static_cast<float*>(x.storage().data());
	for (size_t i = 0; i < size; i++) {
	p[i] = (size - i) / size;
	}

	auto out = model->forward(x);
	ASSERT_EQ(out.output.ndimension(), 3);
	ASSERT_EQ(out.output.size(0), 3);
	ASSERT_EQ(out.output.size(1), 4);
	ASSERT_EQ(out.output.size(2), 2);

	auto flat = out.output.view(3 * 4 * 2);
	float c_out[] = {0.4391, 0.5402, 0.4330, 0.5324, 0.4261, 0.5239,
	0.4183, 0.5147, 0.6822, 0.8064, 0.6726, 0.7968,
	0.6620, 0.7860, 0.6501, 0.7741, 0.7889, 0.9003,
	0.7769, 0.8905, 0.7635, 0.8794, 0.7484, 0.8666};
	for (size_t i = 0; i < 3 * 4 * 2; i++) {
	ASSERT_LT(std::abs(flat[i].item<float>() - c_out[i]), 1e-3);
	}

	ASSERT_EQ(out.state.ndimension(), 4); // (hx, cx) x layers x B x 2
	ASSERT_EQ(out.state.size(0), 2);
	ASSERT_EQ(out.state.size(1), 1);
	ASSERT_EQ(out.state.size(2), 4);
	ASSERT_EQ(out.state.size(3), 2);
	flat = out.state.view(16);
	float h_out[] = {0.7889,
	0.9003,
	0.7769,
	0.8905,
	0.7635,
	0.8794,
	0.7484,
	0.8666,
	1.1647,
	1.6106,
	1.1425,
	1.5726,
	1.1187,
	1.5329,
	1.0931,
	1.4911};
	for (size_t i = 0; i < 16; i++) {
	ASSERT_LT(std::abs(flat[i].item<float>() - h_out[i]), 1e-3);
	}
	}

	TEST_F(RNNTest, EndToEndLSTM) {
	ASSERT_TRUE(test_RNN_xor<LSTM>(
	[](int s) { return LSTM(LSTMOptions(s, s).layers(2)); }));
	}

	TEST_F(RNNTest, EndToEndGRU) {
	ASSERT_TRUE(
	test_RNN_xor<GRU>([](int s) { return GRU(GRUOptions(s, s).layers(2)); }));
	}

	TEST_F(RNNTest, EndToEndRNNRelu) {
	ASSERT_TRUE(test_RNN_xor<RNN>(
	[](int s) { return RNN(RNNOptions(s, s).relu().layers(2)); }));
	}

	TEST_F(RNNTest, EndToEndRNNTanh) {
	ASSERT_TRUE(test_RNN_xor<RNN>(
	[](int s) { return RNN(RNNOptions(s, s).tanh().layers(2)); }));
	}

	TEST_F(RNNTest, Sizes_CUDA) {
	torch::manual_seed(0);
	LSTM model(LSTMOptions(128, 64).layers(3).dropout(0.2));
	model->to(torch::kCUDA);
	auto x =
	torch::randn({10, 16, 128}, torch::requires_grad().device(torch::kCUDA));
	auto output = model->forward(x);
	auto y = x.mean();

	y.backward();
	check_lstm_sizes(output);

	auto next = model->forward(x, output.state);

	check_lstm_sizes(next);

	torch::Tensor diff = next.state - output.state;

	// Hiddens changed
	ASSERT_GT(diff.abs().sum().item<float>(), 1e-3);
	}

	TEST_F(RNNTest, EndToEndLSTM_CUDA) {
	ASSERT_TRUE(test_RNN_xor<LSTM>(
	[](int s) { return LSTM(LSTMOptions(s, s).layers(2)); }, true));
	}

	TEST_F(RNNTest, EndToEndGRU_CUDA) {
	ASSERT_TRUE(test_RNN_xor<GRU>(
	[](int s) { return GRU(GRUOptions(s, s).layers(2)); }, true));
	}

	TEST_F(RNNTest, EndToEndRNNRelu_CUDA) {
	ASSERT_TRUE(test_RNN_xor<RNN>(
	[](int s) { return RNN(RNNOptions(s, s).relu().layers(2)); }, true));
	}
	TEST_F(RNNTest, EndToEndRNNTanh_CUDA) {
	ASSERT_TRUE(test_RNN_xor<RNN>(
	[](int s) { return RNN(RNNOptions(s, s).tanh().layers(2)); }, true));
	}

	TEST_F(RNNTest, PrettyPrintRNNs) {
	ASSERT_EQ(
	c10::str(LSTM(LSTMOptions(128, 64).layers(3).dropout(0.2))),
	"torch::nn::LSTM(input_size=128, hidden_size=64, layers=3, dropout=0.2)");
	ASSERT_EQ(
	c10::str(GRU(GRUOptions(128, 64).layers(3).dropout(0.5))),
	"torch::nn::GRU(input_size=128, hidden_size=64, layers=3, dropout=0.5)");
	ASSERT_EQ(
	c10::str(RNN(RNNOptions(128, 64).layers(3).dropout(0.2).tanh())),
	"torch::nn::RNN(input_size=128, hidden_size=64, layers=3, dropout=0.2, activation=tanh)");
	}

	// This test assures that flatten_parameters does not crash,
	// when bidirectional is set to true
	// https://github.com/pytorch/pytorch/issues/19545
	TEST_F(RNNTest, BidirectionalFlattenParameters) {
	GRU gru(GRUOptions(100, 256).layers(2).bidirectional(true));
	gru->flatten_parameters();
	}

	template <typename Impl>
	void copyParameters(torch::nn::ModuleHolder<Impl>& target, size_t t_i,
	const torch::nn::ModuleHolder<Impl>& source, size_t s_i) {
	at::NoGradGuard guard;
	target->w_ih[t_i].copy_(source->w_ih[s_i]);
	target->w_hh[t_i].copy_(source->w_hh[s_i]);
	target->b_ih[t_i].copy_(source->b_ih[s_i]);
	target->b_hh[t_i].copy_(source->b_hh[s_i]);
	}

	// This test is a port of python code introduced here:
	// https://towardsdatascience.com/understanding-bidirectional-rnn-in-pytorch-5bd25a5dd66
	// Reverse forward of bidrectional GRU should act
	// as regular forward of unidirectional GRU
	void BidirectionalGRUReverseForward(bool cuda) {
	auto opt = torch::TensorOptions().dtype(torch::kFloat32).requires_grad(false)
	.device(cuda ? torch::kCUDA : torch::kCPU);
	auto input = torch::tensor({1, 2, 3, 4, 5}, opt).reshape({5, 1, 1});
	auto input_reversed = torch::tensor({5, 4, 3, 2, 1}, opt).reshape({5, 1, 1});

	auto gru_options = GRUOptions(1, 1).layers(1).batch_first(false);
	GRU bi_grus {gru_options.bidirectional(true)};
	GRU reverse_gru {gru_options.bidirectional(false)};

	if (cuda) {
	bi_grus->to(torch::kCUDA);
	reverse_gru->to(torch::kCUDA);
	}

	// Now make sure the weights of the reverse gru layer match
	// ones of the (reversed) bidirectional's:
	copyParameters(reverse_gru, 0, bi_grus, 1);

	auto bi_output = bi_grus->forward(input);
	auto reverse_output = reverse_gru->forward(input_reversed);

	if (cuda) {
	bi_output.output = bi_output.output.to(torch::kCPU);
	bi_output.state = bi_output.state.to(torch::kCPU);
	reverse_output.output = reverse_output.output.to(torch::kCPU);
	reverse_output.state = reverse_output.state.to(torch::kCPU);
	}

	ASSERT_EQ(bi_output.output.size(0), reverse_output.output.size(0));
	auto size = bi_output.output.size(0);
	for (int i = 0; i < size; i++) {
	ASSERT_EQ(bi_output.output[i][0][1].item<float>(),
	reverse_output.output[size - 1 - i][0][0].item<float>());
	}
	// The hidden states of the reversed GRUs sits
	// in the odd indices in the first dimension.
	ASSERT_EQ(bi_output.state[1][0][0].item<float>(),
	reverse_output.state[0][0][0].item<float>());
	}

	TEST_F(RNNTest, BidirectionalGRUReverseForward) {
	BidirectionalGRUReverseForward(false);
	}

	TEST_F(RNNTest, BidirectionalGRUReverseForward_CUDA) {
	BidirectionalGRUReverseForward(true);
	}

	// Reverse forward of bidrectional LSTM should act
	// as regular forward of unidirectional LSTM
	void BidirectionalLSTMReverseForwardTest(bool cuda) {
	auto opt = torch::TensorOptions().dtype(torch::kFloat32).requires_grad(false)
	.device(cuda ? torch::kCUDA : torch::kCPU);
	auto input = torch::tensor({1, 2, 3, 4, 5}, opt).reshape({5, 1, 1});
	auto input_reversed = torch::tensor({5, 4, 3, 2, 1}, opt).reshape({5, 1, 1});

	auto lstm_opt = GRUOptions(1, 1).layers(1).batch_first(false);

	LSTM bi_lstm {lstm_opt.bidirectional(true)};
	LSTM reverse_lstm {lstm_opt.bidirectional(false)};

	if (cuda) {
	bi_lstm->to(torch::kCUDA);
	reverse_lstm->to(torch::kCUDA);
	}

	// Now make sure the weights of the reverse lstm layer match
	// ones of the (reversed) bidirectional's:
	copyParameters(reverse_lstm, 0, bi_lstm, 1);

	auto bi_output = bi_lstm->forward(input);
	auto reverse_output = reverse_lstm->forward(input_reversed);

	if (cuda) {
	bi_output.output = bi_output.output.to(torch::kCPU);
	bi_output.state = bi_output.state.to(torch::kCPU);
	reverse_output.output = reverse_output.output.to(torch::kCPU);
	reverse_output.state = reverse_output.state.to(torch::kCPU);
	}

	ASSERT_EQ(bi_output.output.size(0), reverse_output.output.size(0));
	auto size = bi_output.output.size(0);
	for (int i = 0; i < size; i++) {
	ASSERT_EQ(bi_output.output[i][0][1].item<float>(),
	reverse_output.output[size - 1 - i][0][0].item<float>());
	}
	// The hidden states of the reversed LSTM sits
	// in the odd indices in the first dimension.
	ASSERT_EQ(bi_output.state[0][1][0][0].item<float>(),
	reverse_output.state[0][0][0][0].item<float>());
	ASSERT_EQ(bi_output.state[1][1][0][0].item<float>(),
	reverse_output.state[1][0][0][0].item<float>());
	}

	TEST_F(RNNTest, BidirectionalLSTMReverseForward) {
	BidirectionalLSTMReverseForwardTest(false);
	}

	TEST_F(RNNTest, BidirectionalLSTMReverseForward_CUDA) {
	BidirectionalLSTMReverseForwardTest(true);
	}

	TEST_F(RNNTest, BidirectionalMultilayerGRU_CPU_vs_CUDA) {
	// Create two GRUs with the same options
	auto opt = GRUOptions(2, 4).layers(3).batch_first(false).bidirectional(true);
	GRU gru_cpu {opt};
	GRU gru_cuda {opt};

	// Copy weights and biases from CPU GRU to CUDA GRU
	{
	at::NoGradGuard guard;
	const auto num_directions = gru_cpu->options.bidirectional() ? 2 : 1;
	for (int64_t layer = 0; layer < gru_cpu->options.layers(); layer++) {
	for (auto direction = 0; direction < num_directions; direction++) {
	const auto layer_idx = (layer * num_directions) + direction;
	copyParameters(gru_cuda, layer_idx, gru_cpu, layer_idx);
	}
	}
	}

	gru_cpu->flatten_parameters();
	gru_cuda->flatten_parameters();

	// Move GRU to CUDA
	gru_cuda->to(torch::kCUDA);

	// Create the same inputs
	auto input_opt = torch::TensorOptions()
	.dtype(torch::kFloat32).requires_grad(false);
	auto input_cpu = torch::tensor({1, 2, 3, 4, 5, 6}, input_opt)
	.reshape({3, 1, 2});
	auto input_cuda = torch::tensor({1, 2, 3, 4, 5, 6}, input_opt)
	.reshape({3, 1, 2}).to(torch::kCUDA);

	// Call forward on both GRUs
	auto output_cpu = gru_cpu->forward(input_cpu);
	auto output_cuda = gru_cuda->forward(input_cuda);

	output_cpu.output = output_cpu.output.to(torch::kCPU);
	output_cpu.state = output_cpu.state.to(torch::kCPU);

	// Assert that the output and state are equal on CPU and CUDA
	ASSERT_EQ(output_cpu.output.dim(), output_cuda.output.dim());
	for (int i = 0; i < output_cpu.output.dim(); i++) {
	ASSERT_EQ(output_cpu.output.size(i), output_cuda.output.size(i));
	}
	for (int i = 0; i < output_cpu.output.size(0); i++) {
	for (int j = 0; j < output_cpu.output.size(1); j++) {
	for (int k = 0; k < output_cpu.output.size(2); k++) {
	ASSERT_NEAR(
	output_cpu.output[i][j][k].item<float>(),
	output_cuda.output[i][j][k].item<float>(), 1e-5);
	}
	}
	}
	}

	TEST_F(RNNTest, BidirectionalMultilayerLSTM_CPU_vs_CUDA) {
	// Create two LSTMs with the same options
	auto opt = LSTMOptions(2, 4).layers(3).batch_first(false).bidirectional(true);
	LSTM lstm_cpu {opt};
	LSTM lstm_cuda {opt};

	// Copy weights and biases from CPU LSTM to CUDA LSTM
	{
	at::NoGradGuard guard;
	const auto num_directions = lstm_cpu->options.bidirectional() ? 2 : 1;
	for (int64_t layer = 0; layer < lstm_cpu->options.layers(); layer++) {
	for (auto direction = 0; direction < num_directions; direction++) {
	const auto layer_idx = (layer * num_directions) + direction;
	copyParameters(lstm_cuda, layer_idx, lstm_cpu, layer_idx);
	}
	}
	}

	lstm_cpu->flatten_parameters();
	lstm_cuda->flatten_parameters();

	// Move LSTM to CUDA
	lstm_cuda->to(torch::kCUDA);

	auto options = torch::TensorOptions()
	.dtype(torch::kFloat32).requires_grad(false);
	auto input_cpu = torch::tensor({1, 2, 3, 4, 5, 6}, options)
	.reshape({3, 1, 2});
	auto input_cuda = torch::tensor({1, 2, 3, 4, 5, 6}, options)
	.reshape({3, 1, 2}).to(torch::kCUDA);

	// Call forward on both LSTMs
	auto output_cpu = lstm_cpu->forward(input_cpu);
	auto output_cuda = lstm_cuda->forward(input_cuda);

	output_cpu.output = output_cpu.output.to(torch::kCPU);
	output_cpu.state = output_cpu.state.to(torch::kCPU);

	// Assert that the output and state are equal on CPU and CUDA
	ASSERT_EQ(output_cpu.output.dim(), output_cuda.output.dim());
	for (int i = 0; i < output_cpu.output.dim(); i++) {
	ASSERT_EQ(output_cpu.output.size(i), output_cuda.output.size(i));
	}
	for (int i = 0; i < output_cpu.output.size(0); i++) {
	for (int j = 0; j < output_cpu.output.size(1); j++) {
	for (int k = 0; k < output_cpu.output.size(2); k++) {
	ASSERT_NEAR(
	output_cpu.output[i][j][k].item<float>(),
	output_cuda.output[i][j][k].item<float>(), 1e-5);
	}
	}
	}
	}