kernels/test/op_to_copy_test.cpp - platform/external/executorch - Git at Google

 /*
  * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */

 #include <cstdint>
 #include <map>
 #include <typeindex>
 #include <variant>

 #include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
 #include <executorch/kernels/test/TestUtil.h>
 #include <executorch/kernels/test/supported_features.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>

 #include <gtest/gtest.h>

 using namespace ::testing;
 using exec_aten::MemoryFormat;
 using exec_aten::optional;
 using exec_aten::ScalarType;
 using exec_aten::Tensor;
 using torch::executor::testing::TensorFactory;

 // To further emphasize the accuracy of our op_to, we test the conversion
 // from floating-point types to signed int types directly by the test cases
 // generated by core Pytorch directly. Such data is random generated in [-5, 5].

 // clang-format off
 typedef std::map<
           std::type_index,
           std::variant<
             std::vector<float>,
             std::vector<double>>>
         FloatingTypeToDataMap;

 typedef std::map<
           std::type_index,
           std::variant<
               std::vector<int64_t>,
               std::vector<int32_t>,
               std::vector<int16_t>,
               std::vector<int8_t>,
               std::vector<uint8_t>>>
         IntTypeToDataMap;
 // clang-format on

 class OpToTest : public OperatorTest {
  protected:
   Tensor& op_to_copy_out(
       const Tensor& self,
       bool non_blocking,
       optional<MemoryFormat> memory_format,
       Tensor& out) {
     return torch::executor::aten::_to_copy_outf(
         context_, self, non_blocking, memory_format, out);
   }

   // Cast float vector to OUTPUT_CTYPE vector
   template <typename INPUT_CTYPE, typename OUTPUT_CTYPE>
   std::vector<OUTPUT_CTYPE> vector_type_cast(std::vector<INPUT_CTYPE> input) {
     std::vector<OUTPUT_CTYPE> output(input.size());
     std::transform(
         input.begin(), input.end(), output.begin(), [](INPUT_CTYPE x) {
           return static_cast<OUTPUT_CTYPE>(x);
         });
     return output;
   }
   template <typename INPUT_CTYPE, typename OUTPUT_CTYPE>
   struct ToTestCase {
     const std::vector<int32_t> sizes;
     const std::vector<INPUT_CTYPE> data_in;
     const std::vector<OUTPUT_CTYPE> data_out;
   };

   // Each test has different combination of input and output types. Therefore it
   // is a little bit mess if create template test case and custom data types for
   // both input data and output data.
   // We choose another way: for all test cases, their data are all in double.
   // And we are gonna cast them into desired type when delievering them into
   // tf.make function. Based on our experiments, type cast of core PyTorch is
   // same as static_cast in c++ in the representable scope, so here we believe
   // using static_cast to generate ground truth is reasonable.
   template <
       typename INPUT_CTYPE,
       ScalarType INPUT_DTYPE,
       typename OUTPUT_CTYPE,
       ScalarType OUTPUT_DTYPE>
   void test_runner_static_cast(
       std::vector<ToTestCase<double, double>> test_cases) {
     TensorFactory<INPUT_DTYPE> tf_in;
     TensorFactory<OUTPUT_DTYPE> tf_out;

     for (auto test_case : test_cases) {
       auto data_in = vector_type_cast<double, INPUT_CTYPE>(test_case.data_in);
       auto data_out = vector_type_cast<INPUT_CTYPE, OUTPUT_CTYPE>(data_in);

       Tensor input = tf_in.make(test_case.sizes, data_in);
       Tensor output = tf_out.zeros_like(input);

       Tensor ret = op_to_copy_out(
           /*self=*/input,
           /*non_blocking=*/false,
           exec_aten::MemoryFormat::Contiguous,
           output);

       Tensor expected = tf_out.make(test_case.sizes, data_out);

       // The original tensor a should share same value with the out variable and
       // return variable of to function
       EXPECT_TENSOR_EQ(ret, output);
       EXPECT_TENSOR_EQ(ret, expected);
     }
   }

   template <typename INPUT_CTYPE, ScalarType INPUT_DTYPE>
   void test_runner_to_bool(
       std::vector<double> test_case,
       std::vector<uint8_t> data_out) {
     TensorFactory<INPUT_DTYPE> tf_in;
     TensorFactory<ScalarType::Bool> tf_out;

     auto data_in = vector_type_cast<double, INPUT_CTYPE>(test_case);

     Tensor input = tf_in.make({(int)test_case.size()}, data_in);
     Tensor output = tf_out.zeros_like(input);

     Tensor ret = op_to_copy_out(
         /*self=*/input,
         /*non_blocking=*/false,
         exec_aten::MemoryFormat::Contiguous,
         output);

     Tensor expected = tf_out.make({(int)data_out.size()}, data_out);

     // The return value of op_to_copy_out and the values written to output
     // should be the same.
     EXPECT_TENSOR_EQ(ret, output);
     // The return value of op_to_copy_out and the values in expected which are
     // the reference values should be the same.
     EXPECT_TENSOR_EQ(ret, expected);
   }

   template <typename OUT_CTYPE, ScalarType OUT_DTYPE>
   void test_runner_from_bool(
       std::vector<uint8_t> test_case,
       std::vector<double> out) {
     TensorFactory<ScalarType::Bool> tf_in;
     TensorFactory<OUT_DTYPE> tf_out;

     auto data_out = vector_type_cast<double, OUT_CTYPE>(out);

     Tensor input = tf_in.make({(int)test_case.size()}, test_case);
     Tensor output = tf_out.zeros_like(input);

     Tensor ret = op_to_copy_out(
         /*self=*/input,
         /*non_blocking=*/false,
         exec_aten::MemoryFormat::Contiguous,
         output);

     Tensor expected = tf_out.make({(int)data_out.size()}, data_out);

     // The return value of op_to_copy_out and the values written to output
     // should be the same.
     EXPECT_TENSOR_EQ(ret, output);
     // The return value of op_to_copy_out and the values in expected which are
     // the reference values should be the same.
     EXPECT_TENSOR_EQ(ret, expected);
   }

   template <
       typename INPUT_CTYPE,
       ScalarType INPUT_DTYPE,
       typename OUTPUT_CTYPE,
       ScalarType OUTPUT_DTYPE>
   void test_runner_hardcode_data(
       FloatingTypeToDataMap floating_point_data,
       IntTypeToDataMap int_data) {
     TensorFactory<INPUT_DTYPE> tf_in;
     TensorFactory<OUTPUT_DTYPE> tf_out;

     if (typeid(OUTPUT_CTYPE) == typeid(uint8_t)) {
       // Would cause underflow when testing uint8_t.
       return;
     }

     ToTestCase<INPUT_CTYPE, OUTPUT_CTYPE> test_case = {
         /*sizes=*/{3, 5}, /*data_in=*/
         std::get<std::vector<INPUT_CTYPE>>(
             floating_point_data[typeid(INPUT_CTYPE)]),
         /*data_out=*/
         std::get<std::vector<OUTPUT_CTYPE>>(int_data[typeid(OUTPUT_CTYPE)])};

     Tensor input = tf_in.make(test_case.sizes, test_case.data_in);
     Tensor output = tf_out.zeros_like(input);

     Tensor ret = op_to_copy_out(
         /*self=*/input,
         /*non_blocking=*/false,
         exec_aten::MemoryFormat::Contiguous,
         output);

     Tensor expected = tf_out.make(test_case.sizes, test_case.data_out);

     // The original tensor a should share same value with the out variable and
     // return variable of to function
     EXPECT_TENSOR_EQ(ret, output);
     EXPECT_TENSOR_EQ(ret, expected);
   }

   /* %python
   import torch
   torch.manual_seed(0)
   x = torch.rand(2, 3)
   res = x.to(non_blocking = False, memory_format = torch.preserve_format)
   op = "op_to_copy_out"
   opt_setup_params = """
     bool non_blocking = false;
     optional<MemoryFormat> memory_format;
   """
   opt_extra_params = "non_blocking, memory_format,"
   out_args = "out_shape, dynamism"
   dtype = "ScalarType::Float"
   check = "EXPECT_TENSOR_EQ" */

   void test_dynamic_shape(
       const std::vector<int32_t>& out_shape,
       enum torch::executor::TensorShapeDynamism dynamism) {
     /* %python
     %rewrite(unary_op) */

     TensorFactory<ScalarType::Float> tf;

     Tensor x = tf.make(
         {2, 3},
         {0.49625658988952637,
          0.7682217955589294,
          0.08847743272781372,
          0.13203048706054688,
          0.30742281675338745,
          0.6340786814689636});
     Tensor expected = tf.make(
         {2, 3},
         {0.49625658988952637,
          0.7682217955589294,
          0.08847743272781372,
          0.13203048706054688,
          0.30742281675338745,
          0.6340786814689636});

     bool non_blocking = false;
     optional<MemoryFormat> memory_format;

     Tensor out = tf.zeros(out_shape, dynamism);
     op_to_copy_out(x, non_blocking, memory_format, out);
     EXPECT_TENSOR_EQ(out, expected);
   }
 };

 /* Here we temporary not try to implement or test the behavior about casting a
  * number can not be represented in some type to this type (e.g. inf to int32_t
  * nan to int64_t or 2147483648 to int32_t), because
  * - a. The result of such kind of cast is undefined according to c++ standard;
  * - b. No explicit rules can be found in core pytorch for such transaction (not
  *      same as static_cast or any other casting function in c++);
  * - c. If user tries to cast a unrepresentable value to certain type, they
  *      should take the risk;
  * - d. Even though we can always use if/switch to cover these boundry cases,
  *      the code will be lengthy and jumbled. I believe using these disordered
  *      code to meet some undefine behavior is meaningless, and we can not
  *      cover all such cases.
  */

 namespace {} // namespace

 // Regular test for to_copy.out
 // Test if to_copy.out works well under all kinds of data pairs
 TEST_F(OpToTest, AllDtypesSupported) {
   std::vector<ToTestCase<double, double>> test_cases = {
       {
           /*sizes=*/{2, 4}, /*data_in=*/
           {2.11, 3.2, 2.3, 4.0, 1.1, 5.2, 1.1, 6.3}, /*data_out=*/
           {}, // data_out shouldn't be used in test_runner_static_cast
       },
       {
           /*sizes=*/{3, 4, 0, 5},
           /*data_in=*/{},
           /*data_out=*/{},
       },
       {
           /*sizes=*/{},
           /*data_in=*/{10.0},
           /*data_out=*/{}, // data_out shouldn't be used in
                            // test_runner_static_cast
       },
   };

 #define TEST_KERNEL(INPUT_CTYPE, INPUT_DTYPE, OUTPUT_CTYPE, OUTPUT_DTYPE) \
   test_runner_static_cast<                                                \
       INPUT_CTYPE,                                                        \
       ScalarType::INPUT_DTYPE,                                            \
       OUTPUT_CTYPE,                                                       \
       ScalarType::OUTPUT_DTYPE>(test_cases);

 #define TEST_ENTRY(INPUT_CTYPE, INPUT_DTYPE) \
   ET_FORALL_REAL_TYPES_WITH2(INPUT_CTYPE, INPUT_DTYPE, TEST_KERNEL);

   ET_FORALL_REAL_TYPES(TEST_ENTRY);

 #undef TEST_ENTRY
 #undef TEST_KERNEL
 }

 TEST_F(OpToTest, BoolTests) {
   std::vector<double> test_case_to_bool = {1.1, 2.2, 0};
   std::vector<uint8_t> result_to_bool = {true, true, false};
 #define TEST_TO_BOOL(INPUT_CTYPE, INPUT_DTYPE)               \
   test_runner_to_bool<INPUT_CTYPE, ScalarType::INPUT_DTYPE>( \
       test_case_to_bool, result_to_bool);
   ET_FORALL_REAL_TYPES(TEST_TO_BOOL);

   std::vector<uint8_t> test_case_from_bool = {true, true, false};
   std::vector<double> result_from_bool = {1.0, 1.0, 0};
 #define TEST_FROM_BOOL(OUTPUT_CTYPE, OUTPUT_DTYPE)               \
   test_runner_from_bool<OUTPUT_CTYPE, ScalarType::OUTPUT_DTYPE>( \
       test_case_from_bool, result_from_bool);
   ET_FORALL_REAL_TYPES(TEST_FROM_BOOL);
 }

 TEST_F(OpToTest, NanInfSupported) {
   constexpr auto floatInfinity = std::numeric_limits<float>::infinity();
   std::vector<ToTestCase<double, double>> test_cases = {{
       /*sizes=*/{2, 4},
       /*data_in=*/{2, 3, NAN, 4, floatInfinity, 5, -floatInfinity, 6},
       /*data_out=*/{2, 3, NAN, 4, floatInfinity, 5, -floatInfinity, 6},
   }};

 #define TEST_KERNEL(INPUT_CTYPE, INPUT_DTYPE, OUTPUT_CTYPE, OUTPUT_DTYPE) \
   test_runner_static_cast<                                                \
       INPUT_CTYPE,                                                        \
       ScalarType::INPUT_DTYPE,                                            \
       OUTPUT_CTYPE,                                                       \
       ScalarType::OUTPUT_DTYPE>(test_cases);

 #define TEST_ENTRY(INPUT_CTYPE, INPUT_DTYPE) \
   ET_FORALL_FLOAT_TYPES_WITH2(INPUT_CTYPE, INPUT_DTYPE, TEST_KERNEL);

   ET_FORALL_FLOAT_TYPES(TEST_ENTRY);

 #undef TEST_ENTRY
 #undef TEST_KERNEL
 }

 TEST_F(OpToTest, HardcodeFloatConvertInt) {
   // Hardcode input and output generated from core PyTorch
   // clang-format off
   std::vector<float> float_data = {
       -1.47900056838989257812, -4.59277725219726562500,
        2.15365791320800781250, -2.55494546890258789062,
        3.06999135017395019531,  3.27460670471191406250,
       -3.98865103721618652344, -4.81065988540649414062,
        3.67902207374572753906,  3.72226405143737792969,
        0.80567771196365356445,  2.23788332939147949219,
       -0.52035576105117797852, -1.58493483066558837891,
       -0.30919688940048217773};

   std::vector<double> double_data = {
       -1.47900053955270172068, -4.59277735274143061872,
        2.15365796963871947156, -2.55494554556038755422,
        3.06999137834642255029,  3.27460679459944969949,
       -3.98865109243288795682, -4.81065977167646074975,
        3.67902198302105531980,  3.72226414774102742911,
        0.80567768667100203572,  2.23788335717029518435,
       -0.52035578832931150828, -1.58493480710766210251,
       -0.30919688936285893988};
   // clang-format on

   std::vector<int64_t> int64_data = {
       -1, -4, 2, -2, 3, 3, -3, -4, 3, 3, 0, 2, 0, -1, 0};
   std::vector<int32_t> int32_data = {
       -1, -4, 2, -2, 3, 3, -3, -4, 3, 3, 0, 2, 0, -1, 0};
   std::vector<int16_t> int16_data = {
       -1, -4, 2, -2, 3, 3, -3, -4, 3, 3, 0, 2, 0, -1, 0};
   std::vector<int8_t> int8_data = {
       -1, -4, 2, -2, 3, 3, -3, -4, 3, 3, 0, 2, 0, -1, 0};

   // Gathering all floating point data together for better traversial
   FloatingTypeToDataMap floating_point_data;
   floating_point_data[typeid(float)] = float_data;
   floating_point_data[typeid(double)] = double_data;

   // Gathering all int data together for better traversial
   IntTypeToDataMap int_data;
   int_data[typeid(int64_t)] = int64_data;
   int_data[typeid(int32_t)] = int32_data;
   int_data[typeid(int16_t)] = int16_data;
   int_data[typeid(int8_t)] = int8_data;

 #define TEST_KERNEL(INPUT_CTYPE, INPUT_DTYPE, OUTPUT_CTYPE, OUTPUT_DTYPE) \
   test_runner_hardcode_data<                                              \
       INPUT_CTYPE,                                                        \
       ScalarType::INPUT_DTYPE,                                            \
       OUTPUT_CTYPE,                                                       \
       ScalarType::OUTPUT_DTYPE>(floating_point_data, int_data);

 #define TEST_ENTRY(INPUT_CTYPE, INPUT_DTYPE) \
   ET_FORALL_INT_TYPES_WITH2(INPUT_CTYPE, INPUT_DTYPE, TEST_KERNEL);

   ET_FORALL_FLOAT_TYPES(TEST_ENTRY);
 }

 TEST_F(OpToTest, MismatchedSizesDie) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle mismatched sizes";
   }
   TensorFactory<ScalarType::Int> tf;
   Tensor input = tf.make(/*sizes=*/{3, 1, 1, 2}, /*data=*/{1, 2, 3, 4, 5, 6});
   Tensor out = tf.zeros({3, 2, 1, 1});
   ET_EXPECT_KERNEL_FAILURE(
       context_,
       op_to_copy_out(
           input,
           /*non_blocking=*/false,
           exec_aten::MemoryFormat::Contiguous,
           out));
 }

 // Only contiguous memory is supported, the memory type MemoryFormat::Contiguous
 // should not be allowed. The function is expected death if using the illegal
 // memory format.
 TEST_F(OpToTest, MismatchedMemoryFormatDies) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle non contiguous memory formats";
   }
   TensorFactory<ScalarType::Float> tf_in;
   TensorFactory<ScalarType::Float> tf_out;
   Tensor input =
       tf_in.make(/*sizes=*/{3, 1, 1, 2}, /*data=*/{1, 2, 3, 4, 5, 6});
   Tensor out = tf_out.zeros({3, 1, 1, 2});

   ET_EXPECT_KERNEL_FAILURE(
       context_,
       op_to_copy_out(
           input,
           /*non_blocking=*/false,
           static_cast<exec_aten::MemoryFormat>(55),
           out));
   // memory format can be null
   EXPECT_TENSOR_EQ(
       op_to_copy_out(
           input,
           /*non_blocking=*/false,
           /*memory_format=*/exec_aten::nullopt,
           out),
       input);
 }

 // Only blocking data transfer supported
 TEST_F(OpToTest, MismatchedBlockingDie) {
   if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
     GTEST_SKIP() << "ATen kernel can handle non blocking data transfer";
   }
   TensorFactory<ScalarType::Int> tf;
   Tensor input = tf.make(/*sizes=*/{3, 1, 1, 2}, /*data=*/{1, 2, 3, 4, 5, 6});
   Tensor out = tf.zeros(/*sizes=*/{3, 1, 1, 2});
   ET_EXPECT_KERNEL_FAILURE(
       context_,
       op_to_copy_out(
           input,
           /*non_blocking=*/true,
           exec_aten::MemoryFormat::Contiguous,
           out));
 }

 TEST_F(OpToTest, DynamicShapeUpperBoundSameAsExpected) {
   test_dynamic_shape(
       {2, 3}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }

 TEST_F(OpToTest, DynamicShapeUpperBoundLargerThanExpected) {
   test_dynamic_shape(
       {10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }

 TEST_F(OpToTest, DynamicShapeUnbound) {
   if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
     GTEST_SKIP() << "Dynamic shape unbound not supported";
   }
   test_dynamic_shape(
       {1, 1}, torch::executor::TensorShapeDynamism::DYNAMIC_UNBOUND);
 }
	/*
	* Copyright (c) Meta Platforms, Inc. and affiliates.
	* All rights reserved.
	*
	* This source code is licensed under the BSD-style license found in the
	* LICENSE file in the root directory of this source tree.
	*/

	#include <cstdint>
	#include <map>
	#include <typeindex>
	#include <variant>

	#include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
	#include <executorch/kernels/test/TestUtil.h>
	#include <executorch/kernels/test/supported_features.h>
	#include <executorch/runtime/core/exec_aten/exec_aten.h>
	#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
	#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>

	#include <gtest/gtest.h>

	using namespace ::testing;
	using exec_aten::MemoryFormat;
	using exec_aten::optional;
	using exec_aten::ScalarType;
	using exec_aten::Tensor;
	using torch::executor::testing::TensorFactory;

	// To further emphasize the accuracy of our op_to, we test the conversion
	// from floating-point types to signed int types directly by the test cases
	// generated by core Pytorch directly. Such data is random generated in [-5, 5].

	// clang-format off
	typedef std::map<
	std::type_index,
	std::variant<
	std::vector<float>,
	std::vector<double>>>
	FloatingTypeToDataMap;

	typedef std::map<
	std::type_index,
	std::variant<
	std::vector<int64_t>,
	std::vector<int32_t>,
	std::vector<int16_t>,
	std::vector<int8_t>,
	std::vector<uint8_t>>>
	IntTypeToDataMap;
	// clang-format on

	class OpToTest : public OperatorTest {
	protected:
	Tensor& op_to_copy_out(
	const Tensor& self,
	bool non_blocking,
	optional<MemoryFormat> memory_format,
	Tensor& out) {
	return torch::executor::aten::_to_copy_outf(
	context_, self, non_blocking, memory_format, out);
	}

	// Cast float vector to OUTPUT_CTYPE vector
	template <typename INPUT_CTYPE, typename OUTPUT_CTYPE>
	std::vector<OUTPUT_CTYPE> vector_type_cast(std::vector<INPUT_CTYPE> input) {
	std::vector<OUTPUT_CTYPE> output(input.size());
	std::transform(
	input.begin(), input.end(), output.begin(), [](INPUT_CTYPE x) {
	return static_cast<OUTPUT_CTYPE>(x);
	});
	return output;
	}
	template <typename INPUT_CTYPE, typename OUTPUT_CTYPE>
	struct ToTestCase {
	const std::vector<int32_t> sizes;
	const std::vector<INPUT_CTYPE> data_in;
	const std::vector<OUTPUT_CTYPE> data_out;
	};

	// Each test has different combination of input and output types. Therefore it
	// is a little bit mess if create template test case and custom data types for
	// both input data and output data.
	// We choose another way: for all test cases, their data are all in double.
	// And we are gonna cast them into desired type when delievering them into
	// tf.make function. Based on our experiments, type cast of core PyTorch is
	// same as static_cast in c++ in the representable scope, so here we believe
	// using static_cast to generate ground truth is reasonable.
	template <
	typename INPUT_CTYPE,
	ScalarType INPUT_DTYPE,
	typename OUTPUT_CTYPE,
	ScalarType OUTPUT_DTYPE>
	void test_runner_static_cast(
	std::vector<ToTestCase<double, double>> test_cases) {
	TensorFactory<INPUT_DTYPE> tf_in;
	TensorFactory<OUTPUT_DTYPE> tf_out;

	for (auto test_case : test_cases) {
	auto data_in = vector_type_cast<double, INPUT_CTYPE>(test_case.data_in);
	auto data_out = vector_type_cast<INPUT_CTYPE, OUTPUT_CTYPE>(data_in);

	Tensor input = tf_in.make(test_case.sizes, data_in);
	Tensor output = tf_out.zeros_like(input);

	Tensor ret = op_to_copy_out(
	/self=/input,
	/non_blocking=/false,
	exec_aten::MemoryFormat::Contiguous,
	output);

	Tensor expected = tf_out.make(test_case.sizes, data_out);

	// The original tensor a should share same value with the out variable and
	// return variable of to function
	EXPECT_TENSOR_EQ(ret, output);
	EXPECT_TENSOR_EQ(ret, expected);
	}
	}

	template <typename INPUT_CTYPE, ScalarType INPUT_DTYPE>
	void test_runner_to_bool(
	std::vector<double> test_case,
	std::vector<uint8_t> data_out) {
	TensorFactory<INPUT_DTYPE> tf_in;
	TensorFactory<ScalarType::Bool> tf_out;

	auto data_in = vector_type_cast<double, INPUT_CTYPE>(test_case);

	Tensor input = tf_in.make({(int)test_case.size()}, data_in);
	Tensor output = tf_out.zeros_like(input);

	Tensor ret = op_to_copy_out(
	/self=/input,
	/non_blocking=/false,
	exec_aten::MemoryFormat::Contiguous,
	output);

	Tensor expected = tf_out.make({(int)data_out.size()}, data_out);

	// The return value of op_to_copy_out and the values written to output
	// should be the same.
	EXPECT_TENSOR_EQ(ret, output);
	// The return value of op_to_copy_out and the values in expected which are
	// the reference values should be the same.
	EXPECT_TENSOR_EQ(ret, expected);
	}

	template <typename OUT_CTYPE, ScalarType OUT_DTYPE>
	void test_runner_from_bool(
	std::vector<uint8_t> test_case,
	std::vector<double> out) {
	TensorFactory<ScalarType::Bool> tf_in;
	TensorFactory<OUT_DTYPE> tf_out;

	auto data_out = vector_type_cast<double, OUT_CTYPE>(out);

	Tensor input = tf_in.make({(int)test_case.size()}, test_case);
	Tensor output = tf_out.zeros_like(input);

	Tensor ret = op_to_copy_out(
	/self=/input,
	/non_blocking=/false,
	exec_aten::MemoryFormat::Contiguous,
	output);

	Tensor expected = tf_out.make({(int)data_out.size()}, data_out);

	// The return value of op_to_copy_out and the values written to output
	// should be the same.
	EXPECT_TENSOR_EQ(ret, output);
	// The return value of op_to_copy_out and the values in expected which are
	// the reference values should be the same.
	EXPECT_TENSOR_EQ(ret, expected);
	}

	template <
	typename INPUT_CTYPE,
	ScalarType INPUT_DTYPE,
	typename OUTPUT_CTYPE,
	ScalarType OUTPUT_DTYPE>
	void test_runner_hardcode_data(
	FloatingTypeToDataMap floating_point_data,
	IntTypeToDataMap int_data) {
	TensorFactory<INPUT_DTYPE> tf_in;
	TensorFactory<OUTPUT_DTYPE> tf_out;

	if (typeid(OUTPUT_CTYPE) == typeid(uint8_t)) {
	// Would cause underflow when testing uint8_t.
	return;
	}

	ToTestCase<INPUT_CTYPE, OUTPUT_CTYPE> test_case = {
	/sizes=/{3, 5}, /data_in=/
	std::get<std::vector<INPUT_CTYPE>>(
	floating_point_data[typeid(INPUT_CTYPE)]),
	/data_out=/
	std::get<std::vector<OUTPUT_CTYPE>>(int_data[typeid(OUTPUT_CTYPE)])};

	Tensor input = tf_in.make(test_case.sizes, test_case.data_in);
	Tensor output = tf_out.zeros_like(input);

	Tensor ret = op_to_copy_out(
	/self=/input,
	/non_blocking=/false,
	exec_aten::MemoryFormat::Contiguous,
	output);

	Tensor expected = tf_out.make(test_case.sizes, test_case.data_out);

	// The original tensor a should share same value with the out variable and
	// return variable of to function
	EXPECT_TENSOR_EQ(ret, output);
	EXPECT_TENSOR_EQ(ret, expected);
	}

	/* %python
	import torch
	torch.manual_seed(0)
	x = torch.rand(2, 3)
	res = x.to(non_blocking = False, memory_format = torch.preserve_format)
	op = "op_to_copy_out"
	opt_setup_params = """
	bool non_blocking = false;
	optional<MemoryFormat> memory_format;
	"""
	opt_extra_params = "non_blocking, memory_format,"
	out_args = "out_shape, dynamism"
	dtype = "ScalarType::Float"
	check = "EXPECT_TENSOR_EQ" */

	void test_dynamic_shape(
	const std::vector<int32_t>& out_shape,
	enum torch::executor::TensorShapeDynamism dynamism) {
	/* %python
	%rewrite(unary_op) */

	TensorFactory<ScalarType::Float> tf;

	Tensor x = tf.make(
	{2, 3},
	{0.49625658988952637,
	0.7682217955589294,
	0.08847743272781372,
	0.13203048706054688,
	0.30742281675338745,
	0.6340786814689636});
	Tensor expected = tf.make(
	{2, 3},
	{0.49625658988952637,
	0.7682217955589294,
	0.08847743272781372,
	0.13203048706054688,
	0.30742281675338745,
	0.6340786814689636});

	bool non_blocking = false;
	optional<MemoryFormat> memory_format;

	Tensor out = tf.zeros(out_shape, dynamism);
	op_to_copy_out(x, non_blocking, memory_format, out);
	EXPECT_TENSOR_EQ(out, expected);
	}
	};

	/* Here we temporary not try to implement or test the behavior about casting a
	* number can not be represented in some type to this type (e.g. inf to int32_t
	* nan to int64_t or 2147483648 to int32_t), because
	* - a. The result of such kind of cast is undefined according to c++ standard;
	* - b. No explicit rules can be found in core pytorch for such transaction (not
	* same as static_cast or any other casting function in c++);
	* - c. If user tries to cast a unrepresentable value to certain type, they
	* should take the risk;
	* - d. Even though we can always use if/switch to cover these boundry cases,
	* the code will be lengthy and jumbled. I believe using these disordered
	* code to meet some undefine behavior is meaningless, and we can not
	* cover all such cases.
	*/

	namespace {} // namespace

	// Regular test for to_copy.out
	// Test if to_copy.out works well under all kinds of data pairs
	TEST_F(OpToTest, AllDtypesSupported) {
	std::vector<ToTestCase<double, double>> test_cases = {
	{
	/sizes=/{2, 4}, /data_in=/
	{2.11, 3.2, 2.3, 4.0, 1.1, 5.2, 1.1, 6.3}, /data_out=/
	{}, // data_out shouldn't be used in test_runner_static_cast
	},
	{
	/sizes=/{3, 4, 0, 5},
	/data_in=/{},
	/data_out=/{},
	},
	{
	/sizes=/{},
	/data_in=/{10.0},
	/data_out=/{}, // data_out shouldn't be used in
	// test_runner_static_cast
	},
	};

	#define TEST_KERNEL(INPUT_CTYPE, INPUT_DTYPE, OUTPUT_CTYPE, OUTPUT_DTYPE) \
	test_runner_static_cast< \
	INPUT_CTYPE, \
	ScalarType::INPUT_DTYPE, \
	OUTPUT_CTYPE, \
	ScalarType::OUTPUT_DTYPE>(test_cases);

	#define TEST_ENTRY(INPUT_CTYPE, INPUT_DTYPE) \
	ET_FORALL_REAL_TYPES_WITH2(INPUT_CTYPE, INPUT_DTYPE, TEST_KERNEL);

	ET_FORALL_REAL_TYPES(TEST_ENTRY);

	#undef TEST_ENTRY
	#undef TEST_KERNEL
	}

	TEST_F(OpToTest, BoolTests) {
	std::vector<double> test_case_to_bool = {1.1, 2.2, 0};
	std::vector<uint8_t> result_to_bool = {true, true, false};
	#define TEST_TO_BOOL(INPUT_CTYPE, INPUT_DTYPE) \
	test_runner_to_bool<INPUT_CTYPE, ScalarType::INPUT_DTYPE>( \
	test_case_to_bool, result_to_bool);
	ET_FORALL_REAL_TYPES(TEST_TO_BOOL);

	std::vector<uint8_t> test_case_from_bool = {true, true, false};
	std::vector<double> result_from_bool = {1.0, 1.0, 0};
	#define TEST_FROM_BOOL(OUTPUT_CTYPE, OUTPUT_DTYPE) \
	test_runner_from_bool<OUTPUT_CTYPE, ScalarType::OUTPUT_DTYPE>( \
	test_case_from_bool, result_from_bool);
	ET_FORALL_REAL_TYPES(TEST_FROM_BOOL);
	}

	TEST_F(OpToTest, NanInfSupported) {
	constexpr auto floatInfinity = std::numeric_limits<float>::infinity();
	std::vector<ToTestCase<double, double>> test_cases = {{
	/sizes=/{2, 4},
	/data_in=/{2, 3, NAN, 4, floatInfinity, 5, -floatInfinity, 6},
	/data_out=/{2, 3, NAN, 4, floatInfinity, 5, -floatInfinity, 6},
	}};

	#define TEST_KERNEL(INPUT_CTYPE, INPUT_DTYPE, OUTPUT_CTYPE, OUTPUT_DTYPE) \
	test_runner_static_cast< \
	INPUT_CTYPE, \
	ScalarType::INPUT_DTYPE, \
	OUTPUT_CTYPE, \
	ScalarType::OUTPUT_DTYPE>(test_cases);

	#define TEST_ENTRY(INPUT_CTYPE, INPUT_DTYPE) \
	ET_FORALL_FLOAT_TYPES_WITH2(INPUT_CTYPE, INPUT_DTYPE, TEST_KERNEL);

	ET_FORALL_FLOAT_TYPES(TEST_ENTRY);

	#undef TEST_ENTRY
	#undef TEST_KERNEL
	}

	TEST_F(OpToTest, HardcodeFloatConvertInt) {
	// Hardcode input and output generated from core PyTorch
	// clang-format off
	std::vector<float> float_data = {
	-1.47900056838989257812, -4.59277725219726562500,
	2.15365791320800781250, -2.55494546890258789062,
	3.06999135017395019531, 3.27460670471191406250,
	-3.98865103721618652344, -4.81065988540649414062,
	3.67902207374572753906, 3.72226405143737792969,
	0.80567771196365356445, 2.23788332939147949219,
	-0.52035576105117797852, -1.58493483066558837891,
	-0.30919688940048217773};

	std::vector<double> double_data = {
	-1.47900053955270172068, -4.59277735274143061872,
	2.15365796963871947156, -2.55494554556038755422,
	3.06999137834642255029, 3.27460679459944969949,
	-3.98865109243288795682, -4.81065977167646074975,
	3.67902198302105531980, 3.72226414774102742911,
	0.80567768667100203572, 2.23788335717029518435,
	-0.52035578832931150828, -1.58493480710766210251,
	-0.30919688936285893988};
	// clang-format on

	std::vector<int64_t> int64_data = {
	-1, -4, 2, -2, 3, 3, -3, -4, 3, 3, 0, 2, 0, -1, 0};
	std::vector<int32_t> int32_data = {
	-1, -4, 2, -2, 3, 3, -3, -4, 3, 3, 0, 2, 0, -1, 0};
	std::vector<int16_t> int16_data = {
	-1, -4, 2, -2, 3, 3, -3, -4, 3, 3, 0, 2, 0, -1, 0};
	std::vector<int8_t> int8_data = {
	-1, -4, 2, -2, 3, 3, -3, -4, 3, 3, 0, 2, 0, -1, 0};

	// Gathering all floating point data together for better traversial
	FloatingTypeToDataMap floating_point_data;
	floating_point_data[typeid(float)] = float_data;
	floating_point_data[typeid(double)] = double_data;

	// Gathering all int data together for better traversial
	IntTypeToDataMap int_data;
	int_data[typeid(int64_t)] = int64_data;
	int_data[typeid(int32_t)] = int32_data;
	int_data[typeid(int16_t)] = int16_data;
	int_data[typeid(int8_t)] = int8_data;

	#define TEST_KERNEL(INPUT_CTYPE, INPUT_DTYPE, OUTPUT_CTYPE, OUTPUT_DTYPE) \
	test_runner_hardcode_data< \
	INPUT_CTYPE, \
	ScalarType::INPUT_DTYPE, \
	OUTPUT_CTYPE, \
	ScalarType::OUTPUT_DTYPE>(floating_point_data, int_data);

	#define TEST_ENTRY(INPUT_CTYPE, INPUT_DTYPE) \
	ET_FORALL_INT_TYPES_WITH2(INPUT_CTYPE, INPUT_DTYPE, TEST_KERNEL);

	ET_FORALL_FLOAT_TYPES(TEST_ENTRY);
	}

	TEST_F(OpToTest, MismatchedSizesDie) {
	if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
	GTEST_SKIP() << "ATen kernel can handle mismatched sizes";
	}
	TensorFactory<ScalarType::Int> tf;
	Tensor input = tf.make(/sizes=/{3, 1, 1, 2}, /data=/{1, 2, 3, 4, 5, 6});
	Tensor out = tf.zeros({3, 2, 1, 1});
	ET_EXPECT_KERNEL_FAILURE(
	context_,
	op_to_copy_out(
	input,
	/non_blocking=/false,
	exec_aten::MemoryFormat::Contiguous,
	out));
	}

	// Only contiguous memory is supported, the memory type MemoryFormat::Contiguous
	// should not be allowed. The function is expected death if using the illegal
	// memory format.
	TEST_F(OpToTest, MismatchedMemoryFormatDies) {
	if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
	GTEST_SKIP() << "ATen kernel can handle non contiguous memory formats";
	}
	TensorFactory<ScalarType::Float> tf_in;
	TensorFactory<ScalarType::Float> tf_out;
	Tensor input =
	tf_in.make(/sizes=/{3, 1, 1, 2}, /data=/{1, 2, 3, 4, 5, 6});
	Tensor out = tf_out.zeros({3, 1, 1, 2});

	ET_EXPECT_KERNEL_FAILURE(
	context_,
	op_to_copy_out(
	input,
	/non_blocking=/false,
	static_cast<exec_aten::MemoryFormat>(55),
	out));
	// memory format can be null
	EXPECT_TENSOR_EQ(
	op_to_copy_out(
	input,
	/non_blocking=/false,
	/memory_format=/exec_aten::nullopt,
	out),
	input);
	}

	// Only blocking data transfer supported
	TEST_F(OpToTest, MismatchedBlockingDie) {
	if (torch::executor::testing::SupportedFeatures::get()->is_aten) {
	GTEST_SKIP() << "ATen kernel can handle non blocking data transfer";
	}
	TensorFactory<ScalarType::Int> tf;
	Tensor input = tf.make(/sizes=/{3, 1, 1, 2}, /data=/{1, 2, 3, 4, 5, 6});
	Tensor out = tf.zeros(/sizes=/{3, 1, 1, 2});
	ET_EXPECT_KERNEL_FAILURE(
	context_,
	op_to_copy_out(
	input,
	/non_blocking=/true,
	exec_aten::MemoryFormat::Contiguous,
	out));
	}

	TEST_F(OpToTest, DynamicShapeUpperBoundSameAsExpected) {
	test_dynamic_shape(
	{2, 3}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
	}

	TEST_F(OpToTest, DynamicShapeUpperBoundLargerThanExpected) {
	test_dynamic_shape(
	{10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
	}

	TEST_F(OpToTest, DynamicShapeUnbound) {
	if (!torch::executor::testing::SupportedFeatures::get()->output_resize) {
	GTEST_SKIP() << "Dynamic shape unbound not supported";
	}
	test_dynamic_shape(
	{1, 1}, torch::executor::TensorShapeDynamism::DYNAMIC_UNBOUND);
	}