test/test_cpp_extensions.py - platform/external/pytorch - Git at Google

 import os
 import shutil
 import sys
 import unittest
 import warnings

 import common_utils as common
 import torch
 import torch.backends.cudnn
 import torch.utils.cpp_extension
 from torch.utils.cpp_extension import CUDA_HOME


 try:
     import torch_test_cpp_extension.cpp as cpp_extension
 except ImportError:
     warnings.warn(
         "test_cpp_extensions.py cannot be invoked directly. Run "
         "`python run_test.py -i cpp_extensions` instead."
     )


 TEST_CUDA = torch.cuda.is_available() and CUDA_HOME is not None
 TEST_CUDNN = False
 if TEST_CUDA:
     CUDNN_HEADER_EXISTS = os.path.isfile(os.path.join(CUDA_HOME, "include/cudnn.h"))
     TEST_CUDNN = (
         TEST_CUDA and CUDNN_HEADER_EXISTS and torch.backends.cudnn.is_available()
     )
 IS_WINDOWS = sys.platform == "win32"


 # This effectively allows re-using the same extension (compiled once) in
 # multiple tests, just to split up the tested properties.
 def dont_wipe_extensions_build_folder(func):
     func.dont_wipe = True
     return func


 class TestCppExtension(common.TestCase):
     def setUp(self):
         test_name = self.id().split(".")[-1]
         dont_wipe = hasattr(getattr(self, test_name), "dont_wipe")
         if dont_wipe:
             print(
                 "Test case {} has 'dont_wipe' attribute set, ".format(test_name)
                 + "therefore not wiping extensions build folder before running the test"
             )
             return
         if sys.platform == "win32":
             print("Not wiping extensions build folder because Windows")
             return
         default_build_root = torch.utils.cpp_extension.get_default_build_root()
         if os.path.exists(default_build_root):
             shutil.rmtree(default_build_root)

     def test_extension_function(self):
         x = torch.randn(4, 4)
         y = torch.randn(4, 4)
         z = cpp_extension.sigmoid_add(x, y)
         self.assertEqual(z, x.sigmoid() + y.sigmoid())

     def test_extension_module(self):
         mm = cpp_extension.MatrixMultiplier(4, 8)
         weights = torch.rand(8, 4)
         expected = mm.get().mm(weights)
         result = mm.forward(weights)
         self.assertEqual(expected, result)

     def test_backward(self):
         mm = cpp_extension.MatrixMultiplier(4, 8)
         weights = torch.rand(8, 4, requires_grad=True)
         result = mm.forward(weights)
         result.sum().backward()
         tensor = mm.get()

         expected_weights_grad = tensor.t().mm(torch.ones([4, 4]))
         self.assertEqual(weights.grad, expected_weights_grad)

         expected_tensor_grad = torch.ones([4, 4]).mm(weights.t())
         self.assertEqual(tensor.grad, expected_tensor_grad)

     def test_jit_compile_extension(self):
         module = torch.utils.cpp_extension.load(
             name="jit_extension",
             sources=[
                 "cpp_extensions/jit_extension.cpp",
                 "cpp_extensions/jit_extension2.cpp",
             ],
             extra_include_paths=["cpp_extensions"],
             extra_cflags=["-g"],
             verbose=True,
         )
         x = torch.randn(4, 4)
         y = torch.randn(4, 4)

         z = module.tanh_add(x, y)
         self.assertEqual(z, x.tanh() + y.tanh())

         # Checking we can call a method defined not in the main C++ file.
         z = module.exp_add(x, y)
         self.assertEqual(z, x.exp() + y.exp())

         # Checking we can use this JIT-compiled class.
         doubler = module.Doubler(2, 2)
         self.assertIsNone(doubler.get().grad)
         self.assertEqual(doubler.get().sum(), 4)
         self.assertEqual(doubler.forward().sum(), 8)

     @unittest.skipIf(not TEST_CUDA, "CUDA not found")
     def test_cuda_extension(self):
         import torch_test_cpp_extension.cuda as cuda_extension

         x = torch.zeros(100, device="cuda", dtype=torch.float32)
         y = torch.zeros(100, device="cuda", dtype=torch.float32)

         z = cuda_extension.sigmoid_add(x, y).cpu()

         # 2 * sigmoid(0) = 2 * 0.5 = 1
         self.assertEqual(z, torch.ones_like(z))

     @unittest.skipIf(not TEST_CUDA, "CUDA not found")
     def test_jit_cuda_extension(self):
         # NOTE: The name of the extension must equal the name of the module.
         module = torch.utils.cpp_extension.load(
             name="torch_test_cuda_extension",
             sources=[
                 "cpp_extensions/cuda_extension.cpp",
                 "cpp_extensions/cuda_extension.cu",
             ],
             extra_cuda_cflags=["-O2"],
             verbose=True,
         )

         x = torch.zeros(100, device="cuda", dtype=torch.float32)
         y = torch.zeros(100, device="cuda", dtype=torch.float32)

         z = module.sigmoid_add(x, y).cpu()

         # 2 * sigmoid(0) = 2 * 0.5 = 1
         self.assertEqual(z, torch.ones_like(z))

     @unittest.skipIf(not TEST_CUDNN, "CuDNN not found")
     def test_jit_cudnn_extension(self):
         # implementation of CuDNN ReLU
         if IS_WINDOWS:
             extra_ldflags = ["cudnn.lib"]
         else:
             extra_ldflags = ["-lcudnn"]
         module = torch.utils.cpp_extension.load(
             name="torch_test_cudnn_extension",
             sources=["cpp_extensions/cudnn_extension.cpp"],
             extra_ldflags=extra_ldflags,
             verbose=True,
             with_cuda=True,
         )

         x = torch.randn(100, device="cuda", dtype=torch.float32)
         y = torch.zeros(100, device="cuda", dtype=torch.float32)
         module.cudnn_relu(x, y)  # y=relu(x)
         self.assertEqual(torch.nn.functional.relu(x), y)
         with self.assertRaisesRegex(RuntimeError, "same size"):
             y_incorrect = torch.zeros(20, device="cuda", dtype=torch.float32)
             module.cudnn_relu(x, y_incorrect)

     def test_optional(self):
         has_value = cpp_extension.function_taking_optional(torch.ones(5))
         self.assertTrue(has_value)
         has_value = cpp_extension.function_taking_optional(None)
         self.assertFalse(has_value)

     def test_inline_jit_compile_extension_with_functions_as_list(self):
         cpp_source = """
         torch::Tensor tanh_add(torch::Tensor x, torch::Tensor y) {
           return x.tanh() + y.tanh();
         }
         """

         module = torch.utils.cpp_extension.load_inline(
             name="inline_jit_extension_with_functions_list",
             cpp_sources=cpp_source,
             functions="tanh_add",
             verbose=True,
         )

         self.assertEqual(module.tanh_add.__doc__.split("\n")[2], "tanh_add")

         x = torch.randn(4, 4)
         y = torch.randn(4, 4)

         z = module.tanh_add(x, y)
         self.assertEqual(z, x.tanh() + y.tanh())

     def test_inline_jit_compile_extension_with_functions_as_dict(self):
         cpp_source = """
         torch::Tensor tanh_add(torch::Tensor x, torch::Tensor y) {
           return x.tanh() + y.tanh();
         }
         """

         module = torch.utils.cpp_extension.load_inline(
             name="inline_jit_extension_with_functions_dict",
             cpp_sources=cpp_source,
             functions={"tanh_add": "Tanh and then sum :D"},
             verbose=True,
         )

         self.assertEqual(module.tanh_add.__doc__.split("\n")[2], "Tanh and then sum :D")

     def test_inline_jit_compile_extension_multiple_sources_and_no_functions(self):
         cpp_source1 = """
         torch::Tensor sin_add(torch::Tensor x, torch::Tensor y) {
           return x.sin() + y.sin();
         }
         """

         cpp_source2 = """
         #include <torch/extension.h>
         torch::Tensor sin_add(torch::Tensor x, torch::Tensor y);
         PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
           m.def("sin_add", &sin_add, "sin(x) + sin(y)");
         }
         """

         module = torch.utils.cpp_extension.load_inline(
             name="inline_jit_extension",
             cpp_sources=[cpp_source1, cpp_source2],
             verbose=True,
         )

         x = torch.randn(4, 4)
         y = torch.randn(4, 4)

         z = module.sin_add(x, y)
         self.assertEqual(z, x.sin() + y.sin())

     @unittest.skipIf(not TEST_CUDA, "CUDA not found")
     def test_inline_jit_compile_extension_cuda(self):
         cuda_source = """
         __global__ void cos_add_kernel(
             const float* __restrict__ x,
             const float* __restrict__ y,
             float* __restrict__ output,
             const int size) {
           const auto index = blockIdx.x * blockDim.x + threadIdx.x;
           if (index < size) {
             output[index] = __cosf(x[index]) + __cosf(y[index]);
           }
         }

         torch::Tensor cos_add(torch::Tensor x, torch::Tensor y) {
           auto output = torch::zeros_like(x);
           const int threads = 1024;
           const int blocks = (output.numel() + threads - 1) / threads;
           cos_add_kernel<<<blocks, threads>>>(x.data<float>(), y.data<float>(), output.data<float>(), output.numel());
           return output;
         }
         """

         # Here, the C++ source need only declare the function signature.
         cpp_source = "torch::Tensor cos_add(torch::Tensor x, torch::Tensor y);"

         module = torch.utils.cpp_extension.load_inline(
             name="inline_jit_extension_cuda",
             cpp_sources=cpp_source,
             cuda_sources=cuda_source,
             functions=["cos_add"],
             verbose=True,
         )

         self.assertEqual(module.cos_add.__doc__.split("\n")[2], "cos_add")

         x = torch.randn(4, 4, device="cuda", dtype=torch.float32)
         y = torch.randn(4, 4, device="cuda", dtype=torch.float32)

         z = module.cos_add(x, y)
         self.assertEqual(z, x.cos() + y.cos())

     def test_inline_jit_compile_extension_throws_when_functions_is_bad(self):
         with self.assertRaises(ValueError):
             torch.utils.cpp_extension.load_inline(
                 name="invalid_jit_extension", cpp_sources="", functions=5
             )

     def test_lenient_flag_handling_in_jit_extensions(self):
         cpp_source = """
         torch::Tensor tanh_add(torch::Tensor x, torch::Tensor y) {
           return x.tanh() + y.tanh();
         }
         """

         module = torch.utils.cpp_extension.load_inline(
             name="lenient_flag_handling_extension",
             cpp_sources=cpp_source,
             functions="tanh_add",
             extra_cflags=["-g\n\n", "-O0 -Wall"],
             extra_include_paths=["       cpp_extensions\n"],
             verbose=True,
         )

         x = torch.zeros(100, dtype=torch.float32)
         y = torch.zeros(100, dtype=torch.float32)
         z = module.tanh_add(x, y).cpu()
         self.assertEqual(z, x.tanh() + y.tanh())

     def test_complex_registration(self):
         module = torch.utils.cpp_extension.load(
             name="complex_registration_extension",
             sources="cpp_extensions/complex_registration_extension.cpp",
             verbose=True,
         )

         torch.empty(2, 2, dtype=torch.complex64)

     @unittest.skipIf(not TEST_CUDA, "CUDA not found")
     def test_half_support(self):
         """
         Checks for an issue with operator< ambiguity for half when certain
         THC headers are included.

         See https://github.com/pytorch/pytorch/pull/10301#issuecomment-416773333
         for the corresponding issue.
         """
         cuda_source = """
         #include <THC/THCNumerics.cuh>

         template<typename T, typename U>
         __global__ void half_test_kernel(const T* input, U* output) {
             if (input[0] < input[1] || input[0] >= input[1]) {
                 output[0] = 123;
             }
         }

         torch::Tensor half_test(torch::Tensor input) {
             auto output = torch::empty(1, input.options().dtype(torch::kFloat));
             AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "half_test", [&] {
                 half_test_kernel<scalar_t><<<1, 1>>>(
                     input.data<scalar_t>(),
                     output.data<float>());
             });
             return output;
         }
         """

         module = torch.utils.cpp_extension.load_inline(
             name="half_test_extension",
             cpp_sources="torch::Tensor half_test(torch::Tensor input);",
             cuda_sources=cuda_source,
             functions=["half_test"],
             verbose=True,
         )

         x = torch.randn(3, device="cuda", dtype=torch.half)
         result = module.half_test(x)
         self.assertEqual(result[0], 123)

     def test_reload_jit_extension(self):
         def compile(code):
             return torch.utils.cpp_extension.load_inline(
                 name="reloaded_jit_extension",
                 cpp_sources=code,
                 functions="f",
                 verbose=True,
             )

         module = compile("int f() { return 123; }")
         self.assertEqual(module.f(), 123)

         module = compile("int f() { return 456; }")
         self.assertEqual(module.f(), 456)
         module = compile("int f() { return 456; }")
         self.assertEqual(module.f(), 456)

         module = compile("int f() { return 789; }")
         self.assertEqual(module.f(), 789)

     @dont_wipe_extensions_build_folder
     @common.skipIfRocm
     def test_cpp_frontend_module_has_same_output_as_python(self):
         extension = torch.utils.cpp_extension.load(
             name="cpp_frontend_extension",
             sources="cpp_extensions/cpp_frontend_extension.cpp",
             verbose=True,
         )

         input = torch.randn(2, 5)
         cpp_linear = extension.Net(5, 2)
         cpp_linear.to(torch.float64)
         python_linear = torch.nn.Linear(5, 2)

         # First make sure they have the same parameters
         cpp_parameters = dict(cpp_linear.named_parameters())
         with torch.no_grad():
             python_linear.weight.copy_(cpp_parameters["fc.weight"])
             python_linear.bias.copy_(cpp_parameters["fc.bias"])

         cpp_output = cpp_linear.forward(input)
         python_output = python_linear(input)
         self.assertEqual(cpp_output, python_output)

         cpp_output.sum().backward()
         python_output.sum().backward()

         for p in cpp_linear.parameters():
             self.assertFalse(p.grad is None)

         self.assertEqual(cpp_parameters["fc.weight"].grad, python_linear.weight.grad)
         self.assertEqual(cpp_parameters["fc.bias"].grad, python_linear.bias.grad)

     @dont_wipe_extensions_build_folder
     @common.skipIfRocm
     def test_cpp_frontend_module_python_inter_op(self):
         extension = torch.utils.cpp_extension.load(
             name="cpp_frontend_extension",
             sources="cpp_extensions/cpp_frontend_extension.cpp",
             verbose=True,
         )

         # Create a torch.nn.Module which uses the C++ module as a submodule.
         class M(torch.nn.Module):
             def __init__(self):
                 super(M, self).__init__()
                 self.x = torch.nn.Parameter(torch.tensor(1.0))
                 self.net = extension.Net(3, 5)

             def forward(self, input):
                 return self.net.forward(input) + self.x

         net = extension.Net(5, 2)
         net.double()
         net.to(torch.get_default_dtype())
         self.assertEqual(str(net), "Net")

         # Further embed the torch.nn.Module into a Sequential, and also add the
         # C++ module as an element of the Sequential.
         sequential = torch.nn.Sequential(M(), torch.nn.Tanh(), net, torch.nn.Sigmoid())

         input = torch.randn(2, 3)
         # Try calling the module!
         output = sequential.forward(input)
         # The call operator is bound to forward too.
         self.assertEqual(output, sequential(input))
         self.assertEqual(list(output.shape), [2, 2])

         # Do changes on the module hierarchy.
         old_dtype = torch.get_default_dtype()
         sequential.to(torch.float64)
         sequential.to(torch.float32)
         sequential.to(old_dtype)
         self.assertEqual(sequential[2].parameters()[0].dtype, old_dtype)

         # Make sure we can access these method recursively.
         self.assertEqual(len(list(sequential.parameters())), len(net.parameters()) * 2 + 1)
         self.assertEqual(len(list(sequential.named_parameters())), len(net.named_parameters()) * 2 + 1)
         self.assertEqual(len(list(sequential.buffers())), len(net.buffers()) * 2)
         self.assertEqual(len(list(sequential.modules())), 8)

         # Test clone()
         net2 = net.clone()
         self.assertEqual(len(net.parameters()), len(net2.parameters()))
         self.assertEqual(len(net.buffers()), len(net2.buffers()))
         self.assertEqual(len(net.modules()), len(net2.modules()))

         # Try differentiating through the whole module.
         for parameter in net.parameters():
             self.assertIsNone(parameter.grad)
         output.sum().backward()
         for parameter in net.parameters():
             self.assertFalse(parameter.grad is None)
             self.assertGreater(parameter.grad.sum(), 0)

         # Try calling zero_grad()
         net.zero_grad()
         for p in net.parameters():
             self.assertEqual(p.grad, torch.zeros_like(p))

         # Test train(), eval(), training (a property)
         self.assertTrue(net.training)
         net.eval()
         self.assertFalse(net.training)
         net.train()
         self.assertTrue(net.training)
         net.eval()

         # Try calling the additional methods we registered.
         biased_input = torch.randn(4, 5)
         output_before = net.forward(biased_input)
         bias = net.get_bias().clone()
         self.assertEqual(list(bias.shape), [2])
         net.set_bias(bias + 1)
         self.assertEqual(net.get_bias(), bias + 1)
         output_after = net.forward(biased_input)

         self.assertNotEqual(output_before, output_after)

         # Try accessing parameters
         self.assertEqual(len(net.parameters()), 2)
         np = net.named_parameters()
         self.assertEqual(len(np), 2)
         self.assertIn("fc.weight", np)
         self.assertIn("fc.bias", np)

         self.assertEqual(len(net.buffers()), 1)
         nb = net.named_buffers()
         self.assertEqual(len(nb), 1)
         self.assertIn("buf", nb)
         self.assertEqual(nb[0][1], torch.eye(5))

     @dont_wipe_extensions_build_folder
     @common.skipIfRocm
     def test_cpp_frontend_module_has_up_to_date_attributes(self):
         extension = torch.utils.cpp_extension.load(
             name="cpp_frontend_extension",
             sources="cpp_extensions/cpp_frontend_extension.cpp",
             verbose=True,
         )

         net = extension.Net(5, 2)

         self.assertEqual(len(net._parameters), 0)
         net.add_new_parameter("foo", torch.eye(5))
         self.assertEqual(len(net._parameters), 1)

         self.assertEqual(len(net._buffers), 1)
         net.add_new_buffer("bar", torch.eye(5))
         self.assertEqual(len(net._buffers), 2)

         self.assertEqual(len(net._modules), 1)
         net.add_new_submodule("fc2")
         self.assertEqual(len(net._modules), 2)

     @dont_wipe_extensions_build_folder
     @unittest.skipIf(not TEST_CUDA, "CUDA not found")
     @common.skipIfRocm
     def test_cpp_frontend_module_python_inter_op_with_cuda(self):
         extension = torch.utils.cpp_extension.load(
             name="cpp_frontend_extension",
             sources="cpp_extensions/cpp_frontend_extension.cpp",
             verbose=True,
         )

         net = extension.Net(5, 2)
         for p in net.parameters():
             self.assertTrue(p.device.type == "cpu")
         cpu_parameters = [p.clone() for p in net.parameters()]

         device = torch.device("cuda", 0)
         net.to(device)

         for i, p in enumerate(net.parameters()):
             self.assertTrue(p.device.type == "cuda")
             self.assertTrue(p.device.index == 0)
             self.assertEqual(cpu_parameters[i], p)

     def test_returns_shared_library_path_when_is_python_module_is_true(self):
         source = """
         #include <torch/script.h>
         torch::Tensor func(torch::Tensor x) { return x; }
         static torch::jit::RegisterOperators r("test::func", &func);
         """
         torch.utils.cpp_extension.load_inline(
             name="is_python_module",
             cpp_sources=source,
             functions="func",
             verbose=True,
             is_python_module=False,
         )
         self.assertEqual(torch.ops.test.func(torch.eye(5)), torch.eye(5))

     @unittest.skipIf(IS_WINDOWS, "Not available on Windows")
     def test_no_python_abi_suffix_sets_the_correct_library_name(self):
         # For this test, run_test.py will call `python setup.py install` in the
         # cpp_extensions/no_python_abi_suffix_test folder, where the
         # `BuildExtension` class has a `no_python_abi_suffix` option set to
         # `True`. This *should* mean that on Python 3, the produced shared
         # library does not have an ABI suffix like
         # "cpython-37m-x86_64-linux-gnu" before the library suffix, e.g. "so".
         # On Python 2 there is no ABI suffix anyway.
         root = os.path.join("cpp_extensions", "no_python_abi_suffix_test", "build")
         matches = [f for _, _, fs in os.walk(root) for f in fs if f.endswith("so")]
         self.assertEqual(len(matches), 1, str(matches))
         self.assertEqual(matches[0], "no_python_abi_suffix_test.so", str(matches))

     def test_set_default_type_also_changes_aten_default_type(self):
         module = torch.utils.cpp_extension.load_inline(
             name="test_set_default_type",
             cpp_sources="torch::Tensor get() { return torch::empty({}); }",
             functions="get",
             verbose=True,
         )

         initial_default = torch.get_default_dtype()
         try:
             self.assertEqual(module.get().dtype, initial_default)
             torch.set_default_dtype(torch.float64)
             self.assertEqual(module.get().dtype, torch.float64)
             torch.set_default_dtype(torch.float32)
             self.assertEqual(module.get().dtype, torch.float32)
             torch.set_default_dtype(torch.float16)
             self.assertEqual(module.get().dtype, torch.float16)
         finally:
             torch.set_default_dtype(initial_default)


 if __name__ == "__main__":
     common.run_tests()
	import os
	import shutil
	import sys
	import unittest
	import warnings

	import common_utils as common
	import torch
	import torch.backends.cudnn
	import torch.utils.cpp_extension
	from torch.utils.cpp_extension import CUDA_HOME


	try:
	import torch_test_cpp_extension.cpp as cpp_extension
	except ImportError:
	warnings.warn(
	"test_cpp_extensions.py cannot be invoked directly. Run "
	"`python run_test.py -i cpp_extensions` instead."
	)


	TEST_CUDA = torch.cuda.is_available() and CUDA_HOME is not None
	TEST_CUDNN = False
	if TEST_CUDA:
	CUDNN_HEADER_EXISTS = os.path.isfile(os.path.join(CUDA_HOME, "include/cudnn.h"))
	TEST_CUDNN = (
	TEST_CUDA and CUDNN_HEADER_EXISTS and torch.backends.cudnn.is_available()
	)
	IS_WINDOWS = sys.platform == "win32"


	# This effectively allows re-using the same extension (compiled once) in
	# multiple tests, just to split up the tested properties.
	def dont_wipe_extensions_build_folder(func):
	func.dont_wipe = True
	return func


	class TestCppExtension(common.TestCase):
	def setUp(self):
	test_name = self.id().split(".")[-1]
	dont_wipe = hasattr(getattr(self, test_name), "dont_wipe")
	if dont_wipe:
	print(
	"Test case {} has 'dont_wipe' attribute set, ".format(test_name)
	+ "therefore not wiping extensions build folder before running the test"
	)
	return
	if sys.platform == "win32":
	print("Not wiping extensions build folder because Windows")
	return
	default_build_root = torch.utils.cpp_extension.get_default_build_root()
	if os.path.exists(default_build_root):
	shutil.rmtree(default_build_root)

	def test_extension_function(self):
	x = torch.randn(4, 4)
	y = torch.randn(4, 4)
	z = cpp_extension.sigmoid_add(x, y)
	self.assertEqual(z, x.sigmoid() + y.sigmoid())

	def test_extension_module(self):
	mm = cpp_extension.MatrixMultiplier(4, 8)
	weights = torch.rand(8, 4)
	expected = mm.get().mm(weights)
	result = mm.forward(weights)
	self.assertEqual(expected, result)

	def test_backward(self):
	mm = cpp_extension.MatrixMultiplier(4, 8)
	weights = torch.rand(8, 4, requires_grad=True)
	result = mm.forward(weights)
	result.sum().backward()
	tensor = mm.get()

	expected_weights_grad = tensor.t().mm(torch.ones([4, 4]))
	self.assertEqual(weights.grad, expected_weights_grad)

	expected_tensor_grad = torch.ones([4, 4]).mm(weights.t())
	self.assertEqual(tensor.grad, expected_tensor_grad)

	def test_jit_compile_extension(self):
	module = torch.utils.cpp_extension.load(
	name="jit_extension",
	sources=[
	"cpp_extensions/jit_extension.cpp",
	"cpp_extensions/jit_extension2.cpp",
	],
	extra_include_paths=["cpp_extensions"],
	extra_cflags=["-g"],
	verbose=True,
	)
	x = torch.randn(4, 4)
	y = torch.randn(4, 4)

	z = module.tanh_add(x, y)
	self.assertEqual(z, x.tanh() + y.tanh())

	# Checking we can call a method defined not in the main C++ file.
	z = module.exp_add(x, y)
	self.assertEqual(z, x.exp() + y.exp())

	# Checking we can use this JIT-compiled class.
	doubler = module.Doubler(2, 2)
	self.assertIsNone(doubler.get().grad)
	self.assertEqual(doubler.get().sum(), 4)
	self.assertEqual(doubler.forward().sum(), 8)

	@unittest.skipIf(not TEST_CUDA, "CUDA not found")
	def test_cuda_extension(self):
	import torch_test_cpp_extension.cuda as cuda_extension

	x = torch.zeros(100, device="cuda", dtype=torch.float32)
	y = torch.zeros(100, device="cuda", dtype=torch.float32)

	z = cuda_extension.sigmoid_add(x, y).cpu()

	# 2 * sigmoid(0) = 2 * 0.5 = 1
	self.assertEqual(z, torch.ones_like(z))

	@unittest.skipIf(not TEST_CUDA, "CUDA not found")
	def test_jit_cuda_extension(self):
	# NOTE: The name of the extension must equal the name of the module.
	module = torch.utils.cpp_extension.load(
	name="torch_test_cuda_extension",
	sources=[
	"cpp_extensions/cuda_extension.cpp",
	"cpp_extensions/cuda_extension.cu",
	],
	extra_cuda_cflags=["-O2"],
	verbose=True,
	)

	x = torch.zeros(100, device="cuda", dtype=torch.float32)
	y = torch.zeros(100, device="cuda", dtype=torch.float32)

	z = module.sigmoid_add(x, y).cpu()

	# 2 * sigmoid(0) = 2 * 0.5 = 1
	self.assertEqual(z, torch.ones_like(z))

	@unittest.skipIf(not TEST_CUDNN, "CuDNN not found")
	def test_jit_cudnn_extension(self):
	# implementation of CuDNN ReLU
	if IS_WINDOWS:
	extra_ldflags = ["cudnn.lib"]
	else:
	extra_ldflags = ["-lcudnn"]
	module = torch.utils.cpp_extension.load(
	name="torch_test_cudnn_extension",
	sources=["cpp_extensions/cudnn_extension.cpp"],
	extra_ldflags=extra_ldflags,
	verbose=True,
	with_cuda=True,
	)

	x = torch.randn(100, device="cuda", dtype=torch.float32)
	y = torch.zeros(100, device="cuda", dtype=torch.float32)
	module.cudnn_relu(x, y) # y=relu(x)
	self.assertEqual(torch.nn.functional.relu(x), y)
	with self.assertRaisesRegex(RuntimeError, "same size"):
	y_incorrect = torch.zeros(20, device="cuda", dtype=torch.float32)
	module.cudnn_relu(x, y_incorrect)

	def test_optional(self):
	has_value = cpp_extension.function_taking_optional(torch.ones(5))
	self.assertTrue(has_value)
	has_value = cpp_extension.function_taking_optional(None)
	self.assertFalse(has_value)

	def test_inline_jit_compile_extension_with_functions_as_list(self):
	cpp_source = """
	torch::Tensor tanh_add(torch::Tensor x, torch::Tensor y) {
	return x.tanh() + y.tanh();
	}
	"""

	module = torch.utils.cpp_extension.load_inline(
	name="inline_jit_extension_with_functions_list",
	cpp_sources=cpp_source,
	functions="tanh_add",
	verbose=True,
	)

	self.assertEqual(module.tanh_add.__doc__.split("\n")[2], "tanh_add")

	x = torch.randn(4, 4)
	y = torch.randn(4, 4)

	z = module.tanh_add(x, y)
	self.assertEqual(z, x.tanh() + y.tanh())

	def test_inline_jit_compile_extension_with_functions_as_dict(self):
	cpp_source = """
	torch::Tensor tanh_add(torch::Tensor x, torch::Tensor y) {
	return x.tanh() + y.tanh();
	}
	"""

	module = torch.utils.cpp_extension.load_inline(
	name="inline_jit_extension_with_functions_dict",
	cpp_sources=cpp_source,
	functions={"tanh_add": "Tanh and then sum :D"},
	verbose=True,
	)

	self.assertEqual(module.tanh_add.__doc__.split("\n")[2], "Tanh and then sum :D")

	def test_inline_jit_compile_extension_multiple_sources_and_no_functions(self):
	cpp_source1 = """
	torch::Tensor sin_add(torch::Tensor x, torch::Tensor y) {
	return x.sin() + y.sin();
	}
	"""

	cpp_source2 = """
	#include <torch/extension.h>
	torch::Tensor sin_add(torch::Tensor x, torch::Tensor y);
	PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
	m.def("sin_add", &sin_add, "sin(x) + sin(y)");
	}
	"""

	module = torch.utils.cpp_extension.load_inline(
	name="inline_jit_extension",
	cpp_sources=[cpp_source1, cpp_source2],
	verbose=True,
	)

	x = torch.randn(4, 4)
	y = torch.randn(4, 4)

	z = module.sin_add(x, y)
	self.assertEqual(z, x.sin() + y.sin())

	@unittest.skipIf(not TEST_CUDA, "CUDA not found")
	def test_inline_jit_compile_extension_cuda(self):
	cuda_source = """
	__global__ void cos_add_kernel(
	const float* __restrict__ x,
	const float* __restrict__ y,
	float* __restrict__ output,
	const int size) {
	const auto index = blockIdx.x * blockDim.x + threadIdx.x;
	if (index < size) {
	output[index] = __cosf(x[index]) + __cosf(y[index]);
	}
	}

	torch::Tensor cos_add(torch::Tensor x, torch::Tensor y) {
	auto output = torch::zeros_like(x);
	const int threads = 1024;
	const int blocks = (output.numel() + threads - 1) / threads;
	cos_add_kernel<<<blocks, threads>>>(x.data<float>(), y.data<float>(), output.data<float>(), output.numel());
	return output;
	}
	"""

	# Here, the C++ source need only declare the function signature.
	cpp_source = "torch::Tensor cos_add(torch::Tensor x, torch::Tensor y);"

	module = torch.utils.cpp_extension.load_inline(
	name="inline_jit_extension_cuda",
	cpp_sources=cpp_source,
	cuda_sources=cuda_source,
	functions=["cos_add"],
	verbose=True,
	)

	self.assertEqual(module.cos_add.__doc__.split("\n")[2], "cos_add")

	x = torch.randn(4, 4, device="cuda", dtype=torch.float32)
	y = torch.randn(4, 4, device="cuda", dtype=torch.float32)

	z = module.cos_add(x, y)
	self.assertEqual(z, x.cos() + y.cos())

	def test_inline_jit_compile_extension_throws_when_functions_is_bad(self):
	with self.assertRaises(ValueError):
	torch.utils.cpp_extension.load_inline(
	name="invalid_jit_extension", cpp_sources="", functions=5
	)

	def test_lenient_flag_handling_in_jit_extensions(self):
	cpp_source = """
	torch::Tensor tanh_add(torch::Tensor x, torch::Tensor y) {
	return x.tanh() + y.tanh();
	}
	"""

	module = torch.utils.cpp_extension.load_inline(
	name="lenient_flag_handling_extension",
	cpp_sources=cpp_source,
	functions="tanh_add",
	extra_cflags=["-g\n\n", "-O0 -Wall"],
	extra_include_paths=[" cpp_extensions\n"],
	verbose=True,
	)

	x = torch.zeros(100, dtype=torch.float32)
	y = torch.zeros(100, dtype=torch.float32)
	z = module.tanh_add(x, y).cpu()
	self.assertEqual(z, x.tanh() + y.tanh())

	def test_complex_registration(self):
	module = torch.utils.cpp_extension.load(
	name="complex_registration_extension",
	sources="cpp_extensions/complex_registration_extension.cpp",
	verbose=True,
	)

	torch.empty(2, 2, dtype=torch.complex64)

	@unittest.skipIf(not TEST_CUDA, "CUDA not found")
	def test_half_support(self):
	"""
	Checks for an issue with operator< ambiguity for half when certain
	THC headers are included.

	See https://github.com/pytorch/pytorch/pull/10301#issuecomment-416773333
	for the corresponding issue.
	"""
	cuda_source = """
	#include <THC/THCNumerics.cuh>

	template<typename T, typename U>
	__global__ void half_test_kernel(const T* input, U* output) {
	if (input[0] < input[1] \|\| input[0] >= input[1]) {
	output[0] = 123;
	}
	}

	torch::Tensor half_test(torch::Tensor input) {
	auto output = torch::empty(1, input.options().dtype(torch::kFloat));
	AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "half_test", [&] {
	half_test_kernel<scalar_t><<<1, 1>>>(
	input.data<scalar_t>(),
	output.data<float>());
	});
	return output;
	}
	"""

	module = torch.utils.cpp_extension.load_inline(
	name="half_test_extension",
	cpp_sources="torch::Tensor half_test(torch::Tensor input);",
	cuda_sources=cuda_source,
	functions=["half_test"],
	verbose=True,
	)

	x = torch.randn(3, device="cuda", dtype=torch.half)
	result = module.half_test(x)
	self.assertEqual(result[0], 123)

	def test_reload_jit_extension(self):
	def compile(code):
	return torch.utils.cpp_extension.load_inline(
	name="reloaded_jit_extension",
	cpp_sources=code,
	functions="f",
	verbose=True,
	)

	module = compile("int f() { return 123; }")
	self.assertEqual(module.f(), 123)

	module = compile("int f() { return 456; }")
	self.assertEqual(module.f(), 456)
	module = compile("int f() { return 456; }")
	self.assertEqual(module.f(), 456)

	module = compile("int f() { return 789; }")
	self.assertEqual(module.f(), 789)

	@dont_wipe_extensions_build_folder
	@common.skipIfRocm
	def test_cpp_frontend_module_has_same_output_as_python(self):
	extension = torch.utils.cpp_extension.load(
	name="cpp_frontend_extension",
	sources="cpp_extensions/cpp_frontend_extension.cpp",
	verbose=True,
	)

	input = torch.randn(2, 5)
	cpp_linear = extension.Net(5, 2)
	cpp_linear.to(torch.float64)
	python_linear = torch.nn.Linear(5, 2)

	# First make sure they have the same parameters
	cpp_parameters = dict(cpp_linear.named_parameters())
	with torch.no_grad():
	python_linear.weight.copy_(cpp_parameters["fc.weight"])
	python_linear.bias.copy_(cpp_parameters["fc.bias"])

	cpp_output = cpp_linear.forward(input)
	python_output = python_linear(input)
	self.assertEqual(cpp_output, python_output)

	cpp_output.sum().backward()
	python_output.sum().backward()

	for p in cpp_linear.parameters():
	self.assertFalse(p.grad is None)

	self.assertEqual(cpp_parameters["fc.weight"].grad, python_linear.weight.grad)
	self.assertEqual(cpp_parameters["fc.bias"].grad, python_linear.bias.grad)

	@dont_wipe_extensions_build_folder
	@common.skipIfRocm
	def test_cpp_frontend_module_python_inter_op(self):
	extension = torch.utils.cpp_extension.load(
	name="cpp_frontend_extension",
	sources="cpp_extensions/cpp_frontend_extension.cpp",
	verbose=True,
	)

	# Create a torch.nn.Module which uses the C++ module as a submodule.
	class M(torch.nn.Module):
	def __init__(self):
	super(M, self).__init__()
	self.x = torch.nn.Parameter(torch.tensor(1.0))
	self.net = extension.Net(3, 5)

	def forward(self, input):
	return self.net.forward(input) + self.x

	net = extension.Net(5, 2)
	net.double()
	net.to(torch.get_default_dtype())
	self.assertEqual(str(net), "Net")

	# Further embed the torch.nn.Module into a Sequential, and also add the
	# C++ module as an element of the Sequential.
	sequential = torch.nn.Sequential(M(), torch.nn.Tanh(), net, torch.nn.Sigmoid())

	input = torch.randn(2, 3)
	# Try calling the module!
	output = sequential.forward(input)
	# The call operator is bound to forward too.
	self.assertEqual(output, sequential(input))
	self.assertEqual(list(output.shape), [2, 2])

	# Do changes on the module hierarchy.
	old_dtype = torch.get_default_dtype()
	sequential.to(torch.float64)
	sequential.to(torch.float32)
	sequential.to(old_dtype)
	self.assertEqual(sequential[2].parameters()[0].dtype, old_dtype)

	# Make sure we can access these method recursively.
	self.assertEqual(len(list(sequential.parameters())), len(net.parameters()) * 2 + 1)
	self.assertEqual(len(list(sequential.named_parameters())), len(net.named_parameters()) * 2 + 1)
	self.assertEqual(len(list(sequential.buffers())), len(net.buffers()) * 2)
	self.assertEqual(len(list(sequential.modules())), 8)

	# Test clone()
	net2 = net.clone()
	self.assertEqual(len(net.parameters()), len(net2.parameters()))
	self.assertEqual(len(net.buffers()), len(net2.buffers()))
	self.assertEqual(len(net.modules()), len(net2.modules()))

	# Try differentiating through the whole module.
	for parameter in net.parameters():
	self.assertIsNone(parameter.grad)
	output.sum().backward()
	for parameter in net.parameters():
	self.assertFalse(parameter.grad is None)
	self.assertGreater(parameter.grad.sum(), 0)

	# Try calling zero_grad()
	net.zero_grad()
	for p in net.parameters():
	self.assertEqual(p.grad, torch.zeros_like(p))

	# Test train(), eval(), training (a property)
	self.assertTrue(net.training)
	net.eval()
	self.assertFalse(net.training)
	net.train()
	self.assertTrue(net.training)
	net.eval()

	# Try calling the additional methods we registered.
	biased_input = torch.randn(4, 5)
	output_before = net.forward(biased_input)
	bias = net.get_bias().clone()
	self.assertEqual(list(bias.shape), [2])
	net.set_bias(bias + 1)
	self.assertEqual(net.get_bias(), bias + 1)
	output_after = net.forward(biased_input)

	self.assertNotEqual(output_before, output_after)

	# Try accessing parameters
	self.assertEqual(len(net.parameters()), 2)
	np = net.named_parameters()
	self.assertEqual(len(np), 2)
	self.assertIn("fc.weight", np)
	self.assertIn("fc.bias", np)

	self.assertEqual(len(net.buffers()), 1)
	nb = net.named_buffers()
	self.assertEqual(len(nb), 1)
	self.assertIn("buf", nb)
	self.assertEqual(nb[0][1], torch.eye(5))

	@dont_wipe_extensions_build_folder
	@common.skipIfRocm
	def test_cpp_frontend_module_has_up_to_date_attributes(self):
	extension = torch.utils.cpp_extension.load(
	name="cpp_frontend_extension",
	sources="cpp_extensions/cpp_frontend_extension.cpp",
	verbose=True,
	)

	net = extension.Net(5, 2)

	self.assertEqual(len(net._parameters), 0)
	net.add_new_parameter("foo", torch.eye(5))
	self.assertEqual(len(net._parameters), 1)

	self.assertEqual(len(net._buffers), 1)
	net.add_new_buffer("bar", torch.eye(5))
	self.assertEqual(len(net._buffers), 2)

	self.assertEqual(len(net._modules), 1)
	net.add_new_submodule("fc2")
	self.assertEqual(len(net._modules), 2)

	@dont_wipe_extensions_build_folder
	@unittest.skipIf(not TEST_CUDA, "CUDA not found")
	@common.skipIfRocm
	def test_cpp_frontend_module_python_inter_op_with_cuda(self):
	extension = torch.utils.cpp_extension.load(
	name="cpp_frontend_extension",
	sources="cpp_extensions/cpp_frontend_extension.cpp",
	verbose=True,
	)

	net = extension.Net(5, 2)
	for p in net.parameters():
	self.assertTrue(p.device.type == "cpu")
	cpu_parameters = [p.clone() for p in net.parameters()]

	device = torch.device("cuda", 0)
	net.to(device)

	for i, p in enumerate(net.parameters()):
	self.assertTrue(p.device.type == "cuda")
	self.assertTrue(p.device.index == 0)
	self.assertEqual(cpu_parameters[i], p)

	def test_returns_shared_library_path_when_is_python_module_is_true(self):
	source = """
	#include <torch/script.h>
	torch::Tensor func(torch::Tensor x) { return x; }
	static torch::jit::RegisterOperators r("test::func", &func);
	"""
	torch.utils.cpp_extension.load_inline(
	name="is_python_module",
	cpp_sources=source,
	functions="func",
	verbose=True,
	is_python_module=False,
	)
	self.assertEqual(torch.ops.test.func(torch.eye(5)), torch.eye(5))

	@unittest.skipIf(IS_WINDOWS, "Not available on Windows")
	def test_no_python_abi_suffix_sets_the_correct_library_name(self):
	# For this test, run_test.py will call `python setup.py install` in the
	# cpp_extensions/no_python_abi_suffix_test folder, where the
	# `BuildExtension` class has a `no_python_abi_suffix` option set to
	# `True`. This should mean that on Python 3, the produced shared
	# library does not have an ABI suffix like
	# "cpython-37m-x86_64-linux-gnu" before the library suffix, e.g. "so".
	# On Python 2 there is no ABI suffix anyway.
	root = os.path.join("cpp_extensions", "no_python_abi_suffix_test", "build")
	matches = [f for _, _, fs in os.walk(root) for f in fs if f.endswith("so")]
	self.assertEqual(len(matches), 1, str(matches))
	self.assertEqual(matches[0], "no_python_abi_suffix_test.so", str(matches))

	def test_set_default_type_also_changes_aten_default_type(self):
	module = torch.utils.cpp_extension.load_inline(
	name="test_set_default_type",
	cpp_sources="torch::Tensor get() { return torch::empty({}); }",
	functions="get",
	verbose=True,
	)

	initial_default = torch.get_default_dtype()
	try:
	self.assertEqual(module.get().dtype, initial_default)
	torch.set_default_dtype(torch.float64)
	self.assertEqual(module.get().dtype, torch.float64)
	torch.set_default_dtype(torch.float32)
	self.assertEqual(module.get().dtype, torch.float32)
	torch.set_default_dtype(torch.float16)
	self.assertEqual(module.get().dtype, torch.float16)
	finally:
	torch.set_default_dtype(initial_default)


	if __name__ == "__main__":
	common.run_tests()