tensorflow/core/kernels/mlir_generated/BUILD - platform/external/tensorflow - Git at Google

 # Generates CUDA kernels using MLIR codegen.

 load(
     "//tensorflow/core/kernels/mlir_generated:build_defs.bzl",
     "cpu_kernel_library",
     "gpu_kernel_library",
     "if_mlir_experimental_kernels_enabled",
     "if_mlir_generated_gpu_kernels_enabled",
 )
 load(
     "//tensorflow:tensorflow.bzl",
     "if_cuda_or_rocm",
     "tf_cc_test",
 )
 load("//tensorflow:tensorflow.bzl", "get_compatible_with_cloud")  # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")  # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_kernel_library")  # buildifier: disable=same-origin-load
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
 )

 package(
     default_visibility = [
         "//tensorflow/core/kernels:__subpackages__",
     ],
     licenses = ["notice"],  # Apache 2.0
 )

 config_setting(
     name = "mlir_generated_gpu_kernels_disabled",
     define_values = {"tensorflow_enable_mlir_generated_gpu_kernels": "0"},
 )

 config_setting(
     name = "mlir_experimental_kernels_enabled",
     define_values = {"enable_unranked_kernels": "1"},
 )

 filegroup(
     name = "enabled_unary_gpu_kernel_srcs",
     srcs = [
         "gpu_op_abs.cc",
         "gpu_op_acos.cc",
         "gpu_op_acosh.cc",
         "gpu_op_angle.cc",
         "gpu_op_asin.cc",
         "gpu_op_asinh.cc",
         "gpu_op_atan.cc",
         "gpu_op_atanh.cc",
         "gpu_op_ceil.cc",
         "gpu_op_complex.cc",
         "gpu_op_complex_abs.cc",
         "gpu_op_conj.cc",
         "gpu_op_cos.cc",
         "gpu_op_cosh.cc",
         "gpu_op_digamma.cc",
         "gpu_op_erf.cc",
         "gpu_op_erfc.cc",
         "gpu_op_floor.cc",
         "gpu_op_imag.cc",
         "gpu_op_invert.cc",
         "gpu_op_is_finite.cc",
         "gpu_op_is_inf.cc",
         "gpu_op_is_nan.cc",
         "gpu_op_lgamma.cc",
         "gpu_op_log.cc",
         "gpu_op_log1p.cc",
         "gpu_op_logical_not.cc",
         "gpu_op_neg.cc",
         "gpu_op_real.cc",
         "gpu_op_rsqrt.cc",
         "gpu_op_sin.cc",
         "gpu_op_sinh.cc",
         "gpu_op_sqrt.cc",
         "gpu_op_square.cc",
         "gpu_op_tan.cc",
         "gpu_op_tanh.cc",
     ],
     compatible_with = get_compatible_with_cloud(),
 )

 filegroup(
     name = "experimental_unary_gpu_kernel_srcs",
     srcs = [
         "gpu_op_exp.cc",
         "gpu_op_expm1.cc",
         "gpu_op_sign.cc",
     ],
     compatible_with = get_compatible_with_cloud(),
 )

 filegroup(
     name = "unary_gpu_kernel_srcs",
     srcs = [
         ":enabled_unary_gpu_kernel_srcs",
     ] + if_mlir_experimental_kernels_enabled(
         if_true = [":experimental_unary_gpu_kernel_srcs"],
     ),
     compatible_with = get_compatible_with_cloud(),
 )

 cc_library(
     name = "base_op",
     srcs = ["base_op.cc"],
     hdrs = ["base_op.h"],
     compatible_with = get_compatible_with_cloud(),
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/framework:allocation_description_proto_cc",
         "//tensorflow/core/framework:op_requires",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:mlir_c_runner_utils",
     ],
 )

 cc_library(
     name = "base_gpu_op",
     hdrs = ["base_gpu_op.h"],
     compatible_with = get_compatible_with_cloud(),
     deps = [":base_op"],
 )

 cc_library(
     name = "base_cpu_op",
     hdrs = ["base_cpu_op.h"],
     compatible_with = get_compatible_with_cloud(),
     deps = [":base_op"],
 )

 tf_kernel_library(
     name = "gpu_cwise_unary_op",
     srcs = [":unary_gpu_kernel_srcs"],
     tags = ["manual"],
     # Technically we only need to depend on the kernel libraries for the
     # kernels which are enabled by default. But this would make our BUILD
     # target structure uglier. We already need to make sure that those
     # targets can be built, so it should not hurt to link them in even if
     # they are currently not needed yet.
     deps = [
         ":base_gpu_op",
         ":gpu_abs_kernels",
         ":gpu_acos_kernels",
         ":gpu_acosh_kernels",
         ":gpu_angle_kernels",
         ":gpu_asin_kernels",
         ":gpu_asinh_kernels",
         ":gpu_atan_kernels",
         ":gpu_atanh_kernels",
         ":gpu_ceil_kernels",
         ":gpu_complex_abs_kernels",
         ":gpu_complex_kernels",
         ":gpu_conj_kernels",
         ":gpu_cos_kernels",
         ":gpu_cosh_kernels",
         ":gpu_digamma_kernels",
         ":gpu_erf_kernels",
         ":gpu_erfc_kernels",
         ":gpu_exp_kernels",
         ":gpu_expm1_kernels",
         ":gpu_floor_kernels",
         ":gpu_imag_kernels",
         ":gpu_invert_kernels",
         ":gpu_is_finite_kernels",
         ":gpu_is_inf_kernels",
         ":gpu_is_nan_kernels",
         ":gpu_lgamma_kernels",
         ":gpu_log1p_kernels",
         ":gpu_log_kernels",
         ":gpu_logical_not_kernels",
         ":gpu_neg_kernels",
         ":gpu_real_kernels",
         ":gpu_rsqrt_kernels",
         ":gpu_sign_kernels",
         ":gpu_sin_kernels",
         ":gpu_sinh_kernels",
         ":gpu_sqrt_kernels",
         ":gpu_square_kernels",
         ":gpu_tan_kernels",
         ":gpu_tanh_kernels",
         "//third_party/eigen3",
     ],
 )

 tf_kernel_library(
     name = "cpu_cwise_unary_op",
     srcs = [":cpu_op_abs.cc"],
     tags = ["manual"],
     # Technically we only need to depend on the kernel libraries for the
     # kernels which are enabled by default. But this would make our BUILD
     # target structure uglier. We already need to make sure that those
     # targets can be built, so it should not hurt to link them in even if
     # they are currently not needed yet.
     deps = [
         ":base_cpu_op",
         ":cpu_abs_kernels",
         "//third_party/eigen3",
     ],
 )

 tf_kernel_library(
     name = "gpu_cwise_binary_op",
     srcs = [
         "gpu_op_add.cc",
         "gpu_op_atan2.cc",
         "gpu_op_bitwise_and.cc",
         "gpu_op_bitwise_or.cc",
         "gpu_op_bitwise_xor.cc",
         "gpu_op_div.cc",
         "gpu_op_equal.cc",
         "gpu_op_floor_div.cc",
         "gpu_op_greater.cc",
         "gpu_op_greater_equal.cc",
         "gpu_op_left_shift.cc",
         "gpu_op_less.cc",
         "gpu_op_less_equal.cc",
         "gpu_op_logical_and.cc",
         "gpu_op_logical_or.cc",
         "gpu_op_maximum.cc",
         "gpu_op_minimum.cc",
         "gpu_op_mul.cc",
         "gpu_op_not_equal.cc",
         "gpu_op_pow.cc",
         "gpu_op_right_shift.cc",
         "gpu_op_squared_difference.cc",
         "gpu_op_sub.cc",
         "gpu_op_zeta.cc",
     ],
     tags = [
         "manual",
     ],
     deps = [
         ":base_gpu_op",
         ":gpu_add_v2_kernels",
         ":gpu_atan2_kernels",
         ":gpu_bitwise_and_kernels",
         ":gpu_bitwise_or_kernels",
         ":gpu_bitwise_xor_kernels",
         ":gpu_div_kernels",
         ":gpu_equal_kernels",
         ":gpu_floor_div_kernels",
         ":gpu_greater_equal_kernels",
         ":gpu_greater_kernels",
         ":gpu_left_shift_kernels",
         ":gpu_less_equal_kernels",
         ":gpu_less_kernels",
         ":gpu_logical_and_kernels",
         ":gpu_logical_or_kernels",
         ":gpu_maximum_kernels",
         ":gpu_minimum_kernels",
         ":gpu_mul_kernels",
         ":gpu_not_equal_kernels",
         ":gpu_pow_kernels",
         ":gpu_right_shift_kernels",
         ":gpu_squared_difference_kernels",
         ":gpu_sub_kernels",
         ":gpu_zeta_kernels",
         "//third_party/eigen3",
     ],
 )

 tf_kernel_library(
     name = "cpu_cwise_binary_op",
     srcs = [
         "cpu_op_add.cc",
     ],
     tags = ["manual"],
     deps = [
         ":base_cpu_op",
         ":cpu_add_v2_kernels",
         "//third_party/eigen3",
     ],
 )

 tf_kernel_library(
     name = "cwise_op",
     srcs = [],
     tags = ["no_rocm"],
     # Technically these libraries don't need --config=cuda or --config=rocm,
     # but we want to avoid building them if they are not needed.
     deps = if_cuda_or_rocm([
         ":gpu_cwise_unary_op",
     ]) + if_mlir_experimental_kernels_enabled([":experimental_cwise_op"]),
 )

 tf_kernel_library(
     name = "experimental_cwise_op",
     srcs = [],
     deps = [
         ":cpu_cwise_unary_op",
         ":cpu_cwise_binary_op",
     ] + if_cuda_or_rocm([":gpu_cwise_binary_op"]),
 )

 cc_library(
     name = "base_ops_test",
     testonly = 1,
     srcs = ["base_ops_test.cc"],
     hdrs = ["base_ops_test.h"],
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:tensorflow",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
     ],
 )

 cc_library(
     name = "base_unary_ops_test",
     testonly = 1,
     hdrs = ["base_unary_ops_test.h"],
     deps = [
         ":base_ops_test",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/framework:types_proto_cc",
         "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:ops_testutil",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@llvm-project//llvm:Support",
     ],
 )

 tf_cuda_cc_test(
     name = "gpu_unary_ops_test",
     size = "small",
     srcs = if_mlir_generated_gpu_kernels_enabled(["gpu_unary_ops_test.cc"]),
     tags = tf_cuda_tests_tags() + [
         "no_cuda_asan",  # TODO(b/171341759): re-enable.
     ],
     deps = [
         ":base_ops_test",
         ":base_unary_ops_test",
         "//tensorflow/core/common_runtime:device",
         "//tensorflow/core/common_runtime:device_factory",
     ],
 )

 tf_cc_test(
     name = "cpu_unary_ops_test",
     size = "small",
     srcs = if_mlir_generated_gpu_kernels_enabled(["cpu_unary_ops_test.cc"]),
     deps = [
         ":base_ops_test",
         ":base_unary_ops_test",
         "//tensorflow/core/common_runtime:device",
         "//tensorflow/core/common_runtime:device_factory",
     ],
 )

 cc_library(
     name = "base_binary_ops_test",
     testonly = 1,
     hdrs = ["base_binary_ops_test.h"],
     deps = [
         ":base_ops_test",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/common_runtime:device",
         "//tensorflow/core/common_runtime:device_factory",
         "//tensorflow/core/framework:types_proto_cc",
         "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:ops_testutil",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@llvm-project//llvm:Support",
     ],
 )

 tf_cuda_cc_test(
     name = "gpu_binary_ops_test",
     size = "medium",
     srcs = if_mlir_generated_gpu_kernels_enabled(["gpu_binary_ops_test.cc"]),
     tags = tf_cuda_tests_tags() + [
         "no_cuda_asan",  # b/173033461
     ],
     deps = [
         ":base_binary_ops_test",
         ":base_ops_test",
         "//tensorflow/core/common_runtime:device",
         "//tensorflow/core/common_runtime:device_factory",
     ],
 )

 tf_cuda_cc_test(
     name = "cpu_binary_ops_test",
     size = "medium",
     srcs = if_mlir_generated_gpu_kernels_enabled(["cpu_binary_ops_test.cc"]),
     deps = [
         ":base_binary_ops_test",
         ":base_ops_test",
         "//tensorflow/core/common_runtime:device",
         "//tensorflow/core/common_runtime:device_factory",
     ],
 )

 # TODO(b/160731748): Re-enable when it works again.
 # gpu_kernel_library(
 #     name = "gpu_bias_add_lib",
 #     op = "bias_add",
 #     tile_size = "16x16",
 #     types = [
 #         "f16",
 #         "f32",
 #         "f64",
 #     ],
 # )

 # TODO(b/160190568): Re-enable when it works again.
 # gpu_kernel_library(
 #     name = "gpu_relu_lib",
 #     op = "relu",
 #     tile_size = "256",
 #     types = [
 #         "f16",
 #         "f32",
 #         "f64",
 #     ],
 # )

 # TODO(b/25387198): Add an int32 kernel.
 gpu_kernel_library(
     name = "gpu_abs_lib",
     op = "abs",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
         "i64",
     ],
     unroll_factors = "4",
 )

 gpu_kernel_library(
     name = "gpu_acos_lib",
     op = "acos",
     tile_size = "256",
     types = [
         "f32",
         "f64",
     ],
     unroll_factors = "4",
 )

 gpu_kernel_library(
     name = "gpu_acosh_lib",
     op = "acosh",
     tile_size = "256",
     types = [
         "f32",
         "f64",
     ],
     unroll_factors = "4",
 )

 gpu_kernel_library(
     name = "gpu_angle_lib",
     op = "angle",
     output_types = [
         "f32",
         "f64",
     ],
     tile_size = "256",
     types = [
         "c64",
         "c128",
     ],
     unroll_factors = "2",
 )

 gpu_kernel_library(
     name = "gpu_asin_lib",
     op = "asin",
     tile_size = "256",
     types = [
         "f32",
         "f64",
     ],
     unroll_factors = "4",
 )

 gpu_kernel_library(
     name = "gpu_asinh_lib",
     op = "asinh",
     tile_size = "256",
     types = [
         "f32",
         "f64",
     ],
     unroll_factors = "4",
 )

 gpu_kernel_library(
     name = "gpu_atan_lib",
     op = "atan",
     tile_size = "256",
     types = [
         "f32",
         "f64",
     ],
     unroll_factors = "4",
 )

 gpu_kernel_library(
     name = "gpu_atanh_lib",
     op = "atanh",
     tile_size = "256",
     types = [
         "f32",
         "f64",
     ],
     unroll_factors = "4",
 )

 gpu_kernel_library(
     name = "gpu_conj_lib",
     op = "conj",
     tile_size = "256",
     types = [
         "c64",
         "c128",
     ],
     unroll_factors = "2",
 )

 gpu_kernel_library(
     name = "gpu_cosh_lib",
     op = "cosh",
     tile_size = "256",
     types = [
         "f32",
         "f64",
     ],
     unroll_factors = "4",
 )

 gpu_kernel_library(
     name = "gpu_erf_lib",
     op = "erf",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
     ],
     unroll_factors = "4",
 )

 gpu_kernel_library(
     name = "gpu_erfc_lib",
     op = "erfc",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
     ],
     unroll_factors = "4",
 )

 gpu_kernel_library(
     name = "gpu_imag_lib",
     op = "imag",
     output_types = [
         "f32",
         "f64",
     ],
     tile_size = "256",
     types = [
         "c64",
         "c128",
     ],
 )

 gpu_kernel_library(
     name = "gpu_invert_lib",
     op = "invert",
     tile_size = "256",
     types = [
         "i8",
         "i16",
         "i32",
         "i64",
     ],
     unroll_factors = "4",
 )

 gpu_kernel_library(
     name = "gpu_logical_not_lib",
     op = "logical_not",
     tile_size = "256",
     types = ["i1"],
 )

 gpu_kernel_library(
     name = "gpu_real_lib",
     op = "real",
     output_types = [
         "f32",
         "f64",
     ],
     tile_size = "256",
     types = [
         "c64",
         "c128",
     ],
 )

 gpu_kernel_library(
     name = "gpu_sign_lib",
     op = "sign",
     tile_size = "256",
     types = [
         # TODO(b/162577610): Add bf16, c64 and c128.
         "f16",
         "f32",
         "f64",
         "i32",
         "i64",
     ],
     unroll_factors = "4",
 )

 gpu_kernel_library(
     name = "gpu_sinh_lib",
     op = "sinh",
     tile_size = "256",
     types = [
         "f32",
         "f64",
     ],
     unroll_factors = "4",
 )

 [
     gpu_kernel_library(
         name = "gpu_" + op + "_lib",
         op = op,
         tile_size = "1024",
         types = [
             "f16",
             "f32",
             "f64",
             "i64",
         ],
         unroll_factors = "4",
     )
     for op in [
         "square",
         "add_v2",
         "squared_difference",
         "sub",
     ]
 ]

 gpu_kernel_library(
     name = "gpu_complex_lib",
     op = "complex",
     output_types = [
         "c64",
         "c128",
     ],
     tile_size = "1024",
     types = [
         "f32",
         "f64",
     ],
     unroll_factors = "2",
 )

 gpu_kernel_library(
     name = "gpu_complex_abs_lib",
     op = "complex_abs",
     output_types = [
         "f32",
         "f64",
     ],
     tile_size = "256",
     types = [
         "c64",
         "c128",
     ],
 )

 gpu_kernel_library(
     name = "gpu_div_lib",
     op = "div",
     tile_size = "1024",
     types = [
         "f16",
         "f32",
         "f64",
         "i16",
         "i64",
     ],
 )

 gpu_kernel_library(
     name = "gpu_mul_lib",
     op = "mul",
     tile_size = "1024",
     types = [
         "f16",
         "f32",
         "f64",
         "i8",
         "i16",
         "i64",
     ],
     unroll_factors = "4",
 )

 # Bitwise operations.
 [
     gpu_kernel_library(
         name = "gpu_" + op + "_lib",
         op = op,
         tile_size = "1024",
         types = [
             "i8",
             "i16",
             "i32",
             "i64",
             # TODO(b/172804967): Enable once fixed.
             # "ui8",
             # "ui16",
             # "ui32",
             # "ui64",
         ],
         unroll_factors = "4",
     )
     for op in [
         "bitwise_and",
         "bitwise_or",
         "bitwise_xor",
         "left_shift",
         "right_shift",
     ]
 ]

 gpu_kernel_library(
     name = "gpu_atan2_lib",
     op = "atan2",
     tile_size = "256,1,1",
     types = [
         "f32",
         "f64",
     ],
     unroll_factors = "4",
 )

 # Logical operations.
 [
     gpu_kernel_library(
         name = "gpu_" + op + "_lib",
         op = op,
         tile_size = "1024",
         types = [
             "i1",
         ],
         unroll_factors = "4",
     )
     for op in [
         "logical_and",
         "logical_or",
     ]
 ]

 [
     gpu_kernel_library(
         name = "gpu_" + op + "_lib",
         op = op,
         output_types = ["i1"] * 8,
         tile_size = "1024",
         types = [
             "f16",
             "f32",
             "f64",
             "i1",
             "i8",
             "i16",
             "i32",
             "i64",
         ],
         unroll_factors = "4",
     )
     for op in [
         "equal",
         "not_equal",
     ]
 ]

 [
     gpu_kernel_library(
         name = "gpu_" + op + "_lib",
         op = op,
         output_types = ["i1"] * 7,
         tile_size = "1024",
         types = [
             "f16",
             "f32",
             "f64",
             "i8",
             "i16",
             "i32",
             "i64",
         ],
         unroll_factors = "4",
     )
     for op in [
         "less",
         "less_equal",
         "greater",
         "greater_equal",
     ]
 ]

 [
     gpu_kernel_library(
         name = "gpu_" + op + "_lib",
         op = op,
         tile_size = "1024",
         types = [
             "f16",
             "f32",
             "f64",
             "i16",
             "i32",
             "i64",
         ],
         unroll_factors = "4",
     )
     for op in [
         "maximum",
         "minimum",
     ]
 ]

 # Kernels that support all floating-point and signed int types.
 [
     gpu_kernel_library(
         name = "gpu_" + op + "_lib",
         op = op,
         tile_size = "256",
         types = [
             "f16",
             "f32",
             "f64",
             "i8",
             "i16",
             "i32",
             "i64",
         ],
         unroll_factors = "4",
     )
     for op in [
         "neg",
     ]
 ]

 gpu_kernel_library(
     name = "gpu_floor_div_lib",
     op = "floor_div",
     tile_size = "1024",
     # TODO(172804967): Enable for integer types also once unsigned integers are
     # supported.
     types = [
         "f16",
         "f32",
         "f64",
     ],
     unroll_factors = "4",
 )

 # Kernels that support all floating-point types.
 [
     gpu_kernel_library(
         name = "gpu_" + op + "_lib",
         op = op,
         tile_size = "256",
         types = [
             "f16",
             "f32",
             "f64",
         ],
         unroll_factors = "4",
     )
     for op in [
         "ceil",
         "digamma",
         "exp",
         "expm1",
         "floor",
         "lgamma",
         "log",
         "log1p",
         "rsqrt",
         "sqrt",
         "tanh",
     ]
 ]

 # Kernels that support all floating-point types but have i1 output.
 [
     gpu_kernel_library(
         name = "gpu_" + op + "_lib",
         op = op,
         output_types = ["i1"] * 3,
         tile_size = "256",
         types = [
             "f16",
             "f32",
             "f64",
         ],
         unroll_factors = "4",
     )
     for op in [
         "is_finite",
         "is_inf",
         "is_nan",
     ]
 ]

 # Kernels that support all floating-point types but cannot be vectorized.
 [
     gpu_kernel_library(
         name = "gpu_" + op + "_lib",
         op = op,
         tile_size = "256",
         types = [
             "f16",
             "f32",
             "f64",
         ],
     )
     for op in [
         "cos",
         "sin",
         "tan",
     ]
 ]

 gpu_kernel_library(
     name = "gpu_cast_lib",
     op = "cast",
     # We generate all combinations of input types/output types from the set
     # {i1, i8, i16, i32, i64, f16, f32, f64}. The easiest way to do this is to
     # repeat each input type 8 times and match it with the 8 different output
     # types (thus, the list of 8 different output types needs to be repeated 8
     # times as well).
     output_types = [
         "i1",
         "i8",
         "i16",
         "i32",
         "i64",
         "f16",
         "f32",
         "f64",
     ] * 8,
     tile_size = "256",
     types = ["i1"] * 8 + ["i8"] * 8 + ["i16"] * 8 + ["i32"] * 8 + ["i64"] * 8 + ["f16"] * 8 + ["f32"] * 8 + ["f64"] * 8,
     unroll_factors = "4",
 )

 gpu_kernel_library(
     name = "gpu_pow_lib",
     op = "pow",
     tile_size = "1024",
     types = [
         "f16",
         "f32",
         "f64",
         "i64",
     ],
 )

 gpu_kernel_library(
     # The zeta kernels needs many registers so tile at 256.
     name = "gpu_zeta_lib",
     op = "zeta",
     tile_size = "256",
     types = [
         "f32",
         "f64",
     ],
     # TODO(b/178388085): Enable unrolling after vectorization is fixed.
     # unroll_factors = "4",
 )

 cpu_kernel_library(
     name = "cpu_abs_lib",
     op = "abs",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
         "i8",
         "i16",
         "i32",
         "i64",
     ],
     unroll_factors = "4",
 )

 cpu_kernel_library(
     name = "cpu_add_v2_lib",
     op = "add_v2",
     tile_size = "1024",
     types = [
         "f16",
         "f32",
         "f64",
         "i32",
         "i64",
     ],
     unroll_factors = "4",
 )
	# Generates CUDA kernels using MLIR codegen.

	load(
	"//tensorflow/core/kernels/mlir_generated:build_defs.bzl",
	"cpu_kernel_library",
	"gpu_kernel_library",
	"if_mlir_experimental_kernels_enabled",
	"if_mlir_generated_gpu_kernels_enabled",
	)
	load(
	"//tensorflow:tensorflow.bzl",
	"if_cuda_or_rocm",
	"tf_cc_test",
	)
	load("//tensorflow:tensorflow.bzl", "get_compatible_with_cloud") # buildifier: disable=same-origin-load
	load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test") # buildifier: disable=same-origin-load
	load("//tensorflow:tensorflow.bzl", "tf_kernel_library") # buildifier: disable=same-origin-load
	load(
	"//tensorflow/core/platform:build_config_root.bzl",
	"tf_cuda_tests_tags",
	)

	package(
	default_visibility = [
	"//tensorflow/core/kernels:__subpackages__",
	],
	licenses = ["notice"], # Apache 2.0
	)

	config_setting(
	name = "mlir_generated_gpu_kernels_disabled",
	define_values = {"tensorflow_enable_mlir_generated_gpu_kernels": "0"},
	)

	config_setting(
	name = "mlir_experimental_kernels_enabled",
	define_values = {"enable_unranked_kernels": "1"},
	)

	filegroup(
	name = "enabled_unary_gpu_kernel_srcs",
	srcs = [
	"gpu_op_abs.cc",
	"gpu_op_acos.cc",
	"gpu_op_acosh.cc",
	"gpu_op_angle.cc",
	"gpu_op_asin.cc",
	"gpu_op_asinh.cc",
	"gpu_op_atan.cc",
	"gpu_op_atanh.cc",
	"gpu_op_ceil.cc",
	"gpu_op_complex.cc",
	"gpu_op_complex_abs.cc",
	"gpu_op_conj.cc",
	"gpu_op_cos.cc",
	"gpu_op_cosh.cc",
	"gpu_op_digamma.cc",
	"gpu_op_erf.cc",
	"gpu_op_erfc.cc",
	"gpu_op_floor.cc",
	"gpu_op_imag.cc",
	"gpu_op_invert.cc",
	"gpu_op_is_finite.cc",
	"gpu_op_is_inf.cc",
	"gpu_op_is_nan.cc",
	"gpu_op_lgamma.cc",
	"gpu_op_log.cc",
	"gpu_op_log1p.cc",
	"gpu_op_logical_not.cc",
	"gpu_op_neg.cc",
	"gpu_op_real.cc",
	"gpu_op_rsqrt.cc",
	"gpu_op_sin.cc",
	"gpu_op_sinh.cc",
	"gpu_op_sqrt.cc",
	"gpu_op_square.cc",
	"gpu_op_tan.cc",
	"gpu_op_tanh.cc",
	],
	compatible_with = get_compatible_with_cloud(),
	)

	filegroup(
	name = "experimental_unary_gpu_kernel_srcs",
	srcs = [
	"gpu_op_exp.cc",
	"gpu_op_expm1.cc",
	"gpu_op_sign.cc",
	],
	compatible_with = get_compatible_with_cloud(),
	)

	filegroup(
	name = "unary_gpu_kernel_srcs",
	srcs = [
	":enabled_unary_gpu_kernel_srcs",
	] + if_mlir_experimental_kernels_enabled(
	if_true = [":experimental_unary_gpu_kernel_srcs"],
	),
	compatible_with = get_compatible_with_cloud(),
	)

	cc_library(
	name = "base_op",
	srcs = ["base_op.cc"],
	hdrs = ["base_op.h"],
	compatible_with = get_compatible_with_cloud(),
	deps = [
	"//tensorflow/core:framework",
	"//tensorflow/core:lib",
	"//tensorflow/core/framework:allocation_description_proto_cc",
	"//tensorflow/core/framework:op_requires",
	"@llvm-project//llvm:Support",
	"@llvm-project//mlir:mlir_c_runner_utils",
	],
	)

	cc_library(
	name = "base_gpu_op",
	hdrs = ["base_gpu_op.h"],
	compatible_with = get_compatible_with_cloud(),
	deps = [":base_op"],
	)

	cc_library(
	name = "base_cpu_op",
	hdrs = ["base_cpu_op.h"],
	compatible_with = get_compatible_with_cloud(),
	deps = [":base_op"],
	)

	tf_kernel_library(
	name = "gpu_cwise_unary_op",
	srcs = [":unary_gpu_kernel_srcs"],
	tags = ["manual"],
	# Technically we only need to depend on the kernel libraries for the
	# kernels which are enabled by default. But this would make our BUILD
	# target structure uglier. We already need to make sure that those
	# targets can be built, so it should not hurt to link them in even if
	# they are currently not needed yet.
	deps = [
	":base_gpu_op",
	":gpu_abs_kernels",
	":gpu_acos_kernels",
	":gpu_acosh_kernels",
	":gpu_angle_kernels",
	":gpu_asin_kernels",
	":gpu_asinh_kernels",
	":gpu_atan_kernels",
	":gpu_atanh_kernels",
	":gpu_ceil_kernels",
	":gpu_complex_abs_kernels",
	":gpu_complex_kernels",
	":gpu_conj_kernels",
	":gpu_cos_kernels",
	":gpu_cosh_kernels",
	":gpu_digamma_kernels",
	":gpu_erf_kernels",
	":gpu_erfc_kernels",
	":gpu_exp_kernels",
	":gpu_expm1_kernels",
	":gpu_floor_kernels",
	":gpu_imag_kernels",
	":gpu_invert_kernels",
	":gpu_is_finite_kernels",
	":gpu_is_inf_kernels",
	":gpu_is_nan_kernels",
	":gpu_lgamma_kernels",
	":gpu_log1p_kernels",
	":gpu_log_kernels",
	":gpu_logical_not_kernels",
	":gpu_neg_kernels",
	":gpu_real_kernels",
	":gpu_rsqrt_kernels",
	":gpu_sign_kernels",
	":gpu_sin_kernels",
	":gpu_sinh_kernels",
	":gpu_sqrt_kernels",
	":gpu_square_kernels",
	":gpu_tan_kernels",
	":gpu_tanh_kernels",
	"//third_party/eigen3",
	],
	)

	tf_kernel_library(
	name = "cpu_cwise_unary_op",
	srcs = [":cpu_op_abs.cc"],
	tags = ["manual"],
	# Technically we only need to depend on the kernel libraries for the
	# kernels which are enabled by default. But this would make our BUILD
	# target structure uglier. We already need to make sure that those
	# targets can be built, so it should not hurt to link them in even if
	# they are currently not needed yet.
	deps = [
	":base_cpu_op",
	":cpu_abs_kernels",
	"//third_party/eigen3",
	],
	)

	tf_kernel_library(
	name = "gpu_cwise_binary_op",
	srcs = [
	"gpu_op_add.cc",
	"gpu_op_atan2.cc",
	"gpu_op_bitwise_and.cc",
	"gpu_op_bitwise_or.cc",
	"gpu_op_bitwise_xor.cc",
	"gpu_op_div.cc",
	"gpu_op_equal.cc",
	"gpu_op_floor_div.cc",
	"gpu_op_greater.cc",
	"gpu_op_greater_equal.cc",
	"gpu_op_left_shift.cc",
	"gpu_op_less.cc",
	"gpu_op_less_equal.cc",
	"gpu_op_logical_and.cc",
	"gpu_op_logical_or.cc",
	"gpu_op_maximum.cc",
	"gpu_op_minimum.cc",
	"gpu_op_mul.cc",
	"gpu_op_not_equal.cc",
	"gpu_op_pow.cc",
	"gpu_op_right_shift.cc",
	"gpu_op_squared_difference.cc",
	"gpu_op_sub.cc",
	"gpu_op_zeta.cc",
	],
	tags = [
	"manual",
	],
	deps = [
	":base_gpu_op",
	":gpu_add_v2_kernels",
	":gpu_atan2_kernels",
	":gpu_bitwise_and_kernels",
	":gpu_bitwise_or_kernels",
	":gpu_bitwise_xor_kernels",
	":gpu_div_kernels",
	":gpu_equal_kernels",
	":gpu_floor_div_kernels",
	":gpu_greater_equal_kernels",
	":gpu_greater_kernels",
	":gpu_left_shift_kernels",
	":gpu_less_equal_kernels",
	":gpu_less_kernels",
	":gpu_logical_and_kernels",
	":gpu_logical_or_kernels",
	":gpu_maximum_kernels",
	":gpu_minimum_kernels",
	":gpu_mul_kernels",
	":gpu_not_equal_kernels",
	":gpu_pow_kernels",
	":gpu_right_shift_kernels",
	":gpu_squared_difference_kernels",
	":gpu_sub_kernels",
	":gpu_zeta_kernels",
	"//third_party/eigen3",
	],
	)

	tf_kernel_library(
	name = "cpu_cwise_binary_op",
	srcs = [
	"cpu_op_add.cc",
	],
	tags = ["manual"],
	deps = [
	":base_cpu_op",
	":cpu_add_v2_kernels",
	"//third_party/eigen3",
	],
	)

	tf_kernel_library(
	name = "cwise_op",
	srcs = [],
	tags = ["no_rocm"],
	# Technically these libraries don't need --config=cuda or --config=rocm,
	# but we want to avoid building them if they are not needed.
	deps = if_cuda_or_rocm([
	":gpu_cwise_unary_op",
	]) + if_mlir_experimental_kernels_enabled([":experimental_cwise_op"]),
	)

	tf_kernel_library(
	name = "experimental_cwise_op",
	srcs = [],
	deps = [
	":cpu_cwise_unary_op",
	":cpu_cwise_binary_op",
	] + if_cuda_or_rocm([":gpu_cwise_binary_op"]),
	)

	cc_library(
	name = "base_ops_test",
	testonly = 1,
	srcs = ["base_ops_test.cc"],
	hdrs = ["base_ops_test.h"],
	deps = [
	"//tensorflow/core:framework",
	"//tensorflow/core:tensorflow",
	"@com_google_absl//absl/container:inlined_vector",
	"@com_google_absl//absl/strings",
	"@llvm-project//llvm:Support",
	],
	)

	cc_library(
	name = "base_unary_ops_test",
	testonly = 1,
	hdrs = ["base_unary_ops_test.h"],
	deps = [
	":base_ops_test",
	"//tensorflow/core:framework",
	"//tensorflow/core:framework_internal",
	"//tensorflow/core:tensorflow",
	"//tensorflow/core:test",
	"//tensorflow/core:test_main",
	"//tensorflow/core:testlib",
	"//tensorflow/core/framework:types_proto_cc",
	"//tensorflow/core/kernels:cwise_op",
	"//tensorflow/core/kernels:ops_testutil",
	"@com_google_absl//absl/container:inlined_vector",
	"@com_google_absl//absl/strings",
	"@com_google_absl//absl/types:optional",
	"@llvm-project//llvm:Support",
	],
	)

	tf_cuda_cc_test(
	name = "gpu_unary_ops_test",
	size = "small",
	srcs = if_mlir_generated_gpu_kernels_enabled(["gpu_unary_ops_test.cc"]),
	tags = tf_cuda_tests_tags() + [
	"no_cuda_asan", # TODO(b/171341759): re-enable.
	],
	deps = [
	":base_ops_test",
	":base_unary_ops_test",
	"//tensorflow/core/common_runtime:device",
	"//tensorflow/core/common_runtime:device_factory",
	],
	)

	tf_cc_test(
	name = "cpu_unary_ops_test",
	size = "small",
	srcs = if_mlir_generated_gpu_kernels_enabled(["cpu_unary_ops_test.cc"]),
	deps = [
	":base_ops_test",
	":base_unary_ops_test",
	"//tensorflow/core/common_runtime:device",
	"//tensorflow/core/common_runtime:device_factory",
	],
	)

	cc_library(
	name = "base_binary_ops_test",
	testonly = 1,
	hdrs = ["base_binary_ops_test.h"],
	deps = [
	":base_ops_test",
	"//tensorflow/core:framework",
	"//tensorflow/core:framework_internal",
	"//tensorflow/core:tensorflow",
	"//tensorflow/core:test",
	"//tensorflow/core:test_main",
	"//tensorflow/core:testlib",
	"//tensorflow/core/common_runtime:device",
	"//tensorflow/core/common_runtime:device_factory",
	"//tensorflow/core/framework:types_proto_cc",
	"//tensorflow/core/kernels:cwise_op",
	"//tensorflow/core/kernels:ops_testutil",
	"@com_google_absl//absl/container:inlined_vector",
	"@com_google_absl//absl/strings",
	"@com_google_absl//absl/types:optional",
	"@llvm-project//llvm:Support",
	],
	)

	tf_cuda_cc_test(
	name = "gpu_binary_ops_test",
	size = "medium",
	srcs = if_mlir_generated_gpu_kernels_enabled(["gpu_binary_ops_test.cc"]),
	tags = tf_cuda_tests_tags() + [
	"no_cuda_asan", # b/173033461
	],
	deps = [
	":base_binary_ops_test",
	":base_ops_test",
	"//tensorflow/core/common_runtime:device",
	"//tensorflow/core/common_runtime:device_factory",
	],
	)

	tf_cuda_cc_test(
	name = "cpu_binary_ops_test",
	size = "medium",
	srcs = if_mlir_generated_gpu_kernels_enabled(["cpu_binary_ops_test.cc"]),
	deps = [
	":base_binary_ops_test",
	":base_ops_test",
	"//tensorflow/core/common_runtime:device",
	"//tensorflow/core/common_runtime:device_factory",
	],
	)

	# TODO(b/160731748): Re-enable when it works again.
	# gpu_kernel_library(
	# name = "gpu_bias_add_lib",
	# op = "bias_add",
	# tile_size = "16x16",
	# types = [
	# "f16",
	# "f32",
	# "f64",
	# ],
	# )

	# TODO(b/160190568): Re-enable when it works again.
	# gpu_kernel_library(
	# name = "gpu_relu_lib",
	# op = "relu",
	# tile_size = "256",
	# types = [
	# "f16",
	# "f32",
	# "f64",
	# ],
	# )

	# TODO(b/25387198): Add an int32 kernel.
	gpu_kernel_library(
	name = "gpu_abs_lib",
	op = "abs",
	tile_size = "256",
	types = [
	"f16",
	"f32",
	"f64",
	"i64",
	],
	unroll_factors = "4",
	)

	gpu_kernel_library(
	name = "gpu_acos_lib",
	op = "acos",
	tile_size = "256",
	types = [
	"f32",
	"f64",
	],
	unroll_factors = "4",
	)

	gpu_kernel_library(
	name = "gpu_acosh_lib",
	op = "acosh",
	tile_size = "256",
	types = [
	"f32",
	"f64",
	],
	unroll_factors = "4",
	)

	gpu_kernel_library(
	name = "gpu_angle_lib",
	op = "angle",
	output_types = [
	"f32",
	"f64",
	],
	tile_size = "256",
	types = [
	"c64",
	"c128",
	],
	unroll_factors = "2",
	)

	gpu_kernel_library(
	name = "gpu_asin_lib",
	op = "asin",
	tile_size = "256",
	types = [
	"f32",
	"f64",
	],
	unroll_factors = "4",
	)

	gpu_kernel_library(
	name = "gpu_asinh_lib",
	op = "asinh",
	tile_size = "256",
	types = [
	"f32",
	"f64",
	],
	unroll_factors = "4",
	)

	gpu_kernel_library(
	name = "gpu_atan_lib",
	op = "atan",
	tile_size = "256",
	types = [
	"f32",
	"f64",
	],
	unroll_factors = "4",
	)

	gpu_kernel_library(
	name = "gpu_atanh_lib",
	op = "atanh",
	tile_size = "256",
	types = [
	"f32",
	"f64",
	],
	unroll_factors = "4",
	)

	gpu_kernel_library(
	name = "gpu_conj_lib",
	op = "conj",
	tile_size = "256",
	types = [
	"c64",
	"c128",
	],
	unroll_factors = "2",
	)

	gpu_kernel_library(
	name = "gpu_cosh_lib",
	op = "cosh",
	tile_size = "256",
	types = [
	"f32",
	"f64",
	],
	unroll_factors = "4",
	)

	gpu_kernel_library(
	name = "gpu_erf_lib",
	op = "erf",
	tile_size = "256",
	types = [
	"f16",
	"f32",
	"f64",
	],
	unroll_factors = "4",
	)

	gpu_kernel_library(
	name = "gpu_erfc_lib",
	op = "erfc",
	tile_size = "256",
	types = [
	"f16",
	"f32",
	"f64",
	],
	unroll_factors = "4",
	)

	gpu_kernel_library(
	name = "gpu_imag_lib",
	op = "imag",
	output_types = [
	"f32",
	"f64",
	],
	tile_size = "256",
	types = [
	"c64",
	"c128",
	],
	)

	gpu_kernel_library(
	name = "gpu_invert_lib",
	op = "invert",
	tile_size = "256",
	types = [
	"i8",
	"i16",
	"i32",
	"i64",
	],
	unroll_factors = "4",
	)

	gpu_kernel_library(
	name = "gpu_logical_not_lib",
	op = "logical_not",
	tile_size = "256",
	types = ["i1"],
	)

	gpu_kernel_library(
	name = "gpu_real_lib",
	op = "real",
	output_types = [
	"f32",
	"f64",
	],
	tile_size = "256",
	types = [
	"c64",
	"c128",
	],
	)

	gpu_kernel_library(
	name = "gpu_sign_lib",
	op = "sign",
	tile_size = "256",
	types = [
	# TODO(b/162577610): Add bf16, c64 and c128.
	"f16",
	"f32",
	"f64",
	"i32",
	"i64",
	],
	unroll_factors = "4",
	)

	gpu_kernel_library(
	name = "gpu_sinh_lib",
	op = "sinh",
	tile_size = "256",
	types = [
	"f32",
	"f64",
	],
	unroll_factors = "4",
	)

	[
	gpu_kernel_library(
	name = "gpu_" + op + "_lib",
	op = op,
	tile_size = "1024",
	types = [
	"f16",
	"f32",
	"f64",
	"i64",
	],
	unroll_factors = "4",
	)
	for op in [
	"square",
	"add_v2",
	"squared_difference",
	"sub",
	]
	]

	gpu_kernel_library(
	name = "gpu_complex_lib",
	op = "complex",
	output_types = [
	"c64",
	"c128",
	],
	tile_size = "1024",
	types = [
	"f32",
	"f64",
	],
	unroll_factors = "2",
	)

	gpu_kernel_library(
	name = "gpu_complex_abs_lib",
	op = "complex_abs",
	output_types = [
	"f32",
	"f64",
	],
	tile_size = "256",
	types = [
	"c64",
	"c128",
	],
	)

	gpu_kernel_library(
	name = "gpu_div_lib",
	op = "div",
	tile_size = "1024",
	types = [
	"f16",
	"f32",
	"f64",
	"i16",
	"i64",
	],
	)

	gpu_kernel_library(
	name = "gpu_mul_lib",
	op = "mul",
	tile_size = "1024",
	types = [
	"f16",
	"f32",
	"f64",
	"i8",
	"i16",
	"i64",
	],
	unroll_factors = "4",
	)

	# Bitwise operations.
	[
	gpu_kernel_library(
	name = "gpu_" + op + "_lib",
	op = op,
	tile_size = "1024",
	types = [
	"i8",
	"i16",
	"i32",
	"i64",
	# TODO(b/172804967): Enable once fixed.
	# "ui8",
	# "ui16",
	# "ui32",
	# "ui64",
	],
	unroll_factors = "4",
	)
	for op in [
	"bitwise_and",
	"bitwise_or",
	"bitwise_xor",
	"left_shift",
	"right_shift",
	]
	]

	gpu_kernel_library(
	name = "gpu_atan2_lib",
	op = "atan2",
	tile_size = "256,1,1",
	types = [
	"f32",
	"f64",
	],
	unroll_factors = "4",
	)

	# Logical operations.
	[
	gpu_kernel_library(
	name = "gpu_" + op + "_lib",
	op = op,
	tile_size = "1024",
	types = [
	"i1",
	],
	unroll_factors = "4",
	)
	for op in [
	"logical_and",
	"logical_or",
	]
	]

	[
	gpu_kernel_library(
	name = "gpu_" + op + "_lib",
	op = op,
	output_types = ["i1"] * 8,
	tile_size = "1024",
	types = [
	"f16",
	"f32",
	"f64",
	"i1",
	"i8",
	"i16",
	"i32",
	"i64",
	],
	unroll_factors = "4",
	)
	for op in [
	"equal",
	"not_equal",
	]
	]

	[
	gpu_kernel_library(
	name = "gpu_" + op + "_lib",
	op = op,
	output_types = ["i1"] * 7,
	tile_size = "1024",
	types = [
	"f16",
	"f32",
	"f64",
	"i8",
	"i16",
	"i32",
	"i64",
	],
	unroll_factors = "4",
	)
	for op in [
	"less",
	"less_equal",
	"greater",
	"greater_equal",
	]
	]

	[
	gpu_kernel_library(
	name = "gpu_" + op + "_lib",
	op = op,
	tile_size = "1024",
	types = [
	"f16",
	"f32",
	"f64",
	"i16",
	"i32",
	"i64",
	],
	unroll_factors = "4",
	)
	for op in [
	"maximum",
	"minimum",
	]
	]

	# Kernels that support all floating-point and signed int types.
	[
	gpu_kernel_library(
	name = "gpu_" + op + "_lib",
	op = op,
	tile_size = "256",
	types = [
	"f16",
	"f32",
	"f64",
	"i8",
	"i16",
	"i32",
	"i64",
	],
	unroll_factors = "4",
	)
	for op in [
	"neg",
	]
	]

	gpu_kernel_library(
	name = "gpu_floor_div_lib",
	op = "floor_div",
	tile_size = "1024",
	# TODO(172804967): Enable for integer types also once unsigned integers are
	# supported.
	types = [
	"f16",
	"f32",
	"f64",
	],
	unroll_factors = "4",
	)

	# Kernels that support all floating-point types.
	[
	gpu_kernel_library(
	name = "gpu_" + op + "_lib",
	op = op,
	tile_size = "256",
	types = [
	"f16",
	"f32",
	"f64",
	],
	unroll_factors = "4",
	)
	for op in [
	"ceil",
	"digamma",
	"exp",
	"expm1",
	"floor",
	"lgamma",
	"log",
	"log1p",
	"rsqrt",
	"sqrt",
	"tanh",
	]
	]

	# Kernels that support all floating-point types but have i1 output.
	[
	gpu_kernel_library(
	name = "gpu_" + op + "_lib",
	op = op,
	output_types = ["i1"] * 3,
	tile_size = "256",
	types = [
	"f16",
	"f32",
	"f64",
	],
	unroll_factors = "4",
	)
	for op in [
	"is_finite",
	"is_inf",
	"is_nan",
	]
	]

	# Kernels that support all floating-point types but cannot be vectorized.
	[
	gpu_kernel_library(
	name = "gpu_" + op + "_lib",
	op = op,
	tile_size = "256",
	types = [
	"f16",
	"f32",
	"f64",
	],
	)
	for op in [
	"cos",
	"sin",
	"tan",
	]
	]

	gpu_kernel_library(
	name = "gpu_cast_lib",
	op = "cast",
	# We generate all combinations of input types/output types from the set
	# {i1, i8, i16, i32, i64, f16, f32, f64}. The easiest way to do this is to
	# repeat each input type 8 times and match it with the 8 different output
	# types (thus, the list of 8 different output types needs to be repeated 8
	# times as well).
	output_types = [
	"i1",
	"i8",
	"i16",
	"i32",
	"i64",
	"f16",
	"f32",
	"f64",
	] * 8,
	tile_size = "256",
	types = ["i1"] * 8 + ["i8"] * 8 + ["i16"] * 8 + ["i32"] * 8 + ["i64"] * 8 + ["f16"] * 8 + ["f32"] * 8 + ["f64"] * 8,
	unroll_factors = "4",
	)

	gpu_kernel_library(
	name = "gpu_pow_lib",
	op = "pow",
	tile_size = "1024",
	types = [
	"f16",
	"f32",
	"f64",
	"i64",
	],
	)

	gpu_kernel_library(
	# The zeta kernels needs many registers so tile at 256.
	name = "gpu_zeta_lib",
	op = "zeta",
	tile_size = "256",
	types = [
	"f32",
	"f64",
	],
	# TODO(b/178388085): Enable unrolling after vectorization is fixed.
	# unroll_factors = "4",
	)

	cpu_kernel_library(
	name = "cpu_abs_lib",
	op = "abs",
	tile_size = "256",
	types = [
	"f16",
	"f32",
	"f64",
	"i8",
	"i16",
	"i32",
	"i64",
	],
	unroll_factors = "4",
	)

	cpu_kernel_library(
	name = "cpu_add_v2_lib",
	op = "add_v2",
	tile_size = "1024",
	types = [
	"f16",
	"f32",
	"f64",
	"i32",
	"i64",
	],
	unroll_factors = "4",
	)