tensorflow/core/kernels/mlir_generated/BUILD - platform/external/tensorflow - Git at Google

 # Generates CUDA kernels using MLIR codegen.

 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
 load(
     "//tensorflow/core/kernels/mlir_generated:build_defs.bzl",
     "cpu_kernel_library",
     "gpu_kernel_library",
     "if_mlir_generated_cpu_kernels_enabled",
     "if_mlir_generated_experimental_kernels_enabled",
     "if_mlir_generated_gpu_kernels_enabled",
 )
 load(
     "//tensorflow:tensorflow.bzl",
     "if_cuda_or_rocm",
     "tf_cc_test",
 )
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")  # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_kernel_library")  # buildifier: disable=same-origin-load
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
 )

 package(
     default_visibility = [
         "//tensorflow/core/kernels:__subpackages__",
     ],
     licenses = ["notice"],
 )

 package_group(
     name = "friends",
     packages = [
         "//third_party/car/...",
     ],
 )

 bool_flag(
     name = "enable_gpu",
     build_setting_default = True,
 )

 config_setting(
     name = "is_gpu_enabled",
     flag_values = {":enable_gpu": "True"},
 )

 bool_flag(
     name = "enable_cpu",
     build_setting_default = True,
 )

 config_setting(
     name = "is_cpu_enabled",
     flag_values = {":enable_cpu": "True"},
 )

 # This flag may only be enabled with enable_gpu and enable_cpu are true.
 bool_flag(
     name = "enable_experimental",
     build_setting_default = False,
 )

 config_setting(
     name = "is_experimental_enabled",
     flag_values = {":enable_experimental": "True"},
 )

 cc_library(
     name = "base_op",
     srcs = ["base_op.cc"],
     hdrs = ["base_op.h"],
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/framework:allocation_description_proto_cc",
         "//tensorflow/core/framework:op_requires",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:mlir_c_runner_utils",
     ],
 )

 cc_library(
     name = "base_gpu_op",
     hdrs = ["base_gpu_op.h"],
     deps = [
         ":base_op",
         "//tensorflow/core:framework",
     ],
 )

 cc_library(
     name = "base_cpu_op",
     hdrs = ["base_cpu_op.h"],
     deps = [":base_op"],
 )

 tf_kernel_library(
     name = "gpu_cwise_unary_op",
     srcs = if_mlir_generated_gpu_kernels_enabled([
         "gpu_op_abs.cc",
         "gpu_op_acos.cc",
         "gpu_op_acosh.cc",
         "gpu_op_angle.cc",
         "gpu_op_asin.cc",
         "gpu_op_asinh.cc",
         "gpu_op_atan.cc",
         "gpu_op_atanh.cc",
         "gpu_op_ceil.cc",
         "gpu_op_complex_abs.cc",
         "gpu_op_conj.cc",
         "gpu_op_cos.cc",
         "gpu_op_cosh.cc",
         "gpu_op_digamma.cc",
         "gpu_op_erf.cc",
         "gpu_op_erfc.cc",
         "gpu_op_exp.cc",
         "gpu_op_expm1.cc",
         "gpu_op_floor.cc",
         "gpu_op_imag.cc",
         "gpu_op_invert.cc",
         "gpu_op_is_finite.cc",
         "gpu_op_is_inf.cc",
         "gpu_op_is_nan.cc",
         "gpu_op_lgamma.cc",
         "gpu_op_log.cc",
         "gpu_op_log1p.cc",
         "gpu_op_logical_not.cc",
         "gpu_op_neg.cc",
         "gpu_op_real.cc",
         "gpu_op_reciprocal.cc",
         "gpu_op_rint.cc",
         "gpu_op_round.cc",
         "gpu_op_rsqrt.cc",
         "gpu_op_sigmoid.cc",
         "gpu_op_sign.cc",
         "gpu_op_sin.cc",
         "gpu_op_sinh.cc",
         "gpu_op_sqrt.cc",
         "gpu_op_square.cc",
         "gpu_op_tan.cc",
         "gpu_op_tanh.cc",
     ]),
     copts = if_mlir_generated_experimental_kernels_enabled([
         "-DMLIR_GENERATED_EXPERIMENTAL_KERNELS_ENABLED",
     ]),
     tags = ["manual"],
     deps = if_mlir_generated_gpu_kernels_enabled([
         ":base_gpu_op",
         ":gpu_abs_kernels",
         ":gpu_acos_kernels",
         ":gpu_acosh_kernels",
         ":gpu_angle_kernels",
         ":gpu_asin_kernels",
         ":gpu_asinh_kernels",
         ":gpu_atan_kernels",
         ":gpu_atanh_kernels",
         ":gpu_ceil_kernels",
         ":gpu_complex_abs_kernels",
         ":gpu_conj_kernels",
         ":gpu_cos_kernels",
         ":gpu_cosh_kernels",
         ":gpu_digamma_kernels",
         ":gpu_erf_kernels",
         ":gpu_erfc_kernels",
         ":gpu_exp_kernels",
         ":gpu_expm1_kernels",
         ":gpu_floor_kernels",
         ":gpu_imag_kernels",
         ":gpu_invert_kernels",
         ":gpu_is_finite_kernels",
         ":gpu_is_inf_kernels",
         ":gpu_is_nan_kernels",
         ":gpu_lgamma_kernels",
         ":gpu_log1p_kernels",
         ":gpu_log_kernels",
         ":gpu_logical_not_kernels",
         ":gpu_neg_kernels",
         ":gpu_real_kernels",
         ":gpu_reciprocal_kernels",
         ":gpu_rint_kernels",
         ":gpu_round_kernels",
         ":gpu_rsqrt_kernels",
         ":gpu_sigmoid_kernels",
         ":gpu_sign_kernels",
         ":gpu_sin_kernels",
         ":gpu_sinh_kernels",
         ":gpu_sqrt_kernels",
         ":gpu_square_kernels",
         ":gpu_tan_kernels",
         ":gpu_tanh_kernels",
         "//third_party/eigen3",
     ]),
 )

 tf_kernel_library(
     name = "cpu_cwise_unary_op",
     srcs = if_mlir_generated_cpu_kernels_enabled([
     ]) + if_mlir_generated_experimental_kernels_enabled([
         "cpu_op_abs.cc",
         "cpu_op_angle.cc",
         "cpu_op_ceil.cc",
         "cpu_op_cos.cc",
         "cpu_op_floor.cc",
         "cpu_op_invert.cc",
         "cpu_op_rsqrt.cc",
         "cpu_op_sin.cc",
         "cpu_op_sqrt.cc",
         "cpu_op_square.cc",
         "cpu_op_tan.cc",
         "cpu_op_tanh.cc",
     ]),
     tags = ["manual"],
     deps = if_mlir_generated_cpu_kernels_enabled([
     ]) + if_mlir_generated_experimental_kernels_enabled([
         ":base_cpu_op",
         ":cpu_abs_kernels",
         ":cpu_angle_kernels",
         ":cpu_ceil_kernels",
         ":cpu_cos_kernels",
         ":cpu_floor_kernels",
         ":cpu_invert_kernels",
         ":cpu_rsqrt_kernels",
         ":cpu_sin_kernels",
         ":cpu_sqrt_kernels",
         ":cpu_square_kernels",
         ":cpu_tan_kernels",
         ":cpu_tanh_kernels",
         "//third_party/eigen3",
     ]),
 )

 tf_kernel_library(
     name = "gpu_cwise_binary_op",
     srcs = if_mlir_generated_gpu_kernels_enabled([
         "gpu_op_add.cc",
         "gpu_op_atan2.cc",
         "gpu_op_bitwise_and.cc",
         "gpu_op_bitwise_or.cc",
         "gpu_op_bitwise_xor.cc",
         "gpu_op_complex.cc",
         "gpu_op_div.cc",
         "gpu_op_div_no_nan.cc",
         "gpu_op_equal.cc",
         "gpu_op_floor_div.cc",
         "gpu_op_greater.cc",
         "gpu_op_greater_equal.cc",
         "gpu_op_left_shift.cc",
         "gpu_op_less.cc",
         "gpu_op_less_equal.cc",
         "gpu_op_logical_and.cc",
         "gpu_op_logical_or.cc",
         "gpu_op_maximum.cc",
         "gpu_op_minimum.cc",
         "gpu_op_mul.cc",
         "gpu_op_mul_no_nan.cc",
         "gpu_op_not_equal.cc",
         "gpu_op_polygamma.cc",
         "gpu_op_pow.cc",
         "gpu_op_right_shift.cc",
         "gpu_op_select.cc",
         "gpu_op_squared_difference.cc",
         "gpu_op_sub.cc",
         "gpu_op_xdivy.cc",
         "gpu_op_xlog1py.cc",
         "gpu_op_xlogy.cc",
         "gpu_op_zeta.cc",
     ]),
     copts = if_mlir_generated_experimental_kernels_enabled([
         "-DMLIR_GENERATED_EXPERIMENTAL_KERNELS_ENABLED",
     ]),
     tags = ["manual"],
     deps = if_mlir_generated_gpu_kernels_enabled([
         ":base_gpu_op",
         ":gpu_add_v2_kernels",
         ":gpu_atan2_kernels",
         ":gpu_bitwise_and_kernels",
         ":gpu_bitwise_or_kernels",
         ":gpu_bitwise_xor_kernels",
         ":gpu_complex_kernels",
         ":gpu_div_kernels",
         ":gpu_div_no_nan_kernels",
         ":gpu_equal_kernels",
         ":gpu_floor_div_kernels",
         ":gpu_greater_equal_kernels",
         ":gpu_greater_kernels",
         ":gpu_left_shift_kernels",
         ":gpu_less_equal_kernels",
         ":gpu_less_kernels",
         ":gpu_logical_and_kernels",
         ":gpu_logical_or_kernels",
         ":gpu_maximum_kernels",
         ":gpu_minimum_kernels",
         ":gpu_mul_kernels",
         ":gpu_mul_no_nan_kernels",
         ":gpu_not_equal_kernels",
         ":gpu_polygamma_kernels",
         ":gpu_pow_kernels",
         ":gpu_right_shift_kernels",
         ":gpu_select_v2_kernels",
         ":gpu_squared_difference_kernels",
         ":gpu_sub_kernels",
         ":gpu_xdivy_kernels",
         ":gpu_xlog1py_kernels",
         ":gpu_xlogy_kernels",
         ":gpu_zeta_kernels",
         "//third_party/eigen3",
     ]),
 )

 tf_kernel_library(
     name = "cpu_cwise_binary_op",
     srcs = if_mlir_generated_cpu_kernels_enabled([
     ]) + if_mlir_generated_experimental_kernels_enabled([
         "cpu_op_add.cc",
         "cpu_op_bitwise_and.cc",
         "cpu_op_bitwise_or.cc",
         "cpu_op_bitwise_xor.cc",
         "cpu_op_left_shift.cc",
         "cpu_op_right_shift.cc",
     ]),
     tags = ["manual"],
     deps = if_mlir_generated_cpu_kernels_enabled([
     ]) + if_mlir_generated_experimental_kernels_enabled([
         ":base_cpu_op",
         ":cpu_add_v2_kernels",
         ":cpu_bitwise_and_kernels",
         ":cpu_bitwise_or_kernels",
         ":cpu_bitwise_xor_kernels",
         ":cpu_left_shift_kernels",
         ":cpu_right_shift_kernels",
         "//third_party/eigen3",
     ]),
 )

 tf_kernel_library(
     name = "cwise_op",
     srcs = [],
     visibility = [
         ":friends",
         "//tensorflow/core/kernels:__subpackages__",
     ],
     deps = [
         ":cpu_cwise_binary_op",
         ":cpu_cwise_unary_op",
     ] + if_cuda_or_rocm([
         ":gpu_cwise_unary_op",
         ":gpu_cwise_binary_op",
     ]),
 )

 tf_kernel_library(
     name = "gpu_cast_op",
     srcs = if_mlir_generated_gpu_kernels_enabled([
         "gpu_op_cast.cc",
     ]),
     tags = ["manual"],
     deps = if_mlir_generated_gpu_kernels_enabled([
         ":base_gpu_op",
         ":gpu_cast_kernels",
         "//third_party/eigen3",
     ]),
 )

 tf_kernel_library(
     name = "cast_op",
     srcs = [],
     deps = [
     ] + if_cuda_or_rocm([
         ":gpu_cast_op",
     ]),
 )

 tf_kernel_library(
     name = "gpu_constant_op",
     srcs = if_mlir_generated_experimental_kernels_enabled([
         "gpu_op_ones_like.cc",
         "gpu_op_zeros_like.cc",
     ]),
     tags = ["manual"],
     deps = if_mlir_generated_experimental_kernels_enabled([
         ":base_gpu_op",
         ":gpu_ones_like_kernels",
         ":gpu_zeros_like_kernels",
         "//third_party/eigen3",
     ]),
 )

 tf_kernel_library(
     name = "constant_op",
     srcs = [],
     deps = [
     ] + if_cuda_or_rocm([
         ":gpu_constant_op",
     ]),
 )

 tf_kernel_library(
     name = "gpu_nextafter_op",
     srcs = if_mlir_generated_gpu_kernels_enabled([
         "gpu_op_next_after.cc",
     ]),
     tags = ["manual"],
     deps = if_mlir_generated_gpu_kernels_enabled([
         ":base_gpu_op",
         ":gpu_next_after_kernels",
     ]),
 )

 tf_kernel_library(
     name = "nextafter_op",
     srcs = [],
     deps = [] + if_cuda_or_rocm([":gpu_nextafter_op"]),
 )

 tf_kernel_library(
     name = "gpu_relu_op",
     srcs = if_mlir_generated_gpu_kernels_enabled([
         "gpu_op_elu.cc",
         "gpu_op_relu.cc",
         "gpu_op_selu.cc",
     ]),
     tags = ["manual"],
     deps = if_mlir_generated_gpu_kernels_enabled([
         ":base_gpu_op",
         ":gpu_elu_kernels",
         ":gpu_relu_kernels",
         ":gpu_selu_kernels",
         "//third_party/eigen3",
     ]),
 )

 tf_kernel_library(
     name = "relu_op",
     srcs = [],
     deps = [
     ] + if_cuda_or_rocm([
         ":gpu_relu_op",
     ]),
 )

 tf_kernel_library(
     name = "gpu_softplus_op",
     srcs = if_mlir_generated_gpu_kernels_enabled([
         "gpu_op_softplus.cc",
     ]),
     tags = ["manual"],
     deps = if_mlir_generated_gpu_kernels_enabled([
         ":base_gpu_op",
         ":gpu_softplus_kernels",
         "//third_party/eigen3",
     ]),
 )

 tf_kernel_library(
     name = "softplus_op",
     srcs = [],
     deps = [
     ] + if_cuda_or_rocm([
         ":gpu_softplus_op",
     ]),
 )

 tf_kernel_library(
     name = "gpu_softsign_op",
     srcs = if_mlir_generated_gpu_kernels_enabled([
         "gpu_op_softsign.cc",
     ]),
     tags = ["manual"],
     deps = if_mlir_generated_gpu_kernels_enabled([
         ":base_gpu_op",
         ":gpu_softsign_kernels",
         "//third_party/eigen3",
     ]),
 )

 tf_kernel_library(
     name = "softsign_op",
     srcs = [],
     deps = [
     ] + if_cuda_or_rocm([
         ":gpu_softsign_op",
     ]),
 )

 cc_library(
     name = "base_ops_test",
     testonly = 1,
     srcs = ["base_ops_test.cc"],
     hdrs = ["base_ops_test.h"],
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:tensorflow",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
     ],
 )

 cc_library(
     name = "base_unary_ops_test",
     testonly = 1,
     hdrs = ["base_unary_ops_test.h"],
     deps = [
         ":base_ops_test",
         "//tensorflow/compiler/mlir/tools/kernel_gen:tf_jit_cache",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/framework:types_proto_cc",
         "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:ops_testutil",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@llvm-project//llvm:Support",
     ],
 )

 tf_cuda_cc_test(
     name = "gpu_unary_ops_test",
     size = "small",
     srcs = ["gpu_unary_ops_test.cc"],
     extra_copts = if_mlir_generated_experimental_kernels_enabled([
         "-DMLIR_GENERATED_EXPERIMENTAL_KERNELS_ENABLED",
     ]) + if_mlir_generated_gpu_kernels_enabled(
         ["-DMLIR_GENERATED_GPU_KERNELS_ENABLED"],
     ),
     tags = tf_cuda_tests_tags() + [
         "no_cuda_asan",  # TODO(b/171341759): re-enable.
         "no_cuda",  # TODO(b/196608406): re-enable
     ],
     deps = [
         ":base_ops_test",
         ":base_unary_ops_test",
         "//tensorflow/core/common_runtime:device",
         "//tensorflow/core/common_runtime:device_factory",
     ],
 )

 tf_cc_test(
     name = "cpu_unary_ops_test",
     size = "small",
     srcs = ["cpu_unary_ops_test.cc"],
     deps = [
         ":base_ops_test",
         ":base_unary_ops_test",
         "//tensorflow/core/common_runtime:device",
         "//tensorflow/core/common_runtime:device_factory",
     ],
 )

 cc_library(
     name = "base_binary_ops_test",
     testonly = 1,
     hdrs = ["base_binary_ops_test.h"],
     deps = [
         ":base_ops_test",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/common_runtime:device",
         "//tensorflow/core/common_runtime:device_factory",
         "//tensorflow/core/framework:types_proto_cc",
         "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:ops_testutil",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@llvm-project//llvm:Support",
     ],
 )

 tf_cuda_cc_test(
     name = "gpu_binary_ops_test",
     size = "medium",
     srcs = ["gpu_binary_ops_test.cc"],
     extra_copts = if_mlir_generated_gpu_kernels_enabled(
         ["-DMLIR_GENERATED_GPU_KERNELS_ENABLED"],
     ) + if_mlir_generated_experimental_kernels_enabled(
         [
             "-DMLIR_GENERATED_EXPERIMENTAL_KERNELS_ENABLED",
         ],
     ),
     tags = tf_cuda_tests_tags() + [
         "no_cuda_asan",  # b/173033461
     ],
     deps = [
         ":base_binary_ops_test",
         ":base_ops_test",
         "//tensorflow/core/common_runtime:device",
         "//tensorflow/core/common_runtime:device_factory",
     ],
 )

 tf_cc_test(
     name = "cpu_binary_ops_test",
     size = "medium",
     srcs = ["cpu_binary_ops_test.cc"],
     deps = [
         ":base_binary_ops_test",
         ":base_ops_test",
         "//tensorflow/core/common_runtime:device",
         "//tensorflow/core/common_runtime:device_factory",
     ],
 )

 # TODO(b/160731748): Re-enable when it works again.
 # gpu_kernel_library(
 #     name = "gpu_bias_add_kernels",
 #     op = "bias_add",
 #     tile_size = "16x16",
 #     types = [
 #         "f16",
 #         "f32",
 #         "f64",
 #     ],
 # )

 gpu_kernel_library(
     name = "gpu_relu_kernels",
     op = "relu",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
     ],
 )

 gpu_kernel_library(
     name = "gpu_elu_kernels",
     op = "elu",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
     ],
 )

 gpu_kernel_library(
     name = "gpu_selu_kernels",
     op = "selu",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
     ],
 )

 gpu_kernel_library(
     name = "gpu_sigmoid_kernels",
     op = "sigmoid",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
     ],
 )

 # TODO(b/25387198): Add an int32 kernel.
 gpu_kernel_library(
     name = "gpu_abs_kernels",
     op = "abs",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
         "i64",
     ],
     unroll_factors = "4",
 )

 gpu_kernel_library(
     name = "gpu_acos_kernels",
     op = "acos",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
     ],
     # Cannot vectorize.
 )

 gpu_kernel_library(
     name = "gpu_acosh_kernels",
     op = "acosh",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
     ],
     # May be compute-bound.
     # unroll_factors = "4",
 )

 gpu_kernel_library(
     name = "gpu_angle_kernels",
     op = "angle",
     output_types = [
         "f32",
         "f64",
     ],
     tile_size = "256",
     types = [
         "c64",
         "c128",
     ],
     unroll_factors = "2",
 )

 gpu_kernel_library(
     name = "gpu_asin_kernels",
     op = "asin",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
     ],
     # Cannot vectorize.
 )

 gpu_kernel_library(
     name = "gpu_asinh_kernels",
     op = "asinh",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
     ],
     # Cannot vectorize.
 )

 gpu_kernel_library(
     name = "gpu_atan_kernels",
     op = "atan",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
     ],
     unroll_factors = "4",
 )

 gpu_kernel_library(
     name = "gpu_atanh_kernels",
     op = "atanh",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
     ],
     unroll_factors = "4",
 )

 gpu_kernel_library(
     name = "gpu_conj_kernels",
     op = "conj",
     tile_size = "256",
     types = [
         "c64",
         "c128",
     ],
     unroll_factors = "2",
 )

 gpu_kernel_library(
     name = "gpu_cosh_kernels",
     op = "cosh",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
     ],
     # May be compute-bound.
     # unroll_factors = "4",
 )

 gpu_kernel_library(
     name = "gpu_erf_kernels",
     op = "erf",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
     ],
     unroll_factors = "4",
 )

 gpu_kernel_library(
     name = "gpu_erfc_kernels",
     op = "erfc",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
     ],
     unroll_factors = "4",
 )

 gpu_kernel_library(
     name = "gpu_imag_kernels",
     op = "imag",
     output_types = [
         "f32",
         "f64",
     ],
     tile_size = "256",
     types = [
         "c64",
         "c128",
     ],
 )

 gpu_kernel_library(
     name = "gpu_logical_not_kernels",
     op = "logical_not",
     tile_size = "256",
     types = ["i1"],
 )

 gpu_kernel_library(
     name = "gpu_real_kernels",
     op = "real",
     output_types = [
         "f32",
         "f64",
     ],
     tile_size = "256",
     types = [
         "c64",
         "c128",
     ],
 )

 gpu_kernel_library(
     name = "gpu_reciprocal_kernels",
     op = "reciprocal",
     tile_size = "256",
     types = [
         "c64",
         "c128",
         "f16",
         "f32",
         "f64",
         "i64",
     ],
     unroll_factors = "4",
 )

 gpu_kernel_library(
     name = "gpu_polygamma_kernels",
     op = "polygamma",
     tile_size = "256",
     types = [
         "f32",
         "f64",
     ],
 )

 gpu_kernel_library(
     name = "gpu_digamma_kernels",
     op = "digamma",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
     ],
 )

 gpu_kernel_library(
     name = "gpu_lgamma_kernels",
     op = "lgamma",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
     ],
 )

 gpu_kernel_library(
     name = "gpu_sign_kernels",
     op = "sign",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
         "i32",
         "i64",
         "c64",
         "c128",
     ],
     unroll_factors = "4",
 )

 gpu_kernel_library(
     name = "gpu_sinh_kernels",
     op = "sinh",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
     ],
     # May be compute-bound.
     # unroll_factors = "4",
 )

 [
     gpu_kernel_library(
         name = "gpu_" + op + "_kernels",
         op = op,
         tile_size = "1024",
         types = [
             "f16",
             "f32",
             "f64",
             "i64",
         ],
         unroll_factors = "4",
     )
     for op in [
         "square",
         "squared_difference",
     ]
 ]

 gpu_kernel_library(
     name = "gpu_add_v2_kernels",
     op = "add_v2",
     tile_size = "1024",
     types = [
         "f16",
         "f32",
         "f64",
         "i8",
         "i16",
         "i32",
         "i64",
         "c64",
         "c128",
     ],
     unroll_factors = "4",
 )

 gpu_kernel_library(
     name = "gpu_sub_kernels",
     jit_types = ["i16"],
     op = "sub",
     tile_size = "1024",
     types = [
         "f16",
         "f32",
         "f64",
         "i32",
         "i64",
         "c64",
         "c128",
     ],
     unroll_factors = "4",
 )

 gpu_kernel_library(
     name = "gpu_complex_kernels",
     op = "complex",
     output_types = [
         "c64",
         "c128",
     ],
     tile_size = "1024",
     types = [
         "f32",
         "f64",
     ],
     unroll_factors = "2",
 )

 gpu_kernel_library(
     name = "gpu_complex_abs_kernels",
     op = "complex_abs",
     output_types = [
         "f32",
         "f64",
     ],
     tile_size = "256",
     types = [
         "c64",
         "c128",
     ],
 )

 gpu_kernel_library(
     name = "gpu_div_kernels",
     op = "div",
     tile_size = "1024",
     types = [
         "f16",
         "f32",
         "f64",
         "i16",
         "i64",
         "ui8",
         "ui16",
         "c64",
         "c128",
     ],
 )

 gpu_kernel_library(
     name = "gpu_mul_kernels",
     op = "mul",
     tile_size = "1024",
     types = [
         "f16",
         "f32",
         "f64",
         "c64",
         "c128",
         "i8",
         "i16",
         "i32",
         "i64",
     ],
     # For complex MulOp kernels, we don't use unrolling, it would only cause
     # slowdowns.
     types_with_unrolling_disabled = [
         "c64",
         "c128",
     ],
     unroll_factors = "4",
 )

 gpu_kernel_library(
     name = "gpu_mul_no_nan_kernels",
     op = "mul_no_nan",
     tile_size = "1024",
     types = [
         "f16",
         "f32",
         "f64",
         "c64",
         "c128",
     ],
     # For complex MulNoNanOp kernels, we don't use unrolling, it would only
     # cause slowdowns.
     types_with_unrolling_disabled = [
         "c64",
         "c128",
     ],
     unroll_factors = "4",
 )

 # Bitwise operations.
 [
     gpu_kernel_library(
         name = "gpu_" + op + "_kernels",
         op = op,
         tile_size = "1024",
         types = [
             "i8",
             "i16",
             "i32",
             "i64",
         ],
         unroll_factors = "4",
     )
     for op in [
         "bitwise_and",
         "bitwise_or",
         "bitwise_xor",
         "invert",
         "left_shift",
     ]
 ]

 gpu_kernel_library(
     name = "gpu_right_shift_kernels",
     op = "right_shift",
     tile_size = "1024",
     types = [
         "i8",
         "i16",
         "i32",
         "i64",
         "ui8",
         "ui16",
         "ui32",
         "ui64",
     ],
     unroll_factors = "4",
 )

 gpu_kernel_library(
     name = "gpu_atan2_kernels",
     op = "atan2",
     tile_size = "256,1,1",
     types = [
         "f16",
         "f32",
         "f64",
     ],
     unroll_factors = "4",
 )

 # Logical operations.
 [
     gpu_kernel_library(
         name = "gpu_" + op + "_kernels",
         op = op,
         tile_size = "1024",
         types = [
             "i1",
         ],
     )
     for op in [
         "logical_and",
         "logical_or",
     ]
 ]

 [
     gpu_kernel_library(
         name = "gpu_" + op + "_kernels",
         op = op,
         output_types = ["i1"] * 10,
         tile_size = "1024",
         types = [
             "c64",
             "c128",
             "f16",
             "f32",
             "f64",
             "i1",
             "i8",
             "i16",
             "i32",
             "i64",
         ],
         unroll_factors = "4",
     )
     for op in [
         "equal",
         "not_equal",
     ]
 ]

 [
     gpu_kernel_library(
         name = "gpu_" + op + "_kernels",
         op = op,
         output_types = ["i1"] * 10,
         tile_size = "1024",
         types = [
             "f16",
             "f32",
             "f64",
             "i8",
             "i16",
             "i64",
             "ui8",
             "ui16",
             "ui32",
             "ui64",
         ],
         unroll_factors = "4",
     )
     for op in [
         "less",
         "less_equal",
         "greater",
         "greater_equal",
     ]
 ]

 [
     gpu_kernel_library(
         name = "gpu_" + op + "_kernels",
         op = op,
         tile_size = "1024",
         types = [
             "f16",
             "f32",
             "f64",
             "i16",
             "i64",
             "ui8",
         ],
         unroll_factors = "4",
     )
     for op in [
         "maximum",
         "minimum",
     ]
 ]

 gpu_kernel_library(
     name = "gpu_neg_kernels",
     op = "neg",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
         "i8",
         "i16",
         "i32",
         "i64",
         "c64",
         "c128",
     ],
     unroll_factors = "4",
 )

 gpu_kernel_library(
     name = "gpu_floor_div_kernels",
     op = "floor_div",
     tile_size = "1024",
     # TODO(b/196539835): Enable for integer types also once unsigned integers
     # and unranked tensors are supported.
     types = [
         "f16",
         "f32",
         "f64",
     ],
     unroll_factors = "4",
 )

 # Kernels that support all floating-point types.
 [
     gpu_kernel_library(
         name = "gpu_" + op + "_kernels",
         op = op,
         tile_size = "256",
         types = [
             "f16",
             "f32",
             "f64",
         ],
         unroll_factors = "4",
     )
     for op in [
         "ceil",
         "expm1",
         "floor",
         "log",
         "log1p",
         "relu_grad",
         "rsqrt",
         "softplus",
         "softsign",
         "sqrt",
         "tanh",
     ]
 ]

 gpu_kernel_library(
     name = "gpu_exp_kernels",
     op = "exp",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
         "c64",
         "c128",
     ],
     unroll_factors = "4",
 )

 # Kernels that support all floating-point types but have i1 output.
 [
     gpu_kernel_library(
         name = "gpu_" + op + "_kernels",
         op = op,
         output_types = ["i1"] * 3,
         tile_size = "256",
         types = [
             "f16",
             "f32",
             "f64",
         ],
         unroll_factors = "4",
     )
     for op in [
         "is_finite",
         "is_inf",
         "is_nan",
     ]
 ]

 # Kernels that support all floating-point types but cannot be vectorized.
 [
     gpu_kernel_library(
         name = "gpu_" + op + "_kernels",
         op = op,
         tile_size = "256",
         types = [
             "f16",
             "f32",
             "f64",
         ],
     )
     for op in [
         "cos",
         "sin",
         "tan",
     ]
 ]

 gpu_kernel_library(
     name = "gpu_cast_kernels",
     op = "cast",
     # We generate all combinations of input types/output types from the set
     # {i1, i8, i16, i32, i64, ui8, ui16, ui32, ui64, f16, f32, f64}. The easiest
     # way to do this is to repeat each input type 12 times and match it with the
     # 12 different output types (thus, the list of 12 different output types
     # needs to be repeated 12 times as well).
     output_types = [
         "i1",
         "i8",
         "i16",
         "i32",
         "i64",
         "ui8",
         "ui16",
         "ui32",
         "ui64",
         "f16",
         "f32",
         "f64",
     ] * 12,
     tile_size = "256",
     types = ["i1"] * 12 + ["i8"] * 12 + ["i16"] * 12 + ["i32"] * 12 +
             ["i64"] * 12 + ["ui8"] * 12 + ["ui16"] * 12 + ["ui32"] * 12 +
             ["ui64"] * 12 + ["f16"] * 12 + ["f32"] * 12 + ["f64"] * 12,
 )

 gpu_kernel_library(
     name = "gpu_pow_kernels",
     op = "pow",
     tile_size = "1024",
     types = [
         "f16",
         "f32",
         "f64",
         "i64",
     ],
 )

 gpu_kernel_library(
     # The zeta kernels needs many registers so tile at 256.
     name = "gpu_zeta_kernels",
     op = "zeta",
     tile_size = "256",
     types = [
         "f32",
         "f64",
     ],
     # TODO(b/178388085): Enable unrolling after vectorization is fixed.
     # unroll_factors = "4",
 )

 gpu_kernel_library(
     name = "gpu_select_v2_kernels",
     max_supported_rank = 8,
     op = "select_v2",
     tile_size = "256",
     types = [
         "i1",
         "i32",
         "i64",
         "f16",
         "f32",
         "f64",
         "c64",
         "c128",
     ],
 )

 cpu_kernel_library(
     name = "cpu_abs_kernels",
     op = "abs",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
         "i8",
         "i16",
         "i32",
         "i64",
     ],
     unroll_factors = "4",
 )

 cpu_kernel_library(
     name = "cpu_invert_kernels",
     op = "invert",
     tile_size = "256",
     types = [
         "i8",
         "i16",
         "i32",
         "i64",
     ],
     unroll_factors = "4",
 )

 cpu_kernel_library(
     name = "cpu_ceil_kernels",
     op = "ceil",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
     ],
     unroll_factors = "4",
 )

 cpu_kernel_library(
     name = "cpu_cos_kernels",
     op = "cos",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
     ],
     unroll_factors = "4",
 )

 cpu_kernel_library(
     name = "cpu_floor_kernels",
     op = "floor",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
     ],
     unroll_factors = "4",
 )

 cpu_kernel_library(
     name = "cpu_rsqrt_kernels",
     op = "rsqrt",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
     ],
     unroll_factors = "4",
 )

 cpu_kernel_library(
     name = "cpu_sin_kernels",
     op = "sin",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
     ],
     unroll_factors = "4",
 )

 cpu_kernel_library(
     name = "cpu_sqrt_kernels",
     op = "sqrt",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
     ],
     unroll_factors = "4",
 )

 cpu_kernel_library(
     name = "cpu_square_kernels",
     op = "square",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
         "i32",
         "i64",
         "c64",
         "c128",
     ],
     unroll_factors = "4",
 )

 cpu_kernel_library(
     name = "cpu_tan_kernels",
     op = "tan",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
     ],
     unroll_factors = "4",
 )

 cpu_kernel_library(
     name = "cpu_add_v2_kernels",
     op = "add_v2",
     tile_size = "1024",
     types = [
         "f16",
         "f32",
         "f64",
         "i8",
         "i16",
         "i32",
         "i64",
         "c64",
         "c128",
     ],
     unroll_factors = "4",
 )

 # Binary bitwise operations.
 [
     cpu_kernel_library(
         name = "cpu_" + op + "_kernels",
         op = op,
         tile_size = "1024",
         types = [
             "i8",
             "i16",
             "i32",
             "i64",
         ],
         unroll_factors = "4",
     )
     for op in [
         "bitwise_and",
         "bitwise_or",
         "bitwise_xor",
         "left_shift",
     ]
 ]

 cpu_kernel_library(
     name = "cpu_right_shift_kernels",
     op = "right_shift",
     tile_size = "1024",
     types = [
         "i8",
         "i16",
         "i32",
         "i64",
         "ui8",
         "ui16",
         "ui32",
         "ui64",
     ],
     unroll_factors = "4",
 )

 gpu_kernel_library(
     name = "gpu_xlogy_kernels",
     op = "xlogy",
     tile_size = "1024",
     types = [
         "f16",
         "f32",
         "f64",
         "c64",
         "c128",
     ],
     # For complex XlogyOp kernels, we don't use unrolling, it would only cause
     # slowdowns.
     types_with_unrolling_disabled = [
         "c64",
         "c128",
     ],
     unroll_factors = "4",
 )

 gpu_kernel_library(
     name = "gpu_xdivy_kernels",
     op = "xdivy",
     tile_size = "1024",
     types = [
         "f16",
         "f32",
         "f64",
         "c64",
         "c128",
     ],
     unroll_factors = "4",
 )

 gpu_kernel_library(
     name = "gpu_xlog1py_kernels",
     op = "xlog1py",
     tile_size = "1024",
     types = [
         "f16",
         "f32",
         "f64",
         "c64",
         "c128",
     ],
     # For complex Xlog1pyOp kernels, we don't use unrolling, it would only cause
     # slowdowns.
     types_with_unrolling_disabled = [
         "c64",
         "c128",
     ],
     unroll_factors = "4",
 )

 cpu_kernel_library(
     name = "cpu_angle_kernels",
     op = "angle",
     output_types = [
         "f32",
         "f64",
     ],
     tile_size = "256",
     types = [
         "c64",
         "c128",
     ],
     unroll_factors = "2",
 )

 cpu_kernel_library(
     name = "cpu_tanh_kernels",
     op = "tanh",
     tile_size = "256",
     types = [
         "f16",
         "f32",
         "f64",
     ],
     unroll_factors = "4",
 )

 gpu_kernel_library(
     name = "gpu_div_no_nan_kernels",
     op = "div_no_nan",
     tile_size = "1024",
     types = [
         "f16",
         "f32",
         "f64",
         "c64",
         "c128",
     ],
 )

 gpu_kernel_library(
     name = "gpu_rint_kernels",
     jit_types = ["f16"],
     op = "rint",
     tile_size = "1024",
     types = [
         "f32",
         "f64",
     ],
 )

 gpu_kernel_library(
     name = "gpu_round_kernels",
     op = "round",
     tile_size = "1024",
     types = [
         "f16",
         "f32",
         "f64",
         "i32",
         "i64",
     ],
 )

 gpu_kernel_library(
     name = "gpu_zeros_like_kernels",
     op = "zeros_like",
     tile_size = "1024",
     types = [
         "i1",
         "i64",
         "f16",
         "f32",
         "f64",
         # TODO(b/190374484): Enable it for complex types once it is supported.
         # "c64",
         # "c128",
     ],
 )

 gpu_kernel_library(
     name = "gpu_ones_like_kernels",
     op = "ones_like",
     tile_size = "1024",
     types = [
         "i1",
         "i64",
         "f16",
         "f32",
         "f64",
         # TODO(b/190374484): Enable it for complex types once it is supported.
         # "c64",
         # "c128",
     ],
 )

 gpu_kernel_library(
     name = "gpu_next_after_kernels",
     op = "next_after",
     tile_size = "1024",
     types = [
         "f32",
         "f64",
     ],
 )