blob: e04adefe59a644013488ffbddb2796d01e842ec9 [file] [log] [blame]
# Generates CUDA kernels using MLIR codegen.
load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
load(
"//tensorflow/core/kernels/mlir_generated:build_defs.bzl",
"cpu_kernel_library",
"gpu_kernel_library",
"if_mlir_generated_cpu_kernels_enabled",
"if_mlir_generated_experimental_kernels_enabled",
"if_mlir_generated_gpu_kernels_enabled",
)
load(
"//tensorflow:tensorflow.bzl",
"if_cuda_or_rocm",
"tf_cc_test",
)
load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test") # buildifier: disable=same-origin-load
load("//tensorflow:tensorflow.bzl", "tf_kernel_library") # buildifier: disable=same-origin-load
load(
"//tensorflow/core/platform:build_config_root.bzl",
"tf_cuda_tests_tags",
)
package(
default_visibility = [
"//tensorflow/core/kernels:__subpackages__",
],
licenses = ["notice"],
)
package_group(
name = "friends",
packages = [
"//third_party/car/...",
],
)
bool_flag(
name = "enable_gpu",
build_setting_default = True,
)
config_setting(
name = "is_gpu_enabled",
flag_values = {":enable_gpu": "True"},
)
bool_flag(
name = "enable_cpu",
build_setting_default = True,
)
config_setting(
name = "is_cpu_enabled",
flag_values = {":enable_cpu": "True"},
)
# This flag may only be enabled with enable_gpu and enable_cpu are true.
bool_flag(
name = "enable_experimental",
build_setting_default = False,
)
config_setting(
name = "is_experimental_enabled",
flag_values = {":enable_experimental": "True"},
)
cc_library(
name = "base_op",
srcs = ["base_op.cc"],
hdrs = ["base_op.h"],
deps = [
"//tensorflow/core:framework",
"//tensorflow/core:lib",
"//tensorflow/core/framework:allocation_description_proto_cc",
"//tensorflow/core/framework:op_requires",
"@llvm-project//llvm:Support",
"@llvm-project//mlir:mlir_c_runner_utils",
],
)
cc_library(
name = "base_gpu_op",
hdrs = ["base_gpu_op.h"],
deps = [
":base_op",
"//tensorflow/core:framework",
],
)
cc_library(
name = "base_cpu_op",
hdrs = ["base_cpu_op.h"],
deps = [":base_op"],
)
tf_kernel_library(
name = "gpu_cwise_unary_op",
srcs = if_mlir_generated_gpu_kernels_enabled([
"gpu_op_abs.cc",
"gpu_op_acos.cc",
"gpu_op_acosh.cc",
"gpu_op_angle.cc",
"gpu_op_asin.cc",
"gpu_op_asinh.cc",
"gpu_op_atan.cc",
"gpu_op_atanh.cc",
"gpu_op_ceil.cc",
"gpu_op_complex_abs.cc",
"gpu_op_conj.cc",
"gpu_op_cos.cc",
"gpu_op_cosh.cc",
"gpu_op_digamma.cc",
"gpu_op_erf.cc",
"gpu_op_erfc.cc",
"gpu_op_exp.cc",
"gpu_op_expm1.cc",
"gpu_op_floor.cc",
"gpu_op_imag.cc",
"gpu_op_invert.cc",
"gpu_op_is_finite.cc",
"gpu_op_is_inf.cc",
"gpu_op_is_nan.cc",
"gpu_op_lgamma.cc",
"gpu_op_log.cc",
"gpu_op_log1p.cc",
"gpu_op_logical_not.cc",
"gpu_op_neg.cc",
"gpu_op_real.cc",
"gpu_op_reciprocal.cc",
"gpu_op_rint.cc",
"gpu_op_round.cc",
"gpu_op_rsqrt.cc",
"gpu_op_sigmoid.cc",
"gpu_op_sign.cc",
"gpu_op_sin.cc",
"gpu_op_sinh.cc",
"gpu_op_sqrt.cc",
"gpu_op_square.cc",
"gpu_op_tan.cc",
"gpu_op_tanh.cc",
]),
copts = if_mlir_generated_experimental_kernels_enabled([
"-DMLIR_GENERATED_EXPERIMENTAL_KERNELS_ENABLED",
]),
tags = ["manual"],
deps = if_mlir_generated_gpu_kernels_enabled([
":base_gpu_op",
":gpu_abs_kernels",
":gpu_acos_kernels",
":gpu_acosh_kernels",
":gpu_angle_kernels",
":gpu_asin_kernels",
":gpu_asinh_kernels",
":gpu_atan_kernels",
":gpu_atanh_kernels",
":gpu_ceil_kernels",
":gpu_complex_abs_kernels",
":gpu_conj_kernels",
":gpu_cos_kernels",
":gpu_cosh_kernels",
":gpu_digamma_kernels",
":gpu_erf_kernels",
":gpu_erfc_kernels",
":gpu_exp_kernels",
":gpu_expm1_kernels",
":gpu_floor_kernels",
":gpu_imag_kernels",
":gpu_invert_kernels",
":gpu_is_finite_kernels",
":gpu_is_inf_kernels",
":gpu_is_nan_kernels",
":gpu_lgamma_kernels",
":gpu_log1p_kernels",
":gpu_log_kernels",
":gpu_logical_not_kernels",
":gpu_neg_kernels",
":gpu_real_kernels",
":gpu_reciprocal_kernels",
":gpu_rint_kernels",
":gpu_round_kernels",
":gpu_rsqrt_kernels",
":gpu_sigmoid_kernels",
":gpu_sign_kernels",
":gpu_sin_kernels",
":gpu_sinh_kernels",
":gpu_sqrt_kernels",
":gpu_square_kernels",
":gpu_tan_kernels",
":gpu_tanh_kernels",
"//third_party/eigen3",
]),
)
tf_kernel_library(
name = "cpu_cwise_unary_op",
srcs = if_mlir_generated_cpu_kernels_enabled([
]) + if_mlir_generated_experimental_kernels_enabled([
"cpu_op_abs.cc",
"cpu_op_angle.cc",
"cpu_op_ceil.cc",
"cpu_op_cos.cc",
"cpu_op_floor.cc",
"cpu_op_invert.cc",
"cpu_op_rsqrt.cc",
"cpu_op_sin.cc",
"cpu_op_sqrt.cc",
"cpu_op_square.cc",
"cpu_op_tan.cc",
"cpu_op_tanh.cc",
]),
tags = ["manual"],
deps = if_mlir_generated_cpu_kernels_enabled([
]) + if_mlir_generated_experimental_kernels_enabled([
":base_cpu_op",
":cpu_abs_kernels",
":cpu_angle_kernels",
":cpu_ceil_kernels",
":cpu_cos_kernels",
":cpu_floor_kernels",
":cpu_invert_kernels",
":cpu_rsqrt_kernels",
":cpu_sin_kernels",
":cpu_sqrt_kernels",
":cpu_square_kernels",
":cpu_tan_kernels",
":cpu_tanh_kernels",
"//third_party/eigen3",
]),
)
tf_kernel_library(
name = "gpu_cwise_binary_op",
srcs = if_mlir_generated_gpu_kernels_enabled([
"gpu_op_add.cc",
"gpu_op_atan2.cc",
"gpu_op_bitwise_and.cc",
"gpu_op_bitwise_or.cc",
"gpu_op_bitwise_xor.cc",
"gpu_op_complex.cc",
"gpu_op_div.cc",
"gpu_op_div_no_nan.cc",
"gpu_op_equal.cc",
"gpu_op_floor_div.cc",
"gpu_op_greater.cc",
"gpu_op_greater_equal.cc",
"gpu_op_left_shift.cc",
"gpu_op_less.cc",
"gpu_op_less_equal.cc",
"gpu_op_logical_and.cc",
"gpu_op_logical_or.cc",
"gpu_op_maximum.cc",
"gpu_op_minimum.cc",
"gpu_op_mul.cc",
"gpu_op_mul_no_nan.cc",
"gpu_op_not_equal.cc",
"gpu_op_polygamma.cc",
"gpu_op_pow.cc",
"gpu_op_right_shift.cc",
"gpu_op_select.cc",
"gpu_op_squared_difference.cc",
"gpu_op_sub.cc",
"gpu_op_xdivy.cc",
"gpu_op_xlog1py.cc",
"gpu_op_xlogy.cc",
"gpu_op_zeta.cc",
]),
copts = if_mlir_generated_experimental_kernels_enabled([
"-DMLIR_GENERATED_EXPERIMENTAL_KERNELS_ENABLED",
]),
tags = ["manual"],
deps = if_mlir_generated_gpu_kernels_enabled([
":base_gpu_op",
":gpu_add_v2_kernels",
":gpu_atan2_kernels",
":gpu_bitwise_and_kernels",
":gpu_bitwise_or_kernels",
":gpu_bitwise_xor_kernels",
":gpu_complex_kernels",
":gpu_div_kernels",
":gpu_div_no_nan_kernels",
":gpu_equal_kernels",
":gpu_floor_div_kernels",
":gpu_greater_equal_kernels",
":gpu_greater_kernels",
":gpu_left_shift_kernels",
":gpu_less_equal_kernels",
":gpu_less_kernels",
":gpu_logical_and_kernels",
":gpu_logical_or_kernels",
":gpu_maximum_kernels",
":gpu_minimum_kernels",
":gpu_mul_kernels",
":gpu_mul_no_nan_kernels",
":gpu_not_equal_kernels",
":gpu_polygamma_kernels",
":gpu_pow_kernels",
":gpu_right_shift_kernels",
":gpu_select_v2_kernels",
":gpu_squared_difference_kernels",
":gpu_sub_kernels",
":gpu_xdivy_kernels",
":gpu_xlog1py_kernels",
":gpu_xlogy_kernels",
":gpu_zeta_kernels",
"//third_party/eigen3",
]),
)
tf_kernel_library(
name = "cpu_cwise_binary_op",
srcs = if_mlir_generated_cpu_kernels_enabled([
]) + if_mlir_generated_experimental_kernels_enabled([
"cpu_op_add.cc",
"cpu_op_bitwise_and.cc",
"cpu_op_bitwise_or.cc",
"cpu_op_bitwise_xor.cc",
"cpu_op_left_shift.cc",
"cpu_op_right_shift.cc",
]),
tags = ["manual"],
deps = if_mlir_generated_cpu_kernels_enabled([
]) + if_mlir_generated_experimental_kernels_enabled([
":base_cpu_op",
":cpu_add_v2_kernels",
":cpu_bitwise_and_kernels",
":cpu_bitwise_or_kernels",
":cpu_bitwise_xor_kernels",
":cpu_left_shift_kernels",
":cpu_right_shift_kernels",
"//third_party/eigen3",
]),
)
tf_kernel_library(
name = "cwise_op",
srcs = [],
visibility = [
":friends",
"//tensorflow/core/kernels:__subpackages__",
],
deps = [
":cpu_cwise_binary_op",
":cpu_cwise_unary_op",
] + if_cuda_or_rocm([
":gpu_cwise_unary_op",
":gpu_cwise_binary_op",
]),
)
tf_kernel_library(
name = "gpu_cast_op",
srcs = if_mlir_generated_gpu_kernels_enabled([
"gpu_op_cast.cc",
]),
tags = ["manual"],
deps = if_mlir_generated_gpu_kernels_enabled([
":base_gpu_op",
":gpu_cast_kernels",
"//third_party/eigen3",
]),
)
tf_kernel_library(
name = "cast_op",
srcs = [],
deps = [
] + if_cuda_or_rocm([
":gpu_cast_op",
]),
)
tf_kernel_library(
name = "gpu_constant_op",
srcs = if_mlir_generated_experimental_kernels_enabled([
"gpu_op_ones_like.cc",
"gpu_op_zeros_like.cc",
]),
tags = ["manual"],
deps = if_mlir_generated_experimental_kernels_enabled([
":base_gpu_op",
":gpu_ones_like_kernels",
":gpu_zeros_like_kernels",
"//third_party/eigen3",
]),
)
tf_kernel_library(
name = "constant_op",
srcs = [],
deps = [
] + if_cuda_or_rocm([
":gpu_constant_op",
]),
)
tf_kernel_library(
name = "gpu_nextafter_op",
srcs = if_mlir_generated_gpu_kernels_enabled([
"gpu_op_next_after.cc",
]),
tags = ["manual"],
deps = if_mlir_generated_gpu_kernels_enabled([
":base_gpu_op",
":gpu_next_after_kernels",
]),
)
tf_kernel_library(
name = "nextafter_op",
srcs = [],
deps = [] + if_cuda_or_rocm([":gpu_nextafter_op"]),
)
tf_kernel_library(
name = "gpu_relu_op",
srcs = if_mlir_generated_gpu_kernels_enabled([
"gpu_op_elu.cc",
"gpu_op_relu.cc",
"gpu_op_selu.cc",
]),
tags = ["manual"],
deps = if_mlir_generated_gpu_kernels_enabled([
":base_gpu_op",
":gpu_elu_kernels",
":gpu_relu_kernels",
":gpu_selu_kernels",
"//third_party/eigen3",
]),
)
tf_kernel_library(
name = "relu_op",
srcs = [],
deps = [
] + if_cuda_or_rocm([
":gpu_relu_op",
]),
)
tf_kernel_library(
name = "gpu_softplus_op",
srcs = if_mlir_generated_gpu_kernels_enabled([
"gpu_op_softplus.cc",
]),
tags = ["manual"],
deps = if_mlir_generated_gpu_kernels_enabled([
":base_gpu_op",
":gpu_softplus_kernels",
"//third_party/eigen3",
]),
)
tf_kernel_library(
name = "softplus_op",
srcs = [],
deps = [
] + if_cuda_or_rocm([
":gpu_softplus_op",
]),
)
tf_kernel_library(
name = "gpu_softsign_op",
srcs = if_mlir_generated_gpu_kernels_enabled([
"gpu_op_softsign.cc",
]),
tags = ["manual"],
deps = if_mlir_generated_gpu_kernels_enabled([
":base_gpu_op",
":gpu_softsign_kernels",
"//third_party/eigen3",
]),
)
tf_kernel_library(
name = "softsign_op",
srcs = [],
deps = [
] + if_cuda_or_rocm([
":gpu_softsign_op",
]),
)
cc_library(
name = "base_ops_test",
testonly = 1,
srcs = ["base_ops_test.cc"],
hdrs = ["base_ops_test.h"],
deps = [
"//tensorflow/core:framework",
"//tensorflow/core:tensorflow",
"@com_google_absl//absl/container:inlined_vector",
"@com_google_absl//absl/strings",
"@llvm-project//llvm:Support",
],
)
cc_library(
name = "base_unary_ops_test",
testonly = 1,
hdrs = ["base_unary_ops_test.h"],
deps = [
":base_ops_test",
"//tensorflow/compiler/mlir/tools/kernel_gen:tf_jit_cache",
"//tensorflow/core:framework",
"//tensorflow/core:framework_internal",
"//tensorflow/core:tensorflow",
"//tensorflow/core:test",
"//tensorflow/core:test_main",
"//tensorflow/core:testlib",
"//tensorflow/core/framework:types_proto_cc",
"//tensorflow/core/kernels:cwise_op",
"//tensorflow/core/kernels:ops_testutil",
"@com_google_absl//absl/container:inlined_vector",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/types:optional",
"@llvm-project//llvm:Support",
],
)
tf_cuda_cc_test(
name = "gpu_unary_ops_test",
size = "small",
srcs = ["gpu_unary_ops_test.cc"],
extra_copts = if_mlir_generated_experimental_kernels_enabled([
"-DMLIR_GENERATED_EXPERIMENTAL_KERNELS_ENABLED",
]) + if_mlir_generated_gpu_kernels_enabled(
["-DMLIR_GENERATED_GPU_KERNELS_ENABLED"],
),
tags = tf_cuda_tests_tags() + [
"no_cuda_asan", # TODO(b/171341759): re-enable.
"no_cuda", # TODO(b/196608406): re-enable
],
deps = [
":base_ops_test",
":base_unary_ops_test",
"//tensorflow/core/common_runtime:device",
"//tensorflow/core/common_runtime:device_factory",
],
)
tf_cc_test(
name = "cpu_unary_ops_test",
size = "small",
srcs = ["cpu_unary_ops_test.cc"],
deps = [
":base_ops_test",
":base_unary_ops_test",
"//tensorflow/core/common_runtime:device",
"//tensorflow/core/common_runtime:device_factory",
],
)
cc_library(
name = "base_binary_ops_test",
testonly = 1,
hdrs = ["base_binary_ops_test.h"],
deps = [
":base_ops_test",
"//tensorflow/core:framework",
"//tensorflow/core:framework_internal",
"//tensorflow/core:tensorflow",
"//tensorflow/core:test",
"//tensorflow/core:test_main",
"//tensorflow/core:testlib",
"//tensorflow/core/common_runtime:device",
"//tensorflow/core/common_runtime:device_factory",
"//tensorflow/core/framework:types_proto_cc",
"//tensorflow/core/kernels:cwise_op",
"//tensorflow/core/kernels:ops_testutil",
"@com_google_absl//absl/container:inlined_vector",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/types:optional",
"@llvm-project//llvm:Support",
],
)
tf_cuda_cc_test(
name = "gpu_binary_ops_test",
size = "medium",
srcs = ["gpu_binary_ops_test.cc"],
extra_copts = if_mlir_generated_gpu_kernels_enabled(
["-DMLIR_GENERATED_GPU_KERNELS_ENABLED"],
) + if_mlir_generated_experimental_kernels_enabled(
[
"-DMLIR_GENERATED_EXPERIMENTAL_KERNELS_ENABLED",
],
),
tags = tf_cuda_tests_tags() + [
"no_cuda_asan", # b/173033461
],
deps = [
":base_binary_ops_test",
":base_ops_test",
"//tensorflow/core/common_runtime:device",
"//tensorflow/core/common_runtime:device_factory",
],
)
tf_cc_test(
name = "cpu_binary_ops_test",
size = "medium",
srcs = ["cpu_binary_ops_test.cc"],
deps = [
":base_binary_ops_test",
":base_ops_test",
"//tensorflow/core/common_runtime:device",
"//tensorflow/core/common_runtime:device_factory",
],
)
# TODO(b/160731748): Re-enable when it works again.
# gpu_kernel_library(
# name = "gpu_bias_add_kernels",
# op = "bias_add",
# tile_size = "16x16",
# types = [
# "f16",
# "f32",
# "f64",
# ],
# )
gpu_kernel_library(
name = "gpu_relu_kernels",
op = "relu",
tile_size = "256",
types = [
"f16",
"f32",
"f64",
],
)
gpu_kernel_library(
name = "gpu_elu_kernels",
op = "elu",
tile_size = "256",
types = [
"f16",
"f32",
"f64",
],
)
gpu_kernel_library(
name = "gpu_selu_kernels",
op = "selu",
tile_size = "256",
types = [
"f16",
"f32",
"f64",
],
)
gpu_kernel_library(
name = "gpu_sigmoid_kernels",
op = "sigmoid",
tile_size = "256",
types = [
"f16",
"f32",
"f64",
],
)
# TODO(b/25387198): Add an int32 kernel.
gpu_kernel_library(
name = "gpu_abs_kernels",
op = "abs",
tile_size = "256",
types = [
"f16",
"f32",
"f64",
"i64",
],
unroll_factors = "4",
)
gpu_kernel_library(
name = "gpu_acos_kernels",
op = "acos",
tile_size = "256",
types = [
"f16",
"f32",
"f64",
],
# Cannot vectorize.
)
gpu_kernel_library(
name = "gpu_acosh_kernels",
op = "acosh",
tile_size = "256",
types = [
"f16",
"f32",
"f64",
],
# May be compute-bound.
# unroll_factors = "4",
)
gpu_kernel_library(
name = "gpu_angle_kernels",
op = "angle",
output_types = [
"f32",
"f64",
],
tile_size = "256",
types = [
"c64",
"c128",
],
unroll_factors = "2",
)
gpu_kernel_library(
name = "gpu_asin_kernels",
op = "asin",
tile_size = "256",
types = [
"f16",
"f32",
"f64",
],
# Cannot vectorize.
)
gpu_kernel_library(
name = "gpu_asinh_kernels",
op = "asinh",
tile_size = "256",
types = [
"f16",
"f32",
"f64",
],
# Cannot vectorize.
)
gpu_kernel_library(
name = "gpu_atan_kernels",
op = "atan",
tile_size = "256",
types = [
"f16",
"f32",
"f64",
],
unroll_factors = "4",
)
gpu_kernel_library(
name = "gpu_atanh_kernels",
op = "atanh",
tile_size = "256",
types = [
"f16",
"f32",
"f64",
],
unroll_factors = "4",
)
gpu_kernel_library(
name = "gpu_conj_kernels",
op = "conj",
tile_size = "256",
types = [
"c64",
"c128",
],
unroll_factors = "2",
)
gpu_kernel_library(
name = "gpu_cosh_kernels",
op = "cosh",
tile_size = "256",
types = [
"f16",
"f32",
"f64",
],
# May be compute-bound.
# unroll_factors = "4",
)
gpu_kernel_library(
name = "gpu_erf_kernels",
op = "erf",
tile_size = "256",
types = [
"f16",
"f32",
"f64",
],
unroll_factors = "4",
)
gpu_kernel_library(
name = "gpu_erfc_kernels",
op = "erfc",
tile_size = "256",
types = [
"f16",
"f32",
"f64",
],
unroll_factors = "4",
)
gpu_kernel_library(
name = "gpu_imag_kernels",
op = "imag",
output_types = [
"f32",
"f64",
],
tile_size = "256",
types = [
"c64",
"c128",
],
)
gpu_kernel_library(
name = "gpu_logical_not_kernels",
op = "logical_not",
tile_size = "256",
types = ["i1"],
)
gpu_kernel_library(
name = "gpu_real_kernels",
op = "real",
output_types = [
"f32",
"f64",
],
tile_size = "256",
types = [
"c64",
"c128",
],
)
gpu_kernel_library(
name = "gpu_reciprocal_kernels",
op = "reciprocal",
tile_size = "256",
types = [
"c64",
"c128",
"f16",
"f32",
"f64",
"i64",
],
unroll_factors = "4",
)
gpu_kernel_library(
name = "gpu_polygamma_kernels",
op = "polygamma",
tile_size = "256",
types = [
"f32",
"f64",
],
)
gpu_kernel_library(
name = "gpu_digamma_kernels",
op = "digamma",
tile_size = "256",
types = [
"f16",
"f32",
"f64",
],
)
gpu_kernel_library(
name = "gpu_lgamma_kernels",
op = "lgamma",
tile_size = "256",
types = [
"f16",
"f32",
"f64",
],
)
gpu_kernel_library(
name = "gpu_sign_kernels",
op = "sign",
tile_size = "256",
types = [
"f16",
"f32",
"f64",
"i32",
"i64",
"c64",
"c128",
],
unroll_factors = "4",
)
gpu_kernel_library(
name = "gpu_sinh_kernels",
op = "sinh",
tile_size = "256",
types = [
"f16",
"f32",
"f64",
],
# May be compute-bound.
# unroll_factors = "4",
)
[
gpu_kernel_library(
name = "gpu_" + op + "_kernels",
op = op,
tile_size = "1024",
types = [
"f16",
"f32",
"f64",
"i64",
],
unroll_factors = "4",
)
for op in [
"square",
"squared_difference",
]
]
gpu_kernel_library(
name = "gpu_add_v2_kernels",
op = "add_v2",
tile_size = "1024",
types = [
"f16",
"f32",
"f64",
"i8",
"i16",
"i32",
"i64",
"c64",
"c128",
],
unroll_factors = "4",
)
gpu_kernel_library(
name = "gpu_sub_kernels",
jit_types = ["i16"],
op = "sub",
tile_size = "1024",
types = [
"f16",
"f32",
"f64",
"i32",
"i64",
"c64",
"c128",
],
unroll_factors = "4",
)
gpu_kernel_library(
name = "gpu_complex_kernels",
op = "complex",
output_types = [
"c64",
"c128",
],
tile_size = "1024",
types = [
"f32",
"f64",
],
unroll_factors = "2",
)
gpu_kernel_library(
name = "gpu_complex_abs_kernels",
op = "complex_abs",
output_types = [
"f32",
"f64",
],
tile_size = "256",
types = [
"c64",
"c128",
],
)
gpu_kernel_library(
name = "gpu_div_kernels",
op = "div",
tile_size = "1024",
types = [
"f16",
"f32",
"f64",
"i16",
"i64",
"ui8",
"ui16",
"c64",
"c128",
],
)
gpu_kernel_library(
name = "gpu_mul_kernels",
op = "mul",
tile_size = "1024",
types = [
"f16",
"f32",
"f64",
"c64",
"c128",
"i8",
"i16",
"i32",
"i64",
],
# For complex MulOp kernels, we don't use unrolling, it would only cause
# slowdowns.
types_with_unrolling_disabled = [
"c64",
"c128",
],
unroll_factors = "4",
)
gpu_kernel_library(
name = "gpu_mul_no_nan_kernels",
op = "mul_no_nan",
tile_size = "1024",
types = [
"f16",
"f32",
"f64",
"c64",
"c128",
],
# For complex MulNoNanOp kernels, we don't use unrolling, it would only
# cause slowdowns.
types_with_unrolling_disabled = [
"c64",
"c128",
],
unroll_factors = "4",
)
# Bitwise operations.
[
gpu_kernel_library(
name = "gpu_" + op + "_kernels",
op = op,
tile_size = "1024",
types = [
"i8",
"i16",
"i32",
"i64",
],
unroll_factors = "4",
)
for op in [
"bitwise_and",
"bitwise_or",
"bitwise_xor",
"invert",
"left_shift",
]
]
gpu_kernel_library(
name = "gpu_right_shift_kernels",
op = "right_shift",
tile_size = "1024",
types = [
"i8",
"i16",
"i32",
"i64",
"ui8",
"ui16",
"ui32",
"ui64",
],
unroll_factors = "4",
)
gpu_kernel_library(
name = "gpu_atan2_kernels",
op = "atan2",
tile_size = "256,1,1",
types = [
"f16",
"f32",
"f64",
],
unroll_factors = "4",
)
# Logical operations.
[
gpu_kernel_library(
name = "gpu_" + op + "_kernels",
op = op,
tile_size = "1024",
types = [
"i1",
],
)
for op in [
"logical_and",
"logical_or",
]
]
[
gpu_kernel_library(
name = "gpu_" + op + "_kernels",
op = op,
output_types = ["i1"] * 10,
tile_size = "1024",
types = [
"c64",
"c128",
"f16",
"f32",
"f64",
"i1",
"i8",
"i16",
"i32",
"i64",
],
unroll_factors = "4",
)
for op in [
"equal",
"not_equal",
]
]
[
gpu_kernel_library(
name = "gpu_" + op + "_kernels",
op = op,
output_types = ["i1"] * 10,
tile_size = "1024",
types = [
"f16",
"f32",
"f64",
"i8",
"i16",
"i64",
"ui8",
"ui16",
"ui32",
"ui64",
],
unroll_factors = "4",
)
for op in [
"less",
"less_equal",
"greater",
"greater_equal",
]
]
[
gpu_kernel_library(
name = "gpu_" + op + "_kernels",
op = op,
tile_size = "1024",
types = [
"f16",
"f32",
"f64",
"i16",
"i64",
"ui8",
],
unroll_factors = "4",
)
for op in [
"maximum",
"minimum",
]
]
gpu_kernel_library(
name = "gpu_neg_kernels",
op = "neg",
tile_size = "256",
types = [
"f16",
"f32",
"f64",
"i8",
"i16",
"i32",
"i64",
"c64",
"c128",
],
unroll_factors = "4",
)
gpu_kernel_library(
name = "gpu_floor_div_kernels",
op = "floor_div",
tile_size = "1024",
# TODO(b/196539835): Enable for integer types also once unsigned integers
# and unranked tensors are supported.
types = [
"f16",
"f32",
"f64",
],
unroll_factors = "4",
)
# Kernels that support all floating-point types.
[
gpu_kernel_library(
name = "gpu_" + op + "_kernels",
op = op,
tile_size = "256",
types = [
"f16",
"f32",
"f64",
],
unroll_factors = "4",
)
for op in [
"ceil",
"expm1",
"floor",
"log",
"log1p",
"relu_grad",
"rsqrt",
"softplus",
"softsign",
"sqrt",
"tanh",
]
]
gpu_kernel_library(
name = "gpu_exp_kernels",
op = "exp",
tile_size = "256",
types = [
"f16",
"f32",
"f64",
"c64",
"c128",
],
unroll_factors = "4",
)
# Kernels that support all floating-point types but have i1 output.
[
gpu_kernel_library(
name = "gpu_" + op + "_kernels",
op = op,
output_types = ["i1"] * 3,
tile_size = "256",
types = [
"f16",
"f32",
"f64",
],
unroll_factors = "4",
)
for op in [
"is_finite",
"is_inf",
"is_nan",
]
]
# Kernels that support all floating-point types but cannot be vectorized.
[
gpu_kernel_library(
name = "gpu_" + op + "_kernels",
op = op,
tile_size = "256",
types = [
"f16",
"f32",
"f64",
],
)
for op in [
"cos",
"sin",
"tan",
]
]
gpu_kernel_library(
name = "gpu_cast_kernels",
op = "cast",
# We generate all combinations of input types/output types from the set
# {i1, i8, i16, i32, i64, ui8, ui16, ui32, ui64, f16, f32, f64}. The easiest
# way to do this is to repeat each input type 12 times and match it with the
# 12 different output types (thus, the list of 12 different output types
# needs to be repeated 12 times as well).
output_types = [
"i1",
"i8",
"i16",
"i32",
"i64",
"ui8",
"ui16",
"ui32",
"ui64",
"f16",
"f32",
"f64",
] * 12,
tile_size = "256",
types = ["i1"] * 12 + ["i8"] * 12 + ["i16"] * 12 + ["i32"] * 12 +
["i64"] * 12 + ["ui8"] * 12 + ["ui16"] * 12 + ["ui32"] * 12 +
["ui64"] * 12 + ["f16"] * 12 + ["f32"] * 12 + ["f64"] * 12,
)
gpu_kernel_library(
name = "gpu_pow_kernels",
op = "pow",
tile_size = "1024",
types = [
"f16",
"f32",
"f64",
"i64",
],
)
gpu_kernel_library(
# The zeta kernels needs many registers so tile at 256.
name = "gpu_zeta_kernels",
op = "zeta",
tile_size = "256",
types = [
"f32",
"f64",
],
# TODO(b/178388085): Enable unrolling after vectorization is fixed.
# unroll_factors = "4",
)
gpu_kernel_library(
name = "gpu_select_v2_kernels",
max_supported_rank = 8,
op = "select_v2",
tile_size = "256",
types = [
"i1",
"i32",
"i64",
"f16",
"f32",
"f64",
"c64",
"c128",
],
)
cpu_kernel_library(
name = "cpu_abs_kernels",
op = "abs",
tile_size = "256",
types = [
"f16",
"f32",
"f64",
"i8",
"i16",
"i32",
"i64",
],
unroll_factors = "4",
)
cpu_kernel_library(
name = "cpu_invert_kernels",
op = "invert",
tile_size = "256",
types = [
"i8",
"i16",
"i32",
"i64",
],
unroll_factors = "4",
)
cpu_kernel_library(
name = "cpu_ceil_kernels",
op = "ceil",
tile_size = "256",
types = [
"f16",
"f32",
"f64",
],
unroll_factors = "4",
)
cpu_kernel_library(
name = "cpu_cos_kernels",
op = "cos",
tile_size = "256",
types = [
"f16",
"f32",
"f64",
],
unroll_factors = "4",
)
cpu_kernel_library(
name = "cpu_floor_kernels",
op = "floor",
tile_size = "256",
types = [
"f16",
"f32",
"f64",
],
unroll_factors = "4",
)
cpu_kernel_library(
name = "cpu_rsqrt_kernels",
op = "rsqrt",
tile_size = "256",
types = [
"f16",
"f32",
"f64",
],
unroll_factors = "4",
)
cpu_kernel_library(
name = "cpu_sin_kernels",
op = "sin",
tile_size = "256",
types = [
"f16",
"f32",
"f64",
],
unroll_factors = "4",
)
cpu_kernel_library(
name = "cpu_sqrt_kernels",
op = "sqrt",
tile_size = "256",
types = [
"f16",
"f32",
"f64",
],
unroll_factors = "4",
)
cpu_kernel_library(
name = "cpu_square_kernels",
op = "square",
tile_size = "256",
types = [
"f16",
"f32",
"f64",
"i32",
"i64",
"c64",
"c128",
],
unroll_factors = "4",
)
cpu_kernel_library(
name = "cpu_tan_kernels",
op = "tan",
tile_size = "256",
types = [
"f16",
"f32",
"f64",
],
unroll_factors = "4",
)
cpu_kernel_library(
name = "cpu_add_v2_kernels",
op = "add_v2",
tile_size = "1024",
types = [
"f16",
"f32",
"f64",
"i8",
"i16",
"i32",
"i64",
"c64",
"c128",
],
unroll_factors = "4",
)
# Binary bitwise operations.
[
cpu_kernel_library(
name = "cpu_" + op + "_kernels",
op = op,
tile_size = "1024",
types = [
"i8",
"i16",
"i32",
"i64",
],
unroll_factors = "4",
)
for op in [
"bitwise_and",
"bitwise_or",
"bitwise_xor",
"left_shift",
]
]
cpu_kernel_library(
name = "cpu_right_shift_kernels",
op = "right_shift",
tile_size = "1024",
types = [
"i8",
"i16",
"i32",
"i64",
"ui8",
"ui16",
"ui32",
"ui64",
],
unroll_factors = "4",
)
gpu_kernel_library(
name = "gpu_xlogy_kernels",
op = "xlogy",
tile_size = "1024",
types = [
"f16",
"f32",
"f64",
"c64",
"c128",
],
# For complex XlogyOp kernels, we don't use unrolling, it would only cause
# slowdowns.
types_with_unrolling_disabled = [
"c64",
"c128",
],
unroll_factors = "4",
)
gpu_kernel_library(
name = "gpu_xdivy_kernels",
op = "xdivy",
tile_size = "1024",
types = [
"f16",
"f32",
"f64",
"c64",
"c128",
],
unroll_factors = "4",
)
gpu_kernel_library(
name = "gpu_xlog1py_kernels",
op = "xlog1py",
tile_size = "1024",
types = [
"f16",
"f32",
"f64",
"c64",
"c128",
],
# For complex Xlog1pyOp kernels, we don't use unrolling, it would only cause
# slowdowns.
types_with_unrolling_disabled = [
"c64",
"c128",
],
unroll_factors = "4",
)
cpu_kernel_library(
name = "cpu_angle_kernels",
op = "angle",
output_types = [
"f32",
"f64",
],
tile_size = "256",
types = [
"c64",
"c128",
],
unroll_factors = "2",
)
cpu_kernel_library(
name = "cpu_tanh_kernels",
op = "tanh",
tile_size = "256",
types = [
"f16",
"f32",
"f64",
],
unroll_factors = "4",
)
gpu_kernel_library(
name = "gpu_div_no_nan_kernels",
op = "div_no_nan",
tile_size = "1024",
types = [
"f16",
"f32",
"f64",
"c64",
"c128",
],
)
gpu_kernel_library(
name = "gpu_rint_kernels",
jit_types = ["f16"],
op = "rint",
tile_size = "1024",
types = [
"f32",
"f64",
],
)
gpu_kernel_library(
name = "gpu_round_kernels",
op = "round",
tile_size = "1024",
types = [
"f16",
"f32",
"f64",
"i32",
"i64",
],
)
gpu_kernel_library(
name = "gpu_zeros_like_kernels",
op = "zeros_like",
tile_size = "1024",
types = [
"i1",
"i64",
"f16",
"f32",
"f64",
# TODO(b/190374484): Enable it for complex types once it is supported.
# "c64",
# "c128",
],
)
gpu_kernel_library(
name = "gpu_ones_like_kernels",
op = "ones_like",
tile_size = "1024",
types = [
"i1",
"i64",
"f16",
"f32",
"f64",
# TODO(b/190374484): Enable it for complex types once it is supported.
# "c64",
# "c128",
],
)
gpu_kernel_library(
name = "gpu_next_after_kernels",
op = "next_after",
tile_size = "1024",
types = [
"f32",
"f64",
],
)