blob: 851e0b6bf0b5638bd645f0475563ba568188f588 [file] [log] [blame]
"""Generates cubin headers for TF dialect ops."""
load("@local_config_cuda//cuda:build_defs.bzl", "cuda_gpu_architectures")
load("//tensorflow:tensorflow.bzl", "get_compatible_with_cloud")
load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain")
def if_mlir_generated_gpu_kernels_enabled(if_true, if_false = []):
return select({
"//tensorflow/core/kernels/mlir_generated:mlir_generated_gpu_kernels_disabled": if_false,
"//conditions:default": if_true,
def _lookup_file(filegroup, path):
"""Extracts file at (relative) path in filegroup."""
for file in filegroup.files.to_list():
if file.path.endswith(path) or file.path.endswith(path + ".exe"):
return file
return None
GpuBinaryInfo = provider(
"GPU binaries in either cubin format or hsaco format",
fields = ["gpu_bins"],
type_to_mlir = {
"c64": "complex<f32>",
"c128": "complex<f64>",
def _get_mlir_type(type):
"""Return the mlir type corresponding to 'type'"""
if type in type_to_mlir:
return type_to_mlir[type]
return type
def _gen_mlir_op_impl(ctx):
mlir_type = _get_mlir_type(ctx.attr.type)
mlir_output_type = _get_mlir_type(ctx.attr.output_type)
cmd = ctx.actions.run_shell(
inputs = [ctx.file.template],
outputs = [ctx.outputs.out],
command = (
(("cat %s | sed 's/platform/%s/g' | sed 's/_elem_type/_%s/g' | " +
"sed 's/elem_type/%s/g' | " + "sed 's/_output_type/_%s/g' | " +
"sed 's/output_type/%s/g' > %s")) % (
_gen_mlir_op_rule = rule(
implementation = _gen_mlir_op_impl,
output_to_genfiles = True,
attrs = {
"template": attr.label(mandatory = True, allow_single_file = True),
"type": attr.string(mandatory = True),
"output_type": attr.string(mandatory = True),
"platform": attr.string(mandatory = True),
"out": attr.output(mandatory = True),
def _gen_mlir_op(name, type, platform, output_type):
name = "generate_{name}_{platform}_{type}_{output_type}_mlir".format(
name = name,
platform = platform,
type = type,
output_type = output_type,
template = "op_definitions/{name}.mlir.tmpl".format(name = name),
platform = platform,
type = type,
output_type = output_type,
out = "{name}_{platform}_{type}_{output_type}.mlir".format(
name = name,
platform = platform,
type = type,
output_type = output_type,
# Kernels build rules.
def if_mlir_experimental_kernels_enabled(if_true, if_false = []):
return select({
"//tensorflow/core/kernels/mlir_generated:mlir_experimental_kernels_enabled": if_true,
"//conditions:default": if_false,
def _gen_kernel_fatbin_impl(ctx):
cc_toolchain = find_cpp_toolchain(ctx)
feature_configuration = cc_common.configure_features(
ctx = ctx,
cc_toolchain = cc_toolchain,
requested_features = ctx.features,
unsupported_features = ctx.disabled_features,
name =
cmd_args = []
if ctx.attr.unroll_factors:
cmd_args.append("--unroll_factors=%s" % ctx.attr.unroll_factors)
if ctx.attr.extra_args:
tile_sizes = ctx.attr.tile_size.replace("x", ",")
arch_flag = ",".join(ctx.attr.gpu_archs)
gpu_bin = ctx.outputs.kernel
# cc_binary seems not to bring its dependencies with it, so do that explicitly here.
inputs = [ctx.file.mlir_op, ctx.file._tfso],
outputs = [gpu_bin],
executable = ctx.executable._tool,
arguments = cmd_args + [
"--tile_sizes=%s" % tile_sizes,
"--arch=%s" % arch_flag,
"--input=%s" % ctx.file.mlir_op.path,
"--output=%s" % gpu_bin.path,
"--enable_ftz=%s" % (ctx.attr.data_type == "f32"),
mnemonic = "compile",
compilation_outputs = cc_common.create_compilation_outputs(
# We always produce PIC object files, so use the same object files for both.
objects = depset([gpu_bin]),
pic_objects = depset([gpu_bin]),
(linking_context, linking_outputs) = cc_common.create_linking_context_from_compilation_outputs(
name =,
actions = ctx.actions,
feature_configuration = feature_configuration,
cc_toolchain = cc_toolchain,
compilation_outputs = compilation_outputs,
return [CcInfo(linking_context = linking_context)]
_gen_kernel_fatbin_rule = rule(
attrs = {
"mlir_op": attr.label(mandatory = True, allow_single_file = True),
"data_type": attr.string(mandatory = True),
"tile_size": attr.string(mandatory = True),
"unroll_factors": attr.string(),
"gpu_archs": attr.string_list(mandatory = True),
"extra_args": attr.string_list(),
# cc_binary seems not to bring its dependencies with it, so do that explicitly here.
"_tfso": attr.label(
default = Label("//"),
cfg = "host",
allow_single_file = True,
"_tool": attr.label(
executable = True,
default = Label("//tensorflow/compiler/mlir/tools/kernel_gen:tf_to_kernel"),
cfg = "host",
"_cc_toolchain": attr.label(default = "@bazel_tools//tools/cpp:current_cc_toolchain"),
fragments = ["cpp"],
outputs = {"kernel": "%{name}_kernel.o"},
implementation = _gen_kernel_fatbin_impl,
def gen_kernel_library(
output_types = None,
tags = [],
platform = "gpu",
unroll_factors = None,
extra_args = []):
""" Generate a library with kernels for a specific tensorflow op.
name: The name of the tensorflow op.
types: The types ("f16", "f32", "f64") for which a kernel should be generated.
tile_size: The tiling specification, e.g. "16x16".
output_types: The output types for which a kernel should be generated. If
specified, the i-th entry in types corresponds to the i-th
entry in output_types. By default, output_types = types is
tags: The tags which should be added to the library.
platform: Platform for which to compile, i.e. "cpu" or "gpu"
unroll_factors: The unrolling specification, e.g. "4,4"
extra_args: Extra arguments to pass to the generator tool.
if not output_types:
output_types = types
if cuda_gpu_architectures() or rocm_gpu_architectures():
for (type, output_type) in zip(types, output_types):
name = name,
platform = platform,
type = type,
output_type = output_type,
name = "{name}_{platform}_{type}_{output_type}_kernel_generator".format(
name = name,
platform = platform,
type = type,
output_type = output_type,
mlir_op = "{name}_{platform}_{type}_{output_type}.mlir".format(
name = name,
platform = platform,
type = type,
output_type = output_type,
data_type = type,
gpu_archs = rocm_gpu_architectures() + cuda_gpu_architectures(),
tile_size = tile_size,
unroll_factors = unroll_factors,
extra_args = extra_args,
# We have to use a sh_test instead of build_test because it doesn't properly find the dependent targets.
name = "{name}_{platform}_{type}_{output_type}_gen_test".format(
name = name,
platform = platform,
type = type,
output_type = output_type,
srcs = [""],
tags = ["no_rocm"],
args = [
"$(location //tensorflow/compiler/mlir/tools/kernel_gen:tf_to_kernel)",
"$(location {name}_{platform}_{type}_{output_type}.mlir)".format(
name = name,
platform = platform,
type = type,
output_type = output_type,
size = "medium",
data = [
name = name,
platform = platform,
type = type,
output_type = output_type,
name = name + "_kernels",
compatible_with = get_compatible_with_cloud(),
deps = if_gpu_is_configured([":{name}_{platform}_{type}_{output_type}_kernel_generator".format(
name = name,
platform = platform,
type = type,
output_type = output_type,
) for (type, output_type) in zip(types, output_types)]),
linkstatic = 1,
tags = tags,