"""Generates cubin headers for TF dialect ops."""
load("@local_config_cuda//cuda:build_defs.bzl", "cuda_gpu_architectures", "if_cuda")
def _lookup_file(filegroup, path):
"""Extracts file at (relative) path in filegroup."""
for file in filegroup.files.to_list():
if file.path.endswith(path):
return file
return None
def _gen_kernel_image_hdr_impl(ctx):
if not ctx.attr.gpu_archs:
fail("No GPU architecture specified, use --config=cuda or similar")
name =
tile_sizes = ctx.attr.tile_size.replace("x", ",")
same_shape = []
if ctx.attr.same_shape:
same_shape.append("--same_shape=%s" % ctx.attr.same_shape)
cubins = []
images = []
for arch in ctx.attr.gpu_archs:
# TODO(b/152737872): 'compute_' should generate both SASS and PTX.
arch = arch.replace("compute_", "sm_")
filename = "%s.%s.cubin" % (name, arch)
cubin = ctx.actions.declare_file(filename)
outputs = [cubin],
executable = ctx.executable._tool,
arguments = same_shape + [
"--tile_sizes=%s" % tile_sizes,
"--arch=%s" % arch.split("_")[1],
"--output=%s" % cubin.path,
mnemonic = "compile",
images.append("--image=profile=%s,file=%s" % (arch, cubin.path))
# Generate fatbin file from all cubins.
fatbin = ctx.actions.declare_file("%s.fatbin" % name)
outputs = [fatbin],
inputs = cubins,
executable = _lookup_file(ctx.attr._cuda_root, "bin/fatbinary"),
arguments = [
"--create=%s" % fatbin.path,
] + images,
mnemonic = "fatbinary",
bin2c = _lookup_file(ctx.attr._cuda_root, "bin/bin2c")
outputs = [ctx.outputs.out],
inputs = [fatbin],
tools = [bin2c],
command = "%s --static --const --type=char --name=%s %s 1> %s" %
(bin2c.path, ctx.attr.symbol, fatbin.path, ctx.outputs.out.path),
mnemonic = "bin2c",
_gen_kernel_image_hdr_rule = rule(
implementation = _gen_kernel_image_hdr_impl,
output_to_genfiles = True,
attrs = {
"op": attr.string(mandatory = True),
"tile_size": attr.string(mandatory = True),
"same_shape": attr.string(),
"out": attr.output(mandatory = True),
"symbol": attr.string(mandatory = True),
"gpu_archs": attr.string_list(mandatory = True),
"_cuda_root": attr.label(
default = Label("@local_config_cuda//cuda:cuda_root"),
"_tool": attr.label(
executable = True,
default = Label("//tensorflow/compiler/mlir/tools/kernel_gen:tf_to_cubin"),
cfg = "host",
def _gen_kernel_image_hdr(name, op, tile_size, tags = [], same_shape = None):
"""Generates a C header with fatbin data from a Tensorflow op."""
if cuda_gpu_architectures():
name = name,
op = op,
tile_size = tile_size,
same_shape = same_shape,
out = "%s.h" % name,
symbol = "k%s" % name.replace("_", " ").title().replace(" ", ""),
gpu_archs = cuda_gpu_architectures(),
tags = tags,
def gen_kernel_library(name, op, types, tile_size, tags = [], same_shape = None):
if cuda_gpu_architectures():
type_to_dtype = {
"f16": "DT_HALF",
"f32": "DT_FLOAT",
"f64": "DT_DOUBLE",
for type in types:
name = "{name}_{type}_kernel".format(name = name, type = type),
op = op.replace("f99", type).replace("DT_TYPE", type_to_dtype[type]),
tile_size = tile_size,
tags = tags,
same_shape = same_shape,
name = name + "_kernels",
hdrs = if_cuda(if_true = [":{name}_{type}_kernel".format(name = name, type = type) for type in types]),
tags = tags,