blob: e20257dbbd95c9627c32911751b1220db8a3759a [file] [log] [blame]
"""Generates cubin headers for TF dialect ops."""
load("@local_config_cuda//cuda:build_defs.bzl", "cuda_gpu_architectures")
load(
"@local_config_rocm//rocm:build_defs.bzl",
"rocm_gpu_architectures",
"rocm_is_configured",
)
load("//tensorflow:tensorflow.bzl", "get_compatible_with_cloud")
load(
"//tensorflow/stream_executor:build_defs.bzl",
"if_gpu_is_configured",
)
def if_mlir_generated_gpu_kernels_enabled(if_true, if_false = []):
return select({
"//tensorflow/core/kernels/mlir_generated:mlir_generated_gpu_kernels_disabled": if_false,
"//conditions:default": if_true,
})
def _lookup_file(filegroup, path):
"""Extracts file at (relative) path in filegroup."""
for file in filegroup.files.to_list():
if file.path.endswith(path) or file.path.endswith(path + ".exe"):
return file
return None
GpuBinaryInfo = provider(
"GPU binaries in either cubin format or hsaco format",
fields = ["gpu_bins"],
)
def _gen_kernel_gpu_bin_impl(ctx):
name = ctx.attr.name
tile_sizes = ctx.attr.tile_size.replace("x", ",")
cmd_args = []
if ctx.attr.unroll_factors:
cmd_args.append("--unroll_factors=%s" % ctx.attr.unroll_factors)
if ctx.attr.extra_args:
cmd_args.extend(ctx.attr.extra_args)
gpu_bins = []
for arch in ctx.attr.gpu_archs:
# TODO(b/170283783): 'compute_' should generate both SASS and PTX.
arch = arch.replace("compute_", "sm_")
filename = "%s.%s.bin" % (name, arch)
gpu_bin = ctx.actions.declare_file(filename)
ctx.actions.run(
inputs = [ctx.file.mlir_op, ctx.file._tfso],
outputs = [gpu_bin],
executable = ctx.executable._tool,
arguments = cmd_args + [
"--tile_sizes=%s" % tile_sizes,
"--arch=%s" % arch,
"--input=%s" % ctx.file.mlir_op.path,
"--output=%s" % gpu_bin.path,
],
mnemonic = "compile",
)
gpu_bins.append(gpu_bin)
return [GpuBinaryInfo(gpu_bins = gpu_bins)]
_gen_kernel_gpu_bin_rule = rule(
attrs = {
"mlir_op": attr.label(mandatory = True, allow_single_file = True),
"tile_size": attr.string(mandatory = True),
"unroll_factors": attr.string(),
"gpu_archs": attr.string_list(mandatory = True),
"extra_args": attr.string_list(),
"_tfso": attr.label(
default = Label("//tensorflow:libtensorflow_framework.so.2"),
cfg = "host",
allow_single_file = True,
),
"_tool": attr.label(
executable = True,
default = Label("//tensorflow/compiler/mlir/tools/kernel_gen:tf_to_gpu_binary"),
cfg = "host",
),
},
output_to_genfiles = True,
implementation = _gen_kernel_gpu_bin_impl,
)
def _gen_kernel_image_hdr_impl_cuda(ctx):
images = []
for cubin in ctx.attr.input[GpuBinaryInfo].gpu_bins:
arch = cubin.path.split(".")[-2]
images.append("--image=profile=%s,file=%s" % (arch, cubin.path))
# Generate fatbin file from all cubins.
fatbin = ctx.actions.declare_file("%s.fatbin" % ctx.attr.name)
ctx.actions.run(
outputs = [fatbin],
inputs = ctx.attr.input[GpuBinaryInfo].gpu_bins,
executable = _lookup_file(ctx.attr._gpu_root, "bin/fatbinary"),
arguments = [
"--64",
"--cmdline=--compile-only",
"--link",
"--compress-all",
"--create=%s" % fatbin.path,
] + images,
mnemonic = "fatbinary",
)
bin2c = _lookup_file(ctx.attr._gpu_root, "bin/bin2c")
ctx.actions.run_shell(
outputs = [ctx.outputs.out],
inputs = [fatbin],
tools = [bin2c],
command = "%s --static --const --type=char --name=%s %s 1> %s" %
(bin2c.path, ctx.attr.symbol, fatbin.path, ctx.outputs.out.path),
mnemonic = "bin2c",
)
def _gen_kernel_image_hdr_impl_rocm(ctx):
hsaco_files = []
hsaco_targets = []
# Add a dummy host target triple...clang-offload-bundler requires 1 and only 1 host target triple
hsaco_files.append("/dev/null")
hsaco_targets.append("host-x86_64-unknown-linux")
hsacos = ctx.attr.input[GpuBinaryInfo].gpu_bins
for hsaco in hsacos:
gfx_arch = hsaco.path.split(".")[-2]
hsaco_files.append(hsaco.path)
hsaco_targets.append("hip-amdgcn-amd-amdhsa-%s" % gfx_arch)
# Generate fatbin file from all hsacos.
fatbin = ctx.actions.declare_file("%s.fatbin" % ctx.attr.name)
ctx.actions.run(
outputs = [fatbin],
inputs = hsacos,
executable = _lookup_file(ctx.attr._gpu_root, "bin/clang-offload-bundler"),
arguments = [
"--inputs=%s" % ",".join(hsaco_files),
"--targets=%s" % ",".join(hsaco_targets),
"--type=o",
"--outputs=%s" % fatbin.path,
],
mnemonic = "fatbinary",
)
ctx.actions.run_shell(
outputs = [ctx.outputs.out],
inputs = [fatbin],
command = (
("hex=`hexdump -v -e \'/1 \"0x%%02x, \"\' %s` && " +
"len=`echo $hex | wc -c` && " +
"echo 'static const unsigned char %s['$len' + 1] = {' > %s && " +
"echo $hex | cat >> %s && " +
"echo '};' >> %s") % (
fatbin.path,
ctx.attr.symbol,
ctx.outputs.out.path,
ctx.outputs.out.path,
ctx.outputs.out.path,
)
),
)
_gen_kernel_image_hdr_rule = rule(
implementation = _gen_kernel_image_hdr_impl_rocm if rocm_is_configured() else _gen_kernel_image_hdr_impl_cuda,
output_to_genfiles = True,
attrs = {
"input": attr.label(mandatory = True, providers = [GpuBinaryInfo]),
"out": attr.output(mandatory = True),
"symbol": attr.string(mandatory = True),
"_gpu_root": attr.label(
default = Label("@local_config_rocm//rocm:rocm_root") if rocm_is_configured() else Label("@local_config_cuda//cuda:cuda_root"),
),
},
)
def _gen_kernel_image_hdr(name, mlir_op, gpu_archs, tile_size, unroll_factors = None, extra_args = []):
"""Generates a C header with fatbin data from a Tensorflow op."""
_gen_kernel_gpu_bin_rule(
name = name + "_cubin",
mlir_op = mlir_op,
tile_size = tile_size,
unroll_factors = unroll_factors,
gpu_archs = gpu_archs,
extra_args = extra_args,
)
_gen_kernel_image_hdr_rule(
name = name,
input = ":" + name + "_cubin",
out = "%s.h" % name,
symbol = "k%s" % name.replace("_", " ").title().replace(" ", ""),
)
type_to_mlir = {
"c64": "complex<f32>",
"c128": "complex<f64>",
}
def _gen_mlir_op_impl(ctx):
# Map attr.type to MLIR type.
mlir_type = ctx.attr.type
if mlir_type in type_to_mlir:
mlir_type = type_to_mlir[mlir_type]
# In order to generate a ranked kernel we change *xelem_type to ?xelem_type
# and remove element type from the entry function name.
convert_to_ranked = ""
if ctx.attr.unranked == False:
convert_to_ranked = "sed s/*x/?x/g | sed s/_elem_type//g |"
cmd = ctx.actions.run_shell(
inputs = [ctx.file.template],
outputs = [ctx.outputs.out],
command = (
("cat %s | %s sed 's/_elem_type/_%s/g' | sed 's/elem_type/%s/g' > %s") % (
ctx.file.template.path,
convert_to_ranked,
ctx.attr.type,
mlir_type,
ctx.outputs.out.path,
)
),
)
_gen_mlir_op_rule = rule(
implementation = _gen_mlir_op_impl,
output_to_genfiles = True,
attrs = {
"template": attr.label(mandatory = True, allow_single_file = True),
"type": attr.string(mandatory = True),
"out": attr.output(mandatory = True),
"unranked": attr.bool(mandatory = True),
},
)
def _gen_mlir_op(name, type, unranked):
tmpl_name = name.replace("_unranked", "") if unranked else name
_gen_mlir_op_rule(
name = "generate_{name}_{type}_mlir".format(name = name, type = type),
template = "op_definitions/{name}.mlir.tmpl".format(name = tmpl_name),
type = type,
out = "{name}_{type}.mlir".format(name = name, type = type),
unranked = unranked,
)
def gen_ranked_kernel_library(name, types, tile_size, tags = [], unroll_factors = None, extra_args = []):
""" Generate a library with kernels for a specific tensorflow op.
Args:
name: The name of the tensorflow op.
types: The types ("f16", "f32", "f64") for which a kernel should be generated.
tile_size: The tiling specification, e.g. "16x16".
unroll_factors: The unrolling specification, e.g. "4,4"
tags: The tags which should be added to the library.
extra_args: Extra arguments to pass to the generator tool.
"""
if cuda_gpu_architectures() or rocm_gpu_architectures():
for type in types:
_gen_mlir_op(
name = name,
type = type,
unranked = False,
)
_gen_kernel_image_hdr(
name = "{name}_{type}_kernel".format(name = name, type = type),
mlir_op = "{name}_{type}.mlir".format(name = name, type = type),
gpu_archs = rocm_gpu_architectures() if rocm_is_configured() else cuda_gpu_architectures(),
tile_size = tile_size,
unroll_factors = unroll_factors,
extra_args = extra_args,
)
native.cc_library(
name = name + "_kernels",
hdrs = if_gpu_is_configured([":{name}_{type}_kernel".format(name = name, type = type) for type in types]),
tags = tags,
)
################################################################################
# Unranked kernels build rules.
################################################################################
def if_mlir_unranked_kernels_enabled(if_true, if_false = []):
return select({
"//tensorflow/core/kernels/mlir_generated:mlir_use_unranked_kernels": if_true,
"//conditions:default": if_false,
})
def _gen_unranked_kernel_fatbin_impl(ctx):
name = ctx.attr.name
cmd_args = []
if ctx.attr.unroll_factors:
cmd_args.append("--unroll_factors=%s" % ctx.attr.unroll_factors)
if ctx.attr.extra_args:
cmd_args.extend(ctx.attr.extra_args)
tile_sizes = ctx.attr.tile_size.replace("x", ",")
arch_flag = ",".join(ctx.attr.gpu_archs)
gpu_bin = ctx.outputs.output
ctx.actions.run(
inputs = [ctx.file.mlir_op, ctx.file._tfso],
outputs = [gpu_bin],
executable = ctx.executable._tool,
arguments = cmd_args + [
"--tile_sizes=%s" % tile_sizes,
"--arch=%s" % arch_flag,
"--input=%s" % ctx.file.mlir_op.path,
"--output=%s" % gpu_bin.path,
],
mnemonic = "compile",
)
_gen_unranked_kernel_fatbin_rule = rule(
attrs = {
"mlir_op": attr.label(mandatory = True, allow_single_file = True),
"output": attr.output(mandatory = True, doc = "The generated file"),
"tile_size": attr.string(mandatory = True),
"unroll_factors": attr.string(),
"gpu_archs": attr.string_list(mandatory = True),
"extra_args": attr.string_list(),
"_tfso": attr.label(
default = Label("//tensorflow:libtensorflow_framework.so.2"),
cfg = "host",
allow_single_file = True,
),
"_tool": attr.label(
executable = True,
default = Label("//tensorflow/compiler/mlir/tools/kernel_gen:tf_to_kernel"),
cfg = "host",
),
},
output_to_genfiles = True,
implementation = _gen_unranked_kernel_fatbin_impl,
)
def gen_unranked_kernel_library(name, types, tile_size, tags = [], unroll_factors = None, extra_args = []):
""" Generate a library with unranked kernels for a specific tensorflow op.
Args:
name: The name of the tensorflow op.
types: The types ("f16", "f32", "f64") for which a kernel should be generated.
tile_size: The tiling specification, e.g. "16x16".
unroll_factors: The unrolling specification, e.g. "4,4"
tags: The tags which should be added to the library.
extra_args: Extra arguments to pass to the generator tool.
"""
if cuda_gpu_architectures() or rocm_gpu_architectures():
for type in types:
_gen_mlir_op(
name = name,
type = type,
unranked = True,
)
_gen_unranked_kernel_fatbin_rule(
name = "{name}_{type}_kernel_generator".format(name = name, type = type),
mlir_op = "{name}_{type}.mlir".format(name = name, type = type),
output = "{name}_{type}.a".format(name = name, type = type),
gpu_archs = rocm_gpu_architectures() if rocm_is_configured() else cuda_gpu_architectures(),
tile_size = tile_size,
unroll_factors = unroll_factors,
extra_args = extra_args,
)
native.cc_import(
name = "{name}_{type}_kernel".format(name = name, type = type),
static_library = "{name}_{type}.a".format(name = name, type = type),
)
# We have to use a sh_test instead of build_test because it doesn't properly find the dependent targets.
native.sh_test(
name = "{name}_{type}_gen_test".format(name = name, type = type),
srcs = ["build_test.sh"],
tags = ["no_rocm"],
args = [
"$(location //tensorflow/compiler/mlir/tools/kernel_gen:tf_to_kernel)",
"$(location {name}_{type}.mlir)".format(name = name, type = type),
],
size = "small",
data = [
":{name}_{type}.mlir".format(name = name, type = type),
"//tensorflow/compiler/mlir/tools/kernel_gen:tf_to_kernel",
],
)
native.cc_library(
name = name + "_kernels",
compatible_with = get_compatible_with_cloud(),
deps = if_gpu_is_configured([":{name}_{type}_kernel".format(name = name, type = type) for type in types]),
linkstatic = 1,
tags = tags,
)
def gen_kernel_library(name, types, tile_size, tags = [], unroll_factors = None, extra_args = [], generate_ranked = True, generate_unranked = False):
if (generate_ranked):
gen_ranked_kernel_library(
name = name,
types = types,
tile_size = tile_size,
tags = tags,
unroll_factors = unroll_factors,
extra_args = extra_args,
)
if (generate_unranked):
gen_unranked_kernel_library(
name = name + "_unranked",
types = types,
tile_size = tile_size,
tags = tags,
unroll_factors = unroll_factors,
extra_args = extra_args,
)