tensorflow/core/kernels/mlir_generated/build_defs.bzl - platform/external/tensorflow - Git at Google

 """Generates cubin headers for TF dialect ops."""

 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_gpu_architectures")
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "rocm_gpu_architectures",
     "rocm_is_configured",
 )
 load("//tensorflow:tensorflow.bzl", "get_compatible_with_cloud")
 load(
     "//tensorflow/stream_executor:build_defs.bzl",
     "if_gpu_is_configured",
 )

 def if_mlir_generated_gpu_kernels_enabled(if_true, if_false = []):
     return select({
         "//tensorflow/core/kernels/mlir_generated:mlir_generated_gpu_kernels_disabled": if_false,
         "//conditions:default": if_true,
     })

 def _lookup_file(filegroup, path):
     """Extracts file at (relative) path in filegroup."""
     for file in filegroup.files.to_list():
         if file.path.endswith(path) or file.path.endswith(path + ".exe"):
             return file
     return None

 GpuBinaryInfo = provider(
     "GPU binaries in either cubin format or hsaco format",
     fields = ["gpu_bins"],
 )

 def _gen_kernel_gpu_bin_impl(ctx):
     name = ctx.attr.name
     tile_sizes = ctx.attr.tile_size.replace("x", ",")
     cmd_args = []
     if ctx.attr.unroll_factors:
         cmd_args.append("--unroll_factors=%s" % ctx.attr.unroll_factors)

     if ctx.attr.extra_args:
         cmd_args.extend(ctx.attr.extra_args)

     gpu_bins = []
     for arch in ctx.attr.gpu_archs:
         # TODO(b/170283783): 'compute_' should generate both SASS and PTX.
         arch = arch.replace("compute_", "sm_")
         filename = "%s.%s.bin" % (name, arch)
         gpu_bin = ctx.actions.declare_file(filename)
         ctx.actions.run(
             inputs = [ctx.file.mlir_op, ctx.file._tfso],
             outputs = [gpu_bin],
             executable = ctx.executable._tool,
             arguments = cmd_args + [
                 "--tile_sizes=%s" % tile_sizes,
                 "--arch=%s" % arch,
                 "--input=%s" % ctx.file.mlir_op.path,
                 "--output=%s" % gpu_bin.path,
             ],
             mnemonic = "compile",
         )
         gpu_bins.append(gpu_bin)
     return [GpuBinaryInfo(gpu_bins = gpu_bins)]

 _gen_kernel_gpu_bin_rule = rule(
     attrs = {
         "mlir_op": attr.label(mandatory = True, allow_single_file = True),
         "tile_size": attr.string(mandatory = True),
         "unroll_factors": attr.string(),
         "gpu_archs": attr.string_list(mandatory = True),
         "extra_args": attr.string_list(),
         "_tfso": attr.label(
             default = Label("//tensorflow:libtensorflow_framework.so.2"),
             cfg = "host",
             allow_single_file = True,
         ),
         "_tool": attr.label(
             executable = True,
             default = Label("//tensorflow/compiler/mlir/tools/kernel_gen:tf_to_gpu_binary"),
             cfg = "host",
         ),
     },
     output_to_genfiles = True,
     implementation = _gen_kernel_gpu_bin_impl,
 )

 def _gen_kernel_image_hdr_impl_cuda(ctx):
     images = []
     for cubin in ctx.attr.input[GpuBinaryInfo].gpu_bins:
         arch = cubin.path.split(".")[-2]
         images.append("--image=profile=%s,file=%s" % (arch, cubin.path))

     # Generate fatbin file from all cubins.
     fatbin = ctx.actions.declare_file("%s.fatbin" % ctx.attr.name)
     ctx.actions.run(
         outputs = [fatbin],
         inputs = ctx.attr.input[GpuBinaryInfo].gpu_bins,
         executable = _lookup_file(ctx.attr._gpu_root, "bin/fatbinary"),
         arguments = [
             "--64",
             "--cmdline=--compile-only",
             "--link",
             "--compress-all",
             "--create=%s" % fatbin.path,
         ] + images,
         mnemonic = "fatbinary",
     )

     bin2c = _lookup_file(ctx.attr._gpu_root, "bin/bin2c")
     ctx.actions.run_shell(
         outputs = [ctx.outputs.out],
         inputs = [fatbin],
         tools = [bin2c],
         command = "%s --static --const --type=char --name=%s %s 1> %s" %
                   (bin2c.path, ctx.attr.symbol, fatbin.path, ctx.outputs.out.path),
         mnemonic = "bin2c",
     )

 def _gen_kernel_image_hdr_impl_rocm(ctx):
     hsaco_files = []
     hsaco_targets = []

     # Add a dummy host target triple...clang-offload-bundler requires 1 and only 1 host target triple
     hsaco_files.append("/dev/null")
     hsaco_targets.append("host-x86_64-unknown-linux")

     hsacos = ctx.attr.input[GpuBinaryInfo].gpu_bins
     for hsaco in hsacos:
         gfx_arch = hsaco.path.split(".")[-2]
         hsaco_files.append(hsaco.path)
         hsaco_targets.append("hip-amdgcn-amd-amdhsa-%s" % gfx_arch)

     # Generate fatbin file from all hsacos.
     fatbin = ctx.actions.declare_file("%s.fatbin" % ctx.attr.name)
     ctx.actions.run(
         outputs = [fatbin],
         inputs = hsacos,
         executable = _lookup_file(ctx.attr._gpu_root, "bin/clang-offload-bundler"),
         arguments = [
             "--inputs=%s" % ",".join(hsaco_files),
             "--targets=%s" % ",".join(hsaco_targets),
             "--type=o",
             "--outputs=%s" % fatbin.path,
         ],
         mnemonic = "fatbinary",
     )

     ctx.actions.run_shell(
         outputs = [ctx.outputs.out],
         inputs = [fatbin],
         command = (
             ("hex=`hexdump -v -e \'/1 \"0x%%02x, \"\' %s` && " +
              "len=`echo $hex | wc -c` && " +
              "echo 'static const unsigned char %s['$len' + 1] = {' > %s && " +
              "echo $hex | cat >> %s && " +
              "echo '};' >> %s") % (
                 fatbin.path,
                 ctx.attr.symbol,
                 ctx.outputs.out.path,
                 ctx.outputs.out.path,
                 ctx.outputs.out.path,
             )
         ),
     )

 _gen_kernel_image_hdr_rule = rule(
     implementation = _gen_kernel_image_hdr_impl_rocm if rocm_is_configured() else _gen_kernel_image_hdr_impl_cuda,
     output_to_genfiles = True,
     attrs = {
         "input": attr.label(mandatory = True, providers = [GpuBinaryInfo]),
         "out": attr.output(mandatory = True),
         "symbol": attr.string(mandatory = True),
         "_gpu_root": attr.label(
             default = Label("@local_config_rocm//rocm:rocm_root") if rocm_is_configured() else Label("@local_config_cuda//cuda:cuda_root"),
         ),
     },
 )

 def _gen_kernel_image_hdr(name, mlir_op, gpu_archs, tile_size, unroll_factors = None, extra_args = []):
     """Generates a C header with fatbin data from a Tensorflow op."""
     _gen_kernel_gpu_bin_rule(
         name = name + "_cubin",
         mlir_op = mlir_op,
         tile_size = tile_size,
         unroll_factors = unroll_factors,
         gpu_archs = gpu_archs,
         extra_args = extra_args,
     )
     _gen_kernel_image_hdr_rule(
         name = name,
         input = ":" + name + "_cubin",
         out = "%s.h" % name,
         symbol = "k%s" % name.replace("_", " ").title().replace(" ", ""),
     )

 type_to_mlir = {
     "c64": "complex<f32>",
     "c128": "complex<f64>",
 }

 def _gen_mlir_op_impl(ctx):
     # Map attr.type to MLIR type.
     mlir_type = ctx.attr.type
     if mlir_type in type_to_mlir:
         mlir_type = type_to_mlir[mlir_type]

     # In order to generate a ranked kernel we change *xelem_type to ?xelem_type
     # and remove element type from the entry function name.
     convert_to_ranked = ""
     if ctx.attr.unranked == False:
         convert_to_ranked = "sed s/*x/?x/g | sed s/_elem_type//g |"
     cmd = ctx.actions.run_shell(
         inputs = [ctx.file.template],
         outputs = [ctx.outputs.out],
         command = (
             ("cat %s | %s sed 's/_elem_type/_%s/g' | sed 's/elem_type/%s/g' > %s") % (
                 ctx.file.template.path,
                 convert_to_ranked,
                 ctx.attr.type,
                 mlir_type,
                 ctx.outputs.out.path,
             )
         ),
     )

 _gen_mlir_op_rule = rule(
     implementation = _gen_mlir_op_impl,
     output_to_genfiles = True,
     attrs = {
         "template": attr.label(mandatory = True, allow_single_file = True),
         "type": attr.string(mandatory = True),
         "out": attr.output(mandatory = True),
         "unranked": attr.bool(mandatory = True),
     },
 )

 def _gen_mlir_op(name, type, unranked):
     tmpl_name = name.replace("_unranked", "") if unranked else name
     _gen_mlir_op_rule(
         name = "generate_{name}_{type}_mlir".format(name = name, type = type),
         template = "op_definitions/{name}.mlir.tmpl".format(name = tmpl_name),
         type = type,
         out = "{name}_{type}.mlir".format(name = name, type = type),
         unranked = unranked,
     )

 def gen_ranked_kernel_library(name, types, tile_size, tags = [], unroll_factors = None, extra_args = []):
     """ Generate a library with kernels for a specific tensorflow op.

     Args:
       name: The name of the tensorflow op.
       types: The types ("f16", "f32", "f64") for which a kernel should be generated.
       tile_size: The tiling specification, e.g. "16x16".
       unroll_factors: The unrolling specification, e.g. "4,4"
       tags: The tags which should be added to the library.
       extra_args: Extra arguments to pass to the generator tool.
     """

     if cuda_gpu_architectures() or rocm_gpu_architectures():
         for type in types:
             _gen_mlir_op(
                 name = name,
                 type = type,
                 unranked = False,
             )
             _gen_kernel_image_hdr(
                 name = "{name}_{type}_kernel".format(name = name, type = type),
                 mlir_op = "{name}_{type}.mlir".format(name = name, type = type),
                 gpu_archs = rocm_gpu_architectures() if rocm_is_configured() else cuda_gpu_architectures(),
                 tile_size = tile_size,
                 unroll_factors = unroll_factors,
                 extra_args = extra_args,
             )

     native.cc_library(
         name = name + "_kernels",
         hdrs = if_gpu_is_configured([":{name}_{type}_kernel".format(name = name, type = type) for type in types]),
         tags = tags,
     )

 ################################################################################
 # Unranked kernels build rules.
 ################################################################################

 def if_mlir_unranked_kernels_enabled(if_true, if_false = []):
     return select({
         "//tensorflow/core/kernels/mlir_generated:mlir_use_unranked_kernels": if_true,
         "//conditions:default": if_false,
     })

 def _gen_unranked_kernel_fatbin_impl(ctx):
     name = ctx.attr.name
     cmd_args = []
     if ctx.attr.unroll_factors:
         cmd_args.append("--unroll_factors=%s" % ctx.attr.unroll_factors)
     if ctx.attr.extra_args:
         cmd_args.extend(ctx.attr.extra_args)
     tile_sizes = ctx.attr.tile_size.replace("x", ",")
     arch_flag = ",".join(ctx.attr.gpu_archs)
     gpu_bin = ctx.outputs.output
     ctx.actions.run(
         inputs = [ctx.file.mlir_op, ctx.file._tfso],
         outputs = [gpu_bin],
         executable = ctx.executable._tool,
         arguments = cmd_args + [
             "--tile_sizes=%s" % tile_sizes,
             "--arch=%s" % arch_flag,
             "--input=%s" % ctx.file.mlir_op.path,
             "--output=%s" % gpu_bin.path,
         ],
         mnemonic = "compile",
     )

 _gen_unranked_kernel_fatbin_rule = rule(
     attrs = {
         "mlir_op": attr.label(mandatory = True, allow_single_file = True),
         "output": attr.output(mandatory = True, doc = "The generated file"),
         "tile_size": attr.string(mandatory = True),
         "unroll_factors": attr.string(),
         "gpu_archs": attr.string_list(mandatory = True),
         "extra_args": attr.string_list(),
         "_tfso": attr.label(
             default = Label("//tensorflow:libtensorflow_framework.so.2"),
             cfg = "host",
             allow_single_file = True,
         ),
         "_tool": attr.label(
             executable = True,
             default = Label("//tensorflow/compiler/mlir/tools/kernel_gen:tf_to_kernel"),
             cfg = "host",
         ),
     },
     output_to_genfiles = True,
     implementation = _gen_unranked_kernel_fatbin_impl,
 )

 def gen_unranked_kernel_library(name, types, tile_size, tags = [], unroll_factors = None, extra_args = []):
     """ Generate a library with unranked kernels for a specific tensorflow op.

     Args:
       name: The name of the tensorflow op.
       types: The types ("f16", "f32", "f64") for which a kernel should be generated.
       tile_size: The tiling specification, e.g. "16x16".
       unroll_factors: The unrolling specification, e.g. "4,4"
       tags: The tags which should be added to the library.
       extra_args: Extra arguments to pass to the generator tool.
     """

     if cuda_gpu_architectures() or rocm_gpu_architectures():
         for type in types:
             _gen_mlir_op(
                 name = name,
                 type = type,
                 unranked = True,
             )
             _gen_unranked_kernel_fatbin_rule(
                 name = "{name}_{type}_kernel_generator".format(name = name, type = type),
                 mlir_op = "{name}_{type}.mlir".format(name = name, type = type),
                 output = "{name}_{type}.a".format(name = name, type = type),
                 gpu_archs = rocm_gpu_architectures() if rocm_is_configured() else cuda_gpu_architectures(),
                 tile_size = tile_size,
                 unroll_factors = unroll_factors,
                 extra_args = extra_args,
             )
             native.cc_import(
                 name = "{name}_{type}_kernel".format(name = name, type = type),
                 static_library = "{name}_{type}.a".format(name = name, type = type),
             )

             # We have to use a sh_test instead of build_test because it doesn't properly find the dependent targets.
             native.sh_test(
                 name = "{name}_{type}_gen_test".format(name = name, type = type),
                 srcs = ["build_test.sh"],
                 tags = ["no_rocm"],
                 args = [
                     "$(location //tensorflow/compiler/mlir/tools/kernel_gen:tf_to_kernel)",
                     "$(location {name}_{type}.mlir)".format(name = name, type = type),
                 ],
                 size = "small",
                 data = [
                     ":{name}_{type}.mlir".format(name = name, type = type),
                     "//tensorflow/compiler/mlir/tools/kernel_gen:tf_to_kernel",
                 ],
             )

     native.cc_library(
         name = name + "_kernels",
         compatible_with = get_compatible_with_cloud(),
         deps = if_gpu_is_configured([":{name}_{type}_kernel".format(name = name, type = type) for type in types]),
         linkstatic = 1,
         tags = tags,
     )

 def gen_kernel_library(name, types, tile_size, tags = [], unroll_factors = None, extra_args = [], generate_ranked = True, generate_unranked = False):
     if (generate_ranked):
         gen_ranked_kernel_library(
             name = name,
             types = types,
             tile_size = tile_size,
             tags = tags,
             unroll_factors = unroll_factors,
             extra_args = extra_args,
         )
     if (generate_unranked):
         gen_unranked_kernel_library(
             name = name + "_unranked",
             types = types,
             tile_size = tile_size,
             tags = tags,
             unroll_factors = unroll_factors,
             extra_args = extra_args,
         )
	"""Generates cubin headers for TF dialect ops."""

	load("@local_config_cuda//cuda:build_defs.bzl", "cuda_gpu_architectures")
	load(
	"@local_config_rocm//rocm:build_defs.bzl",
	"rocm_gpu_architectures",
	"rocm_is_configured",
	)
	load("//tensorflow:tensorflow.bzl", "get_compatible_with_cloud")
	load(
	"//tensorflow/stream_executor:build_defs.bzl",
	"if_gpu_is_configured",
	)

	def if_mlir_generated_gpu_kernels_enabled(if_true, if_false = []):
	return select({
	"//tensorflow/core/kernels/mlir_generated:mlir_generated_gpu_kernels_disabled": if_false,
	"//conditions:default": if_true,
	})

	def _lookup_file(filegroup, path):
	"""Extracts file at (relative) path in filegroup."""
	for file in filegroup.files.to_list():
	if file.path.endswith(path) or file.path.endswith(path + ".exe"):
	return file
	return None

	GpuBinaryInfo = provider(
	"GPU binaries in either cubin format or hsaco format",
	fields = ["gpu_bins"],
	)

	def _gen_kernel_gpu_bin_impl(ctx):
	name = ctx.attr.name
	tile_sizes = ctx.attr.tile_size.replace("x", ",")
	cmd_args = []
	if ctx.attr.unroll_factors:
	cmd_args.append("--unroll_factors=%s" % ctx.attr.unroll_factors)

	if ctx.attr.extra_args:
	cmd_args.extend(ctx.attr.extra_args)

	gpu_bins = []
	for arch in ctx.attr.gpu_archs:
	# TODO(b/170283783): 'compute_' should generate both SASS and PTX.
	arch = arch.replace("compute_", "sm_")
	filename = "%s.%s.bin" % (name, arch)
	gpu_bin = ctx.actions.declare_file(filename)
	ctx.actions.run(
	inputs = [ctx.file.mlir_op, ctx.file._tfso],
	outputs = [gpu_bin],
	executable = ctx.executable._tool,
	arguments = cmd_args + [
	"--tile_sizes=%s" % tile_sizes,
	"--arch=%s" % arch,
	"--input=%s" % ctx.file.mlir_op.path,
	"--output=%s" % gpu_bin.path,
	],
	mnemonic = "compile",
	)
	gpu_bins.append(gpu_bin)
	return [GpuBinaryInfo(gpu_bins = gpu_bins)]

	_gen_kernel_gpu_bin_rule = rule(
	attrs = {
	"mlir_op": attr.label(mandatory = True, allow_single_file = True),
	"tile_size": attr.string(mandatory = True),
	"unroll_factors": attr.string(),
	"gpu_archs": attr.string_list(mandatory = True),
	"extra_args": attr.string_list(),
	"_tfso": attr.label(
	default = Label("//tensorflow:libtensorflow_framework.so.2"),
	cfg = "host",
	allow_single_file = True,
	),
	"_tool": attr.label(
	executable = True,
	default = Label("//tensorflow/compiler/mlir/tools/kernel_gen:tf_to_gpu_binary"),
	cfg = "host",
	),
	},
	output_to_genfiles = True,
	implementation = _gen_kernel_gpu_bin_impl,
	)

	def _gen_kernel_image_hdr_impl_cuda(ctx):
	images = []
	for cubin in ctx.attr.input[GpuBinaryInfo].gpu_bins:
	arch = cubin.path.split(".")[-2]
	images.append("--image=profile=%s,file=%s" % (arch, cubin.path))

	# Generate fatbin file from all cubins.
	fatbin = ctx.actions.declare_file("%s.fatbin" % ctx.attr.name)
	ctx.actions.run(
	outputs = [fatbin],
	inputs = ctx.attr.input[GpuBinaryInfo].gpu_bins,
	executable = _lookup_file(ctx.attr._gpu_root, "bin/fatbinary"),
	arguments = [
	"--64",
	"--cmdline=--compile-only",
	"--link",
	"--compress-all",
	"--create=%s" % fatbin.path,
	] + images,
	mnemonic = "fatbinary",
	)

	bin2c = _lookup_file(ctx.attr._gpu_root, "bin/bin2c")
	ctx.actions.run_shell(
	outputs = [ctx.outputs.out],
	inputs = [fatbin],
	tools = [bin2c],
	command = "%s --static --const --type=char --name=%s %s 1> %s" %
	(bin2c.path, ctx.attr.symbol, fatbin.path, ctx.outputs.out.path),
	mnemonic = "bin2c",
	)

	def _gen_kernel_image_hdr_impl_rocm(ctx):
	hsaco_files = []
	hsaco_targets = []

	# Add a dummy host target triple...clang-offload-bundler requires 1 and only 1 host target triple
	hsaco_files.append("/dev/null")
	hsaco_targets.append("host-x86_64-unknown-linux")

	hsacos = ctx.attr.input[GpuBinaryInfo].gpu_bins
	for hsaco in hsacos:
	gfx_arch = hsaco.path.split(".")[-2]
	hsaco_files.append(hsaco.path)
	hsaco_targets.append("hip-amdgcn-amd-amdhsa-%s" % gfx_arch)

	# Generate fatbin file from all hsacos.
	fatbin = ctx.actions.declare_file("%s.fatbin" % ctx.attr.name)
	ctx.actions.run(
	outputs = [fatbin],
	inputs = hsacos,
	executable = _lookup_file(ctx.attr._gpu_root, "bin/clang-offload-bundler"),
	arguments = [
	"--inputs=%s" % ",".join(hsaco_files),
	"--targets=%s" % ",".join(hsaco_targets),
	"--type=o",
	"--outputs=%s" % fatbin.path,
	],
	mnemonic = "fatbinary",
	)

	ctx.actions.run_shell(
	outputs = [ctx.outputs.out],
	inputs = [fatbin],
	command = (
	("hex=`hexdump -v -e \'/1 \"0x%%02x, \"\' %s` && " +
	"len=`echo $hex \| wc -c` && " +
	"echo 'static const unsigned char %s['$len' + 1] = {' > %s && " +
	"echo $hex \| cat >> %s && " +
	"echo '};' >> %s") % (
	fatbin.path,
	ctx.attr.symbol,
	ctx.outputs.out.path,
	ctx.outputs.out.path,
	ctx.outputs.out.path,
	)
	),
	)

	_gen_kernel_image_hdr_rule = rule(
	implementation = _gen_kernel_image_hdr_impl_rocm if rocm_is_configured() else _gen_kernel_image_hdr_impl_cuda,
	output_to_genfiles = True,
	attrs = {
	"input": attr.label(mandatory = True, providers = [GpuBinaryInfo]),
	"out": attr.output(mandatory = True),
	"symbol": attr.string(mandatory = True),
	"_gpu_root": attr.label(
	default = Label("@local_config_rocm//rocm:rocm_root") if rocm_is_configured() else Label("@local_config_cuda//cuda:cuda_root"),
	),
	},
	)

	def _gen_kernel_image_hdr(name, mlir_op, gpu_archs, tile_size, unroll_factors = None, extra_args = []):
	"""Generates a C header with fatbin data from a Tensorflow op."""
	_gen_kernel_gpu_bin_rule(
	name = name + "_cubin",
	mlir_op = mlir_op,
	tile_size = tile_size,
	unroll_factors = unroll_factors,
	gpu_archs = gpu_archs,
	extra_args = extra_args,
	)
	_gen_kernel_image_hdr_rule(
	name = name,
	input = ":" + name + "_cubin",
	out = "%s.h" % name,
	symbol = "k%s" % name.replace("_", " ").title().replace(" ", ""),
	)

	type_to_mlir = {
	"c64": "complex<f32>",
	"c128": "complex<f64>",
	}

	def _gen_mlir_op_impl(ctx):
	# Map attr.type to MLIR type.
	mlir_type = ctx.attr.type
	if mlir_type in type_to_mlir:
	mlir_type = type_to_mlir[mlir_type]

	# In order to generate a ranked kernel we change *xelem_type to ?xelem_type
	# and remove element type from the entry function name.
	convert_to_ranked = ""
	if ctx.attr.unranked == False:
	convert_to_ranked = "sed s/*x/?x/g \| sed s/_elem_type//g \|"
	cmd = ctx.actions.run_shell(
	inputs = [ctx.file.template],
	outputs = [ctx.outputs.out],
	command = (
	("cat %s \| %s sed 's/_elem_type/_%s/g' \| sed 's/elem_type/%s/g' > %s") % (
	ctx.file.template.path,
	convert_to_ranked,
	ctx.attr.type,
	mlir_type,
	ctx.outputs.out.path,
	)
	),
	)

	_gen_mlir_op_rule = rule(
	implementation = _gen_mlir_op_impl,
	output_to_genfiles = True,
	attrs = {
	"template": attr.label(mandatory = True, allow_single_file = True),
	"type": attr.string(mandatory = True),
	"out": attr.output(mandatory = True),
	"unranked": attr.bool(mandatory = True),
	},
	)

	def _gen_mlir_op(name, type, unranked):
	tmpl_name = name.replace("_unranked", "") if unranked else name
	_gen_mlir_op_rule(
	name = "generate_{name}_{type}_mlir".format(name = name, type = type),
	template = "op_definitions/{name}.mlir.tmpl".format(name = tmpl_name),
	type = type,
	out = "{name}_{type}.mlir".format(name = name, type = type),
	unranked = unranked,
	)

	def gen_ranked_kernel_library(name, types, tile_size, tags = [], unroll_factors = None, extra_args = []):
	""" Generate a library with kernels for a specific tensorflow op.

	Args:
	name: The name of the tensorflow op.
	types: The types ("f16", "f32", "f64") for which a kernel should be generated.
	tile_size: The tiling specification, e.g. "16x16".
	unroll_factors: The unrolling specification, e.g. "4,4"
	tags: The tags which should be added to the library.
	extra_args: Extra arguments to pass to the generator tool.
	"""

	if cuda_gpu_architectures() or rocm_gpu_architectures():
	for type in types:
	_gen_mlir_op(
	name = name,
	type = type,
	unranked = False,
	)
	_gen_kernel_image_hdr(
	name = "{name}_{type}_kernel".format(name = name, type = type),
	mlir_op = "{name}_{type}.mlir".format(name = name, type = type),
	gpu_archs = rocm_gpu_architectures() if rocm_is_configured() else cuda_gpu_architectures(),
	tile_size = tile_size,
	unroll_factors = unroll_factors,
	extra_args = extra_args,
	)

	native.cc_library(
	name = name + "_kernels",
	hdrs = if_gpu_is_configured([":{name}_{type}_kernel".format(name = name, type = type) for type in types]),
	tags = tags,
	)

	################################################################################
	# Unranked kernels build rules.
	################################################################################

	def if_mlir_unranked_kernels_enabled(if_true, if_false = []):
	return select({
	"//tensorflow/core/kernels/mlir_generated:mlir_use_unranked_kernels": if_true,
	"//conditions:default": if_false,
	})

	def _gen_unranked_kernel_fatbin_impl(ctx):
	name = ctx.attr.name
	cmd_args = []
	if ctx.attr.unroll_factors:
	cmd_args.append("--unroll_factors=%s" % ctx.attr.unroll_factors)
	if ctx.attr.extra_args:
	cmd_args.extend(ctx.attr.extra_args)
	tile_sizes = ctx.attr.tile_size.replace("x", ",")
	arch_flag = ",".join(ctx.attr.gpu_archs)
	gpu_bin = ctx.outputs.output
	ctx.actions.run(
	inputs = [ctx.file.mlir_op, ctx.file._tfso],
	outputs = [gpu_bin],
	executable = ctx.executable._tool,
	arguments = cmd_args + [
	"--tile_sizes=%s" % tile_sizes,
	"--arch=%s" % arch_flag,
	"--input=%s" % ctx.file.mlir_op.path,
	"--output=%s" % gpu_bin.path,
	],
	mnemonic = "compile",
	)

	_gen_unranked_kernel_fatbin_rule = rule(
	attrs = {
	"mlir_op": attr.label(mandatory = True, allow_single_file = True),
	"output": attr.output(mandatory = True, doc = "The generated file"),
	"tile_size": attr.string(mandatory = True),
	"unroll_factors": attr.string(),
	"gpu_archs": attr.string_list(mandatory = True),
	"extra_args": attr.string_list(),
	"_tfso": attr.label(
	default = Label("//tensorflow:libtensorflow_framework.so.2"),
	cfg = "host",
	allow_single_file = True,
	),
	"_tool": attr.label(
	executable = True,
	default = Label("//tensorflow/compiler/mlir/tools/kernel_gen:tf_to_kernel"),
	cfg = "host",
	),
	},
	output_to_genfiles = True,
	implementation = _gen_unranked_kernel_fatbin_impl,
	)

	def gen_unranked_kernel_library(name, types, tile_size, tags = [], unroll_factors = None, extra_args = []):
	""" Generate a library with unranked kernels for a specific tensorflow op.

	Args:
	name: The name of the tensorflow op.
	types: The types ("f16", "f32", "f64") for which a kernel should be generated.
	tile_size: The tiling specification, e.g. "16x16".
	unroll_factors: The unrolling specification, e.g. "4,4"
	tags: The tags which should be added to the library.
	extra_args: Extra arguments to pass to the generator tool.
	"""

	if cuda_gpu_architectures() or rocm_gpu_architectures():
	for type in types:
	_gen_mlir_op(
	name = name,
	type = type,
	unranked = True,
	)
	_gen_unranked_kernel_fatbin_rule(
	name = "{name}_{type}_kernel_generator".format(name = name, type = type),
	mlir_op = "{name}_{type}.mlir".format(name = name, type = type),
	output = "{name}_{type}.a".format(name = name, type = type),
	gpu_archs = rocm_gpu_architectures() if rocm_is_configured() else cuda_gpu_architectures(),
	tile_size = tile_size,
	unroll_factors = unroll_factors,
	extra_args = extra_args,
	)
	native.cc_import(
	name = "{name}_{type}_kernel".format(name = name, type = type),
	static_library = "{name}_{type}.a".format(name = name, type = type),
	)

	# We have to use a sh_test instead of build_test because it doesn't properly find the dependent targets.
	native.sh_test(
	name = "{name}_{type}_gen_test".format(name = name, type = type),
	srcs = ["build_test.sh"],
	tags = ["no_rocm"],
	args = [
	"$(location //tensorflow/compiler/mlir/tools/kernel_gen:tf_to_kernel)",
	"$(location {name}_{type}.mlir)".format(name = name, type = type),
	],
	size = "small",
	data = [
	":{name}_{type}.mlir".format(name = name, type = type),
	"//tensorflow/compiler/mlir/tools/kernel_gen:tf_to_kernel",
	],
	)

	native.cc_library(
	name = name + "_kernels",
	compatible_with = get_compatible_with_cloud(),
	deps = if_gpu_is_configured([":{name}_{type}_kernel".format(name = name, type = type) for type in types]),
	linkstatic = 1,
	tags = tags,
	)

	def gen_kernel_library(name, types, tile_size, tags = [], unroll_factors = None, extra_args = [], generate_ranked = True, generate_unranked = False):
	if (generate_ranked):
	gen_ranked_kernel_library(
	name = name,
	types = types,
	tile_size = tile_size,
	tags = tags,
	unroll_factors = unroll_factors,
	extra_args = extra_args,
	)
	if (generate_unranked):
	gen_unranked_kernel_library(
	name = name + "_unranked",
	types = types,
	tile_size = tile_size,
	tags = tags,
	unroll_factors = unroll_factors,
	extra_args = extra_args,
	)