tensorflow/compiler/aot/tfcompile.bzl - platform/external/tensorflow - Git at Google

 """Build macro that compiles a TensorFlow graph into a cc_library.

 To use from your BUILD file, add the following line to load the macro:

 load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")

 Then call the macro like this:

 tf_library(
     name = "test_graph_tfmatmul",
     config = "test_graph_tfmatmul.config.pbtxt",
     cpp_class = "MatMulComp",
     graph = ":test_graph_tfmatmul.pb",
 )
 """

 load(
     "//tensorflow:tensorflow.bzl",
     "if_android",
     "tf_cc_test",
     "tf_copts",
 )
 load("//tensorflow:tensorflow.bzl", "tfcompile_target_cpu")

 def tf_library(
         name,
         graph,
         config,
         debug_info = None,
         freeze_checkpoint = None,
         freeze_saver = None,
         cpp_class = None,
         gen_test = True,
         gen_benchmark = True,
         visibility = None,
         testonly = None,
         tfcompile_flags = None,
         tfcompile_tool = "//tensorflow/compiler/aot:tfcompile",
         include_standard_runtime_deps = True,
         enable_xla_hlo_profiling = False,
         enable_tracemes = False,
         mlir_components = "None",
         deps = None,
         tags = []):
     """Runs tfcompile to compile a TensorFlow graph into executable code with fast
     math enabled on cpu.

     Given an invocation of tf_library(name="foo", ...), generates the following
     build targets:
       foo:           A cc_library containing the generated header and
                      computation.
       foo_test:      A cc_test with simple tests and benchmarks. Only created if
                      gen_test=True.
       foo_benchmark: A cc_binary that runs a minimal-dependency benchmark,
                      useful for mobile devices or other platforms that can't
                      compile the full test libraries. Only created if
                      gen_benchmark=True.
     The output header is called <name>.h.

     Args:
       name: The name of the build rule.
       graph: The TensorFlow GraphDef to compile.  If the file ends in '.pbtxt'
         it is expected to be in the human-readable proto text format, otherwise
         it is expected to be in the proto binary format.
       config: File containing tensorflow.tf2xla.Config proto.  If the file ends
         in '.pbtxt' it is expected to be in the human-readable proto text
         format, otherwise it is expected to be in the proto binary format.
       freeze_checkpoint: If provided, run freeze_graph with this checkpoint to
         convert variables into constants.
       freeze_saver: If provided, run freeze_graph with this saver, in SaverDef
         binary form, to convert variables into constants.
       cpp_class: The name of the generated C++ class, wrapping the generated
         function.  The syntax of this flag is
         [[<optional_namespace>::],...]<class_name>.  This mirrors the C++ syntax
         for referring to a class, where multiple namespaces may precede the
         class name, separated by double-colons.  The class will be generated in
         the given namespace(s), or if no namespaces are given, within the global
         namespace.
       gen_test: If True, also generate a cc_test rule that builds a simple
         test and benchmark.
       gen_benchmark: If True, also generate a binary with a simple benchmark.
         Unlike the output of gen_test, this benchmark can be run on android.
       visibility: Bazel build visibility.
       testonly:   Bazel testonly attribute.
       tfcompile_flags: Extra flags to pass to tfcompile to control compilation.
       tfcompile_tool: The tfcompile binary. A non-default can be passed to
         use a tfcompile built with extra dependencies.
       include_standard_runtime_deps: If True, the standard list of
         kernel/runtime deps is added to deps.  If False, deps must contain the
         full set of deps needed by the generated library.
       enable_xla_hlo_profiling: Enable XLA HLO profiling in the generated
         program, and emit metadata that lets us pretty-print the gathered
         profile counters.
       enable_tracemes: Tell tfcompile to generate calls to
         TraceMe::Activity{Start|End} around HLO instructions that can be used by
         Xprof to construct profiler timelines.
       mlir_components: When the value is "None", no components use MLIR. When
         the value is "Bridge", use MLIR to translate GraphDef to HLO.
       deps: a list of deps to include on the build rules for the generated
         library, added to the standard deps if standard_runtime_deps is True.
       tags: tags to apply to subsidiary build rules.
     """
     if not cpp_class:
         fail("cpp_class must be specified")

     tfcompile_graph = graph
     if freeze_checkpoint or freeze_saver:
         if not freeze_checkpoint:
             fail("freeze_checkpoint must be specified when freeze_saver is " +
                  "specified")

         freeze_name = "freeze_" + name
         freeze_file = freeze_name + ".pb"

         # First run tfcompile to generate the list of out_nodes.
         #
         # Here and below, we set CUDA_VISIBLE_DEVICES='' to prevent the code we
         # launch from using any GPUs which might be present.  This is important
         # because builds may run concurrently with tests, and tests need to be
         # able to assume that they have control of the full GPU.
         out_nodes_file = "out_nodes_" + freeze_name
         native.genrule(
             name = ("gen_" + out_nodes_file),
             srcs = [config],
             outs = [out_nodes_file],
             cmd = ("CUDA_VISIBLE_DEVICES='' " +
                    "$(location " + tfcompile_tool + ")" +
                    " --config=$(location " + config + ")" +
                    " --dump_fetch_nodes > $@"),
             tools = [tfcompile_tool],
             # Run tfcompile on the build host, rather than forge, since it's
             # typically way faster on the local machine.
             local = 1,
             tags = tags,
         )

         # Now run freeze_graph to convert variables into constants.
         freeze_args = (
             " --input_graph=$(location " + graph + ")" +
             " --checkpoint_version=1" +
             " --input_binary=" + str(not graph.endswith(".pbtxt")) +
             " --input_checkpoint=$(location " + freeze_checkpoint + ")" +
             " --output_graph=$(location " + freeze_file + ")" +
             " --output_node_names=$$(<$(location " + out_nodes_file +
             "))"
         )
         freeze_saver_srcs = []
         if freeze_saver:
             freeze_args += " --input_saver=$(location " + freeze_saver + ")"
             freeze_saver_srcs += [freeze_saver]
         native.genrule(
             name = freeze_name,
             srcs = [
                 graph,
                 freeze_checkpoint,
                 out_nodes_file,
             ] + freeze_saver_srcs,
             outs = [freeze_file],
             cmd = (
                 "CUDA_VISIBLE_DEVICES='' " +
                 "$(location " +
                 "//tensorflow/python/tools:freeze_graph)" +
                 freeze_args
             ),
             tools = ["//tensorflow/python/tools:freeze_graph"],
             tags = tags,
         )
         tfcompile_graph = freeze_file

     # Rule that runs tfcompile to produce the header and object file.
     header_file = name + ".h"
     metadata_object_file = name + "_tfcompile_metadata.o"
     function_object_file = name + "_tfcompile_function.o"

     # The XLA backends morph kernal name prefix __ that is not in the form of
     # __xla_.
     ep = ("__xla_" + native.package_name() + "__" + name).replace("/", "_")
     if type(tfcompile_flags) == type(""):
         flags = tfcompile_flags
     else:
         flags = " ".join([
             "'" + arg.replace("'", "'\\''") + "'"
             for arg in (tfcompile_flags or [])
         ])

     # Do this before we append the `select` into `flags`, because doing so
     # transforms `flags` into a variable of type `select`, and we can't call
     # `find` on such an object.
     need_xla_data_proto = flags and flags.find("--gen_program_shape") != -1

     target_cpu = tfcompile_target_cpu()
     extra_flags = "--target_cpu=" + target_cpu + " " if target_cpu else " "
     flags = extra_flags + flags

     if enable_xla_hlo_profiling:
         profiling_flag = "--xla_hlo_profile"
     else:
         profiling_flag = ""

     if enable_tracemes:
         traceme_flag = "--xla_cpu_enable_xprof_traceme=true"
     else:
         traceme_flag = "--xla_cpu_enable_xprof_traceme=false"

     mlir_flag = "--mlir_components=" + mlir_components

     srcs = [tfcompile_graph, config]
     debug_info_flag = ""
     if debug_info:
         srcs.append(debug_info)
         debug_info_flag = " --debug_info=$(location " + debug_info + ")"

     default_fast_math_xla_flags = ("XLA_FLAGS='" +
                                    "--xla_cpu_enable_fast_math=true " +
                                    "--xla_cpu_fast_math_honor_nans=false " +
                                    "--xla_cpu_fast_math_honor_infs=false " +
                                    "--xla_cpu_fast_math_honor_functions=false " +
                                    "--xla_cpu_fast_math_honor_division=false " +
                                    "--xla_cpu_enable_fast_min_max=true " +
                                    "$${XLA_FLAGS:-}' ")

     native.genrule(
         name = ("gen_" + name),
         srcs = srcs,
         outs = [
             header_file,
             metadata_object_file,
             function_object_file,
         ],
         cmd = (
             default_fast_math_xla_flags +
             "CUDA_VISIBLE_DEVICES='' " +
             "$(location " + tfcompile_tool + ")" +
             " --graph=$(location " + tfcompile_graph + ")" +
             debug_info_flag +
             " --config=$(location " + config + ")" +
             " --entry_point=" + ep +
             " --cpp_class=" + cpp_class +
             " --target_triple=" + target_llvm_triple() +
             " --out_header=$(@D)/" + header_file +
             " --out_metadata_object=$(@D)/" + metadata_object_file +
             " --out_function_object=$(@D)/" + function_object_file +
             " " + flags + " " + profiling_flag + " " + mlir_flag + " " + traceme_flag
         ),
         tools = [tfcompile_tool],
         visibility = visibility,
         testonly = testonly,
         # Run tfcompile on the build host since it's typically faster on the
         # local machine.
         #
         # Note that setting the local=1 attribute on a *test target* causes the
         # test infrastructure to skip that test.  However this is a genrule, not
         # a test target, and runs with --strategy=Genrule=forced_forge, meaning
         # the local=1 attribute is ignored, and the genrule is still run.
         #
         # https://www.bazel.io/versions/master/docs/be/general.html#genrule
         local = 1,
         tags = tags,
     )

     # Rule that runs tfcompile to produce the SessionModule proto, useful for
     # debugging.  TODO(b/64813587): Once the SessionModule proto is
     # deterministic, move this into the main rule above.
     session_module_pb = name + "_session_module.pb"
     native.genrule(
         name = (name + "_session_module"),
         srcs = srcs,
         outs = [
             session_module_pb,
         ],
         cmd = (
             default_fast_math_xla_flags +
             "CUDA_VISIBLE_DEVICES='' " +
             "$(location " + tfcompile_tool + ")" +
             " --graph=$(location " + tfcompile_graph + ")" +
             debug_info_flag +
             " --config=$(location " + config + ")" +
             " --entry_point=" + ep +
             " --cpp_class=" + cpp_class +
             " --target_triple=" + target_llvm_triple() +
             " --out_session_module=$(@D)/" + session_module_pb +
             " " + flags
         ),
         tools = [tfcompile_tool],
         visibility = visibility,
         testonly = testonly,
         local = 1,
         tags = tags,
     )

     # The cc_library rule packaging up the header and object file, and needed
     # kernel implementations.
     native.cc_library(
         name = name,
         srcs = [function_object_file, metadata_object_file],
         hdrs = [header_file],
         visibility = visibility,
         testonly = testonly,
         deps = [
             # These deps are required by all tf_library targets even if
             # include_standard_runtime_deps is False.  Without them, the
             # generated code will fail to compile.
             "//tensorflow/compiler/tf2xla:xla_compiled_cpu_function",
             "//tensorflow/core:framework_lite",
         ] + (need_xla_data_proto and [
             # If we're generating the program shape, we must depend on the
             # proto.
             "//tensorflow/compiler/xla:xla_data_proto_cc",
         ] or []) + (enable_xla_hlo_profiling and [
             "//tensorflow/compiler/xla/service:hlo_profile_printer_data_cc",
         ] or []) + (include_standard_runtime_deps and [
             # TODO(cwhipkey): only depend on kernel code that the model actually
             # needed.
             "//tensorflow/compiler/xla/service/cpu:runtime_conv2d",
             "//tensorflow/compiler/xla/service/cpu:runtime_key_value_sort",
             "//tensorflow/compiler/xla/service/cpu:runtime_matmul",
             "//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_conv2d",
             "//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_matmul",
             "//third_party/eigen3",
         ] or []) + (deps or []),
         tags = tags,
     )

     # Variables used for gen_test and gen_benchmark.
     cpp_class_split = cpp_class.rsplit("::", 2)
     if len(cpp_class_split) == 1:
         no_ns_name = cpp_class_split[0]
     else:
         no_ns_name = cpp_class_split[1]
     sed_replace = (
         "-e \"s|{{TFCOMPILE_HEADER}}|$(location " + header_file + ")|g\" " +
         "-e \"s|{{TFCOMPILE_CPP_CLASS}}|" + cpp_class + "|g\" " +
         "-e \"s|{{TFCOMPILE_NAME}}|" + no_ns_name + "|g\" "
     )

     if gen_test:
         test_name = name + "_test"
         test_file = test_name + ".cc"

         # Rule to rewrite test.cc to produce the test_file.
         native.genrule(
             name = ("gen_" + test_name),
             testonly = 1,
             srcs = [
                 "//tensorflow/compiler/aot:test.cc",
                 header_file,
             ],
             outs = [test_file],
             cmd = (
                 "sed " + sed_replace +
                 " $(location //tensorflow/compiler/aot:test.cc) " +
                 "> $(OUTS)"
             ),
             tags = tags,
         )

         # The cc_test rule for the generated code.  To ensure that this works
         # reliably across build configurations, we must use tf_cc_test instead
         # of native.cc_test.  This is related to how we build
         # //tensorflow/core:lib -- see the note in
         # tensorflow/core/BUILD for more details.
         tf_cc_test(
             name = test_name,
             srcs = [test_file],
             deps = [
                 ":" + name,
                 "//tensorflow/compiler/aot:tf_library_test_main",
                 "//tensorflow/compiler/xla:executable_run_options",
                 "//third_party/eigen3",
                 "//tensorflow/core:lib",
                 "//tensorflow/core:test",
             ],
             tags = tags,
         )

     if gen_benchmark:
         benchmark_name = name + "_benchmark"
         benchmark_file = benchmark_name + ".cc"
         benchmark_main = ("//tensorflow/compiler/aot:" +
                           "benchmark_main.template")

         # Rule to rewrite benchmark.cc to produce the benchmark_file.
         native.genrule(
             name = ("gen_" + benchmark_name),
             srcs = [
                 benchmark_main,
                 header_file,
             ],
             testonly = testonly,
             outs = [benchmark_file],
             cmd = ("sed " + sed_replace +
                    " $(location " + benchmark_main + ") " +
                    "> $(OUTS)"),
             tags = tags,
         )

         # The cc_benchmark rule for the generated code.  This does not need the
         # tf_cc_binary since we (by deliberate design) do not depend on
         # //tensorflow/core:lib.
         #
         # Note: to get smaller size on android for comparison, compile with:
         #    --copt=-fvisibility=hidden
         #    --copt=-D_LIBCPP_TYPE_VIS=_LIBCPP_HIDDEN
         #    --copt=-D_LIBCPP_EXCEPTION_ABI=_LIBCPP_HIDDEN
         native.cc_binary(
             name = benchmark_name,
             srcs = [benchmark_file],
             testonly = testonly,
             copts = tf_copts(),
             linkopts = if_android(["-pie", "-s"]),
             deps = [
                 ":" + name,
                 "//tensorflow/compiler/aot:benchmark",
                 "//tensorflow/compiler/xla:executable_run_options",
                 "//third_party/eigen3",
             ] + if_android([
                 "//tensorflow/compiler/aot:benchmark_extra_android",
             ]),
             tags = tags,
         )

 def target_llvm_triple():
     """Returns the target LLVM triple to be used for compiling the target."""

     # TODO(toddw): Add target_triple for other targets.  For details see:
     # http://llvm.org/docs/doxygen/html/Triple_8h_source.html
     return select({
         "//tensorflow:android_armeabi": "armv5-none-android",
         "//tensorflow:android_arm": "armv7-none-android",
         "//tensorflow:android_arm64": "aarch64-none-android",
         "//tensorflow:android_x86": "i686-none-android",
         "//tensorflow:ios": "arm64-none-ios",
         "//tensorflow:ios_x86_64": "x86_64-apple-ios",
         "//tensorflow:linux_ppc64le": "ppc64le-ibm-linux-gnu",
         "//tensorflow:macos": "x86_64-none-darwin",
         "//tensorflow:windows": "x86_64-none-windows",
         "//tensorflow:linux_s390x": "systemz-none-linux-gnu",
         "//conditions:default": "x86_64-pc-linux",
     })
	"""Build macro that compiles a TensorFlow graph into a cc_library.

	To use from your BUILD file, add the following line to load the macro:

	load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")

	Then call the macro like this:

	tf_library(
	name = "test_graph_tfmatmul",
	config = "test_graph_tfmatmul.config.pbtxt",
	cpp_class = "MatMulComp",
	graph = ":test_graph_tfmatmul.pb",
	)
	"""

	load(
	"//tensorflow:tensorflow.bzl",
	"if_android",
	"tf_cc_test",
	"tf_copts",
	)
	load("//tensorflow:tensorflow.bzl", "tfcompile_target_cpu")

	def tf_library(
	name,
	graph,
	config,
	debug_info = None,
	freeze_checkpoint = None,
	freeze_saver = None,
	cpp_class = None,
	gen_test = True,
	gen_benchmark = True,
	visibility = None,
	testonly = None,
	tfcompile_flags = None,
	tfcompile_tool = "//tensorflow/compiler/aot:tfcompile",
	include_standard_runtime_deps = True,
	enable_xla_hlo_profiling = False,
	enable_tracemes = False,
	mlir_components = "None",
	deps = None,
	tags = []):
	"""Runs tfcompile to compile a TensorFlow graph into executable code with fast
	math enabled on cpu.

	Given an invocation of tf_library(name="foo", ...), generates the following
	build targets:
	foo: A cc_library containing the generated header and
	computation.
	foo_test: A cc_test with simple tests and benchmarks. Only created if
	gen_test=True.
	foo_benchmark: A cc_binary that runs a minimal-dependency benchmark,
	useful for mobile devices or other platforms that can't
	compile the full test libraries. Only created if
	gen_benchmark=True.
	The output header is called <name>.h.

	Args:
	name: The name of the build rule.
	graph: The TensorFlow GraphDef to compile. If the file ends in '.pbtxt'
	it is expected to be in the human-readable proto text format, otherwise
	it is expected to be in the proto binary format.
	config: File containing tensorflow.tf2xla.Config proto. If the file ends
	in '.pbtxt' it is expected to be in the human-readable proto text
	format, otherwise it is expected to be in the proto binary format.
	freeze_checkpoint: If provided, run freeze_graph with this checkpoint to
	convert variables into constants.
	freeze_saver: If provided, run freeze_graph with this saver, in SaverDef
	binary form, to convert variables into constants.
	cpp_class: The name of the generated C++ class, wrapping the generated
	function. The syntax of this flag is
	[[<optional_namespace>::],...]<class_name>. This mirrors the C++ syntax
	for referring to a class, where multiple namespaces may precede the
	class name, separated by double-colons. The class will be generated in
	the given namespace(s), or if no namespaces are given, within the global
	namespace.
	gen_test: If True, also generate a cc_test rule that builds a simple
	test and benchmark.
	gen_benchmark: If True, also generate a binary with a simple benchmark.
	Unlike the output of gen_test, this benchmark can be run on android.
	visibility: Bazel build visibility.
	testonly: Bazel testonly attribute.
	tfcompile_flags: Extra flags to pass to tfcompile to control compilation.
	tfcompile_tool: The tfcompile binary. A non-default can be passed to
	use a tfcompile built with extra dependencies.
	include_standard_runtime_deps: If True, the standard list of
	kernel/runtime deps is added to deps. If False, deps must contain the
	full set of deps needed by the generated library.
	enable_xla_hlo_profiling: Enable XLA HLO profiling in the generated
	program, and emit metadata that lets us pretty-print the gathered
	profile counters.
	enable_tracemes: Tell tfcompile to generate calls to
	TraceMe::Activity{Start\|End} around HLO instructions that can be used by
	Xprof to construct profiler timelines.
	mlir_components: When the value is "None", no components use MLIR. When
	the value is "Bridge", use MLIR to translate GraphDef to HLO.
	deps: a list of deps to include on the build rules for the generated
	library, added to the standard deps if standard_runtime_deps is True.
	tags: tags to apply to subsidiary build rules.
	"""
	if not cpp_class:
	fail("cpp_class must be specified")

	tfcompile_graph = graph
	if freeze_checkpoint or freeze_saver:
	if not freeze_checkpoint:
	fail("freeze_checkpoint must be specified when freeze_saver is " +
	"specified")

	freeze_name = "freeze_" + name
	freeze_file = freeze_name + ".pb"

	# First run tfcompile to generate the list of out_nodes.
	#
	# Here and below, we set CUDA_VISIBLE_DEVICES='' to prevent the code we
	# launch from using any GPUs which might be present. This is important
	# because builds may run concurrently with tests, and tests need to be
	# able to assume that they have control of the full GPU.
	out_nodes_file = "out_nodes_" + freeze_name
	native.genrule(
	name = ("gen_" + out_nodes_file),
	srcs = [config],
	outs = [out_nodes_file],
	cmd = ("CUDA_VISIBLE_DEVICES='' " +
	"$(location " + tfcompile_tool + ")" +
	" --config=$(location " + config + ")" +
	" --dump_fetch_nodes > $@"),
	tools = [tfcompile_tool],
	# Run tfcompile on the build host, rather than forge, since it's
	# typically way faster on the local machine.
	local = 1,
	tags = tags,
	)

	# Now run freeze_graph to convert variables into constants.
	freeze_args = (
	" --input_graph=$(location " + graph + ")" +
	" --checkpoint_version=1" +
	" --input_binary=" + str(not graph.endswith(".pbtxt")) +
	" --input_checkpoint=$(location " + freeze_checkpoint + ")" +
	" --output_graph=$(location " + freeze_file + ")" +
	" --output_node_names=$$(<$(location " + out_nodes_file +
	"))"
	)
	freeze_saver_srcs = []
	if freeze_saver:
	freeze_args += " --input_saver=$(location " + freeze_saver + ")"
	freeze_saver_srcs += [freeze_saver]
	native.genrule(
	name = freeze_name,
	srcs = [
	graph,
	freeze_checkpoint,
	out_nodes_file,
	] + freeze_saver_srcs,
	outs = [freeze_file],
	cmd = (
	"CUDA_VISIBLE_DEVICES='' " +
	"$(location " +
	"//tensorflow/python/tools:freeze_graph)" +
	freeze_args
	),
	tools = ["//tensorflow/python/tools:freeze_graph"],
	tags = tags,
	)
	tfcompile_graph = freeze_file

	# Rule that runs tfcompile to produce the header and object file.
	header_file = name + ".h"
	metadata_object_file = name + "_tfcompile_metadata.o"
	function_object_file = name + "_tfcompile_function.o"

	# The XLA backends morph kernal name prefix __ that is not in the form of
	# __xla_.
	ep = ("__xla_" + native.package_name() + "__" + name).replace("/", "_")
	if type(tfcompile_flags) == type(""):
	flags = tfcompile_flags
	else:
	flags = " ".join([
	"'" + arg.replace("'", "'\\''") + "'"
	for arg in (tfcompile_flags or [])
	])

	# Do this before we append the `select` into `flags`, because doing so
	# transforms `flags` into a variable of type `select`, and we can't call
	# `find` on such an object.
	need_xla_data_proto = flags and flags.find("--gen_program_shape") != -1

	target_cpu = tfcompile_target_cpu()
	extra_flags = "--target_cpu=" + target_cpu + " " if target_cpu else " "
	flags = extra_flags + flags

	if enable_xla_hlo_profiling:
	profiling_flag = "--xla_hlo_profile"
	else:
	profiling_flag = ""

	if enable_tracemes:
	traceme_flag = "--xla_cpu_enable_xprof_traceme=true"
	else:
	traceme_flag = "--xla_cpu_enable_xprof_traceme=false"

	mlir_flag = "--mlir_components=" + mlir_components

	srcs = [tfcompile_graph, config]
	debug_info_flag = ""
	if debug_info:
	srcs.append(debug_info)
	debug_info_flag = " --debug_info=$(location " + debug_info + ")"

	default_fast_math_xla_flags = ("XLA_FLAGS='" +
	"--xla_cpu_enable_fast_math=true " +
	"--xla_cpu_fast_math_honor_nans=false " +
	"--xla_cpu_fast_math_honor_infs=false " +
	"--xla_cpu_fast_math_honor_functions=false " +
	"--xla_cpu_fast_math_honor_division=false " +
	"--xla_cpu_enable_fast_min_max=true " +
	"$${XLA_FLAGS:-}' ")

	native.genrule(
	name = ("gen_" + name),
	srcs = srcs,
	outs = [
	header_file,
	metadata_object_file,
	function_object_file,
	],
	cmd = (
	default_fast_math_xla_flags +
	"CUDA_VISIBLE_DEVICES='' " +
	"$(location " + tfcompile_tool + ")" +
	" --graph=$(location " + tfcompile_graph + ")" +
	debug_info_flag +
	" --config=$(location " + config + ")" +
	" --entry_point=" + ep +
	" --cpp_class=" + cpp_class +
	" --target_triple=" + target_llvm_triple() +
	" --out_header=$(@D)/" + header_file +
	" --out_metadata_object=$(@D)/" + metadata_object_file +
	" --out_function_object=$(@D)/" + function_object_file +
	" " + flags + " " + profiling_flag + " " + mlir_flag + " " + traceme_flag
	),
	tools = [tfcompile_tool],
	visibility = visibility,
	testonly = testonly,
	# Run tfcompile on the build host since it's typically faster on the
	# local machine.
	#
	# Note that setting the local=1 attribute on a test target causes the
	# test infrastructure to skip that test. However this is a genrule, not
	# a test target, and runs with --strategy=Genrule=forced_forge, meaning
	# the local=1 attribute is ignored, and the genrule is still run.
	#
	# https://www.bazel.io/versions/master/docs/be/general.html#genrule
	local = 1,
	tags = tags,
	)

	# Rule that runs tfcompile to produce the SessionModule proto, useful for
	# debugging. TODO(b/64813587): Once the SessionModule proto is
	# deterministic, move this into the main rule above.
	session_module_pb = name + "_session_module.pb"
	native.genrule(
	name = (name + "_session_module"),
	srcs = srcs,
	outs = [
	session_module_pb,
	],
	cmd = (
	default_fast_math_xla_flags +
	"CUDA_VISIBLE_DEVICES='' " +
	"$(location " + tfcompile_tool + ")" +
	" --graph=$(location " + tfcompile_graph + ")" +
	debug_info_flag +
	" --config=$(location " + config + ")" +
	" --entry_point=" + ep +
	" --cpp_class=" + cpp_class +
	" --target_triple=" + target_llvm_triple() +
	" --out_session_module=$(@D)/" + session_module_pb +
	" " + flags
	),
	tools = [tfcompile_tool],
	visibility = visibility,
	testonly = testonly,
	local = 1,
	tags = tags,
	)

	# The cc_library rule packaging up the header and object file, and needed
	# kernel implementations.
	native.cc_library(
	name = name,
	srcs = [function_object_file, metadata_object_file],
	hdrs = [header_file],
	visibility = visibility,
	testonly = testonly,
	deps = [
	# These deps are required by all tf_library targets even if
	# include_standard_runtime_deps is False. Without them, the
	# generated code will fail to compile.
	"//tensorflow/compiler/tf2xla:xla_compiled_cpu_function",
	"//tensorflow/core:framework_lite",
	] + (need_xla_data_proto and [
	# If we're generating the program shape, we must depend on the
	# proto.
	"//tensorflow/compiler/xla:xla_data_proto_cc",
	] or []) + (enable_xla_hlo_profiling and [
	"//tensorflow/compiler/xla/service:hlo_profile_printer_data_cc",
	] or []) + (include_standard_runtime_deps and [
	# TODO(cwhipkey): only depend on kernel code that the model actually
	# needed.
	"//tensorflow/compiler/xla/service/cpu:runtime_conv2d",
	"//tensorflow/compiler/xla/service/cpu:runtime_key_value_sort",
	"//tensorflow/compiler/xla/service/cpu:runtime_matmul",
	"//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_conv2d",
	"//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_matmul",
	"//third_party/eigen3",
	] or []) + (deps or []),
	tags = tags,
	)

	# Variables used for gen_test and gen_benchmark.
	cpp_class_split = cpp_class.rsplit("::", 2)
	if len(cpp_class_split) == 1:
	no_ns_name = cpp_class_split[0]
	else:
	no_ns_name = cpp_class_split[1]
	sed_replace = (
	"-e \"s\|{{TFCOMPILE_HEADER}}\|$(location " + header_file + ")\|g\" " +
	"-e \"s\|{{TFCOMPILE_CPP_CLASS}}\|" + cpp_class + "\|g\" " +
	"-e \"s\|{{TFCOMPILE_NAME}}\|" + no_ns_name + "\|g\" "
	)

	if gen_test:
	test_name = name + "_test"
	test_file = test_name + ".cc"

	# Rule to rewrite test.cc to produce the test_file.
	native.genrule(
	name = ("gen_" + test_name),
	testonly = 1,
	srcs = [
	"//tensorflow/compiler/aot:test.cc",
	header_file,
	],
	outs = [test_file],
	cmd = (
	"sed " + sed_replace +
	" $(location //tensorflow/compiler/aot:test.cc) " +
	"> $(OUTS)"
	),
	tags = tags,
	)

	# The cc_test rule for the generated code. To ensure that this works
	# reliably across build configurations, we must use tf_cc_test instead
	# of native.cc_test. This is related to how we build
	# //tensorflow/core:lib -- see the note in
	# tensorflow/core/BUILD for more details.
	tf_cc_test(
	name = test_name,
	srcs = [test_file],
	deps = [
	":" + name,
	"//tensorflow/compiler/aot:tf_library_test_main",
	"//tensorflow/compiler/xla:executable_run_options",
	"//third_party/eigen3",
	"//tensorflow/core:lib",
	"//tensorflow/core:test",
	],
	tags = tags,
	)

	if gen_benchmark:
	benchmark_name = name + "_benchmark"
	benchmark_file = benchmark_name + ".cc"
	benchmark_main = ("//tensorflow/compiler/aot:" +
	"benchmark_main.template")

	# Rule to rewrite benchmark.cc to produce the benchmark_file.
	native.genrule(
	name = ("gen_" + benchmark_name),
	srcs = [
	benchmark_main,
	header_file,
	],
	testonly = testonly,
	outs = [benchmark_file],
	cmd = ("sed " + sed_replace +
	" $(location " + benchmark_main + ") " +
	"> $(OUTS)"),
	tags = tags,
	)

	# The cc_benchmark rule for the generated code. This does not need the
	# tf_cc_binary since we (by deliberate design) do not depend on
	# //tensorflow/core:lib.
	#
	# Note: to get smaller size on android for comparison, compile with:
	# --copt=-fvisibility=hidden
	# --copt=-D_LIBCPP_TYPE_VIS=_LIBCPP_HIDDEN
	# --copt=-D_LIBCPP_EXCEPTION_ABI=_LIBCPP_HIDDEN
	native.cc_binary(
	name = benchmark_name,
	srcs = [benchmark_file],
	testonly = testonly,
	copts = tf_copts(),
	linkopts = if_android(["-pie", "-s"]),
	deps = [
	":" + name,
	"//tensorflow/compiler/aot:benchmark",
	"//tensorflow/compiler/xla:executable_run_options",
	"//third_party/eigen3",
	] + if_android([
	"//tensorflow/compiler/aot:benchmark_extra_android",
	]),
	tags = tags,
	)

	def target_llvm_triple():
	"""Returns the target LLVM triple to be used for compiling the target."""

	# TODO(toddw): Add target_triple for other targets. For details see:
	# http://llvm.org/docs/doxygen/html/Triple_8h_source.html
	return select({
	"//tensorflow:android_armeabi": "armv5-none-android",
	"//tensorflow:android_arm": "armv7-none-android",
	"//tensorflow:android_arm64": "aarch64-none-android",
	"//tensorflow:android_x86": "i686-none-android",
	"//tensorflow:ios": "arm64-none-ios",
	"//tensorflow:ios_x86_64": "x86_64-apple-ios",
	"//tensorflow:linux_ppc64le": "ppc64le-ibm-linux-gnu",
	"//tensorflow:macos": "x86_64-none-darwin",
	"//tensorflow:windows": "x86_64-none-windows",
	"//tensorflow:linux_s390x": "systemz-none-linux-gnu",
	"//conditions:default": "x86_64-pc-linux",
	})