blob: f2679ac5ff6fd5bdc8f7092d2e79271eb077df06 [file] [log] [blame]
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
include "mlir/Pass/PassBase.td"
// TF dialect passes.
def TensorflowGPUFusion : Pass<"tf-gpu-op-fusion", "mlir::func::FuncOp"> {
let summary = "Fusion optimization for GPU targets";
let description = [{
This pass is performing fusion specific to GPU targets. This is an ad-hoc
pass for now, but should be integrated with some notion of "target" in the
MLIR pipeline in the future.
}];
let constructor = "TF::CreateGpuOpFusionPass()";
}
def BatchMatMulToEinsumPass : Pass<"tf-batch-matmul-to-tf-einsum", "mlir::func::FuncOp"> {
let summary = "Replace TF BatchMatMul op by TF Einsum op.";
let constructor = "TF::CreateBatchMatMulToEinsumPass()";
}
def CanonicalizeCompileAndReplicateAttributesPass : Pass<"tf-canonicalize-compile-and-replicate-attributes", "mlir::func::FuncOp"> {
let summary = "Canonicalize compilation and replication attributes.";
let description = [{
A pass that converts existing compilation and replication attributes into
unified attributes. For example, `_tpu_replicate="cluster"` in the
following code
```mlir
%control = tf_executor.island wraps "tf.TPUReplicateMetadata"() {_tpu_replicate = "cluster", allow_soft_placement = false, computation_shape = [], device = "", device_assignment = [], host_compute_core = [], name = "TPUReplicateMetadata", num_cores_per_replica = 1 : i64, num_replicas = 1 : i64, step_marker_location = "STEP_MARK_AT_ENTRY", topology = "", use_tpu = true, use_spmd_for_xla_partitioning = false} : () -> ()
```
wll be replaced by `_replication_info="cluster"` and `_xla_compile_device_type="TPU"`.
```mlir
%control = tf_executor.island wraps "tf.TPUReplicateMetadata"() {_replication_info = "cluster", _xla_compile_device_type = "TPU", allow_soft_placement = false, computation_shape = [], device = "", device_assignment = [], host_compute_core = [], name = "TPUReplicateMetadata", num_cores_per_replica = 1 : i64, num_replicas = 1 : i64, step_marker_location = "STEP_MARK_AT_ENTRY", topology = "", use_spmd_for_xla_partitioning = false, use_tpu = true} : () -> ()
```
}];
let constructor = "TFTPU::CreateCanonicalizeCompileAndReplicateAttributesPass()";
}
def ClusterTFOpsByHostPass : Pass<"cluster-tf-ops-by-host", "ModuleOp"> {
let summary = "Cluster the TensorFlow ops by host so that each function only "
"contains ops placed on the same host";
let constructor = "TF::CreateClusterTFOpsByHostPass()";
}
def ConvertToLegacyCompileAndReplicateAttributesPass : Pass<"tf-convert-to-legacy-compile-and-replicate-attributes", "mlir::func::FuncOp"> {
let summary = "Convert unified compilation and replication attributes back to legacy attributes.";
let description = [{
This transformation pass converts unified compilation and replication
attributes (`_replication_info` and `_xla_compile_device_type`) into legacy
attributes. This ensures the unified attributes do not get exposed outside
of the MLIR bridge with V1 pipeline in some cases. The pass expects to have
either none or both of the unified attributes present in an op for the
conversion to happen. Otherwise it will fail.
For example, `_replication_info="cluster"` and
`_xla_compile_device_type="TPU"` in the following code
```mlir
%control = tf_executor.island wraps "tf.TPUReplicateMetadata"() {_replication_info = "cluster", _xla_compile_device_type = "TPU", allow_soft_placement = false, computation_shape = [], device = "", device_assignment = [], host_compute_core = [], name = "TPUReplicateMetadata", num_cores_per_replica = 1 : i64, num_replicas = 1 : i64, step_marker_location = "STEP_MARK_AT_ENTRY", topology = "", use_spmd_for_xla_partitioning = false, use_tpu = true} : () -> ()
```
wll be replaced by `_tpu_replicate="cluster"` as follows,
```mlir
%control = tf_executor.island wraps "tf.TPUReplicateMetadata"() {_tpu_replicate = "cluster", allow_soft_placement = false, computation_shape = [], device = "", device_assignment = [], host_compute_core = [], name = "TPUReplicateMetadata", num_cores_per_replica = 1 : i64, num_replicas = 1 : i64, step_marker_location = "STEP_MARK_AT_ENTRY", topology = "", use_tpu = true, use_spmd_for_xla_partitioning = false} : () -> ()
```
}];
let constructor = "TFTPU::CreateConvertToLegacyCompileAndReplicateAttributesPass()";
}
def ResourceDeviceInferencePass : Pass<"tf-resource-device-inference", "ModuleOp"> {
let summary = "Propagates the device attribute on resources from callers to "
"callees.";
let constructor = "TF::CreateResourceDeviceInferencePass()";
let description = [{
A pass that propagates device assignment of resources on a module. It
performs in-function propagation, as well as cross-function propagation from
callers to callees.
This pass changes the module by adding "tf.device" attribute to function
arguments and adding "device" attribute to TF ops.
For example, given the function
```mlir
!tf_res = type tensor<*x!tf_type.resource<tensor<32xf32>>>
func @test(%arg0: !tf_res {tf.device = "/TPU:0"}) {
tf_executor.graph {
%control = tf_executor.island {
%id0 = "tf.Identity"(%arg0) : (!tf_res) -> !tf_res
tf_executor.yield
}
tf_executor.fetch %control : !tf_executor.control
}
return
}
```
Observe how the op inside the island obtains a `/TPU:0` device assignment:
```mlir
!tf_res = type tensor<*x!tf_type.resource<tensor<32xf32>>>
func @test(%arg0: !tf_res {tf.device = "/TPU:0"}) {
tf_executor.graph {
%control = tf_executor.island {
%0 = "tf.Identity"(%arg0) {device = "/TPU:0"} : (!tf_res) -> !tf_res
tf_executor.yield
}
tf_executor.fetch %control : !tf_executor.control
}
return
}
```
}];
}
def StackOpsDecompositionPass : Pass<"tf-stack-ops-decomposition", "ModuleOp"> {
let summary = "Decompose stack operations into local variable operations. Needs "
"static shapes.";
let constructor = "TF::CreateStackOpsDecompositionPass()";
let description = [{
A pass that converts stack operations to tensor operations and read/assign
ops on local variables. A later resource lifting pass can further remove the
local variables.
This pass requires that the full shape of the stack can be inferred: 1) the
maximum size needs to be a constant and 2) a push op can be found with a
known shape, and all push ops need to have the same shape.
A stack creation op "tf.StackV2" will be turned in to two zero-initialized
variables, for the buffer and current size. Each push will be turned into
```mlir
%old_val = "tf.ReadVariableOp"(%buffer)
%old_size = "tf.ReadVariableOp"(%size)
%offsets = "tf.ConcatV2"(%old_size, %other_dims_0s, %const0)
%new_val = "tf.XlaDynamicUpdateSlice"(%old_val, %push_val, %offsets)
"tf.AssignVariableOp"(%buffer, %new_val)
%new_size = "tf.AddV2"(%old_size, %const1)
"tf.AssignVariableOp"(%size, %new_size)
```
and each pop will be turned into
```mlir
%old_val = "tf.ReadVariableOp"(%buffer)
%old_size = "tf.ReadVariableOp"(%size)
%new_size = "tf.Sub"(%old_size, %const1)
%offsets = "tf.ConcatV2"(%old_size, %other_dims_0s, %const0)
%slice = "tf.Slice"(%old_val, %offsets, %slice_size_const)
%pop_result = "tf.Reshape"(%slice, %elem_size_const)
"tf.AssignVariableOp"(%size, %new_size)
```
The pass also works across control flow and functional calls.
}];
}
def TPUVariableRuntimeReformattingPass : Pass<"tf-tpu-variable-runtime-reformatting", "ModuleOp"> {
let summary = "Adds device variable formatting op to allow compilation-guided "
"variable formatting.";
let constructor = "TFTPU::CreateTPUVariableRuntimeReformattingPass()";
let description = [{
A pass that takes advantage of a loop to add ops that allow the execution to
avoid repeatedly formatting variables back and forth. The desired formatting
is determined by TPU program compilation, so this pass does not include how
to reformat the variables, but only inserts general TPUReshardVariablesOps in
proper places, and TPUReshardVariablesOps interpret the compilation.
The core idea of this optimization is to keep track of the formatting state
of variables, and when the next desired state does not change, it can avoid
reformatting. We associate a set of variables on a device with a formatting
state, and TPUReshardVariablesOps compares the current state with a desired
state (which can be the compilation result). If they mismatch,
TPUReshardVariablesOp reformats the variables to the desired state; if they
match, TPUReshardVariablesOp is a no-op.
A major use of this pass is weight-update sharding in data parallelism, so we
require there is a tf_device.replicate in the loop.
For example, suppose we have a training loop (for simplicity we write the
loop body inine):
```mlir
%var0 = ...
%var1 = ...
tf.while (..., %var0, %var1) {
tf_device.replicate ([%var0, %var1] as %rvar) {
%compile:2 = "tf._TPUCompileMlir"()
tf.TPUExecuteAndUpdateVariablesOp(%rvar, compile#1)
}
}
```
This pass will transform it into
```mlir
%var0 = ...
%var1 = ...
%state_var0 = ...
%state_var1 = ...
tf.while (..., %var0, %var1, %state_var0, %state_var1) {
tf_device.replicate ([%var0, %var1] as %rvar,
[%state_var0, %state_var1] as %rstate) {
%compile:2 = "tf._TPUCompileMlir"()
tf.TPUReshardVariablesOp(%rvar, %compile#1, %rstate)
tf.TPUExecuteAndUpdateVariablesOp(%rvar, compile#1)
}
}
%default_format = tf.constant()
tf_device.replicate ([%var0, %var1] as %rvar,
[%state_var0, %state_var1] as %rstate) {
tf.TPUReshardVariablesOp(%rvar, %default_format, %rstate)
}
```
}];
}
def TPUShardingIdentificationPass : Pass<"tf-tpu-sharding-identification", "ModuleOp"> {
let summary = "Identifies and handles inputs/outputs of TPU computation that is "
"sharded across logical cores.";
let constructor = "TFTPU::CreateTPUShardingIdentificationPass()";
let description = [{
Bubbles up sharding configuration from `cluster_func` regions into
the attributes of `cluster_func`. This is done by parsing the
`XlaSharding` / `TPUPartitionedOutput` / `TPUPartitionedInput` ops inside
`cluster_func`.
For example, given the following `cluster_func` wrapping `func`:
```mlir
func @test(%arg0: tensor<*xi32>) {
"tf_device.cluster_func"(%arg0) {
func = @func,
step_marker_location = ""} : (tensor<*xi32>) -> tensor<*xi32>
return
}
func @func(%arg0: tensor<*xi32>) -> tensor<*xi32> {
%0 = "tf.XlaSharding"(%arg0) {_XlaSharding = "\01\02\03",
sharding = "\01\02\03"} : (tensor<*xi32>) -> tensor<*xi32>
%1 = "tf.A"(%0) : (tensor<*xi32>) -> (tensor<*xi32>)
return %1 : tensor<*xi32>
}
```
Now, cluster_func recieves the following `*_sharding_configuration`
attributes, and `func` receives the mhlo.sharding attribute:
```mlir
func @test(%arg0: tensor<*xi32>) {
%0 = "tf_device.cluster_func"(%arg0) {
func = @func,
input_sharding_configuration = ["\01\02\03"],
output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"],
step_marker_location = ""} : (tensor<*xi32>) -> tensor<*xi32>
return
}
func @func(%arg0: tensor<*xi32> {mhlo.sharding = "\01\02\03"}) ->
(tensor<*xi32> {mhlo.sharding = "\08\01\1A\01\01\22\01\00"}) {
%0 = "tf.XlaSharding"(%arg0) {_XlaSharding = "\01\02\03", sharding = "\01\02\03"} : (tensor<*xi32>) -> tensor<*xi32>
%1 = "tf.A"(%0) : (tensor<*xi32>) -> tensor<*xi32>
return %1 : tensor<*xi32>
}
```
}];
}
def UnrollBatchMatMulPass : Pass<"tf-unroll-batch-matmul", "mlir::func::FuncOp"> {
let summary = "Unroll TF BatchMatMul op into Reshape, Slice, MatMul, Pack ops.";
let constructor = "TF::CreateUnrollBatchMatMulPassPass()";
}
def ClusterFormationPass : Pass<"tf-device-cluster-formation", "mlir::func::FuncOp"> {
let summary = "Form clusters from instructions assigned to same device";
let constructor = "TFDevice::CreateClusterFormationPass()";
let dependentDialects = ["tf_device::TensorFlowDeviceDialect"];
let description = [{
Clusters operations with the same device assignment id. For each
cluster, creates a "tf_device.device_launch" op with a Region containing the
ops in each cluster and replaces the ops with the new launch op.
For example, given the following program:
```mlir
%2 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
%3 = "tf.B"(%2) {device = "tpu0"} : (tensor<?xi32>) -> tensor<?xi32>
%4 = "tf.C"(%2, %3) {device = "tpu0"} : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
%5 = "tf.D"(%4) : (tensor<?xi32>) -> tensor<?xi32>
```
After the pass, we will have:
```mlir
%0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
%1 = "tf_device.launch"() ( {
%3 = "tf.B"(%0) : (tensor<?xi32>) -> tensor<?xi32>
%4 = "tf.C"(%0, %3) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi32>
tf_device.return %4 : tensor<?xi32>
}) {device = "tpu0"} : () -> tensor<?xi32>
%2 = "tf.D"(%1) : (tensor<?xi32>) -> tensor<?xi32>
return %2 : tensor<?xi32>
```
}];
}
def MaterializePassthroughOp : Pass<"tf-materialize-passthrough-op", "mlir::func::FuncOp"> {
let summary = "Materialize the MlirPassthroughOp by replacing it with the "
"MLIR module attached as an attribute";
let constructor = "TF::CreateMaterializePassthroughOpPass()";
let description = [{
A pass that replaces MlirPassthrough ops with the code they have in
their `mlir_module` string attribute.
}];
}
def TransformEinsumPass : Pass<"tf-einsum", "mlir::func::FuncOp"> {
let summary = "Transform Einsum to other TF Ops for the supported variants";
let constructor = "TF::CreateTransformEinsumPass()";
}
def LegalizeHloToTfPass : Pass<"tf-legalize-hlo", "mlir::func::FuncOp"> {
let summary = "Legalize from HLO to the TF dialect";
let dependentDialects = ["TF::TensorFlowDialect"];
let constructor = "TF::CreateLegalizeHloToTfPass()";
}
def LegalizeTFGToTFPass : Pass<"tfe-legalize-tfg", "ModuleOp"> {
let summary = "Legalize from TFG to the TFE dialect";
let constructor = "TF::CreateLegalizeTFGToTFEPass()";
}
def ReplicateToIslandPass : Pass<"tf-replicate-to-island", "mlir::func::FuncOp"> {
let summary = "Lowers device replicate to executor islands";
let constructor = "TFDevice::CreateReplicateToIslandPass()";
}
def ReplicaIDToDeviceOrdinalPass : Pass<"tf-replica-id-to-device-ordinal", "mlir::func::FuncOp"> {
let summary = "Set device ordinal with replica id";
let constructor = "TFDevice::CreateReplicaIDToDeviceOrdinalPass()";
let description = [{
This pass sets the device ordinal attribute of the ops using the replica id
attribute. This is run immediately after the replica_to_island pass which
sets the replica id attribute of these ops. Note for single chip usecase,
the pass will check if there is one op and sets the device ordinal attribute
to be zero.
}];
}
def ConvertReadonlyReferenceVariablesToResourceVariablesPass :
Pass<"tf-readonly-references-to-resources", "mlir::func::FuncOp"> {
let summary = "Convert readonly reference variables to resource variables.";
let constructor = "TF::CreateConvertReadonlyReferenceVariablesToResourceVariablesPass()";
}
def TensorFlowShapeInferencePass : Pass<"tf-shape-inference", "ModuleOp"> {
let summary =
"Shape inference on TF dialect and ops implementing InferTypeOpInterface";
let description = [{
Fixed point shape refinement pass that utilizes the shape functions
registered on ops using the InferTypeOpInterface as well as by bridging to
the TensorFlow op registry's shape functions. This is an interprocedural
pass that propagates information across function calls/control flow
operations where possible (the GuaranteeAllFuncsOneUsePass is often run
before this pass to enable more propagation opportunities). It refines
both the outermost element type of tensors as well as the nested component
type (e.g., for tensor lists).
During shape refinement this pass may insert additional cast operations as
well as fold some constant shape computations to enable more exact shape
inference. Therefore it does do some mutation of the graph. Constant folding
required to produce more exact shapes is also performed but these values
are only kept in the context rather than the ops folded/IR mutated.
}];
let constructor = "TF::CreateTFShapeInferencePass()";
let options = [
Option<"max_iterations_", "max-iterations", "int64_t", /*default=*/"10",
"Maximum shape inference iterations">
];
}
def StripNoinlineAttributePass : Pass<"tf-strip-noinline-attribute", "ModuleOp"> {
let summary = "Strip the tf._noinline attribute from top-level functions.";
let constructor = "TF::CreateStripNoinlineAttributePass()";
}
def ExecutorConvertControlToDataOutputsPass : Pass<"tf-executor-convert-control-to-data-outputs", "ModuleOp"> {
let summary = "Chain control outputs of while loop body";
let description = [{
This pass converts the control outputs of a while loop body function to data
outputs. Thus, inter iteration control dependencies are transformed to
data dependencies. Since data dependencies can express which particular
operations in the while loop body are dependent on which inputs, it captures
inter iteration parallelism in while loop. Control dependencies on the other
hand create a barrier at the end of while loop body thus blocking any
parallelism across iterations.
For example, the following while loop body has a `%barrier` at the end.
Although there is no data/control dependency between `tf.AssignVariableOp`
for `%arg0` to `tf.AssignVariableOp` for `%arg1` across any iteration, the
while loop body has a control barrier (`%barrier`) at the end which forces
a dependency and the two assign variable ops must wait for each other to
complete before starting the next iteration. Transforming these control
outputs to data outputs removes the dependency between the two assign
variable ops, thus allowing them to run in parallel across iterations.
Before:
```mlir
!tf_res = type tensor<!tf_type.resource<tensor<f32>>>
func @while_body(%arg0: !tf_res, %arg1: !tf_res, %arg2: tensor<f32>, %arg3: tensor<f32>) -> (!tf_res, !tf_res, tensor<f32>, tensor<f32>) {
%graph:4 = tf_executor.graph {
%assign_0_control = tf_executor.island wraps "tf.AssignVariableOp"(%arg0, %arg2) : (!tf_res, tensor<f32>) -> ()
%assign_1_control = tf_executor.island wraps "tf.AssignVariableOp"(%arg1, %arg3) : (!tf_res, tensor<f32>) -> ()
%add_out, %add_control = tf_executor.island wraps "tf.Add"(%arg2, %arg3) : (tensor<f32>, tensor<f32>) -> tensor<f32>
%mul_out, %mul_control = tf_executor.island wraps "tf.Mul"(%arg2, %arg3) : (tensor<f32>, tensor<f32>) -> tensor<f32>
%barrier = tf_executor.island(%assign_0_control, %assign_1_control, %add_control, %mul_control) wraps "tf.NoOp"() : () -> ()
tf_executor.fetch %arg0, %arg1, %add_out, %mul_out, %barrier : !tf_res, !tf_res, tensor<f32>, tensor<f32>, !tf_executor.control
}
return %graph#0, %graph#1, %graph#2, %graph#3 : !tf_res, !tf_res, tensor<f32>, tensor<f32>
}
```
After:
```mlir
func @while_body(%arg0: !tf_res, %arg1: !tf_res, %arg2: tensor<f32>, %arg3: tensor<f32>, %chain_0: tensor<i32>, %chain_1: tensor<i32>) -> (!tf_res, !tf_res, tensor<f32>, tensor<f32>, tensor<i32>, tensor<i32>) {
%graph:6 = tf_executor.graph {
%_, %chain_0_src = tf_executor.island wraps "tf.Identity"(%chain_0) : (tensor<i32>) -> tensor<i32>
%_, %chain_1_src = tf_executor.island wraps "tf.Identity"(%chain_1) : (tensor<i32>) -> tensor<i32>
%assign_0_control = tf_executor.island(%chain_0_src) wraps "tf.AssignVariableOp"(%arg0, %arg2) : (!tf_res, tensor<f32>) -> ()
%assign_1_control = tf_executor.island(%chain_1_src) wraps "tf.AssignVariableOp"(%arg1, %arg3) : (!tf_res, tensor<f32>) -> ()
%add_out, %add_control = tf_executor.island wraps "tf.Add"(%arg2, %arg3) : (tensor<f32>, tensor<f32>) -> tensor<f32>
%mul_out, %mul_control = tf_executor.island wraps "tf.Mul"(%arg2, %arg3) : (tensor<f32>, tensor<f32>) -> tensor<f32>
%chain_0_sink, %_ = tf_executor.island(%assign_0_control) wraps "tf.Identity"(%chain_0) : (tensor<i32>) -> tensor<i32>
%chain_1_sink, %_ = tf_executor.island(%assign_1_control) wraps "tf.Identity"(%chain_1) : (tensor<i32>) -> tensor<i32>
tf_executor.fetch %arg0, %arg1, %add_out, %mul_out, %chain_0_sink, %chain_1_sink : !tf_res, !tf_res, tensor<f32>, tensor<f32>, tensor<i32>, tensor<i32>
}
return %graph#0, %graph#1, %graph#2, %graph#3, %graph#4, %graph#5 : !tf_res, !tf_res, tensor<f32>, tensor<f32>, tensor<i32>, tensor<i32>
}
```
}];
let constructor = "tf_executor::CreateTFExecutorConvertControlToDataOutputsPass()";
}
def ExecutorGraphPruningPass : Pass<"tf-executor-graph-pruning", "mlir::func::FuncOp"> {
let summary = "Prunes unreachable ops in a tf_executor.graph";
let description = [{
This pass removes ops from a `tf_executor.graph` that are not transitively, via
data or control dependencies, connected to the associated `tf_executor.fetch`
op. The order of ops will be preserved. Functions named `main` with no
`tf.entry_function` attribute will not be pruned, as such graphs/functions may
have been imported from a V1 TensorFlow graph, where feeds/fetches/targets are
not provided at certain stages of IR transformation (e.g. pre-placement).
Option `ops-to-preserve` allows to specify ops that should not be pruned,
regardless of their reachability.
For example, the following:
```mlir
func @graph(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> {
%graph = tf_executor.graph {
%transitive_reachable_data:2 = tf_executor.island wraps "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
%reachable_data:2 = tf_executor.island wraps "tf.Identity"(%transitive_reachable_data#0) : (tensor<i32>) -> tensor<i32>
%unreachable_data:2 = tf_executor.island wraps "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
%transitive_reachable_control = tf_executor.island wraps "tf.NoOp"() : () -> ()
%reachable_control = tf_executor.island(%transitive_reachable_control) wraps "tf.NoOp"() : () -> ()
%unreachable_control = tf_executor.island wraps "tf.NoOp"() : () -> tensor<i32>
tf_executor.fetch %reachable_data#0, %reachable_control : tensor<i32>, !tf_executor.control
}
return %graph : tensor<i32>
}
```
will be transformed into:
```mlir
func @graph(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> {
%graph = tf_executor.graph {
%transitive_reachable_data:2 = tf_executor.island wraps "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
%reachable_data:2 = tf_executor.island wraps "tf.Identity"(%transitive_reachable_data#0) : (tensor<i32>) -> tensor<i32>
%transitive_reachable_control = tf_executor.island wraps "tf.NoOp"() : () -> ()
%reachable_control = tf_executor.island(%transitive_reachable_control) wraps "tf.NoOp"() : () -> ()
tf_executor.fetch %reachable_data#0, %reachable_control : tensor<i32>, !tf_executor.control
}
return %graph : tensor<i32>
}
```
}];
let constructor = "tf_executor::CreateTFExecutorGraphPruningPass()";
let options = [
ListOption<"ops_to_preserve_", "ops-to-preserve", "std::string",
"Comma separated list of ops that should not be pruned "
"regardless of reachability">
];
}
def ExecutorDialectToFunctionalPass : Pass<"tf-executor-to-functional-conversion", "mlir::func::FuncOp"> {
let summary = "Lifts tf_executor.island inner ops from a tf_executor.graph";
let description = [{
This pass converts tf_executor.graphs consisting of only tf_executor.islands and
a tf_executor.fetch into a sea of nodes consisting of TensorFlow Dialect ops by
lifting such ops out of a tf_executor.graph's tf_executor.islands. If V1 control
flow ops are present in a tf_executor.graph, an error will be returned.
For example, the following:
```mlir
func @my_fn(%arg0: tensor<i32>, %arg1: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
%graph_results:2 = tf_executor.graph {
%island_0_result, %island_0_control = tf_executor.island {
%identity = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
tf_executor.yield %identity : tensor<i32>
}
%island_1_result, %island_1_control = tf_executor.island {
%identity_n:2 = "tf.IdentityN"(%arg1, %island_0_result) : (tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>)
tf_executor.yield %identity_n#0
}
tf_executor.fetch %island_0_result, %island_1_result : tensor<i32>, tensor<i32>
}
return %graph_results#0, %graph_results#1 : tensor<i32>, tensor<i32>
}
```
will be transformed into:
```mlir
func @my_fn(%arg0: tensor<i32>, %arg1: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
%identity = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
%identity_n:2 = "tf.IdentityN"(%arg1, %identity) : (tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>)
return %identity, %identity_n#0 : tensor<i32>, tensor<i32>
}
```
}];
let constructor = "CreateExecutorDialectToFunctionalConversionPass()";
}
def ExecutorIslandCoarseningPass : Pass<"tf-executor-island-coarsening", "mlir::func::FuncOp"> {
let summary = "Walks tf_executor::GraphOp and merges individual tf_executor::IslandOps.";
let description = [{
This pass performs whole graph analysis for a graph encapsulated into tf_executor::GraphOp.
The analysis identifies all IslandOps within the graph which could be merged together.
The goal is to merge as many islands as possible.
Once analysis is completed, the pass merges all IslandOps in a single scan.
For example given the following program with two disjunct islands:
```mlir
func @test(%arg0 : tensor<i1>) -> tensor<f32> {
%0 = tf_executor.graph {
%1:2 = tf_executor.island {
%3 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
tf_executor.yield %3 : tensor<i1>
}
%2:2 = tf_executor.island(%1#1) {
%4 = "tf.opB"() : () -> tensor<f32>
tf_executor.yield %4 : tensor<f32>
}
tf_executor.fetch %2#0 : tensor<f32>
}
return %0 : tensor<f32>
}
```
After running this pass, the two islands are merged:
```mlir
func @test(%arg0: tensor<i1>) -> tensor<f32> {
%0 = tf_executor.graph {
%outputs, %control = tf_executor.island {
%1 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
%2 = "tf.opB"() : () -> tensor<f32>
tf_executor.yield %2 : tensor<f32>
}
tf_executor.fetch %outputs : tensor<f32>
}
return %0 : tensor<f32>
}
```
}];
let constructor = "tf_executor::CreateTFExecutorIslandCoarseningPass()";
}
def TpuV1BridgeExecutorIslandCoarseningPass : Pass<"tf-executor-tpu-v1-island-coarsening", "ModuleOp"> {
let summary = "Merges TPU clusters IslandOps, intended for V1 compatibility mode";
let constructor = "tf_executor::CreateTFExecutorTPUV1IslandCoarseningPass()";
let description = [{
This pass is a variant of ExecutorIslandCoarseningPass that is limited to
TPU-annotated operations and intended to preserve backward compatibility with
TFv1.
}];
}
def TPUBridgeExecutorIslandOutliningPass : Pass<"tf-executor-tpu-v1-island-outlining", "ModuleOp"> {
let summary = "Outline TPU clusters from island into a nested module, so it can "
"be processed like a V2 module, intended for V1 compatibility mode";
let constructor = "tf_executor::CreateTFExecutorTPUV1IslandOutliningPass()";
let description = [{
Extract the islands containing a TPU cluster computation into an outlined
function in a nested module. This will allow to run the usual bridge on this
nested module which now exhibits a more friendly "V2-like" structure.
This is only intended for V1 compatibility mode where the bridge runs without
feed/fetches on session create/extend.
So given e.g.
```mlir
func @test() -> tensor<i32> {
%0 = tf_executor.graph {
%output, %control = tf_executor.island {
...
tf_executor.yield %result : tensor<i32>
}
tf_executor.fetch %output : tensor<i32>
}
return %0
}
```
This pass will create an additional function containing the code in
tf_executor.island:
```mlir
func nested @_tpu_v1_compat_outlined_func0() -> tensor<i32> {
...
}
```
and will then replace the island with the wrapped call:
```mlir
func @test() -> tensor<i32> {
%0 = tf_executor.graph {
%outputs, %control = tf_executor.island wraps "tf.PartitionedCall"() {
f = @_tpu_v1_compat_outlined::@_tpu_v1_compat_outlined_func0
} : () -> tensor<i32>
tf_executor.fetch %outputs : tensor<i32>
}
return %0 : tensor<i32>
}
```
}];
}
def ExecutorTPUV1IslandInliningPass : Pass<"tf-executor-tpu-v1-island-inlining", "ModuleOp"> {
let summary = "Inline calls to the nested TPU module.";
let description = [{
This pass inlines the islands calling into the nested module that was
outlined, thus reversing the effect of the
`-tf-executor-tpu-v1-island-outlining` pass.
For example, the following:
```mlir
module {
func @foo(%arg0: tensor<f32>) -> tensor<f32> {
%0 = tf_executor.graph {
%outputs, %control = tf_executor.island wraps "tf.PartitionedCall"(%arg0) {f = @_tpu_v1_compat_outlined::@bar} : (tensor<f32>) -> tensor<f32>
tf_executor.fetch %outputs : tensor<f32>
}
return %0 : tensor<f32>
}
module @_tpu_v1_compat_outlined {
func nested @bar(%arg0: tensor<f32>) -> tensor<f32> {
%0 = "tf.opA"(%arg0) : (tensor<f32>) -> tensor<f32>
return %0 : tensor<f32>
}
}
}
```
will be transformed into:
```mlir
module {
func @foo(%arg0: tensor<f32>) -> tensor<f32> {
%0 = tf_executor.graph {
%outputs, %control = tf_executor.island {
%1 = "tf.opA"(%arg0) : (tensor<f32>) -> tensor<f32>
tf_executor.yield %1 : tensor<f32>
}
tf_executor.fetch %outputs : tensor<f32>
}
return %0 : tensor<f32>
}
}
```
}];
let constructor = "tf_executor::CreateTFExecutorTPUV1IslandInliningPass()";
}
def TPUClusterFormationPass : Pass<"tf-tpu-cluster-formation", "ModuleOp"> {
let summary = "Forms clusters from operations assigned to the same TPU computation";
let description = [{
TPU computations from the frontend are composed of a `tf.TPUReplicateMetadata`
op, a subgraph of ops (TensorFlow Dialect) each with a matching
`_replication_info` attribute relative to the associated
`tf.TPUReplicateMetadata` op, and optionally `tf.TPUReplicatedInput` and
`tf.TPUReplicatedOutput` ops feeding in inputs and outputs to and from a
replicated TPU computation. The number of times a TPU computation is
replicated is defined in the `tf.TPUReplicateMetadata` op (`num_replicas`
attribute) and operand and result sizes of `tf.TPUReplicatedInput` and
`tf.TPUReplicatedOutput` respectively must match, excluding packed tensors.
It is also assumed ops of the same TPU computation do not have ops outside
of the TPU computation that are both inputs and outputs to the same TPU
computation. Furthermore, we assume that every node has either none or both
of `_replication_info` and `_xla_compile_device_type` attributes defined.
This pass takes the TPU computation subgraph, moves them into a
`tf_device.cluster`, and copies over attributes from the associated
`tf.TPUReplicateMetadata` op to the newly created `tf_device.cluster`. If the
computation is replicated (`num_replicas` > 1), the `num_replicas` attribute is
not copied over but instead the `tf_device.cluster` is further wrapped with a
`tf_device.replicate`, and associated `tf.TPUReplicatedInput` and
`tf.TPUReplicatedOutput` ops are replaced as the `tf_device.replicate` operands
and results. Otherwise, the single operands and results of the associated
`tf.TPUReplicatedInput` and `tf.TPUReplicatedOutput` ops are simply forwarded to
the `tf_device.cluster`.
For example, the following non replicated computation:
```mlir
func @tpu_computation(%arg0: tensor<i32>) -> tensor<i32> {
// Metadata op for cluster `cluster` with 1 replica, 1 core per replica and
// with topology `<topology>`.
"tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _replication_info = "cluster", num_relicas = 1, num_cores_per_replica = 1, topology = "<topology>", device_assignment = [], padding_map = []} : () -> ()
%replicated_input = "tf.TPUReplicatedInput"(%arg0) : (tensor<i32>) -> tensor<i32>
%identity = "tf.Identity"(%replicated_input) {_xla_compile_device_type = "TPU", _replication_info = "cluster"} : (tensor<i32>) -> tensor<i32>
%replicated_output = "tf.TPUReplicatedOutput(%identity) : (tensor<i32>) -> tensor<i32>
return %replicated_output : tensor<i32>
}
```
will be transformed into:
```mlir
func @tpu_computation(%arg0: tensor<i32>) -> tensor<i32> {
%cluster = "tf_device.cluster"() ( {
%identity = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
tf_device.return %identity : tensor<i32>
}) {_xla_compile_device_type = "TPU", _replication_info = "cluster", num_cores_per_replica = 1, topology = "topology", device_assignment = [], padding_map = []} : () -> (tensor<i32>)
return %cluster : tensor<i32>
}
```
The following replicated computation:
```mlir
func @tpu_computation(%arg0: tensor<i32>, %arg1: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
"tf.TPUReplicateMetadata"() {_xla_compile_device_type = "TPU", _replication_info = "cluster", num_relicas = 2, num_cores_per_replica = 1, topology = "topology", device_assignment = [], padding_map = []} : () -> ()
%replicated_input = "tf.TPUReplicatedInput"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
%identity = "tf.Identity"(%replicated_input) {_xla_compile_device_type = "TPU", _replication_info = "cluster"} : (tensor<i32>) -> tensor<i32>
%replicated_output:2 = "tf.TPUReplicatedOutput(%identity) : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
return %replicated_output#0, %replicated_output#1 : tensor<i32>, tensor<i32>
}
```
will be transformed into:
```mlir
func @tpu_computation(%arg0: tensor<i32>, %arg1: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
%replicate:2 = tf_device.replicate([%arg0, %arg1] as %replicated_input) {n = 2 : i32} {
%cluster = "tf_device.cluster"() ( {
%identity = "tf.Identity"(%replicated_input) : (tensor<i32>) -> tensor<i32>
tf_device.return %identity : tensor<i32>
}) {_xla_compile_device_type = "TPU", _replication_info = "cluster", num_cores_per_replica = 1, topology = "topology", device_assignment = [], padding_map = []} : () -> (tensor<i32>)
tf_device.return %cluster : tensor<i32>
}
return %replicate#0, %replicate#1 : tensor<i32>, tensor<i32>
}
```
}];
let constructor = "TFTPU::CreateTPUClusterFormationPass()";
}
def ClusterConstantSinkingPass : Pass<"tf-device-constant-sinking", "mlir::func::FuncOp"> {
let summary = "Sinks constants implicitly captured in a tf_device.cluster region.";
let description = [{
This pass sinks implicitly captured constants (`tf.Const` ops) used by and into
a `tf_device.cluster` region. Performing this prior to outlining will reduce the
number of arguments of the outlined function.
For example, the following:
```mlir
func @cluster() -> tensor<i32> {
%const = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
%cluster = "tf_device.cluster"() ( {
%identity = "tf.Identity"(%const) : (tensor<i32>) -> tensor<i32>
tf_device.return %identity : tensor<i32>
}) : () -> (tensor<i32>)
return %cluster : tensor<i32>
}
```
will be transformed into:
```mlir
func @cluster() -> tensor<i32> {
%cluster = "tf_device.cluster"() ( {
%const = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
%identity = "tf.Identity"(%const) : (tensor<i32>) -> tensor<i32>
tf_device.return %identity : tensor<i32>
}) : () -> (tensor<i32>)
return %cluster : tensor<i32>
}
```
}];
let constructor = "TFDevice::CreateClusterConstantSinkingPass()";
}
def ClusterOutliningPass : Pass<"tf-device-cluster-outlining", "ModuleOp"> {
let summary = "Outlines regions of tf_device.cluster operations";
let description = [{
This pass outlines the body of a `tf_device.cluster` into a function and
replaces the `tf_device.cluster` op with an equivalent `tf_device.cluster_func`
op. Implicit operands will be captured and materialized as explicit arguments to
the newly created functions and associated `tf_device.cluster_func` ops.
For example, the following:
```mlir
func @computation(%arg0: tensor<i32>) -> tensor<i32> {
%cluster = "tf_device.cluster"() ( {
%identity = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
tf_device.return %identity : tensor<i32>
}) : () -> (tensor<i32>)
return %cluster : tensor<i32>
}
```
will be transformed into:
```mlir
func @computation(%arg0: tensor<i32>) -> tensor<i32> {
%cluster = "tf_device.cluster_func"(%arg0) {func = @_func} : (tensor<i32>) -> tensor<i32>
return %cluster : tensor<i32>
}
func @_func(%arg0: tensor<i32>) -> tensor<i32> {
%identity = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
return %identity : tensor<i32>
}
```
}];
let constructor = "TFDevice::CreateClusterOutliningPass()";
}
def ConvertTfControlFlowToScfPass : Pass<"convert-tf-control-flow-to-scf", "ModuleOp"> {
let summary = "Convert TensorFlow control flow to SCF.";
let description = [{
This pass can be used for all direct control flow lowerings from the TensorFlow
dialect to the SCF dialect.
}];
let dependentDialects = ["scf::SCFDialect", "tensor::TensorDialect"];
let constructor = "TF::createConvertTfControlFlowToScfPass()";
}
def LaunchOutliningPass : Pass<"tf-device-launch-outlining", "ModuleOp"> {
let summary = "Outlines regions of tf_device.launch operations";
let description = [{
This pass outlines the body of a `tf_device.launch` into a function and
replaces the `tf_device.launch` op with an equivalent `tf_device.launch_func`
op. Implicit operands will be captured and materialized as explicit arguments to
the newly created functions and associated `tf_device.launch_func` ops. The
`device` attribute from the `launch` op is transferred to `launch_func`.
For example, the following:
```mlir
func @computation(%arg0: tensor<i32>) -> tensor<i32> {
%launch = "tf_device.launch"() ( {
%identity = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
tf_device.return %identity : tensor<i32>
}) {device = "some_device"} : () -> (tensor<i32>)
return %launch : tensor<i32>
}
```
will be transformed into:
```mlir
func @computation(%arg0: tensor<i32>) -> tensor<i32> {
%launch = "tf_device.launch_func"(%arg0) {device = "some_device", func = @_func} : (tensor<i32>) -> tensor<i32>
return %launch : tensor<i32>
}
func @_func(%arg0: tensor<i32>) -> tensor<i32> {
%identity = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
return %identity : tensor<i32>
}
```
}];
let constructor = "TFDevice::CreateLaunchOutliningPass()";
}
def ConvertLaunchFuncToTFCallPass : Pass<"tf-device-convert-launch-func-to-tf-call", "ModuleOp"> {
let summary = "Rewrites tf_device::LaunchFuncOp to TF::PartitionedCallOp";
let description = [{
This pass converts tf_device::LaunchFuncOp into an equivalent
TF::PartitionedCallOp so that it can be exported to TensorFlow GraphDef.
}];
let constructor = "TFDevice::CreateConvertLaunchFuncToTFCallPass()";
}
def MarkInputOutputAliasesPass : Pass<"tf-device-mark-input-output-aliases", "ModuleOp"> {
let summary = "Marks device cluster inputs-output pairs that read/write to the same variable as aliases";
let description = [{
This pass analyzes the inputs and outputs to device cluster and marks those
input-output pairs as aliases (using `tf.aliasing_output` attribute) which read
and write to the same resource. This aliasing information can then be propagated
to XLA compiler for input/output buffer space optimizations.
}];
let constructor = "TFDevice::CreateMarkInputOutputAliasesPass()";
}
def TPUResourceReadForWritePass : Pass<"tf-tpu-resource-read-for-write", "ModuleOp"> {
let summary = "Inserts tf.ReadVariableOp inputs to a TPU cluster for resource writes with no reads";
let description = [{
This pass materializes `tf.ReadVariableOp` inputs to an outlined TPU computation
for resource variables where only writes are present so later in the pipeline
such resource variables can be fused with generated `tf.TPUExecute` ops, which
only supports resource variable read or read + write. For all TPU computations,
resource variables are required to be initialized prior to execution. Write only
resource variable uses can be generated currently via packed tensor uses.
For example, the following:
```mlir
func @write_only_resource(%value: tensor<i32>, %resource: tensor<*x!tf_type.resource<tensor<i32>>>) {
%0 = "tf_device.cluster_func"(%value) {func = @cluster} : (tensor<i32>) -> tensor<i32>
"tf.AssignVariableOp"(%resource, %0) : (tensor<*x!tf_type.resource<tensor<i32>>>, tensor<i32>) -> ()
return
}
func @cluster(%arg0: tensor<i32>) -> tensor<i32> {
%identity = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
return %identity : tensor<i32>
}
```
will be transformed into:
```mlir
func @write_only_resource(%value: tensor<i32>, %resource: tensor<*x!tf_type.resource<tensor<i32>>>) {
%resource_read = "tf.ReadVariableOp"(%resource) : (tensor<*x!tf_type.resource<tensor<i32>>>) -> tensor<i32>
%0 = "tf_device.cluster_func"(%value, %resource_read) {func = @cluster} : (tensor<i32>, tensor<i32>) -> tensor<i32>
"tf.AssignVariableOp"(%resource, %0) : (tensor<*x!tf_type.resource<tensor<i32>>>, tensor<i32>) -> ()
return
}
func @cluster(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> {
%identity = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
return %identity : tensor<i32>
}
```
}];
let constructor = "TFTPU::CreateTPUResourceReadForWritePass()";
}
def TPUExtractOutsideCompilationPass : Pass<"tf-tpu-extract-outside-compilation", "ModuleOp"> {
let summary = "Extracts TPU outside compilation computation to a separate tf_device.parallel_execute region.";
let description = [{
This pass extracts a CPU computation cluster with `_xla_outside_compilation`
annotation, which denotes ops that should be run on CPU/host, from a TPU cluster.
Each outside compilation cluster is moved to
a tf_device.parallel_execute region. The TPU cluster is also moved to a
tf_device.parallel_execute region. Communication ops between device and host are
added to pass inputs/outputs to/from the outside compiled region.
For example, the following tf_device.cluster with an op marked for `xla_outside_compilation`:
```mlir
func @outside_compilation() -> tensor<f32> {
%0 = "tf_device.cluster"() ( {
%1 = "tf.Const"() {_xla_outside_compilation = "0", value = dense<1.0> : tensor<f32>} : () -> (tensor<f32>)
%2 = "tf.Identity"(%1) {_xla_outside_compilation = "0"} : (tensor<f32>) -> (tensor<f32>)
%3 = "tf.AddV2"(%1, %2) : (tensor<f32>, tensor<f32>) -> (tensor<f32>)
tf_device.return %3 : tensor<f32>
}) {num_cores_per_replica = 1, topology = "", device_assignment = []} : () -> tensor<f32>
return %0 : tensor<f32>
}
```
will become a tf_device.parallel_execute op with a CPU/host region and
a tf_device.cluster with communication ops to send data to/from device/host:
```mlir
func @outside_compilation() -> tensor<f32> {
%0 = "tf_device.parallel_execute"() ( {
"tf_device.launch"() ( {
%1 = "tf._TPUCompileMlirPlaceholderProgramKey"() : () -> tensor<3x!tf_type.string>
%2 = "tf._XlaRecvAtHost"(%1) {device_ordinal = 0 : i64, key = "host_compute_channel_0_0_args"} : (tensor<3x!tf_type.string>) -> tensor<f32>
%3 = "tf.Identity"(%2) : (tensor<f32>) -> tensor<f32>
"tf._XlaSendFromHost"(%3, %1) {device_ordinal = 0 : i64, key = "host_compute_channel_0_0_retvals"} : (tensor<f32>, tensor<3x!tf_type.string>) -> ()
tf_device.return
}) {device = "/job:worker/replica:0/task:0/device:CPU:0"} : () -> ()
tf_device.return
}, {
%1 = "tf_device.cluster"() ( {
%2 = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
%3 = "tf._XlaHostComputeMlir"(%2) {recv_key = "host_compute_channel_0_0_retvals", send_key = "host_compute_channel_0_0_args", tpu_core = 0 : i64} : (tensor<f32>) -> tensor<f32>
%4 = "tf.AddV2"(%2, %3) : (tensor<f32>, tensor<f32>) -> tensor<f32>
tf_device.return %4 : tensor<f32>
}) {device_assignment = [], num_cores_per_replica = 1 : i64, topology = ""} : () -> tensor<f32>
tf_device.return %1 : tensor<f32>
}) : () -> tensor<f32>
return %0 : tensor<f32>
}
```
}];
let constructor = "TFTPU::CreateTPUExtractOutsideCompilationPass()";
}
def HoistReplicateInvariantResourceWritesPass : Pass<"tf-hoist-replicate-invariant-resource-writes", "mlir::func::FuncOp"> {
let summary = "Hoists writes to replicate invariant resource variables.";
let description = [{
This pass hoists replicate invariant resource variable writes outside
tf_device.replicate op. These may have been inserted by other passes such as
resource op lifting. However, if the resource variable is not replicated, writes
to such variables for each replica are redundant and can be replaced by writing
a single value from first replica.
The benefit of this optimization is reduced memory requirement on host. For
multiple writes (one from each replica) to such variables, the host would
allocate buffer space to recieve the device output from all replicas, which is
not required. We can use the output of first replica in such cases.
}];
let constructor = "TF::CreateHoistReplicateInvariantResourceWritesPass()";
}
def MarkOpsForOutsideCompilationPass : Pass<"tf-mark-ops-for-outside-compilation", "ModuleOp"> {
let summary = "Marks ops in device cluster for outside compilation if they are unsupported on device.";
let description = [{
This pass marks unsupported ops in a device cluster with
`_xla_outside_compilation` attribute so the operations will run on the host
instead of the device. Unsupported ops are ops that can not be code
generated to run on the device for the cluster including:
1. String operations on TPUs.
2. Operations that don't have a kernel defined for the device.
This pass is conservative in that it will mark all ops for outside compilation
that can not be compiled for the device. Exceptions for this are added for ops
that will be rewritten or decomposed before compiling on device.
For example, tf_device.cluster op with an unsupported op, tf.UnsupportedOp:
```mlir
func @unsupported_op() -> tensor<i32> {
%0 = "tf_device.cluster"() ( {
%1 = "tf.UnsupportedOp"() : () -> tensor<i32>
%2 = "tf.Identity"(%1) : (tensor<i32>) -> tensor<i32>
tf_device.return %2 : tensor<i32>
}) {allow_soft_placement = true, num_cores_per_replica = 1, topology = "", device_assignment = []} : () -> tensor<i32>
return %0 : tensor<i32>
}
```
will mark tf.UnsupportedOp with `_xla_outside_compilation` attribute:
```mlir
func @unsupported_op() -> tensor<i32> {
%0 = "tf_device.cluster"() ( {
%1 = "tf.UnsupportedOp"() {_xla_outside_compilation = "auto0"} : () -> tensor<i32>
%2 = "tf.Identity"(%1) : (tensor<i32>) -> tensor<i32>
tf_device.return %2 : tensor<i32>
}) {allow_soft_placement = true, device_assignment = [], num_cores_per_replica = 1 : i64, topology = ""} : () -> tensor<i32>
return %0 : tensor<i32>
}
```
}];
let constructor = "TFDevice::CreateMarkOpsForOutsideCompilationPass()";
}
def FunctionalControlFlowToRegionsPass : Pass<"tf-functional-control-flow-to-regions", "ModuleOp"> {
let summary = "Transforms functional control flow operations to their region-based counterparts";
let description = [{
This pass transforms functional control flow operations in the TensorFlow
dialect to their region-based counterparts, i.e., `tf.If` is transformed to
`tf.IfRegion` and `tf.While` is transformed to `tf.WhileRegion`.
For example, this functional operation
```mlir
%0 = "tf.If"(%arg0, %arg1) {
then_branch = @then_branch_func, else_branch = @else_branch_func, is_stateless = false
} : (tensor<i1>, tensor<*xf32>) -> tensor<*xf32>
```
will be transformed into this region-based operation
```mlir
%0 = "tf.IfRegion"(%arg0) ( {
%1 = call @then_branch_func(%arg1) : (tensor<*xf32>) -> tensor<*xf32>
"tf.Yield"(%1) : (tensor<*xf32>) -> ()
}, {
%1 = call @else_branch_func(%arg1) : (tensor<*xf32>) -> tensor<*xf32>
"tf.Yield"(%1) : (tensor<*xf32>) -> ()
}) {is_stateless = false} : (tensor<i1>) -> tensor<*xf32>
```
}];
let constructor = "TF::CreateTFFunctionalControlFlowToRegions()";
}
def RegionControlFlowToFunctionalPass : Pass<"tf-region-control-flow-to-functional", "ModuleOp"> {
let summary = "Transforms region-based control flow operations to their functional counterparts";
let description = [{
This pass transforms region-based control flow operations in the TensorFlow
dialect to their functional counterparts, i.e., `tf.IfRegion` is transformed to
`tf.If` and `tf.WhileRegion` is transformed to `tf.While`.
For example, this region-based operation
```mlir
%0 = "tf.IfRegion"(%arg0) ( {
%1 = call @then_branch_func(%arg1) : (tensor<*xf32>) -> tensor<*xf32>
"tf.Yield"(%1) : (tensor<*xf32>) -> ()
}, {
%1 = call @else_branch_func(%arg1) : (tensor<*xf32>) -> tensor<*xf32>
"tf.Yield"(%1) : (tensor<*xf32>) -> ()
}) {is_stateless = false} : (tensor<i1>) -> tensor<*xf32>
```
will be transformed into this functional operation
```mlir
%0 = "tf.If"(%arg0, %arg1) {
then_branch = @then_branch_func, else_branch = @else_branch_func, is_stateless = false
} : (tensor<i1>, tensor<*xf32>) -> tensor<*xf32>
```
}];
let constructor = "TF::CreateTFRegionControlFlowToFunctional()";
}
def TPUReorderReplicateAndPartitionedInputsPass : Pass<"tf-tpu-reorder-replicate-partitioned-inputs", "mlir::func::FuncOp"> {
let summary = "Reorder replicated and partitioned input ops.";
let description = [{
This pass rewrites how data parallelism and model parallelism is expressed for
inputs. It reorders `tf.TPUPartitionedInput` (model parallelism) and
`tf.TPUReplicatedInput` (data parallelism) ops. It transforms a DAG where
multiple `tf.TPUPartitionedInput` ops are feeding into a single
`tf.TPUReplicatedInput` into a DAG where multiple `tf.TPUReplicatedInput` ops
are feeding into a single `tf.TPUPartitionedInput`. Transforming the IR in such
a manner will allow subsequent cluster formation pass to handle IR with both
data and model parallelism in an easier manner.
For example, the following:
```mlir
!rtype = type tensor<!tf_type.resource<tensor<10x3xf32>>>
func @data_and_model_parallelism(%arg0: !rtype, %arg1: !rtype, %arg2: !rtype, %arg3: !rtype) -> !rtype {
%pi_0 = "tf.TPUPartitionedInput"(%arg0, %arg1) {_XlaSharding = "", device = "", partition_dim = -1 : i64} : (!rtype, !rtype) -> !rtype
%pi_1 = "tf.TPUPartitionedInput"(%arg2, %arg3) {_XlaSharding = "", device = "", partition_dim = -1 : i64} : (!rtype, !rtype) -> !rtype
%ri = "tf.TPUReplicatedInput"(%pi_0, %pi_1) : (!rtype, !rtype) -> !rtype
return %ri : !rtype
}
```
will be transformed into:
```mlir
!rtype = type tensor<!tf_type.resource<tensor<10x3xf32>>>
func @data_and_model_parallelism(%arg0: !rtype, %arg1: !rtype, %arg2: !rtype, %arg3: !rtype) -> !rtype {
%ri_0 = "tf.TPUReplicatedInput"(%arg0, %arg2) : (!rtype, !rtype) -> !rtype
%ri_1 = "tf.TPUReplicatedInput"(%arg1, %arg3) : (!rtype, !rtype) -> !rtype
%pi = "tf.TPUPartitionedInput"(%ri_0, %ri_1) {_XlaSharding = "", device = "", partition_dim = -1 : i64} : (!rtype, !rtype) -> !rtype
return %pi : !rtype
}
```
}];
let constructor = "TFTPU::CreateTPUReorderReplicateAndPartitionedInputsPass()";
}
def TPUResourceReadsWritesPartitioningPass : Pass<"tf-tpu-resource-partition", "mlir::func::FuncOp"> {
let summary = "Partitions unpartitioned resource read/write to partitioned resource variables.";
let description = [{
This pass creates individual resource reads/writes from the unpartitioned
resource variable (from `tf.TPUPartitionedInput`) to individual partitioned
resource variables (`tf.TPUPartitionedInput` operands). As resource op
decomposition/lifting occurs with the unpartitioned resource variables,
transforming the IR in such a manner will allow for subsequent passes to operate
on individual resource variable handles per core/device.
For example, the following:
```mlir
func @cluster(%arg0: tensor<!tf_type.resource<tensor<i32>>>, %arg1: tensor<!tf_type.resource<tensor<i32>>>) {
%partitioned_variable = "tf.TPUPartitionedInput"(%arg0, %arg1) {N = 2 : i64, _XlaSharding = "", partition_dim = -1 : i64} : (tensor<!tf_type.resource<tensor<i32>>>, tensor<!tf_type.resource<tensor<i32>>>) -> tensor<!tf_type.resource<tensor<i32>>>
%read = "tf.ReadVariableOp"(%partitioned_variable) : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
%computation = "tf_device.cluster_func"(%read) {func = @computation, use_spmd_for_xla_partitioning = true} : (tensor<i32>) -> tensor<i32>
"tf.AssignVariableOp"(%partitioned_variable, %computation) : (tensor<!tf_type.resource<tensor<i32>>>, tensor<i32>) -> ()
return
}
func @computation(%arg0: tensor<i32>) -> tensor<i32> {
return %arg0: tensor<i32>
}
```
will be transformed into:
```mlir
func @cluster(%arg0: tensor<!tf_type.resource<tensor<i32>>>, %arg1: tensor<!tf_type.resource<tensor<i32>>>) {
%read0 = "tf.ReadVariableOp"(%arg0) : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
%read1 = "tf.ReadVariableOp"(%arg1) : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
%partitioned_input = "tf.TPUPartitionedInput"(%read0, %read1) {N = 2 : i64, _XlaSharding = "", partition_dim = -1 : i64} : (tensor<i32>, tensor<i32>) -> tensor<i32>
%computation = "tf_device.cluster_func"(%partitioned_input) {func = @computation, use_spmd_for_xla_partitioning = true} : (tensor<i32>) -> tensor<i32>
%partitioned_output:2 = "tf.TPUPartitionedOutput"(%computation) {N = 2 : i64, _XlaSharding = "", partition_dim = -1 : i64} : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
"tf.AssignVariableOp"(%arg0, %partitioned_output#0) : (tensor<!tf_type.resource<tensor<i32>>>, tensor<i32>) -> ()
"tf.AssignVariableOp"(%arg1, %partitioned_output#1) : (tensor<!tf_type.resource<tensor<i32>>>, tensor<i32>) -> ()
return
}
func @computation(%arg0: tensor<i32>) -> tensor<i32> {
return %arg0: tensor<i32>
}
```
}];
let constructor = "TFTPU::CreateTPUResourceReadsWritesPartitioningPass()";
}
def TPURewritePass : Pass<"tf-tpu-rewrite", "ModuleOp"> {
let summary = "Rewrites a `tf_device.cluster_func` on TPUs into TPU runtime operations.";
let description = [{
This pass rewrites a `tf_device.cluster_func` operation into a sequence of `tf._TPUCompileMlir`
and `tf.TPUExecute` operations. `tf._TPUCompileMlir` contains a MLIR module that is
functionally equivalent to the function referenced by `tf_device.cluster_func`.
This makes the module to be jit-compiled and executed on TPU.
If it is not possible to rewrite the operation or device assignment fails,
a failure will be returned.
Note, many parameters to the `tf_device.cluster_func` are omitted in this
and following examples.
For example, a non replicated `tf_device.cluster_func`:
```mlir
func @tf_tpu_rewrite(%arg0: tensor<i8>) {
%0 = "tf_device.cluster_func"(%arg0) {_xla_compile_device_type = "TPU", _replication_info = "cluster0", func = @func} : (tensor<i8>) -> tensor<i8>
return
}
```
will be rewritten as:
```mlir
func @tf_tpu_rewrite(%arg0: tensor<i8>) {
%0:2 = "tf_device.launch"() ( {
%compilation_status, %program = "tf._TPUCompileMlir"() {mlir_module = "<serialized func>"} : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
tf_device.return %compilation_status, %program : tensor<!tf_type.string>, tensor<3x!tf_type.string>
}) {device = "/job:worker/replica:0/task:0/device:CPU:0"} : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
"tf_device.launch"() ( {
"tf.TPUCompileSucceededAssert"(%0#0) : (tensor<!tf_type.string>) -> ()
tf_device.return
}) {device = "/job:worker/replica:0/task:0/device:CPU:0"} : () -> ()
%1 = "tf_device.launch"() ( {
%2 = "tf.TPUExecute"(%arg0, %0#1) : (tensor<i8>, tensor<3x!tf_type.string>) -> tensor<i8>
tf_device.return %2 : tensor<i8>
}) {device = "/job:worker/replica:0/task:0/device:TPU:0"} : () -> tensor<i8>
return
}
```
A replicated `tf_device.cluster_func`:
```mlir
func @tf_tpu_rewrite(%arg0: tensor<i8>, %arg1: tensor<i8>) {
%0:2 = tf_device.replicate([%arg0, %arg1] as %ri: tensor<i8>) {n = 2 : i32} {
%1 = "tf_device.cluster_func"(%ri) {_xla_compile_device_type = "TPU", _replication_info = "cluster0", func = @func} : (tensor<i8>) -> tensor<i8>
tf_device.return %1 : tensor<i8>
}
return
}
```
will be rewritten as:
```mlir
func @tf_tpu_rewrite(%arg0: tensor<i8>, %arg1: tensor<i8>) {
%0:2 = tf_device.replicate([%arg0, %arg1] as %arg2: tensor<i8>) {devices = {TPU_REPLICATED_CORE_0 = ["/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"], TPU_REPLICATED_HOST = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:CPU:0"]}, n = 2 : i32} {
%1:2 = "tf_device.launch"() ( {
%compilation_status, %program = "tf._TPUCompileMlir"() {mlir_module = "<serialized func>"} : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
tf_device.return %compilation_status, %program : tensor<!tf_type.string>, tensor<3x!tf_type.string>
}) {device = "/job:worker/replica:0/task:0/device:CPU:0"} : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
"tf_device.launch"() ( {
"tf.TPUCompileSucceededAssert"(%1#0) : (tensor<!tf_type.string>) -> ()
tf_device.return
}) {device = "/job:worker/replica:0/task:0/device:CPU:0"} : () -> ()
%2 = "tf_device.launch"() ( {
%3 = "tf.TPUExecute"(%arg2, %1#1) : (tensor<i8>, tensor<3x!tf_type.string>) -> tensor<i8>
tf_device.return %3 : tensor<i8>
}) {device = "TPU_REPLICATED_CORE_0"} : () -> tensor<i8>
tf_device.return %2 : tensor<i8>
}
return
}
```
A non replicated `tf_device.cluster_func` with the model parallelism:
```mlir
func @tf_tpu_rewrite(%arg0: tensor<8xi32>) -> tensor<8xi32> {
%0 = "tf_device.cluster_func"(%arg0) {_xla_compile_device_type = "TPU", _replication_info = "cluster0", func = @func, num_cores_per_replica = 2, input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<8xi32>) -> tensor<8xi32>
return %0 : tensor<8xi32>
}
```
will be rewritten as:
```mlir
func @tf_tpu_rewrite(%arg0: tensor<8xi32>) -> tensor<8xi32> {
%0:3 = "tf_device.launch"() ( {
%compilation_status, %program:2 = "tf._TPUCompileMlir"() {mlir_module = "<serialized func>"} : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>, tensor<3x!tf_type.string>)
tf_device.return %compilation_status, %program#0, %program#1 : tensor<!tf_type.string>, tensor<3x!tf_type.string>, tensor<3x!tf_type.string>
}) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>, tensor<3x!tf_type.string>)
"tf_device.launch"() ( {
"tf.TPUCompileSucceededAssert"(%0#0) : (tensor<!tf_type.string>) -> ()
tf_device.return
}) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : () -> ()
%1 = "tf_device.parallel_execute"() ( {
%2 = "tf_device.launch"() ( {
%3 = "tf.TPUExecute"(%arg0, %0#1) : (tensor<8xi32>, tensor<3x!tf_type.string>) -> tensor<8xi32>
tf_device.return %3 : tensor<8xi32>
}) {device = "/job:localhost/replica:0/task:0/device:TPU:0"} : () -> tensor<8xi32>
tf_device.return %2 : tensor<8xi32>
}, {
"tf_device.launch"() ( {
"tf.TPUExecute"(%0#2) : (tensor<3x!tf_type.string>) -> ()
tf_device.return
}) {device = "/job:localhost/replica:0/task:0/device:TPU:1"} : () -> ()
tf_device.return
}) : () -> tensor<8xi32>
return %1 : tensor<8xi32>
}
```
}];
let options = [
Option<"tpu_compile_metadata_debug_", "tpu-compile-metadata-debug", "bool", /*default=*/"false",
"Whether to serialize TPUCompileMetadataProto metadata in 'tf._TPUCompileMlir' op as a proto debug string">
];
let constructor = "TFTPU::CreateTPURewritePass()";
}
def VerifySuitableForExportPass : Pass<"tf-verify-for-export", "ModuleOp"> {
let summary = "Verify module is suitable for export back to TF Graph";
let description = [{
Verifies whether all functions in module are of single tf_executor.graph and
each tf_executor.island in tf_executor.graph only has a single op.
}];
let constructor = "TF::CreateVerifySuitableForExportPass()";
}
def PrepareTpuComputationForTfExportPass : Pass<"prepare-tpu-computation-for-tf-export", "ModuleOp"> {
let summary = "Prepare TPU computation to be legal for export to TensorFlow";
let description = [{
Prepares TPU computation module attached to _TPUCompileMlir op for
TensorFlow graph export by making transformation such as replacing or
removing MLIR or XLA specific attributes that are not legal in TensorFlow
graph.
}];
let constructor = "TF::CreatePrepareTpuComputationForTfExportPass()";
}
def TensorDeviceCopyConversionPass : Pass<"tf-tensor-device-copy", "mlir::func::FuncOp"> {
let summary = "Fold the tf.Identity op and the tf.IdentityN op if the op has the same device as its operand";
let constructor = "TF::CreateTensorDeviceCopyConversionPass()";
}
def TensorArrayOpsDecompositionPass : Pass<"tf-tensor-array-ops-decomposition", "ModuleOp"> {
let summary = "Decompose tensor array operations into local variable operations.";
let description = [{
A pass that converts tensor array operations to tensor operations and
read/assign ops on local variables. A later resource lifting pass can further
remove the local variables.
This pass requires that the full shape of the tensor array can be inferred:
1) the size needs to be a constant, 2) it specifies the full element shape,
or that can be inferred from a later write, and 3) all elements have the same
shape.
}];
let dependentDialects = ["tensor::TensorDialect"];
let constructor = "TF::CreateTensorArrayOpsDecompositionPass()";
}
def TensorFlowOptimizePass : Pass<"tf-optimize", "mlir::func::FuncOp"> {
let summary = "Optimize TensorFlow module";
let constructor = "TF::CreateTFOptimizePass()";
}
def PromoteResourcesToArgsPass : Pass<"tf-promote-resources-to-args", "ModuleOp"> {
let summary = "Promote resources reads/writes to function inputs/outputs.";
let description = [{
This pass promotes resource accesses in function(s) (by default, the main)
to input arguments and outputs of the function(s).
Two types of resources are supported:
(1) A function argument of TF::ResourceType type (this pass).
(2) A VarHandleOp in the function (tf-promote-var-handles-to-args).
After the pass,
. The function will have an input argument for each resource that is
already provided as an input argument or is read. The type of the input
argument will become the shape of the value represented by the resource.
. The function will have an output for each resource that is written. The
type of the output will become the shape of the resource.
The information of variable identification and input-output alising is
recorded as named attributes of the input argument or output:
. 'tf.resource_name' matches 'shared_name' of VarHandleOp, which represents
the identifier of the corresponding resource. This attribute is added to
an input argument if the initial value of the resource is read, or to the
output if the initial value is not read.
. 'tf.aliasing_output' is the index of the function output that is an alias
of the input argument. This attribute is added only to the input argument
when the initial value of the corresponding resource is read, and the
resource is written later.
Assumption of this pass:
. Compound resource operations have already been decomposed.
. Dead functions have already been removed, as resource arguments in dead
functions can cause the pass to fail.
}];
let constructor = "TF::CreatePromoteResourcesToArgsPass()";
let options = [
ListOption<"functions_", "functions", "std::string",
"Comma separated list of functions whose resources "
"read/writes should be promoted to function inputs/outputs.">
];
}
def PromoteVarHandlesToArgsPass : Pass<"tf-promote-var-handles-to-args", "ModuleOp"> {
let summary = "Promote tf.VarHandleOps to function arguments.";
let description = [{See joint description in promote resources to args.}];
let constructor = "TF::CreatePromoteVarHandlesToArgsPass()";
}
def TPUMergeVariablesWithExecutePass : Pass<"tf-tpu-merge-variables-with-execute", "ModuleOp"> {
let summary = "Merges device variable reads and updates into TPU execute ops";
let description = [{
This pass finds on-device resource variable reads and updates surrounding a
`tf.TPUExecute` op and merges them into a `tf.TPUExecuteAndUpdateVariables`
op. This allows the TPU execution to perform more efficient in-place
variable updates.
For example,
```mlir
%0 = "tf.ReadVariableOp"(%arg0)
%1 = "tf.ReadVariableOp"(%arg1)
%2 = "tf.TPUExecute"(%0, %1, %compile)
%3 = "tf.AssignVariableOp"(%arg0, %2)
```
will be transformed into
```mlir
%2 = "tf.TPUExecuteAndUpdateVariables"(%arg0, %arg1, %compile)
{ device_var_reads_indices = [0, 1],
device_var_updates_indices = [0, -1] }
````
The transformation happens only for on-device variables. The above
transformation requires `%arg0`, `%arg1` to have the same device assignment
as the `TPUExecute` op.
}];
let constructor = "TFTPU::CreateTPUMergeVariablesWithExecutePass()";
}
def ReplicateInvariantOpHoistingPass : Pass<"tf-replicate-invariant-op-hoisting", "mlir::func::FuncOp"> {
let summary = "Hoists replicate invariant operations out of replicate";
let description = [{
This pass looks for replicate invariant ops in a `tf_device.replicate` op
region and hoists them out. It also makes `tf.Shape` ops replicate invariant
if possible. This currently updates or replaces `tf.Shape` ops of replicated
arguments, either tensors or resources.
For example, the following
```mlir
tf_device.replicate([%0, %1] as %ri: tensor<*xi32>) {n = 2 : i32} {
%2 = "tf.Shape"(%ri) : (tensor<*xi32>) -> tensor<?xi32>
tf_device.return
}
```
gets converted to
```mlir
tf_device.replicate([%0, %1] as %ri: tensor<*xi32>) {n = 2 : i32} {
%2 = "tf.Shape"(%0) : (tensor<*xi32>) -> tensor<?xi32>
tf_device.return
}
```
and for resource variables the following
```mlir
tf_device.replicate([%0, %1] as %ri: tensor<*x!tf_type.resource>) {n = 2 : i32} {
%2 = "tf.ReadVariableOp"(%ri) : tensor<*x!tf_type.resource> -> tensor<*xi32>
%3 = "tf.Shape"(%2) : (tensor<*xi32>) -> tensor<?xi32>
tf_device.return
}
```
gets converted to
```mlir
tf_device.replicate([%0, %1] as %ri: tensor<*x!tf_type.resource>) {n = 2 : i32} {
%2 = "tf.ReadVariableOp"(%ri) : tensor<*x!tf_type.resource> -> tensor<*xi32>
%3 = "tf.VariableShape"(%0) : (tensor<*x!tf_type.resource>) -> tensor<?xi32>
tf_device.return
}
```
}];
let constructor = "TFDevice::CreateReplicateInvariantOpHoistingPass()";
}
def LowerQuantizedPass : Pass<"tf-lower-quantized", "mlir::func::FuncOp"> {
let summary = "Lowers ops that require quantized input or output.";
let description = [{
This pass rewrites all ops that have at least one input or output that must
be a quantized type to ops whose inputs and outputs allow non-quantized
types. Examples of quantized types are TF_Qint8 or TF_Quint8.
An example is TF_DequantizeOp, which converts a quantized type to a float.
This op is rewritten to generic ops that perform the scale and shift
and can operate on non-quantized types.
Currently, TF_DequantizeOp is the only op with a lowering that falls
in this category. When more lowerings are added (e.g. QuantizeV2Op),
they should be added to this pass.
}];
let constructor = "TF::CreateLowerQuantizedPass()";
}
def OutsideCompiledToHostLaunchPass : Pass<"tf-outside-compiled-to-host-launch", "ModuleOp"> {
let summary = "Wraps each op with the _xla_outside_compiled attribute in a separate tf_device.launch on replicated host device.";
let description = [{
This pass wraps ops with the same `_xla_outside_compilation`
attribute value in a tf_device.launch op with host device assignment.
A simple example:
```mlir
"tf_device.cluster"() ( {
"tf.A"()
"tf.B"() {_xla_outside_compilation = "cluster1"}
"tf.C"()
tf_device.return
}) {num_cores_per_replica = 1, topology = "", device_assignment = []}
```
Would become the following ops (unimportant attribute, type are omitted):
```mlir
"tf_device.cluster"() ( {
"tf.A"()
"tf_device.launch"() {
"tf.B"() {_xla_outside_compilation = "cluster1"}
tf_device.return
} {device = "TPU_REPLICATED_HOST"} : () -> ()
"tf.C"()
tf_device.return
}) {num_cores_per_replica = 1, topology = "", device_assignment = []}
```
}];
let constructor = "TFTPU::CreateOutsideCompiledToHostLaunchPass()";
}
def TPUHostComputationExpansionPass : Pass<"tf-tpu-host-computation-expansion", "mlir::func::FuncOp"> {
let summary = "Expands host computation before and after TPU computation.";
let description = [{
This pass expands outside compilation attributes to Identity/Cast ops
at the head of TPU computation if it's only used by outside compiled ops.
}];
let constructor = "TFTPU::CreateTPUHostComputationExpansionPass()";
}
def TPUUpdateEmbeddingEnqueueOpInputsPass : Pass<"tf-tpu-update-embedding-enqueue-op-inputs", "mlir::func::FuncOp"> {
let summary = "Updates inputs to TPU embedding enqueue ops depending on whether graph is in training mode or in evaluation mode.";
let description = [{
Updates inputs to TPU embedding enqueue ops depending on whether graph
is in training mode or in evaluation mode.
}];
let constructor = "TFTPU::CreateTPUUpdateEmbeddingEnqueueOpInputsPass()";
}
def DropWhileShapeInvariantPass : Pass<"tf-drop-while-shape-invariant", "mlir::func::FuncOp"> {
let summary = "Drop `shape_invariant` attrbute from While/WhileRegion ops.";
let description = [{
Drop `shape_invariant` attribute from tf.While and tf.WhileRegion op. This
would allow shape inference pass to further refine operand/result shapes of
these ops. This is only safe to do when compiling to XLA.
}];
let constructor = "TF::CreateDropWhileShapeInvariantPass()";
}
def DropWhileShapeInvariantInDeviceClusterPass : Pass<"tf-drop-while-shape-invariant-in-device-cluster", "mlir::func::FuncOp"> {
let summary = "Drop `shape_invariant` attrbute from While/WhileRegion ops inside device cluster.";
let description = [{
Drop `shape_invariant` attribute from tf.While and tf.WhileRegion op only
inside device cluster. This would allow shape inference pass to further
refine operand/result shapes of these ops. This is only safe to do when
compiling to XLA.
}];
let constructor = "TF::CreateDropWhileShapeInvariantInDeviceClusterPass()";
}
def TPUCleanupClusterAttributesPass : Pass<"tf-tpu-cleanup-cluster-attributes", "ModuleOp"> {
let summary = "Eliminate _replication_info and other attributes from ops in a cluster";
let description = [{
This pass eliminate `_replication_info` and `device` attribute on operations
that are contained in a tf_device.cluster op.
}];
let constructor = "TFTPU::CreateTPUClusterCleanupAttributesPass()";
}
def TPUExtractHeadTailOutsideCompilationPass : Pass<"tf-tpu-extract-head-tail-outside-compilation", "ModuleOp"> {
let summary = "Extracts TPU head or tail outside compilation to separate host launches before/after device cluster.";
let description = [{
This pass extracts a CPU computation cluster with `_xla_outside_compilation`
annotation from the head or tail of a TPU cluster.
For example:
```mlir
%cluster = "tf_device.cluster"() ( {
%a = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
%b = "tf.B"(%a) : (tensor<i32>) -> tensor<i32>
%c = "tf.C"(%b) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
tf_device.return %c : tensor<i32>
}) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> tensor<i32>
return %cluster : tensor<i32>
```
becomes:
```mlir
%0 = "tf_device.launch"() ( {
%3 = "tf.A"(%arg0) : (tensor<i32>) -> tensor<i32>
tf_device.return %3 : tensor<i32>
}) {device = "/job:worker/replica:0/task:0/device:CPU:0"} : () -> tensor<i32>
%1 = "tf_device.cluster"() ( {
%3 = "tf.B"(%0) : (tensor<i32>) -> tensor<i32>
tf_device.return %3 : tensor<i32>
}) {device_assignment = [], num_cores_per_replica = 1 : i64, padding_map = [], step_marker_location = "", topology = ""} : () -> tensor<i32>
%2 = "tf_device.launch"() ( {
%3 = "tf.C"(%1) : (tensor<i32>) -> tensor<i32>
tf_device.return %3 : tensor<i32>
}) {device = "/job:worker/replica:0/task:0/device:CPU:0"} : () -> tensor<i32>
return %2 : tensor<i32>
```
}];
let constructor = "TFTPU::CreateTPUExtractHeadTailOutsideCompilationPass()";
}
def TPUSpaceToDepthPass : Pass<"tf-tpu-space-to-depth-pass", "ModuleOp"> {
let summary = "Applies automatic space to depth transform for the first or frontier convolutions consume host inputs on TPU.";
let description = [{
Automatic space to depth transform is done by adding space to depth transform op after host input
and applying space to depth transform for the first convolution and its backprop filter on TPU.
For example, original program:
```mlir
module {
func @while_body {
%input = "tf.IteratorGetNext"(...) {device = "/CPU:0"}: -> tensor<2x224x224x3xf32>
%device_launch = "tf_device.cluster_func"(%input,...) {func = @_func,...)
return ...
}
func @_func(%input: tensor<2x224x224x3xf32>, %filter: tensor<7x7x3x64xf32>) {
%6 = "tf.Conv2D"(%input, %filter) {strides = [1, 2, 2, 1]}: (tensor<2x230x230x3xf32>, tensor<7x7x3x64xf32>) -> tensor<2x112x112x64xf32>
}
}
```
The program will be transformed into:
```mlir
module {
func @while_body {
%input = "tf.IteratorGetNext"(...) {device = "/CPU:0"} -> tensor<2x224x224x3xf32>
%space_to_depth = "tf.SpaceToDepth"(%input) {block_size = 2, ...}: (tensor<2x224x224x3xf32>) -> tensor<2x112x112x12xf32>
%device_launch = "tf_device.cluster_func"(%space_to_depth,...) {func = @_func,...)
return ...
}
func @_func(%input: tensor<2x112x112x12xf32>, %filter: tensor<7x7x3x64xf32>) {
%filter_transform = "tf.Pad/tf.Transpose/tf.Reshape"(%filter): tensor<7x7x3x64xf32>) -> tensor<4x4x12x64xf32>
%conv = "tf.Conv2D"(%input, %filter_transfrom) {strides = [1, 1, 1, 1]}: (tensor<2x112x112x12xf32>, tensor<4x4x12x64xf32>) -> tensor<2x112x112x64xf32>
}
}
```
This way, the first convolution with 3 feature dimension will be transformed
to 12 feature dimension, which has better performance on TPU.
}];
let constructor = "TFTPU::CreateTPUSpaceToDepthPass()";
}
def MergeControlFlowPass : Pass<"tf-merge-control-flow", "ModuleOp"> {
let summary = "Merges IfRegion ops together with a common predicate.";
let description = [{
This pass merges IfRegion ops together if they have the same predicate and it
is safe to do so (there are no intermediate dependencies, they are in the
same block, etc).
For example:
```mlir
"tf.IfRegion"(%0) ( {
%2 = "tf.A"() : () -> (tensor<f32>)
"tf.Yield"() : () -> ()
}, {
"tf.Yield"() : () -> ()
}) { is_stateless = true } : (tensor<i1>) -> ()
"tf.IfRegion"(%0) ( {
%2 = "tf.B"() : () -> (tensor<f32>)
"tf.Yield"() : () -> ()
}, {
"tf.Yield"() : () -> ()
}) { is_stateless = true } : (tensor<i1>) -> ()
```
Would be transformed to:
```mlir
"tf.IfRegion"(%0) ( {
%2 = "tf.A"() : () -> (tensor<f32>)
%3 = "tf.B"() : () -> (tensor<f32>)
"tf.Yield"() : () -> ()
}, {
"tf.Yield"() : () -> ()
}) { is_stateless = true } : (tensor<i1>) -> ()
```
}];
let constructor = "TFDevice::CreateMergeControlFlowPass()";
}
def TensorListOpsDecompositionPass : Pass<"tf-tensor-list-ops-decomposition", "ModuleOp"> {
let summary = "Decomposes TensorList operations into generic operations on tensors.";
let description = [{
This pass rewrites TensorList operations into generic and non-mutating
operations on tensors. This results in operations that can be legalized to XLA.
The list is converted to a single large tensor that includes all list elements,
with a new first dimension for the list index. List update operations are
converted to operations that create a new tensor representing the list.
In the current implementation, the resulting operations are statically shaped,
which means it must be possible to infer a bound on the full shape of the
TensorList. That is, the `element_shape` and `num_elements` arguments to a
tensor list creation op are constant.
A tensor list creation op `tf.EmptyTensorList`/`tf.TensorListReserve` will be
turned in to a zero-initialized buffer, and the size is initialized to 0
for `tf.EmptyTensorList` or the specified size for `tf.TensorListReserve`.
Each push will be turned into `tf.XlaDynamicUpdateSlice` with the incremented
size, and each pop will be turned into a `tf.Slice` and a copy of the buffer
with decremented size. Each `tf.TensorListSetItem` will be turned into a
`tf.XlaDynamicUpdateSlice` with unchanged size, and each `tf.TensorListGetItem`
will be rewritten to a `tf.Slice`.
The pass also works across control flow and functional calls.
For example, the TensorList ops in the following function:
```mlir
func @main(%arg0: tensor<8x4xf32>) {
%elem_shape = "tf.Const"() {value = dense<[8, 4]> : tensor<2xi32>} : () -> tensor<2xi32>
%max_size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
%tl = "tf.EmptyTensorList"(%elem_shape, %max_size) : (tensor<2xi32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<8x4xf32>>>
%push = "tf.TensorListPushBack"(%tl, %arg0) : (tensor<!tf_type.variant<tensor<8x4xf32>>>, tensor<8x4xf32>) -> tensor<!tf_type.variant<tensor<8x4xf32>>>
return
}
```
will be transformed to:
```mlir
func @main(%arg0: tensor<8x4xf32>) {
// EmptyTensorList lowering
%emptyi = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
%emptyf = "tf.Cast"(%emptyi) : (tensor<i32>) -> tensor<f32>
%size_shape = "tf.Const"() {value = dense<[10, 8, 4]> : tensor<3xi32>} : () -> tensor<3xi32>
%tl = "tf.BroadcastTo"(%emptyf, %size_shape) : (tensor<f32>, tensor<3xi32>) -> tensor<10x8x4xf32>
// TensorListPushBack lowering
%index_in_list = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
%arg0_shape = "tf.Const"() {value = dense<[1, 8, 4]> : tensor<3xi32>} : () -> tensor<3xi32>
%arg0_reshaped = "tf.Reshape"(%arg0, %arg0_shape) : (tensor<8x4xf32>, tensor<3xi32>) -> tensor<1x8x4xf32>
%zeroi2 = "tf.Const"() {value = dense<0> : tensor<2xi32>} : () -> tensor<2xi32>
%axis = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
%start_indices = "tf.ConcatV2"(%index_in_list, %zeroi2, %axis) : (tensor<1xi32>, tensor<2xi32>, tensor<i32>) -> tensor<3xi32>
%push = "tf.XlaDynamicUpdateSlice"(%tl, %arg0_reshaped, %start_indices) : (tensor<10x8x4xf32>, tensor<1x8x4xf32>, tensor<3xi32>) -> tensor<10x8x4xf32>
%one = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
%next_index_in_list = "tf.AddV2"(%index_in_list, %one) : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
return
}
```
}];
let constructor = "TF::CreateTensorListOpsDecompositionPass()";
}
def TPUParallelExecuteSinkResourceWritePass : Pass<"tf-tpu-parallel-execute-sink-resource-write", "mlir::func::FuncOp"> {
let summary = "Moves tf.AssignVariableOp consumers of tf_device.parallel_execute "
"into tf_device.parallel_execute regions";
let constructor = "TFTPU::CreateTPUParallelExecuteSinkResourceWritePass()";
}
def RewriteTPUEmbeddingOpsPass : Pass<"tf-rewrite-tpu-embedding-ops", "mlir::func::FuncOp"> {
let summary = "Rewrites TPU embedding send/recv ops by adding TPU embedding "
"deduplication data";
let constructor = "TF::CreateRewriteTPUEmbeddingOpsPass()";
}
def InitTextFileToImportPass : Pass<"tf-init-text-file-to-import", "mlir::func::FuncOp"> {
let summary = "convert InitializeTableFromTextFileV2 ops to LookupTableImportV2Op "
"to remove the dependency on asset files";
let constructor = "TF::CreateInitTextFileToImportPass()";
let options = [
Option<"saved_model_dir_", "tf-saved-model-dir", "std::string", /*default=*/"",
"Directory containing the model exported as a TensorFlow "
"SavedModel. If your model is not based on the TensorFlow "
"SavedModel, use an empty value.">
];
}
def FusedKernelMatcherPass : Pass<"tf-fused-kernel-matcher", "mlir::func::FuncOp"> {
let summary = "Matches computations corresponding to optimized fused kernels";
let constructor = "TF::CreateFusedKernelMatcherPass()";
}
def TFDataOptimizationPass : Pass<"tf-data-optimization", "mlir::func::FuncOp"> {
let summary = "Performs tf.data optimizations";
let constructor = "TF::CreateTFDataOptimizationPass()";
}
def FunctionalControlFlowToCFGPass : Pass<"tf-functional-control-flow-to-cfg", "mlir::func::FuncOp"> {
let summary = "Transform functional control flow Ops to MLIR Control Form Graph "
"(CFG) form";
let constructor = "TF::CreateTFFunctionalControlFlowToCFG()";
let dependentDialects = ["tensor::TensorDialect"];
}
def BreakUpIslandsPass : Pass<"tf-executor-break-up-islands", "mlir::func::FuncOp"> {
let summary = "Transform from TF control dialect to TF executor dialect.";
let constructor = "CreateBreakUpIslandsPass()";
let dependentDialects = ["mlir::tf_executor::TensorFlowExecutorDialect"];
}
def FunctionalToExecutorDialectConversionPass : Pass<"tf-functional-to-executor-conversion", "mlir::func::FuncOp"> {
let summary = "Transform from func op to TF executor dialect.";
let constructor = "CreateFunctionalToExecutorDialectConversionPass()";
let dependentDialects = ["mlir::tf_executor::TensorFlowExecutorDialect"];
}
def SimpleTFDeviceAssignmentPass : Pass<"tf-simple-device-assignment", "mlir::func::FuncOp"> {
let summary = "Simple device assignment in TF dialect.";
let constructor = "TF::CreateSimpleTFDeviceAssignmentPass()";
let options = [
Option<"default_device_", "default-device", "std::string", /*default=*/"\"cpu\"",
"The default device to assign.">
];
let description = [{
Assigns the default device to all ops that have an empty (or
nonexistent) device attribute.
For example, if we have the code
```mlir
%0 = "tf.Const"() {value = dense<[[42.0]]> : tensor<1x1xf32>} : () -> tensor<1x1xf32>
%1 = "tf.Const"() {device = "", value = dense<[[42.0]]> : tensor<1x1xf32>} : () -> tensor<1x1xf32>
%2 = "tf.Const"() {device = "baz", value = dense<[[42.0]]> : tensor<1x1xf32>} : () -> tensor<1x1xf32>
```
then running this pass with 'default-device=foobar', we get:
```mlir
%0 = "tf.Const"() {device = "foobar" value = dense<[[42.0]]> : tensor<1x1xf32>} : () -> tensor<1x1xf32>
%1 = "tf.Const"() {device = "foobar", value = dense<[[42.0]]> : tensor<1x1xf32>} : () -> tensor<1x1xf32>
%2 = "tf.Const"() {device = "baz", value = dense<[[42.0]]> : tensor<1x1xf32>} : () -> tensor<1x1xf32>
```
}];
}
def LayoutAssignmentPass : Pass<"tf-layout-assignment", "mlir::func::FuncOp"> {
let summary = "Layout assignment pass.";
let constructor = "TF::CreateLayoutAssignmentPass()";
let options = [
Option<"force_data_format_", "force-data-format", "std::string", /*default=*/"",
"Force data format for all layout sensitive ops.">
];
}
def MoveTransposesPass : Pass<"tf-move-transposes", "mlir::func::FuncOp"> {
let summary = "Move transposes pass.";
let constructor = "TF::CreateMoveTransposesPass()";
let options = [
Option<"fold_transpose_in_ops_", "fold-transpose-in-ops", "bool", /*default=*/"true",
"Whether to fold transposes in ops which can support folding.">,
Option<"direction_", "direction", "enum MoveTransposeDirection",
/*default=*/"MoveTransposeDirection::kBegin", "Move transposes to the beginning or"
" the end of the block where they are defined.",
"llvm::cl::values(clEnumValN(MoveTransposeDirection::kBegin, \"begin\", \"beginning of the block\"),"
"clEnumValN(MoveTransposeDirection::kEnd, \"end\", \"end of the block\"))">
];
}
def BroadcastFoldPass : Pass<"tf-broadcast-fold", "mlir::func::FuncOp"> {
let summary = "Fold explicit broadcasts into the following operations if they "
"support implicit broadcasting on their operand.";
let constructor = "TF::CreateBroadcastFoldPass()";
}
def ParallelExecuteToIslandsPass : Pass<"tf-parallel-execute-to-islands", "mlir::func::FuncOp"> {
let summary = "Lowers device parallel_execute to executor islands";
let constructor = "TFDevice::CreateParallelExecuteToIslandsPass()";
}
def ConstantOpDeviceAssignmentPass : Pass<"constant-op-device-assignment", "ModuleOp"> {
let summary = "Assign device for tf.Const ops";
let constructor = "TF::CreateConstantOpDeviceAssignmentPass()";
}
def GuaranteeAllFuncsOneUsePass : Pass<"tf-guarantee-all-funcs-one-use", "ModuleOp"> {
let summary = "Guarantee all FuncOp's have only a single use.";
let constructor = "TF::CreateGuaranteeAllFuncsOneUsePass()";
}
def TPUColocateCompositeResourceOpsPass : Pass<"tf-tpu-colocate-composite-resource-ops", "mlir::func::FuncOp"> {
let summary = "Colocate resource with composite device assignment to TPU device.";
let constructor = "TFTPU::CreateTPUColocateCompositeResourceOps()";
let description = [{
Pass that co-locates resource ops that use composite device resources
(packed tensors) with the underlying physical TPU device.
So for example, if we have a function that does (inside a `tf_device.replicate`):
```mlir
%0 = "tf.ReadVariableOp"(%arg1) : (tensor<*x!tf_type.resource<tensor<4xf32>>>) -> tensor<4xf32>
```
Then said `ReadVariableOp` is going to get replaced by:
```mlir
%0 = "tf_device.launch"() ( {
%2 = "tf.ReadVariableOp"(%arg1) : (tensor<*x!tf_type.resource<tensor<4xf32>>>) -> tensor<4xf32>
tf_device.return %2 : tensor<4xf32>
}) {...} : () -> tensor<4xf32>
```
}];
}
def TPUDevicePropagationPass : Pass<"tf-tpu-device-propagation", "mlir::func::FuncOp"> {
let summary = "Propagates TPU devices from ops to users";
let constructor = "TFTPU::CreateTPUDevicePropagationPass()";
}
def TPUIdentityPruningPass : Pass<"tf-tpu-identity-pruning", "ModuleOp"> {
let summary = "Removes Identity/IdentityN ops from the TPU computation";
let constructor = "TFTPU::CreateTPUIdentityPruningPass()";
}
def TPUDynamicLayoutPass : Pass<"tf-tpu-dynamic-layout-pass", "ModuleOp"> {
let summary = "Inserts TPU layout ops to determine layout at run time.";
let constructor = "TFTPU::CreateTPUDynamicLayoutPass()";
let description = [{
A pass that allows TPU input layout to be determined after JIT compilation.
This is done by adding run-time ops that interpret compilation result and
copy the input to device with that layout.
Example: original program:
```mlir
%input = "tf.IteratorGetNext"(...) {device = "/CPU:0"}
%compile:2 = "tf._TPUCompileMlir"(...)
%execute = "tf.TPUExecute"(%input, ..., %compile#1) {device = "/TPU:0"}
```
Without this pass, later TF graph partitioning passes will insert send/recv
between %input and %execute and data will be copied to device in a fixed
layout. With this pass, the program will be transformed into:
```mlir
%input = "tf.IteratorGetNext"(...) {device = "/CPU:0"}
%compile:2 = "tf._TPUCompileMlir"(...)
%get_layout = "tf.TPUGetLayoutOp"(%compile#1) {...}
%copy_to_device = "tf.TPUCopyWithLayout"(%input, %get_layout)
{device = "/TPU:0"}
%execute = "tf.TPUExecute"(%copy_to_device, ..., %compile#1)
{device = "/TPU:0"}
```
This way, %compile will determine the layout, which will be respected by
%copy_to_device. There will not be send/recv ops added by later passes,
because tf.TPUCopyWithLayout accepts a host input and produces a device
output.
}];
}
def DeviceIndexSelectorPass : Pass<"tf-device-index-selector", "mlir::func::FuncOp"> {
let summary = "Fold tf.DeviceIndex to constant.";
let constructor = "TF::CreateDeviceIndexSelectorPass()";
}