tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc - platform/external/tensorflow - Git at Google

 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

 #include <iterator>
 #include <memory>
 #include <string>
 #include <tuple>
 #include <utility>

 #include "absl/strings/str_cat.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes_detail.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/random.h"
 #include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"

 namespace mlir {
 namespace TFTPU {

 namespace {

 constexpr char kDeviceAttr[] = "device";
 constexpr char kFuncDeviceAttr[] = "tf.device";
 constexpr char kDefaultShardingValue[] = "";
 constexpr char kMirroredVariableIndicesAttr[] = "_mirrored_variable_indices";

 std::string GetRandomStateVariableName() {
   return absl::StrCat("VariablesFormatState_", tensorflow::random::New64());
 }

 struct TPUVariableRuntimeReformattingPass
     : public TF::TPUVariableRuntimeReformattingPassBase<
           TPUVariableRuntimeReformattingPass> {
   void runOnOperation() final;
 };

 // Returns the earlier value of which `v` is an identity. If `skipped` is
 // provided, it will be used to store the identity nodes skipped.
 Value SkipIdentity(Value v, bool allow_other_use,
                    llvm::SmallPtrSet<Operation*, 4>* skipped = nullptr) {
   while (auto result = v.dyn_cast<OpResult>()) {
     if (!(allow_other_use || v.hasOneUse())) break;
     auto op = result.getDefiningOp();
     if (!llvm::isa<TF::IdentityOp, TF::IdentityNOp>(op)) {
       break;
     }
     v = op->getOperand(result.getResultNumber());
     if (skipped) skipped->insert(op);
   }
   return v;
 }

 // Finds the formattable arguments of `execute` and annotates the metadata of
 // `compile` to record these arguments. In addition, it returns a mapping from
 // the formattable arguments of `execute` to the corresponding operand of
 // `replicate`. The
 // entries in the mapping are sorted in the order of operands of `execute`.
 llvm::SmallVector<std::pair<int64_t, llvm::SmallVector<Value, 4>>, 4>
 AnnotateCompileOpAndGetExecuteArgToWhileArgsMapping(
     TF::WhileRegionOp while_op, tf_device::ReplicateOp replicate,
     TF::TPUExecuteAndUpdateVariablesOp execute,
     tf_device::LaunchOp compile_launch) {
   Region& body = while_op.body();
   Region& cond = while_op.cond();

   llvm::SmallVector<std::pair<int64_t, llvm::SmallVector<Value, 4>>, 4> mapping;
   auto mirrored_variable_indices_attr =
       replicate->getAttrOfType<ArrayAttr>(kMirroredVariableIndicesAttr);
   if (!mirrored_variable_indices_attr) return mapping;

   // Finds the mapping from a replicate argument to an execute operand.
   llvm::SmallDenseMap<int64_t, int64_t, 8> replicate_arg_to_execute_arg;
   for (auto index_and_arg : llvm::enumerate(execute.args())) {
     auto arg = SkipIdentity(index_and_arg.value(), /*allow_other_use=*/false);
     if (!arg.hasOneUse() ||
         !getElementTypeOrSelf(arg.getType()).isa<TF::ResourceType>()) {
       continue;
     }
     auto block_arg = arg.dyn_cast<BlockArgument>();
     if (!block_arg || block_arg.getOwner() != &replicate.GetBody()) continue;
     assert(replicate_arg_to_execute_arg.count(block_arg.getArgNumber()) == 0 &&
            "Found duplicate use of a resource in the execute op.");
     replicate_arg_to_execute_arg[block_arg.getArgNumber()] =
         index_and_arg.index();
   }
   if (replicate_arg_to_execute_arg.empty()) return mapping;

   // Parse the original compile metadata.
   Operation& compile = compile_launch.GetBody().front();
   auto metadata_str = compile.getAttrOfType<StringAttr>("metadata");
   assert(metadata_str && "Missing compilation metadata");
   tensorflow::tpu::TPUCompileMetadataProto metadata;
   metadata.ParseFromString(std::string(metadata_str.getValue()));
   int64_t num_replicas = replicate.n();
   // Find the formattable operands of `execute`, which must be mirrored
   // variables (arguments of `replicate`), and must be pass-throughs from while
   // operands.
   for (const auto& mirrored_index : mirrored_variable_indices_attr) {
     int64_t replicate_arg = mirrored_index.cast<IntegerAttr>().getInt();
     // Check if the mirrored variable is an input to `execute`.
     auto it = replicate_arg_to_execute_arg.find(replicate_arg);
     if (it == replicate_arg_to_execute_arg.end()) continue;
     // Get the data type of the resource.
     auto subtypes = getElementTypeOrSelf(execute.getOperand(it->second))
                         .cast<TF::ResourceType>()
                         .getSubtypes();
     if (subtypes.size() != 1) continue;
     auto data_type = getElementTypeOrSelf(subtypes[0]);
     // The XLA backend does not yet support formatting 64-bit data types.
     if (data_type.getIntOrFloatBitWidth() == 64) continue;

     const auto& block_arg = replicate.GetBody().getArgument(replicate_arg);

     int64_t num_inputs = 0;
     if (replicate.IsReplicatedBlockArgument(block_arg)) {
       num_inputs = num_replicas;
     } else {
       num_inputs = 1;
     }

     // We have found a mirrored variable which is an input to the replicated
     // `execute`. Now find if this mirrored variable is a pass-through of while
     // arguments.
     llvm::SmallVector<Value, 4> replicate_args;
     for (int64_t i = 0; i < num_inputs; ++i) {
       llvm::SmallPtrSet<Operation*, 4> skipped_identities;

       auto replicate_operand = SkipIdentity(
           replicate.GetReplicaOperandForBlockArgument(block_arg, i),
           /*allow_other_use=*/false, &skipped_identities);
       // For region based control flow, the resource operand for the replicate
       // should be a region capture. If this has any use other than the
       // replicate op (within the body of the while) or the skipped identities,
       // then do not apply the transformation to this variable.
       bool is_region_capture =
           replicate_operand.getParentRegion()->isProperAncestor(&body);
       bool has_other_use_in_body =
           llvm::any_of(replicate_operand.getUsers(), [&](Operation* user) {
             // Ignore uses that are not in the while body or condition.
             if (!body.isAncestor(user->getParentRegion()) &&
                 !cond.isAncestor(user->getParentRegion()))
               return false;
             // Within the body or cond, only uses in replicate and the skipped
             // identities is allowed.
             return user != replicate && skipped_identities.count(user) == 0;
           });

       if (!is_region_capture || has_other_use_in_body) {
         replicate_args.clear();
         break;
       }
       replicate_args.push_back(replicate_operand);
     }
     if (replicate_args.empty()) continue;
     // Now set the enable_xla_sharding field in the metadata to inform the
     // compile op.
     auto metadata_arg = metadata.mutable_args(it->second);
     metadata_arg->set_enable_xla_sharding(
         ::tensorflow::tpu::TPUCompileMetadataProto_Arg::ALLOWED);
     mapping.emplace_back(it->second, std::move(replicate_args));
   }
   // Sort the mapping according to execute operand order.
   llvm::sort(mapping, llvm::less_first());
   // Populate the `retval_index_for_sharding` field of the argument metadate.
   for (auto entry : llvm::enumerate(execute.device_var_reads_indices())) {
     int64_t arg_index = entry.value().cast<IntegerAttr>().getInt();
     auto arg_metadata = metadata.mutable_args(arg_index);
     if (arg_metadata->enable_xla_sharding() ==
         ::tensorflow::tpu::TPUCompileMetadataProto_Arg::ALLOWED) {
       int64_t ret_index = execute.device_var_updates_indices()
                               .getValue()[entry.index()]
                               .cast<IntegerAttr>()
                               .getInt();
       arg_metadata->set_retval_index_for_sharding(ret_index);
     }
   }
   // Update the metadata of the compile op.
   compile.setAttr("metadata", StringAttr::get(compile.getContext(),
                                               metadata.SerializeAsString()));
   return mapping;
 }

 // Adds a new replicated input to the replicate op.
 tf_device::ReplicateOp AddInputsToReplicateOp(
     tf_device::ReplicateOp replicate,
     MutableArrayRef<TF::VarHandleOp> new_inputs,
     const llvm::SmallDenseMap<llvm::StringRef, llvm::SmallVector<StringRef, 4>>&
         devices) {
   int64_t num_replicas = replicate.n();
   assert(new_inputs.size() == num_replicas);

   // As model parallelism is not yet supported, we assume that all ops are
   // placed in logical core 0.
   // TODO(b/148913020): Remove this constraint once model parallelism is
   // supported.
   assert(devices.find(tensorflow::GetDeviceAliasForLogicalCore(0))
              ->getSecond()
              .size() == num_replicas);

   llvm::SmallVector<std::pair<ValueRange, Type>, 8> new_replicated_inputs;
   llvm::SmallVector<Value, 8> new_packed_inputs;
   llvm::SmallVector<llvm::SmallVector<Value, 8>, 8> replicated_inputs;
   replicated_inputs.reserve(replicate.GetNumReplicatedBlockArguments());
   new_packed_inputs.reserve(replicate.GetNumPackedBlockArguments());
   for (const auto& arg : replicate.GetReplicatedBlockArguments()) {
     replicated_inputs.emplace_back();
     for (int64_t i = 0; i < num_replicas; ++i) {
       replicated_inputs.back().push_back(
           replicate.GetReplicaOperandForBlockArgument(arg, i));
     }
     new_replicated_inputs.emplace_back(replicated_inputs.back(), arg.getType());
   }
   for (const auto& arg : replicate.GetPackedBlockArguments()) {
     new_packed_inputs.emplace_back(
         replicate.GetReplicaOperandForBlockArgument(arg, /*replica=*/0));
   }
   SmallVector<Value, 4> new_input_values;
   new_input_values.reserve(new_inputs.size());
   for (auto var : new_inputs) new_input_values.push_back(var.resource());
   new_replicated_inputs.emplace_back(new_input_values,
                                      new_input_values.front().getType());
   OpBuilder builder(replicate);
   auto new_replicate = builder.create<tf_device::ReplicateOp>(
       replicate.getLoc(), num_replicas, devices, new_replicated_inputs,
       new_packed_inputs,
       replicate.GetBody().getTerminator()->getOperandTypes());
   for (auto arg : replicate.GetBody().getArguments()) {
     if (replicate.IsReplicatedBlockArgument(arg)) {
       arg.replaceAllUsesWith(
           new_replicate.GetBody().getArgument(arg.getArgNumber()));
     } else {
       // There is a new added replicated state variable between replicated args
       // and packed args.
       arg.replaceAllUsesWith(
           new_replicate.GetBody().getArgument(arg.getArgNumber() + 1));
     }
   }
   for (auto& op : llvm::make_early_inc_range(replicate.GetBody())) {
     op.moveBefore(&new_replicate.GetBody(), new_replicate.GetBody().end());
   }
   replicate.replaceAllUsesWith(new_replicate);
   replicate.erase();
   return new_replicate;
 }

 // Creates the per-device variables that represent the formatting state of each
 // device.
 llvm::SmallVector<TF::VarHandleOp, 4> CreateStateVars(
     const llvm::SmallDenseMap<llvm::StringRef, llvm::SmallVector<StringRef, 4>>&
         devices,
     Location loc, RankedTensorType key_type, OpBuilder* builder) {
   llvm::SmallVector<TF::VarHandleOp, 4> state_vars;

   // TODO(b/148913020): Remove this constraint once model parallelism is
   // supported.
   const auto& device_list =
       devices.find(tensorflow::GetDeviceAliasForLogicalCore(0))->getSecond();

   // Create the state variable for each device.
   for (llvm::StringRef device : device_list) {
     state_vars.push_back(builder->create<TF::VarHandleOp>(
         loc,
         llvm::ArrayRef<Type>{RankedTensorType::get(
             {}, TF::ResourceType::get(llvm::ArrayRef<TensorType>{key_type},
                                       builder->getContext()))},
         llvm::ArrayRef<Value>{},
         llvm::ArrayRef<NamedAttribute>{
             builder->getNamedAttr(kDeviceAttr, builder->getStringAttr(device)),
             builder->getNamedAttr("container", builder->getStringAttr("")),
             builder->getNamedAttr(
                 "shared_name",
                 builder->getStringAttr(GetRandomStateVariableName()))}));
   }
   return state_vars;
 }

 // Wraps single op in `tf_device.launch` for explicit device assignment.
 void WrapOpInLaunch(OpBuilder* builder, Location loc, Operation* op,
                     llvm::StringRef device) {
   OpBuilder::InsertPoint insert_point = builder->saveInsertionPoint();

   auto launch = builder->create<tf_device::LaunchOp>(
       loc, builder->getStringAttr(device), op->getResultTypes());
   launch.body().push_back(new Block);

   builder->setInsertionPointToEnd(&launch.GetBody());
   builder->create<tf_device::ReturnOp>(loc, op->getResults());

   // Move op inside launch.
   op->moveBefore(launch.GetBody().getTerminator());

   builder->restoreInsertionPoint(insert_point);
 }

 // Performs the transformation for a replicate op inside a while loop.
 void HandleReplicateOp(TF::WhileRegionOp while_op,
                        tf_device::ReplicateOp replicate) {
   int64_t num_replicas = replicate.n();
   if (num_replicas == 1) return;

   // Set execute_launch when there is exactly one LaunchOp with a
   // TPUExecuteAndUpdateVariablesOp. More than one means there is model
   // parallelism, which is not supported with TPUReshardVariables. None
   // means there is no TPU computation.
   tf_device::LaunchOp execute_launch;
   replicate.walk([&](tf_device::LaunchOp execute_launch_op) {
     if (!execute_launch_op.WrapsSingleOp() ||
         !llvm::isa<TF::TPUExecuteAndUpdateVariablesOp>(
             execute_launch_op.GetBody().front()))
       return WalkResult::advance();
     if (execute_launch == nullptr) {
       execute_launch = execute_launch_op;
       return WalkResult::advance();
     }
     execute_launch = nullptr;
     return WalkResult::interrupt();
   });
   if (!execute_launch) return;

   auto execute = llvm::cast<TF::TPUExecuteAndUpdateVariablesOp>(
       execute_launch.GetBody().front());
   auto compile =
       SkipIdentity(execute.key(), /*allow_other_use=*/true).getDefiningOp();
   if (!compile) return;
   auto compile_launch = llvm::dyn_cast<tf_device::LaunchOp>(compile);
   if (!compile_launch || !compile_launch.WrapsSingleOp() ||
       !llvm::isa<TF::_TPUCompileMlirOp>(compile_launch.GetBody().front()))
     return;

   // Analyze the formattable inputs.
   auto execute_arg_to_outer_args =
       AnnotateCompileOpAndGetExecuteArgToWhileArgsMapping(
           while_op, replicate, execute, compile_launch);
   if (execute_arg_to_outer_args.empty()) return;

   // Extract the replicated devices.
   auto devices_attr = replicate.devices();
   if (!devices_attr) return;

   auto device_map = devices_attr.getValue();
   llvm::SmallDenseMap<llvm::StringRef, llvm::SmallVector<StringRef, 4>> devices;
   devices.reserve(device_map.size());

   for (auto it : device_map) {
     auto device_alias = it.getName().strref();
     auto device_list = it.getValue().cast<ArrayAttr>();
     llvm::SmallVector<StringRef, 4> device_list_for_alias;
     device_list_for_alias.reserve(device_list.size());

     for (auto device : device_list)
       device_list_for_alias.emplace_back(device.cast<StringAttr>().getValue());

     devices.insert({device_alias, device_list_for_alias});
   }

   OpBuilder builder(replicate);
   builder.setInsertionPoint(while_op);
   // Create per-device variables for formatting state, and add them to the while
   // loop.
   auto key_type =
       RankedTensorType::get({2}, TF::StringType::get(builder.getContext()));
   auto state_vars =
       CreateStateVars(devices, while_op.getLoc(), key_type, &builder);
   replicate = AddInputsToReplicateOp(replicate, state_vars, devices);
   // Build the reformat according to the compilation. Build it inside
   // `replicate`.
   llvm::SmallVector<Value, 8> reformat_operands;
   for (const auto& entry : execute_arg_to_outer_args) {
     reformat_operands.push_back(execute.args()[entry.first]);
   }
   reformat_operands.push_back(compile_launch.getResult(1));
   reformat_operands.push_back(replicate.GetBody().getArgument(
       replicate.GetNumReplicatedBlockArguments() - 1));
   builder.setInsertionPoint(execute_launch);
   auto reformat_op = builder.create<TF::TPUReshardVariablesOp>(
       execute_launch.getLoc(), llvm::ArrayRef<Type>{}, reformat_operands);
   WrapOpInLaunch(&builder, execute_launch.getLoc(), reformat_op,
                  execute_launch.device());

   // Build the replicated unformat op after the loop. First prepare building the
   // replicate op.
   llvm::SmallVector<std::pair<ValueRange, Type>, 8> unformat_replicate_operands;
   llvm::SmallVector<Value, 8> unformat_packed_operands;
   for (const auto& entry : execute_arg_to_outer_args) {
     if (entry.second.size() > 1) {
       unformat_replicate_operands.emplace_back(entry.second,
                                                entry.second.front().getType());
     } else {
       unformat_packed_operands.emplace_back(entry.second.front());
     }
   }
   llvm::SmallVector<Value, 4> state_var_vals(state_vars.size());
   for (const auto& entry : llvm::enumerate(state_vars)) {
     state_var_vals[entry.index()] = entry.value().resource();
   }
   // Add the replicated state var to the end of the replicate operands.
   unformat_replicate_operands.emplace_back(state_var_vals,
                                            state_var_vals.front().getType());
   // Build a constant default key to specify that the unformatting should
   // transform the variables to the original format.
   builder.setInsertionPointAfter(while_op);
   tensorflow::Tensor default_key_tensor(tensorflow::DT_STRING, {3});
   default_key_tensor.vec<tensorflow::tstring>()(0) = kDefaultShardingValue;
   default_key_tensor.vec<tensorflow::tstring>()(1) = kDefaultShardingValue;
   default_key_tensor.vec<tensorflow::tstring>()(2) = kDefaultShardingValue;
   auto default_state_key = builder.create<TF::ConstOp>(
       while_op.getLoc(),
       tensorflow::ConvertTensor(default_key_tensor, &builder).ValueOrDie());
   // With all replicated inputs, now build the replicate op.
   auto unformat_replicate = builder.create<tf_device::ReplicateOp>(
       while_op.getLoc(), num_replicas, devices, unformat_replicate_operands,
       unformat_packed_operands, TypeRange{});
   // Then build the unformat op in the replicate op.
   builder.setInsertionPointToEnd(&unformat_replicate.GetBody());
   llvm::SmallVector<Value, 8> unformat_operands;
   // Add the replicated state var (the last replicated operand of the
   // ReplicateOp) as the last operand of TPUReshardVariablesOp.
   BlockArgument state = unformat_replicate.GetReplicatedBlockArguments().back();
   auto replicated_block_args =
       unformat_replicate.GetReplicatedBlockArguments().drop_back(1);
   auto packed_block_args = unformat_replicate.GetPackedBlockArguments();
   unformat_operands.append(replicated_block_args.begin(),
                            replicated_block_args.end());
   unformat_operands.append(packed_block_args.begin(), packed_block_args.end());
   unformat_operands.push_back(state);

   // Insert the default key as the second last operand.
   unformat_operands.insert(
       unformat_operands.begin() + unformat_operands.size() - 1,
       default_state_key.getResult());
   // Unformat op.
   auto unformat_op = builder.create<TF::TPUReshardVariablesOp>(
       while_op.getLoc(), llvm::ArrayRef<Type>{}, unformat_operands);
   WrapOpInLaunch(&builder, execute_launch.getLoc(), unformat_op,
                  execute_launch.device());
   builder.create<tf_device::ReturnOp>(while_op.getLoc(), ArrayRef<Value>{});
 }

 void TPUVariableRuntimeReformattingPass::runOnOperation() {
   auto module = getOperation();
   module.walk([&](TF::WhileRegionOp while_op) {
     tf_device::ReplicateOp replicate;
     while_op.body().walk([&](tf_device::ReplicateOp replicate_op) {
       if (replicate == nullptr) {
         replicate = replicate_op;
         return WalkResult::advance();
       }
       // We do not handle loops with multiple replicate ops.
       replicate = nullptr;
       return WalkResult::interrupt();
     });
     if (replicate) HandleReplicateOp(while_op, replicate);
   });
 }

 }  // namespace

 std::unique_ptr<OperationPass<ModuleOp>>
 CreateTPUVariableRuntimeReformattingPass() {
   return std::make_unique<TPUVariableRuntimeReformattingPass>();
 }

 }  // namespace TFTPU
 }  // namespace mlir
	/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.

	Licensed under the Apache License, Version 2.0 (the "License");
	you may not use this file except in compliance with the License.
	You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.
	==============================================================================*/

	#include <iterator>
	#include <memory>
	#include <string>
	#include <tuple>
	#include <utility>

	#include "absl/strings/str_cat.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/Support/Casting.h"
	#include "mlir/Dialect/Func/IR/FuncOps.h" // from @llvm-project
	#include "mlir/IR/Attributes.h" // from @llvm-project
	#include "mlir/IR/Builders.h" // from @llvm-project
	#include "mlir/IR/BuiltinOps.h" // from @llvm-project
	#include "mlir/IR/Location.h" // from @llvm-project
	#include "mlir/IR/MLIRContext.h" // from @llvm-project
	#include "mlir/IR/Operation.h" // from @llvm-project
	#include "mlir/IR/TypeUtilities.h" // from @llvm-project
	#include "mlir/IR/Types.h" // from @llvm-project
	#include "mlir/IR/Value.h" // from @llvm-project
	#include "mlir/Pass/Pass.h" // from @llvm-project
	#include "mlir/Pass/PassRegistry.h" // from @llvm-project
	#include "mlir/Transforms/RegionUtils.h" // from @llvm-project
	#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
	#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
	#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
	#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
	#include "tensorflow/compiler/mlir/tensorflow/transforms/passes_detail.h"
	#include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
	#include "tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h"
	#include "tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h"
	#include "tensorflow/core/framework/tensor.h"
	#include "tensorflow/core/framework/tensor_shape.pb.h"
	#include "tensorflow/core/framework/types.pb.h"
	#include "tensorflow/core/platform/random.h"
	#include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"

	namespace mlir {
	namespace TFTPU {

	namespace {

	constexpr char kDeviceAttr[] = "device";
	constexpr char kFuncDeviceAttr[] = "tf.device";
	constexpr char kDefaultShardingValue[] = "";
	constexpr char kMirroredVariableIndicesAttr[] = "_mirrored_variable_indices";

	std::string GetRandomStateVariableName() {
	return absl::StrCat("VariablesFormatState_", tensorflow::random::New64());
	}

	struct TPUVariableRuntimeReformattingPass
	: public TF::TPUVariableRuntimeReformattingPassBase<
	TPUVariableRuntimeReformattingPass> {
	void runOnOperation() final;
	};

	// Returns the earlier value of which `v` is an identity. If `skipped` is
	// provided, it will be used to store the identity nodes skipped.
	Value SkipIdentity(Value v, bool allow_other_use,
	llvm::SmallPtrSet<Operation, 4> skipped = nullptr) {
	while (auto result = v.dyn_cast<OpResult>()) {
	if (!(allow_other_use \|\| v.hasOneUse())) break;
	auto op = result.getDefiningOp();
	if (!llvm::isa<TF::IdentityOp, TF::IdentityNOp>(op)) {
	break;
	}
	v = op->getOperand(result.getResultNumber());
	if (skipped) skipped->insert(op);
	}
	return v;
	}

	// Finds the formattable arguments of `execute` and annotates the metadata of
	// `compile` to record these arguments. In addition, it returns a mapping from
	// the formattable arguments of `execute` to the corresponding operand of
	// `replicate`. The
	// entries in the mapping are sorted in the order of operands of `execute`.
	llvm::SmallVector<std::pair<int64_t, llvm::SmallVector<Value, 4>>, 4>
	AnnotateCompileOpAndGetExecuteArgToWhileArgsMapping(
	TF::WhileRegionOp while_op, tf_device::ReplicateOp replicate,
	TF::TPUExecuteAndUpdateVariablesOp execute,
	tf_device::LaunchOp compile_launch) {
	Region& body = while_op.body();
	Region& cond = while_op.cond();

	llvm::SmallVector<std::pair<int64_t, llvm::SmallVector<Value, 4>>, 4> mapping;
	auto mirrored_variable_indices_attr =
	replicate->getAttrOfType<ArrayAttr>(kMirroredVariableIndicesAttr);
	if (!mirrored_variable_indices_attr) return mapping;

	// Finds the mapping from a replicate argument to an execute operand.
	llvm::SmallDenseMap<int64_t, int64_t, 8> replicate_arg_to_execute_arg;
	for (auto index_and_arg : llvm::enumerate(execute.args())) {
	auto arg = SkipIdentity(index_and_arg.value(), /allow_other_use=/false);
	if (!arg.hasOneUse() \|\|
	!getElementTypeOrSelf(arg.getType()).isa<TF::ResourceType>()) {
	continue;
	}
	auto block_arg = arg.dyn_cast<BlockArgument>();
	if (!block_arg \|\| block_arg.getOwner() != &replicate.GetBody()) continue;
	assert(replicate_arg_to_execute_arg.count(block_arg.getArgNumber()) == 0 &&
	"Found duplicate use of a resource in the execute op.");
	replicate_arg_to_execute_arg[block_arg.getArgNumber()] =
	index_and_arg.index();
	}
	if (replicate_arg_to_execute_arg.empty()) return mapping;

	// Parse the original compile metadata.
	Operation& compile = compile_launch.GetBody().front();
	auto metadata_str = compile.getAttrOfType<StringAttr>("metadata");
	assert(metadata_str && "Missing compilation metadata");
	tensorflow::tpu::TPUCompileMetadataProto metadata;
	metadata.ParseFromString(std::string(metadata_str.getValue()));
	int64_t num_replicas = replicate.n();
	// Find the formattable operands of `execute`, which must be mirrored
	// variables (arguments of `replicate`), and must be pass-throughs from while
	// operands.
	for (const auto& mirrored_index : mirrored_variable_indices_attr) {
	int64_t replicate_arg = mirrored_index.cast<IntegerAttr>().getInt();
	// Check if the mirrored variable is an input to `execute`.
	auto it = replicate_arg_to_execute_arg.find(replicate_arg);
	if (it == replicate_arg_to_execute_arg.end()) continue;
	// Get the data type of the resource.
	auto subtypes = getElementTypeOrSelf(execute.getOperand(it->second))
	.cast<TF::ResourceType>()
	.getSubtypes();
	if (subtypes.size() != 1) continue;
	auto data_type = getElementTypeOrSelf(subtypes[0]);
	// The XLA backend does not yet support formatting 64-bit data types.
	if (data_type.getIntOrFloatBitWidth() == 64) continue;

	const auto& block_arg = replicate.GetBody().getArgument(replicate_arg);

	int64_t num_inputs = 0;
	if (replicate.IsReplicatedBlockArgument(block_arg)) {
	num_inputs = num_replicas;
	} else {
	num_inputs = 1;
	}

	// We have found a mirrored variable which is an input to the replicated
	// `execute`. Now find if this mirrored variable is a pass-through of while
	// arguments.
	llvm::SmallVector<Value, 4> replicate_args;
	for (int64_t i = 0; i < num_inputs; ++i) {
	llvm::SmallPtrSet<Operation*, 4> skipped_identities;

	auto replicate_operand = SkipIdentity(
	replicate.GetReplicaOperandForBlockArgument(block_arg, i),
	/allow_other_use=/false, &skipped_identities);
	// For region based control flow, the resource operand for the replicate
	// should be a region capture. If this has any use other than the
	// replicate op (within the body of the while) or the skipped identities,
	// then do not apply the transformation to this variable.
	bool is_region_capture =
	replicate_operand.getParentRegion()->isProperAncestor(&body);
	bool has_other_use_in_body =
	llvm::any_of(replicate_operand.getUsers(), [&](Operation* user) {
	// Ignore uses that are not in the while body or condition.
	if (!body.isAncestor(user->getParentRegion()) &&
	!cond.isAncestor(user->getParentRegion()))
	return false;
	// Within the body or cond, only uses in replicate and the skipped
	// identities is allowed.
	return user != replicate && skipped_identities.count(user) == 0;
	});

	if (!is_region_capture \|\| has_other_use_in_body) {
	replicate_args.clear();
	break;
	}
	replicate_args.push_back(replicate_operand);
	}
	if (replicate_args.empty()) continue;
	// Now set the enable_xla_sharding field in the metadata to inform the
	// compile op.
	auto metadata_arg = metadata.mutable_args(it->second);
	metadata_arg->set_enable_xla_sharding(
	::tensorflow::tpu::TPUCompileMetadataProto_Arg::ALLOWED);
	mapping.emplace_back(it->second, std::move(replicate_args));
	}
	// Sort the mapping according to execute operand order.
	llvm::sort(mapping, llvm::less_first());
	// Populate the `retval_index_for_sharding` field of the argument metadate.
	for (auto entry : llvm::enumerate(execute.device_var_reads_indices())) {
	int64_t arg_index = entry.value().cast<IntegerAttr>().getInt();
	auto arg_metadata = metadata.mutable_args(arg_index);
	if (arg_metadata->enable_xla_sharding() ==
	::tensorflow::tpu::TPUCompileMetadataProto_Arg::ALLOWED) {
	int64_t ret_index = execute.device_var_updates_indices()
	.getValue()[entry.index()]
	.cast<IntegerAttr>()
	.getInt();
	arg_metadata->set_retval_index_for_sharding(ret_index);
	}
	}
	// Update the metadata of the compile op.
	compile.setAttr("metadata", StringAttr::get(compile.getContext(),
	metadata.SerializeAsString()));
	return mapping;
	}

	// Adds a new replicated input to the replicate op.
	tf_device::ReplicateOp AddInputsToReplicateOp(
	tf_device::ReplicateOp replicate,
	MutableArrayRef<TF::VarHandleOp> new_inputs,
	const llvm::SmallDenseMap<llvm::StringRef, llvm::SmallVector<StringRef, 4>>&
	devices) {
	int64_t num_replicas = replicate.n();
	assert(new_inputs.size() == num_replicas);

	// As model parallelism is not yet supported, we assume that all ops are
	// placed in logical core 0.
	// TODO(b/148913020): Remove this constraint once model parallelism is
	// supported.
	assert(devices.find(tensorflow::GetDeviceAliasForLogicalCore(0))
	->getSecond()
	.size() == num_replicas);

	llvm::SmallVector<std::pair<ValueRange, Type>, 8> new_replicated_inputs;
	llvm::SmallVector<Value, 8> new_packed_inputs;
	llvm::SmallVector<llvm::SmallVector<Value, 8>, 8> replicated_inputs;
	replicated_inputs.reserve(replicate.GetNumReplicatedBlockArguments());
	new_packed_inputs.reserve(replicate.GetNumPackedBlockArguments());
	for (const auto& arg : replicate.GetReplicatedBlockArguments()) {
	replicated_inputs.emplace_back();
	for (int64_t i = 0; i < num_replicas; ++i) {
	replicated_inputs.back().push_back(
	replicate.GetReplicaOperandForBlockArgument(arg, i));
	}
	new_replicated_inputs.emplace_back(replicated_inputs.back(), arg.getType());
	}
	for (const auto& arg : replicate.GetPackedBlockArguments()) {
	new_packed_inputs.emplace_back(
	replicate.GetReplicaOperandForBlockArgument(arg, /replica=/0));
	}
	SmallVector<Value, 4> new_input_values;
	new_input_values.reserve(new_inputs.size());
	for (auto var : new_inputs) new_input_values.push_back(var.resource());
	new_replicated_inputs.emplace_back(new_input_values,
	new_input_values.front().getType());
	OpBuilder builder(replicate);
	auto new_replicate = builder.create<tf_device::ReplicateOp>(
	replicate.getLoc(), num_replicas, devices, new_replicated_inputs,
	new_packed_inputs,
	replicate.GetBody().getTerminator()->getOperandTypes());
	for (auto arg : replicate.GetBody().getArguments()) {
	if (replicate.IsReplicatedBlockArgument(arg)) {
	arg.replaceAllUsesWith(
	new_replicate.GetBody().getArgument(arg.getArgNumber()));
	} else {
	// There is a new added replicated state variable between replicated args
	// and packed args.
	arg.replaceAllUsesWith(
	new_replicate.GetBody().getArgument(arg.getArgNumber() + 1));
	}
	}
	for (auto& op : llvm::make_early_inc_range(replicate.GetBody())) {
	op.moveBefore(&new_replicate.GetBody(), new_replicate.GetBody().end());
	}
	replicate.replaceAllUsesWith(new_replicate);
	replicate.erase();
	return new_replicate;
	}

	// Creates the per-device variables that represent the formatting state of each
	// device.
	llvm::SmallVector<TF::VarHandleOp, 4> CreateStateVars(
	const llvm::SmallDenseMap<llvm::StringRef, llvm::SmallVector<StringRef, 4>>&
	devices,
	Location loc, RankedTensorType key_type, OpBuilder* builder) {
	llvm::SmallVector<TF::VarHandleOp, 4> state_vars;

	// TODO(b/148913020): Remove this constraint once model parallelism is
	// supported.
	const auto& device_list =
	devices.find(tensorflow::GetDeviceAliasForLogicalCore(0))->getSecond();

	// Create the state variable for each device.
	for (llvm::StringRef device : device_list) {
	state_vars.push_back(builder->create<TF::VarHandleOp>(
	loc,
	llvm::ArrayRef<Type>{RankedTensorType::get(
	{}, TF::ResourceType::get(llvm::ArrayRef<TensorType>{key_type},
	builder->getContext()))},
	llvm::ArrayRef<Value>{},
	llvm::ArrayRef<NamedAttribute>{
	builder->getNamedAttr(kDeviceAttr, builder->getStringAttr(device)),
	builder->getNamedAttr("container", builder->getStringAttr("")),
	builder->getNamedAttr(
	"shared_name",
	builder->getStringAttr(GetRandomStateVariableName()))}));
	}
	return state_vars;
	}

	// Wraps single op in `tf_device.launch` for explicit device assignment.
	void WrapOpInLaunch(OpBuilder* builder, Location loc, Operation* op,
	llvm::StringRef device) {
	OpBuilder::InsertPoint insert_point = builder->saveInsertionPoint();

	auto launch = builder->create<tf_device::LaunchOp>(
	loc, builder->getStringAttr(device), op->getResultTypes());
	launch.body().push_back(new Block);

	builder->setInsertionPointToEnd(&launch.GetBody());
	builder->create<tf_device::ReturnOp>(loc, op->getResults());

	// Move op inside launch.
	op->moveBefore(launch.GetBody().getTerminator());

	builder->restoreInsertionPoint(insert_point);
	}

	// Performs the transformation for a replicate op inside a while loop.
	void HandleReplicateOp(TF::WhileRegionOp while_op,
	tf_device::ReplicateOp replicate) {
	int64_t num_replicas = replicate.n();
	if (num_replicas == 1) return;

	// Set execute_launch when there is exactly one LaunchOp with a
	// TPUExecuteAndUpdateVariablesOp. More than one means there is model
	// parallelism, which is not supported with TPUReshardVariables. None
	// means there is no TPU computation.
	tf_device::LaunchOp execute_launch;
	replicate.walk([&](tf_device::LaunchOp execute_launch_op) {
	if (!execute_launch_op.WrapsSingleOp() \|\|
	!llvm::isa<TF::TPUExecuteAndUpdateVariablesOp>(
	execute_launch_op.GetBody().front()))
	return WalkResult::advance();
	if (execute_launch == nullptr) {
	execute_launch = execute_launch_op;
	return WalkResult::advance();
	}
	execute_launch = nullptr;
	return WalkResult::interrupt();
	});
	if (!execute_launch) return;

	auto execute = llvm::cast<TF::TPUExecuteAndUpdateVariablesOp>(
	execute_launch.GetBody().front());
	auto compile =
	SkipIdentity(execute.key(), /allow_other_use=/true).getDefiningOp();
	if (!compile) return;
	auto compile_launch = llvm::dyn_cast<tf_device::LaunchOp>(compile);
	if (!compile_launch \|\| !compile_launch.WrapsSingleOp() \|\|
	!llvm::isa<TF::_TPUCompileMlirOp>(compile_launch.GetBody().front()))
	return;

	// Analyze the formattable inputs.
	auto execute_arg_to_outer_args =
	AnnotateCompileOpAndGetExecuteArgToWhileArgsMapping(
	while_op, replicate, execute, compile_launch);
	if (execute_arg_to_outer_args.empty()) return;

	// Extract the replicated devices.
	auto devices_attr = replicate.devices();
	if (!devices_attr) return;

	auto device_map = devices_attr.getValue();
	llvm::SmallDenseMap<llvm::StringRef, llvm::SmallVector<StringRef, 4>> devices;
	devices.reserve(device_map.size());

	for (auto it : device_map) {
	auto device_alias = it.getName().strref();
	auto device_list = it.getValue().cast<ArrayAttr>();
	llvm::SmallVector<StringRef, 4> device_list_for_alias;
	device_list_for_alias.reserve(device_list.size());

	for (auto device : device_list)
	device_list_for_alias.emplace_back(device.cast<StringAttr>().getValue());

	devices.insert({device_alias, device_list_for_alias});
	}

	OpBuilder builder(replicate);
	builder.setInsertionPoint(while_op);
	// Create per-device variables for formatting state, and add them to the while
	// loop.
	auto key_type =
	RankedTensorType::get({2}, TF::StringType::get(builder.getContext()));
	auto state_vars =
	CreateStateVars(devices, while_op.getLoc(), key_type, &builder);
	replicate = AddInputsToReplicateOp(replicate, state_vars, devices);
	// Build the reformat according to the compilation. Build it inside
	// `replicate`.
	llvm::SmallVector<Value, 8> reformat_operands;
	for (const auto& entry : execute_arg_to_outer_args) {
	reformat_operands.push_back(execute.args()[entry.first]);
	}
	reformat_operands.push_back(compile_launch.getResult(1));
	reformat_operands.push_back(replicate.GetBody().getArgument(
	replicate.GetNumReplicatedBlockArguments() - 1));
	builder.setInsertionPoint(execute_launch);
	auto reformat_op = builder.create<TF::TPUReshardVariablesOp>(
	execute_launch.getLoc(), llvm::ArrayRef<Type>{}, reformat_operands);
	WrapOpInLaunch(&builder, execute_launch.getLoc(), reformat_op,
	execute_launch.device());

	// Build the replicated unformat op after the loop. First prepare building the
	// replicate op.
	llvm::SmallVector<std::pair<ValueRange, Type>, 8> unformat_replicate_operands;
	llvm::SmallVector<Value, 8> unformat_packed_operands;
	for (const auto& entry : execute_arg_to_outer_args) {
	if (entry.second.size() > 1) {
	unformat_replicate_operands.emplace_back(entry.second,
	entry.second.front().getType());
	} else {
	unformat_packed_operands.emplace_back(entry.second.front());
	}
	}
	llvm::SmallVector<Value, 4> state_var_vals(state_vars.size());
	for (const auto& entry : llvm::enumerate(state_vars)) {
	state_var_vals[entry.index()] = entry.value().resource();
	}
	// Add the replicated state var to the end of the replicate operands.
	unformat_replicate_operands.emplace_back(state_var_vals,
	state_var_vals.front().getType());
	// Build a constant default key to specify that the unformatting should
	// transform the variables to the original format.
	builder.setInsertionPointAfter(while_op);
	tensorflow::Tensor default_key_tensor(tensorflow::DT_STRING, {3});
	default_key_tensor.vec<tensorflow::tstring>()(0) = kDefaultShardingValue;
	default_key_tensor.vec<tensorflow::tstring>()(1) = kDefaultShardingValue;
	default_key_tensor.vec<tensorflow::tstring>()(2) = kDefaultShardingValue;
	auto default_state_key = builder.create<TF::ConstOp>(
	while_op.getLoc(),
	tensorflow::ConvertTensor(default_key_tensor, &builder).ValueOrDie());
	// With all replicated inputs, now build the replicate op.
	auto unformat_replicate = builder.create<tf_device::ReplicateOp>(
	while_op.getLoc(), num_replicas, devices, unformat_replicate_operands,
	unformat_packed_operands, TypeRange{});
	// Then build the unformat op in the replicate op.
	builder.setInsertionPointToEnd(&unformat_replicate.GetBody());
	llvm::SmallVector<Value, 8> unformat_operands;
	// Add the replicated state var (the last replicated operand of the
	// ReplicateOp) as the last operand of TPUReshardVariablesOp.
	BlockArgument state = unformat_replicate.GetReplicatedBlockArguments().back();
	auto replicated_block_args =
	unformat_replicate.GetReplicatedBlockArguments().drop_back(1);
	auto packed_block_args = unformat_replicate.GetPackedBlockArguments();
	unformat_operands.append(replicated_block_args.begin(),
	replicated_block_args.end());
	unformat_operands.append(packed_block_args.begin(), packed_block_args.end());
	unformat_operands.push_back(state);

	// Insert the default key as the second last operand.
	unformat_operands.insert(
	unformat_operands.begin() + unformat_operands.size() - 1,
	default_state_key.getResult());
	// Unformat op.
	auto unformat_op = builder.create<TF::TPUReshardVariablesOp>(
	while_op.getLoc(), llvm::ArrayRef<Type>{}, unformat_operands);
	WrapOpInLaunch(&builder, execute_launch.getLoc(), unformat_op,
	execute_launch.device());
	builder.create<tf_device::ReturnOp>(while_op.getLoc(), ArrayRef<Value>{});
	}

	void TPUVariableRuntimeReformattingPass::runOnOperation() {
	auto module = getOperation();
	module.walk([&](TF::WhileRegionOp while_op) {
	tf_device::ReplicateOp replicate;
	while_op.body().walk([&](tf_device::ReplicateOp replicate_op) {
	if (replicate == nullptr) {
	replicate = replicate_op;
	return WalkResult::advance();
	}
	// We do not handle loops with multiple replicate ops.
	replicate = nullptr;
	return WalkResult::interrupt();
	});
	if (replicate) HandleReplicateOp(while_op, replicate);
	});
	}

	} // namespace

	std::unique_ptr<OperationPass<ModuleOp>>
	CreateTPUVariableRuntimeReformattingPass() {
	return std::make_unique<TPUVariableRuntimeReformattingPass>();
	}

	} // namespace TFTPU
	} // namespace mlir