tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc - platform/external/tensorflow - Git at Google

 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

 // Currently, this test only passes when TensorFlow passes with CUDA, because
 // otherwise the optimizer will not turn clearlist nodes to float16. When
 // looking at clearlist nodes, this optimizer checks if the nodes have a float16
 // GPU OpKernel, but without CUDA there are no GPU OpKernels at all.
 #if GOOGLE_CUDA

 #include "tensorflow/core/grappler/optimizers/auto_mixed_precision.h"

 #include <utility>
 #include <vector>

 #include "tensorflow/cc/ops/control_flow_ops_internal.h"
 #include "tensorflow/cc/ops/list_ops.h"
 #include "tensorflow/cc/ops/math_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/clusters/single_machine.h"
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/grappler/devices.h"
 #include "tensorflow/core/grappler/graph_view.h"
 #include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/random/random.h"

 // TODO(benbarsdell): Improve the numerical checks in these tests. The tests
 // were originally written only to check the graph coloring, so the graphs do
 // not have particularly realistic numerical behavior.

 namespace tensorflow {
 namespace grappler {
 namespace {

 template <DataType DTYPE>
 Tensor GenerateIdentityMatrix(int64 height, int64 width) {
   typedef typename EnumToDataType<DTYPE>::Type T;
   Tensor tensor(DTYPE, TensorShape{height, width});
   for (int64 i = 0; i < height; ++i) {
     for (int64 j = 0; j < width; ++j) {
       tensor.matrix<T>()(i, j) = i == j;
     }
   }
   return tensor;
 }

 template <DataType DTYPE>
 Tensor GenerateRandomTensorInRange(const TensorShape& shape, double minval,
                                    double maxval) {
   typedef typename EnumToDataType<DTYPE>::Type T;
   Tensor tensor(DTYPE, shape);
   for (auto i = 0; i < tensor.NumElements(); i++)
     tensor.flat<T>()(i) =
         (random::New64() % 65536 / 65536.0) * (maxval - minval) + minval;
   return tensor;
 }

 const std::pair<int, int> kMinGPUArch = {7, 0};

 class AutoMixedPrecisionTest : public GrapplerTest {
  protected:
   void SetUp() override {
     int num_gpus = GetNumAvailableGPUs();
     // If GPUs are available, require that they all satisfy the min arch.
     gpu_available_ =
         num_gpus > 0 && num_gpus == GetNumAvailableGPUs(kMinGPUArch);

     if (gpu_available_) {
       virtual_cluster_.reset(new SingleMachine(/* timeout_s = */ 10, 1, 1));
     } else {
       DeviceProperties device_properties;
       device_properties.set_type("GPU");
       device_properties.mutable_environment()->insert({"architecture", "7"});
       device_properties.mutable_environment()->insert({"cuda", "9010"});
       virtual_cluster_.reset(
           new VirtualCluster({{"/GPU:1", device_properties}}));
     }
     TF_CHECK_OK(virtual_cluster_->Provision());
   }

   void TearDown() override { TF_CHECK_OK(virtual_cluster_->Shutdown()); }

   NodeDef* AddSimpleNode(const string& name, const string& op,
                          const std::vector<string>& inputs,
                          GraphDef* graph) const {
     std::vector<std::pair<string, AttrValue>> attributes;
     if (op == "AddN" || op == "ShapeN") {
       AttrValue num_inputs;
       num_inputs.set_i(inputs.size());
       attributes.emplace_back("N", num_inputs);
     }
     if (op == "ShapeN") {
       AttrValue out_type;
       out_type.set_type(DT_INT32);
       attributes.emplace_back("out_type", out_type);
     }
     AttrValue type;
     type.set_type(DT_FLOAT);
     if (op == "Const" || op == "Placeholder" || op == "VariableV2" ||
         op == "VarHandleOp" || op == "ReadVariableOp") {
       attributes.emplace_back("dtype", type);
     } else if (op == "SparseMatMul") {
       attributes.emplace_back("Ta", type);
       attributes.emplace_back("Tb", type);
     } else if (op == "IdentityN") {
       AttrValue type_list;
       for (int i = 0; i < static_cast<int>(inputs.size()); ++i) {
         type_list.mutable_list()->add_type(DT_FLOAT);
       }
       attributes.emplace_back("T", type_list);
     } else if (op == "StackV2" || op == "StackPopV2") {
       attributes.emplace_back("elem_type", type);
     } else if (op == "Cast") {
       attributes.emplace_back("SrcT", type);
       attributes.emplace_back("DstT", type);
     } else {
       attributes.emplace_back("T", type);
     }
     return AddNode(name, op, inputs, attributes, graph);
   }

   void TestSimpleUnaryGrayOp(
       double input_min, double input_max, double atol, double rtol,
       const std::function<Output(const tensorflow::Scope&, Output)>&
           test_op_factory) {
     int size = 128;
     tensorflow::Scope s = tensorflow::Scope::NewRootScope();
     Output eye = ops::Const(s.WithOpName("eye"),
                             GenerateIdentityMatrix<DT_FLOAT>(size, size));
     Output input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
     Output wht1 = ops::MatMul(s.WithOpName("wht1"), input, eye);
     Output gry1 = test_op_factory(s.WithOpName("gry1"), wht1);
     Output wht2 = ops::MatMul(s.WithOpName("wht2"), gry1, eye);
     Output fetch1 = ops::Identity(s.WithOpName("fetch1"), wht2);
     GrapplerItem item;
     item.fetch = {"fetch1"};
     TF_CHECK_OK(s.ToGraphDef(&item.graph));
     auto input_tensor = GenerateRandomTensorInRange<DT_FLOAT>(
         TensorShape({size, size}), input_min, input_max);
     std::vector<std::pair<string, Tensor>> feed = {{"input", input_tensor}};
     auto tensors_expected = EvaluateNodes(item.graph, item.fetch, feed);

     AutoMixedPrecision optimizer;
     GraphDef output;
     TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));

     VLOG(1) << output.DebugString();

     GraphView output_view(&output);
     EXPECT_EQ(output_view.GetNode("input")->attr().at("dtype").type(),
               DT_FLOAT);
     EXPECT_EQ(output_view.GetNode("wht1")->attr().at("T").type(), DT_HALF);
     EXPECT_EQ(output_view.GetNode("gry1")->attr().at("T").type(), DT_HALF);
     EXPECT_EQ(output_view.GetNode("wht2")->attr().at("T").type(), DT_HALF);

     auto tensors = EvaluateNodes(output, item.fetch, feed);
     EXPECT_EQ(tensors.size(), tensors_expected.size());
     EXPECT_EQ(tensors.size(), item.fetch.size());
     for (int i = 0; i < item.fetch.size(); ++i) {
       test::ExpectClose(tensors_expected[i], tensors[i], atol, rtol);
     }
   }

   std::unique_ptr<Cluster> virtual_cluster_;
   bool gpu_available_;
 };

 void VerifyGraphsEquivalent(const GraphDef& original_graph,
                             const GraphDef& optimized_graph,
                             const string& func) {
   EXPECT_EQ(original_graph.node_size(), optimized_graph.node_size()) << func;
   GraphView optimized_view(&optimized_graph);
   for (int i = 0; i < original_graph.node_size(); ++i) {
     const NodeDef& original = original_graph.node(i);
     const NodeDef& optimized = *optimized_view.GetNode(original.name());
     EXPECT_EQ(original.name(), optimized.name()) << func;
     EXPECT_EQ(original.op(), optimized.op()) << func;
     EXPECT_EQ(original.input_size(), optimized.input_size()) << func;
     if (original.input_size() == optimized.input_size()) {
       for (int j = 0; j < original.input_size(); ++j) {
         EXPECT_EQ(original.input(j), optimized.input(j)) << func;
       }
     }
   }
 }

 TEST_F(AutoMixedPrecisionTest, NoOp) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.234f, {32});
   Output blk1 = ops::Exp(s.WithOpName("blk1"), input);
   Output clr1 = ops::Relu(s.WithOpName("clr1"), blk1);
   Output gry1 = ops::Sqrt(s.WithOpName("gry1"), clr1);
   Output clr2 = ops::Relu(s.WithOpName("clr2"), gry1);
   Output fetch = ops::Identity(s.WithOpName("fetch"), clr2);

   GrapplerItem item;
   item.fetch = {"fetch"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);

   AutoMixedPrecision optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));

   VLOG(1) << output.DebugString();

   VerifyGraphsEquivalent(item.graph, output, __FUNCTION__);

   GraphView output_view(&output);
   EXPECT_EQ(output_view.GetNode("input")->attr().at("dtype").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("blk1")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("clr1")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("gry1")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("clr2")->attr().at("T").type(), DT_FLOAT);

   auto tensors = EvaluateNodes(output, item.fetch);
   EXPECT_EQ(tensors.size(), tensors_expected.size());
   EXPECT_EQ(tensors.size(), item.fetch.size());
   for (int i = 0; i < item.fetch.size(); ++i) {
     test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-6);
   }
 }

 TEST_F(AutoMixedPrecisionTest, AlreadyFp16) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f, {32, 32});
   Output cst1 = ops::Cast(s.WithOpName("cst1"), input, DT_HALF);
   Output wht1 = ops::MatMul(s.WithOpName("wht1"), cst1, cst1);
   Output clr1 = ops::Relu(s.WithOpName("clr1"), wht1);
   Output cst2 = ops::Cast(s.WithOpName("cst2"), clr1, DT_FLOAT);
   Output clr2 = ops::Relu(s.WithOpName("clr2"), cst2);
   Output fetch = ops::Identity(s.WithOpName("fetch"), clr2);

   GrapplerItem item;
   item.fetch = {"fetch"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);

   AutoMixedPrecision optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
   VLOG(1) << output.DebugString();

   VerifyGraphsEquivalent(item.graph, output, __FUNCTION__);
   GraphView output_view(&output);
   EXPECT_EQ(output_view.GetNode("input")->attr().at("dtype").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("cst1")->attr().at("DstT").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("wht1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("clr1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("cst2")->attr().at("SrcT").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("cst2")->attr().at("DstT").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("clr2")->attr().at("T").type(), DT_FLOAT);

   auto tensors = EvaluateNodes(output, item.fetch);
   EXPECT_EQ(tensors.size(), tensors_expected.size());
   EXPECT_EQ(tensors.size(), item.fetch.size());
   for (int i = 0; i < item.fetch.size(); ++i) {
     test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-6);
   }
 }

 TEST_F(AutoMixedPrecisionTest, Simple) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output blk1 = ops::Exp(s.WithOpName("blk1"), input);
   Output clr1 = ops::Relu(s.WithOpName("clr1"), blk1);
   Output gry1 = ops::Sqrt(s.WithOpName("gry1"), clr1);
   Output clr2 = ops::Relu(s.WithOpName("clr2"), gry1);
   Output wht1 = ops::MatMul(s.WithOpName("wht1"), clr2, clr2);
   Output clr3 = ops::Relu(s.WithOpName("clr3"), wht1);
   Output blk2 = ops::Log(s.WithOpName("blk2"), clr3);
   Output clr4 = ops::Relu(s.WithOpName("clr4"), blk2);
   Output blk3 = ops::SparseMatMul(s.WithOpName("blk3"), clr4, clr4);
   Output clr5 = ops::Relu(s.WithOpName("clr5"), blk3);
   Output fetch = ops::Identity(s.WithOpName("fetch"), clr5);

   GrapplerItem item;
   item.fetch = {"fetch"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);

   AutoMixedPrecision optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));

   VLOG(1) << output.DebugString();

   GraphView output_view(&output);
   EXPECT_EQ(output.node_size(), item.graph.node_size() + 2);
   EXPECT_EQ(output_view.GetNode("input")->attr().at("dtype").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("blk1")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("clr1")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("gry1")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("clr2")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("wht1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("clr3")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("blk2")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("clr4")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("blk3")->attr().at("Ta").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("blk3")->attr().at("Tb").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("clr5")->attr().at("T").type(), DT_FLOAT);

   auto tensors = EvaluateNodes(output, item.fetch);
   EXPECT_EQ(tensors.size(), tensors_expected.size());
   EXPECT_EQ(tensors.size(), item.fetch.size());
   for (int i = 0; i < item.fetch.size(); ++i) {
     test::ExpectClose(tensors_expected[i], tensors[i], -1, 5e-4);
   }
 }

 TEST_F(AutoMixedPrecisionTest, BidirectionalClearChain) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output clr1 = ops::Relu(s.WithOpName("clr1"), input);
   Output clr2 = ops::Relu(s.WithOpName("clr2"), input);
   Output wht1 = ops::MatMul(s.WithOpName("wht1"), clr1, clr1);
   auto clr3 = ops::ShapeN(s.WithOpName("clr3"), {clr1, clr2});
   Output clr4 = ops::Relu(s.WithOpName("clr4"), clr2);
   Output fetch1 = ops::Identity(s.WithOpName("fetch1"), wht1);
   Output fetch2 = ops::Identity(s.WithOpName("fetch2"), clr4);

   GrapplerItem item;
   item.fetch = {"fetch1", "fetch2"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);

   AutoMixedPrecision optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));

   VLOG(1) << output.DebugString();

   GraphView output_view(&output);
   EXPECT_EQ(output.node_size(), item.graph.node_size() + 3);
   EXPECT_EQ(output_view.GetNode("input")->attr().at("dtype").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("clr1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("clr2")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("wht1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("clr3")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("clr4")->attr().at("T").type(), DT_HALF);

   auto tensors = EvaluateNodes(output, item.fetch);
   EXPECT_EQ(tensors.size(), tensors_expected.size());
   EXPECT_EQ(tensors.size(), item.fetch.size());
   for (int i = 0; i < item.fetch.size(); ++i) {
     test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-6);
   }
 }

 TEST_F(AutoMixedPrecisionTest, PreserveFetches) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output wht1 = ops::MatMul(s.WithOpName("wht1"), input, input);
   Output clr1 = ops::Relu(s.WithOpName("clr1"), wht1);
   Output gry1 = ops::Sqrt(s.WithOpName("gry1"), clr1);
   Output blk1 = ops::Exp(s.WithOpName("blk1"), gry1);
   Output clr2 = ops::Relu(s.WithOpName("clr2"), blk1);
   Output wht2 = ops::MatMul(s.WithOpName("wht2"), clr2, clr2);
   Output clr3 = ops::Relu(s.WithOpName("clr3"), wht2);
   Output blk2 = ops::Exp(s.WithOpName("blk2"), clr3);
   Output clr4 = ops::Relu(s.WithOpName("clr4"), blk2);

   GrapplerItem item;
   item.fetch = {"wht1", "clr2", "clr3"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);

   AutoMixedPrecision optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));

   VLOG(1) << output.DebugString();

   GraphView output_view(&output);
   EXPECT_EQ(output.node_size(), item.graph.node_size() + 2);
   EXPECT_EQ(output_view.GetNode("input")->attr().at("dtype").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("wht1")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("clr1")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("gry1")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("blk1")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("clr2")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("wht2")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("clr3")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("blk2")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("clr4")->attr().at("T").type(), DT_FLOAT);

   auto tensors = EvaluateNodes(output, item.fetch);
   EXPECT_EQ(tensors.size(), tensors_expected.size());
   EXPECT_EQ(tensors.size(), item.fetch.size());
   for (int i = 0; i < item.fetch.size(); ++i) {
     test::ExpectClose(tensors_expected[i], tensors[i], -1, 5e-3);
   }
 }

 TEST_F(AutoMixedPrecisionTest, PreserveCPUNodes) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output clr1 = ops::Relu(s.WithOpName("clr1"), input);
   Output wht1 = ops::MatMul(s.WithOpName("wht1"), clr1, clr1);
   Output gry1 = ops::Tanh(s.WithOpName("gry1"), wht1);
   Output wht2 = ops::MatMul(s.WithOpName("wht2").WithDevice(
                                 "/job:localhost/replica:0/task:0/device:CPU:0"),
                             gry1, gry1);
   Output clr2 = ops::Relu(s.WithOpName("clr2"), wht2);
   Output fetch = ops::Identity(s.WithOpName("fetch"), clr2);

   GrapplerItem item;
   item.fetch = {"fetch"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);

   AutoMixedPrecision optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));

   VLOG(1) << output.DebugString();

   GraphView output_view(&output);
   EXPECT_EQ(output.node_size(), item.graph.node_size() + 2);
   EXPECT_EQ(output_view.GetNode("input")->attr().at("dtype").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("clr1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("wht1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("gry1")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("wht2")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("clr2")->attr().at("T").type(), DT_FLOAT);

   auto tensors = EvaluateNodes(output, item.fetch);
   EXPECT_EQ(tensors.size(), tensors_expected.size());
   EXPECT_EQ(tensors.size(), item.fetch.size());
   for (int i = 0; i < item.fetch.size(); ++i) {
     test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-6);
   }
 }

 TEST_F(AutoMixedPrecisionTest, PreserveIdentityAfterVariable) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output var1 = ops::Variable(s.WithOpName("var1"), {32, 32}, DT_FLOAT);
   Output clr1 = ops::Identity(s.WithOpName("clr1"), var1);
   Output wht1 = ops::MatMul(s.WithOpName("wht1"), input, clr1);
   Output input2 = ops::Const(s.WithOpName("input2"), 1.f / 32, {32, 32});
   Output clr2 = ops::Identity(s.WithOpName("clr2"), input2);
   Output wht2 = ops::MatMul(s.WithOpName("wht2"), input, clr2);
   Output fetch1 = ops::Identity(s.WithOpName("fetch1"), wht1);
   Output fetch2 = ops::Identity(s.WithOpName("fetch2"), wht2);

   GrapplerItem item;
   item.fetch = {"fetch1", "fetch2"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto var1_tensor =
       GenerateConstantTensor<DT_FLOAT>(TensorShape({32, 32}), 3.141593f);
   std::vector<std::pair<string, Tensor>> feed = {{"var1", var1_tensor}};
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch, feed);

   AutoMixedPrecision optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));

   VLOG(1) << output.DebugString();

   GraphView output_view(&output);
   EXPECT_EQ(output.node_size(), item.graph.node_size() + 5);
   EXPECT_EQ(output_view.GetNode("input")->attr().at("dtype").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("var1")->attr().at("dtype").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("clr1")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("wht1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("input2")->attr().at("dtype").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("clr2")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("wht2")->attr().at("T").type(), DT_HALF);

   auto tensors = EvaluateNodes(output, item.fetch, feed);
   EXPECT_EQ(tensors.size(), tensors_expected.size());
   EXPECT_EQ(tensors.size(), item.fetch.size());
   for (int i = 0; i < item.fetch.size(); ++i) {
     test::ExpectClose(tensors_expected[i], tensors[i], -1, 5e-3);
   }
 }

 TEST_F(AutoMixedPrecisionTest, FusedBatchNorm) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   // Uses NHWC data format because non-GPU execution does not support NCHW.
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {8, 56, 56, 16});
   Output weight = ops::Const(s.WithOpName("weight"), 2.f, {3, 3, 16, 16});
   Output scale = ops::Const(s.WithOpName("scale"), 3.f, {16});
   Output offset = ops::Const(s.WithOpName("offset"), 4.f, {16});
   Output mean = ops::Const(s.WithOpName("mean"), 5.f, {0});
   Output variance = ops::Const(s.WithOpName("variance"), 6.f, {0});
   Output wht1 = ops::Conv2D(s.WithOpName("wht1"), input, weight, {1, 1, 1, 1},
                             "SAME", ops::Conv2D::DataFormat("NHWC"));
   auto fbn1_op =
       ops::FusedBatchNorm(s.WithOpName("fbn1"), wht1, scale, offset, mean,
                           variance, ops::FusedBatchNorm::DataFormat("NHWC"));
   Output fbn1 = fbn1_op.y;
   Output fbn1_rs1 = fbn1_op.reserve_space_1;
   Output fbn1_rs2 = fbn1_op.reserve_space_2;
   Output bng1 = ops::FusedBatchNormGrad(
                     s.WithOpName("bng1"), fbn1, wht1, scale, fbn1_rs1, fbn1_rs2,
                     ops::FusedBatchNormGrad::DataFormat("NHWC"))
                     .x_backprop;
   Output gry1 = ops::Add(s.WithOpName("gry1"), fbn1, bng1);
   Output wht2 = ops::Conv2D(s.WithOpName("wht2"), gry1, weight, {1, 1, 1, 1},
                             "SAME", ops::Conv2D::DataFormat("NHWC"));
   Output fetch = ops::Identity(s.WithOpName("fetch"), wht2);

   GrapplerItem item;
   item.fetch = {"fetch"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);

   AutoMixedPrecision optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));

   VLOG(1) << output.DebugString();

   GraphView output_view(&output);
   EXPECT_EQ(output.node_size(), item.graph.node_size() + 3);
   EXPECT_EQ(output_view.GetNode("wht1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("fbn1")->op(), "FusedBatchNormV2");
   EXPECT_EQ(output_view.GetNode("fbn1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("fbn1")->attr().at("U").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("bng1")->op(), "FusedBatchNormGradV2");
   EXPECT_EQ(output_view.GetNode("bng1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("bng1")->attr().at("U").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("gry1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("wht2")->attr().at("T").type(), DT_HALF);

   auto tensors = EvaluateNodes(output, item.fetch);
   EXPECT_EQ(tensors.size(), tensors_expected.size());
   EXPECT_EQ(tensors.size(), item.fetch.size());
   for (int i = 0; i < item.fetch.size(); ++i) {
     test::ExpectClose(tensors_expected[i], tensors[i], -1, 1e-3);
   }
 }

 TEST_F(AutoMixedPrecisionTest, RepeatedAndListTypeAttrs) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output wht1 = ops::MatMul(s.WithOpName("wht1"), input, input);
   auto clr1_op = ops::IdentityN(s.WithOpName("clr1"), {wht1, wht1, wht1});
   Output gry1 =
       ops::AddN(s.WithOpName("gry1"),
                 {clr1_op.output[0], clr1_op.output[1], clr1_op.output[2]});
   Output wht2 = ops::MatMul(s.WithOpName("wht2"), gry1, gry1);
   Output fetch = ops::Identity(s.WithOpName("fetch"), wht2);

   GrapplerItem item;
   item.fetch = {"fetch"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);

   AutoMixedPrecision optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));

   VLOG(1) << output.DebugString();

   GraphView output_view(&output);
   EXPECT_EQ(output.node_size(), item.graph.node_size() + 2);
   EXPECT_EQ(output_view.GetNode("input")->attr().at("dtype").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("wht1")->attr().at("T").type(), DT_HALF);
   for (auto type : output_view.GetNode("clr1")->attr().at("T").list().type()) {
     EXPECT_EQ(type, DT_HALF);
   }
   EXPECT_EQ(output_view.GetNode("gry1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("wht2")->attr().at("T").type(), DT_HALF);

   auto tensors = EvaluateNodes(output, item.fetch);
   EXPECT_EQ(tensors.size(), tensors_expected.size());
   EXPECT_EQ(tensors.size(), item.fetch.size());
   for (int i = 0; i < item.fetch.size(); ++i) {
     test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-6);
   }
 }

 TEST_F(AutoMixedPrecisionTest, ExistingCast) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), true, {32, 32});
   Output cst1 = ops::Cast(s.WithOpName("cst1"), input, DT_FLOAT);
   Output wht1 = ops::MatMul(s.WithOpName("wht1"), cst1, cst1);
   Output fetch = ops::Identity(s.WithOpName("fetch"), wht1);

   GrapplerItem item;
   item.fetch = {"fetch"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);

   AutoMixedPrecision optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));

   VLOG(1) << output.DebugString();

   GraphView output_view(&output);
   EXPECT_EQ(output.node_size(), item.graph.node_size() + 1);
   EXPECT_EQ(output_view.GetNode("cst1")->attr().at("SrcT").type(), DT_BOOL);
   EXPECT_EQ(output_view.GetNode("cst1")->attr().at("DstT").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("wht1")->attr().at("T").type(), DT_HALF);

   auto tensors = EvaluateNodes(output, item.fetch);
   EXPECT_EQ(tensors.size(), tensors_expected.size());
   EXPECT_EQ(tensors.size(), item.fetch.size());
   for (int i = 0; i < item.fetch.size(); ++i) {
     test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-6);
   }
 }

 TEST_F(AutoMixedPrecisionTest, RecurrentEdgeColorMismatch) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output blk1 = ops::Exp(s.WithOpName("blk1"), input);
   Output ent1 =
       ops::internal::Enter(s.WithOpName("ent1"), blk1, "loop1").output;
   // Note that the second input is later replaced with "nxt1".
   Output mrg1 = ops::Merge(s.WithOpName("mrg1"), {ent1, ent1}).output;
   // For simplicity, the loop condition is constant false.
   Output con1 = ops::Const(s.WithOpName("con1"), false, {});
   Output lpc1 = ops::LoopCond(s.WithOpName("lpc1"), con1).output;
   auto swt1 = ops::Switch(s.WithOpName("swt1"), mrg1, lpc1);
   Output gry1 = ops::Sqrt(s.WithOpName("gry1"), swt1.output_true);
   Output wht1 = ops::MatMul(s.WithOpName("wht1"), gry1, gry1);
   Output nxt1 = ops::NextIteration(s.WithOpName("nxt1"), wht1);
   Output ext1 = ops::internal::Exit(s.WithOpName("ext1"), swt1.output_false);
   Output fetch = ops::Identity(s.WithOpName("fetch"), ext1);
   // Add a second merge node from the same NextIteration node. This case arises
   // during graph optimization of some models.
   auto mrg2 = ops::Merge(s.WithOpName("mrg2"), {ent1, nxt1});

   GrapplerItem item;
   item.fetch = {"fetch"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   NodeMap node_map_original(&item.graph);
   auto merge_node = node_map_original.GetNode("mrg1");
   // Modify the graph to create a loop.
   merge_node->set_input(1, "nxt1");
   // Add a control edge to ensure the loop condition is inside the frame.
   auto const_node = node_map_original.GetNode("con1");
   const_node->add_input("^mrg1");
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);

   AutoMixedPrecision optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));

   VLOG(1) << output.DebugString();

   GraphView output_view(&output);
   EXPECT_EQ(output.node_size(), item.graph.node_size() + 2);
   // Note that mrg1 gets painted black because it is between blk1 and gry1. This
   // forces nxt1 and mrg2 to be painted black as well (they would otherwise be
   // painted white because they are clear and have a direct path to wht1).
   EXPECT_EQ(output_view.GetNode("blk1")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("ent1")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("mrg1")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("swt1")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("gry1")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("wht1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("nxt1")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("ext1")->attr().at("T").type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("mrg2")->attr().at("T").type(), DT_FLOAT);

   auto tensors = EvaluateNodes(output, item.fetch);
   EXPECT_EQ(tensors.size(), tensors_expected.size());
   EXPECT_EQ(tensors.size(), item.fetch.size());
   for (int i = 0; i < item.fetch.size(); ++i) {
     test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-6);
   }
 }

 TEST_F(AutoMixedPrecisionTest, TensorListSetGet) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   tensorflow::Input shape = {32, 32};
   auto tl1 = ops::TensorListReserve(s.WithOpName("tl1"), {32, 32}, 8, DT_FLOAT);
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output idx1 = ops::Const(s.WithOpName("idx1"), 1);
   Output idx2 = ops::Const(s.WithOpName("idx2"), 2);
   Output idx3 = ops::Const(s.WithOpName("idx3"), 3);
   auto tl1w1 =
       ops::TensorListSetItem(s.WithOpName("tl1w1"), tl1.handle, idx1, input);
   Output wht1 = ops::MatMul(s.WithOpName("wht1"), input, input);
   auto tl1w2 =
       ops::TensorListSetItem(s.WithOpName("tl1w2"), tl1.handle, idx2, wht1);
   // Ensure that TensorListResize doesn't cause any problems.
   Output tl1rs =
       ops::TensorListResize(s.WithOpName("tl1rs"), tl1w2.output_handle, 6);
   Output tl1r1 = ops::TensorListGetItem(s.WithOpName("tl1r1"), tl1rs, idx2,
                                         shape, DT_FLOAT)
                      .item;
   Output gry1 = ops::Tanh(s.WithOpName("gry1"), tl1r1);
   Output wht2 = ops::MatMul(s.WithOpName("wht2"), gry1, gry1);
   auto tl1w3 =
       ops::TensorListSetItem(s.WithOpName("tl1w3"), tl1.handle, idx3, wht2);
   Output tl1r2 =
       ops::TensorListGetItem(s.WithOpName("tl1r2"), tl1w3.output_handle, idx3,
                              shape, DT_FLOAT)
           .item;
   auto tl2 = ops::TensorListReserve(s.WithOpName("tl2"), shape, 8, DT_FLOAT);
   auto tl2w1 =
       ops::TensorListSetItem(s.WithOpName("tl2w1"), tl2.handle, idx1, input);
   Output tl2r1 =
       ops::TensorListGetItem(s.WithOpName("tl2r1"), tl2w1.output_handle, idx1,
                              shape, DT_FLOAT)
           .item;
   Output fetch1 = ops::Identity(s.WithOpName("fetch1"), tl1r2);
   Output fetch2 = ops::Identity(s.WithOpName("fetch2"), tl2r1);

   GrapplerItem item;
   item.fetch = {"fetch1", "fetch2"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);

   AutoMixedPrecision optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));

   VLOG(1) << output.DebugString();

   GraphView output_view(&output);
   EXPECT_EQ(output.node_size(), item.graph.node_size() + 2);
   const char* type_key = "element_dtype";
   EXPECT_EQ(output_view.GetNode("tl1")->attr().at(type_key).type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl1w1")->attr().at(type_key).type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("wht1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl1w2")->attr().at(type_key).type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl1r1")->attr().at(type_key).type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("gry1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("wht2")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl1w3")->attr().at(type_key).type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl2")->attr().at(type_key).type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("tl2w1")->attr().at(type_key).type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("tl2r1")->attr().at(type_key).type(), DT_FLOAT);

   auto tensors = EvaluateNodes(output, item.fetch);
   EXPECT_EQ(tensors.size(), tensors_expected.size());
   EXPECT_EQ(tensors.size(), item.fetch.size());
   for (int i = 0; i < item.fetch.size(); ++i) {
     test::ExpectClose(tensors_expected[i], tensors[i], -1, 5e-4);
   }
 }

 TEST_F(AutoMixedPrecisionTest, TensorListPushPop) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   tensorflow::Input shape = {32, 32};
   auto tl1 = ops::EmptyTensorList(s.WithOpName("tl1"), {32, 32}, 8, DT_FLOAT);
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   auto tl1w1 =
       ops::TensorListPushBack(s.WithOpName("tl1w1"), tl1.handle, input);
   Output wht1 = ops::MatMul(s.WithOpName("wht1"), input, input);
   auto tl1w2 =
       ops::TensorListPushBack(s.WithOpName("tl1w2"), tl1w1.output_handle, wht1);
   Output tl1r1 = ops::TensorListPopBack(s.WithOpName("tl1r1"),
                                         tl1w2.output_handle, shape, DT_FLOAT)
                      .tensor;
   Output gry1 = ops::Tanh(s.WithOpName("gry1"), tl1r1);
   Output wht2 = ops::MatMul(s.WithOpName("wht2"), gry1, gry1);
   auto tl1w3 = ops::TensorListPushBack(s.WithOpName("tl1w3"), tl1.handle, wht2);
   Output tl1r2 = ops::TensorListPopBack(s.WithOpName("tl1r2"),
                                         tl1w3.output_handle, shape, DT_FLOAT)
                      .tensor;
   auto tl2 = ops::EmptyTensorList(s.WithOpName("tl2"), shape, 8, DT_FLOAT);
   auto tl2w1 =
       ops::TensorListPushBack(s.WithOpName("tl2w1"), tl2.handle, input);
   Output tl2r1 = ops::TensorListPopBack(s.WithOpName("tl2r1"),
                                         tl2w1.output_handle, shape, DT_FLOAT)
                      .tensor;
   Output fetch1 = ops::Identity(s.WithOpName("fetch1"), tl1r2);
   Output fetch2 = ops::Identity(s.WithOpName("fetch2"), tl2r1);

   GrapplerItem item;
   item.fetch = {"fetch1", "fetch2"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);

   AutoMixedPrecision optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));

   VLOG(1) << output.DebugString();

   GraphView output_view(&output);
   EXPECT_EQ(output.node_size(), item.graph.node_size() + 2);
   const char* type_key = "element_dtype";
   EXPECT_EQ(output_view.GetNode("tl1")->attr().at(type_key).type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl1w1")->attr().at(type_key).type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("wht1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl1w2")->attr().at(type_key).type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl1r1")->attr().at(type_key).type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("gry1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("wht2")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl1w3")->attr().at(type_key).type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl2")->attr().at(type_key).type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("tl2w1")->attr().at(type_key).type(), DT_FLOAT);
   EXPECT_EQ(output_view.GetNode("tl2r1")->attr().at(type_key).type(), DT_FLOAT);

   auto tensors = EvaluateNodes(output, item.fetch);
   EXPECT_EQ(tensors.size(), tensors_expected.size());
   EXPECT_EQ(tensors.size(), item.fetch.size());
   for (int i = 0; i < item.fetch.size(); ++i) {
     test::ExpectClose(tensors_expected[i], tensors[i], -1, 5e-4);
   }
 }

 TEST_F(AutoMixedPrecisionTest, TensorListFromTensor) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   tensorflow::Input shape = {32};
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output wht1 = ops::MatMul(s.WithOpName("wht1"), input, input);
   auto tl1 = ops::TensorListFromTensor(s.WithOpName("tl1"), wht1, shape);
   Output tl1r1 = ops::TensorListStack(s.WithOpName("tl1r1"), tl1.output_handle,
                                       shape, DT_FLOAT)
                      .tensor;
   Output gry1 = ops::Tanh(s.WithOpName("gry1"), tl1r1);
   Output wht2 = ops::MatMul(s.WithOpName("wht2"), gry1, gry1);
   Output fetch1 = ops::Identity(s.WithOpName("fetch1"), wht2);

   // This tests that a white-painted object node (tl2) will force an unpainted
   // client node (tl2w1) to be painted white as well. (Without the force, tl2w1
   // would remain unpainted, producing an invalid graph).
   auto tl2 = ops::TensorListFromTensor(s.WithOpName("tl2"), wht1, shape);
   auto tl2w1 =
       ops::TensorListPushBack(s.WithOpName("tl2w1"), tl2.output_handle, input);

   GrapplerItem item;
   item.fetch = {"fetch1"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);

   AutoMixedPrecision optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));

   VLOG(1) << output.DebugString();

   GraphView output_view(&output);
   EXPECT_EQ(output.node_size(), item.graph.node_size() + 2);
   const char* type_key = "element_dtype";
   EXPECT_EQ(output_view.GetNode("wht1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl1")->attr().at(type_key).type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl1r1")->attr().at(type_key).type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("gry1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("wht2")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl2")->attr().at(type_key).type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl2w1")->attr().at(type_key).type(), DT_HALF);

   auto tensors = EvaluateNodes(output, item.fetch);
   EXPECT_EQ(tensors.size(), tensors_expected.size());
   EXPECT_EQ(tensors.size(), item.fetch.size());
   for (int i = 0; i < item.fetch.size(); ++i) {
     test::ExpectClose(tensors_expected[i], tensors[i], -1, 2e-4);
   }
 }

 TEST_F(AutoMixedPrecisionTest, TensorListPushBackBatchAndConcatLists) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   tensorflow::Input shape = {32, 32};
   auto tl1 = ops::EmptyTensorList(s.WithOpName("tl1"), {32, 32}, 8, DT_FLOAT);
   auto tl2 = ops::EmptyTensorList(s.WithOpName("tl2"), {32, 32}, 8, DT_FLOAT);
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output wht1 = ops::MatMul(s.WithOpName("wht1"), input, input);
   Output tl1_tl2 =
       ops::Stack(s.WithOpName("tl1_tl2"), {tl1.handle, tl2.handle});
   Output wht1_wht1 = ops::Stack(s.WithOpName("wht1_wht1"), {wht1, wht1});
   auto tl12w1 =
       ops::TensorListPushBackBatch(s.WithOpName("tl12w1"), tl1_tl2, wht1_wht1);
   OutputList tl12w1_outputs =
       ops::Split(s.WithOpName("tl12w1_outputs"), 0, tl12w1.output_handles, 2)
           .output;
   Output scalar_shape = ops::Const(s.WithOpName("scalar_shape"), 0, {0});
   Output tl12w1_output0 = ops::Reshape(s.WithOpName("tl12w1_output0"),
                                        tl12w1_outputs[0], scalar_shape);
   Output tl12w1_output1 = ops::Reshape(s.WithOpName("tl12w1_output1"),
                                        tl12w1_outputs[1], scalar_shape);
   Output tl3 = ops::TensorListConcatLists(s.WithOpName("tl3"), tl12w1_output0,
                                           tl12w1_output1, DT_FLOAT);
   Output tl3r1 =
       ops::TensorListPopBack(s.WithOpName("tl3r1"), tl3, shape, DT_FLOAT)
           .tensor;
   Output gry1 = ops::Tanh(s.WithOpName("gry1"), tl3r1);
   Output wht2 = ops::MatMul(s.WithOpName("wht2"), gry1, gry1);
   Output fetch1 = ops::Identity(s.WithOpName("fetch1"), wht2);

   GrapplerItem item;
   item.fetch = {"fetch1"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);

   AutoMixedPrecision optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));

   VLOG(1) << output.DebugString();

   GraphView output_view(&output);
   EXPECT_EQ(output.node_size(), item.graph.node_size() + 2);
   const char* type_key = "element_dtype";
   EXPECT_EQ(output_view.GetNode("wht1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("wht2")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("gry1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl1")->attr().at(type_key).type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl2")->attr().at(type_key).type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl3")->attr().at(type_key).type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl3r1")->attr().at(type_key).type(), DT_HALF);

   auto tensors = EvaluateNodes(output, item.fetch);
   EXPECT_EQ(tensors.size(), tensors_expected.size());
   EXPECT_EQ(tensors.size(), item.fetch.size());
   for (int i = 0; i < item.fetch.size(); ++i) {
     test::ExpectClose(tensors_expected[i], tensors[i], -1, 5e-4);
   }
 }

 TEST_F(AutoMixedPrecisionTest, TensorListThroughFunction) {
   // This test passes a tensor list handle through a function with its own
   // Tensor List ops inside to test that the types are not changed to a
   // conflicting state.
   // A separate Tensor List cluster is added to test that it is still changed to
   // DT_HALF.
   FunctionDefLibrary function_lib;
   const Tensor kShape = test::AsTensor<int32>({32, 32});
   FunctionDef func1 = FunctionDefHelper::Define(
       "Func1", {"ihandle: variant", "x: float"},
       {"ohandle: variant", "y: float"}, {},
       {
           {{"tl1w1_handle"},
            "TensorListPushBack",
            {"ihandle", "x"},
            {{"element_dtype", DT_FLOAT}}},
           {{"shape"}, "Const", {}, {{"value", kShape}, {"dtype", DT_INT32}}},
           {{"tl1r1_handle", "tl1r1_data"},
            "TensorListPopBack",
            {"tl1w1_handle", "shape"},
            {{"element_dtype", DT_FLOAT}}},
           {{"ohandle"}, "Identity", {"tl1r1_handle"}, {{"T", DT_VARIANT}}},
           {{"y"}, "Identity", {"tl1r1_data"}, {{"T", DT_FLOAT}}},
       });
   function_lib.add_function()->Swap(&func1);

   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   TF_CHECK_OK(s.graph()->AddFunctionLibrary(function_lib));
   tensorflow::Input shape = {32, 32};
   Output input = ops::Const(s.WithOpName("input"), 1.f / 32, {32, 32});
   Output wht1 = ops::MatMul(s.WithOpName("wht1"), input, input);
   Output gry1 = ops::Tanh(s.WithOpName("gry1"), wht1);
   auto tl1 = ops::EmptyTensorList(s.WithOpName("tl1"), {32, 32}, 8, DT_FLOAT);
   auto tl1w1 = ops::TensorListPushBack(s.WithOpName("tl1w1"), tl1.handle, gry1);
   auto _gry1 = tensorflow::ops::AsNodeOut(s, gry1);
   auto _tl1w1_handle = tensorflow::ops::AsNodeOut(s, tl1w1.output_handle);
   auto builder =
       tensorflow::NodeBuilder("Func1", "Func1", s.graph()->op_registry());
   tensorflow::Node* func1_op;
   TF_CHECK_OK(
       builder.Input(_tl1w1_handle).Input(_gry1).Finalize(s.graph(), &func1_op));
   Output func1_handle(func1_op, 0);
   Output tl1r1 = ops::TensorListPopBack(s.WithOpName("tl1r1"), func1_handle,
                                         shape, DT_FLOAT)
                      .tensor;
   auto tl2 = ops::EmptyTensorList(s.WithOpName("tl2"), {32, 32}, 8, DT_FLOAT);
   auto tl2w1 = ops::TensorListPushBack(s.WithOpName("tl2w1"), tl2.handle, gry1);
   Output tl2r1 = ops::TensorListPopBack(s.WithOpName("tl2r1"),
                                         tl2w1.output_handle, shape, DT_FLOAT)
                      .tensor;
   Output wht2 = ops::MatMul(s.WithOpName("wht2"), tl1r1, tl2r1);
   Output fetch1 = ops::Identity(s.WithOpName("fetch1"), wht2);

   GrapplerItem item;
   item.fetch = {"fetch1"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);

   AutoMixedPrecision optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));

   VLOG(1) << output.DebugString();

   GraphView output_view(&output);
   const char* type_key = "element_dtype";
   EXPECT_EQ(output_view.GetNode("wht1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("wht2")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("gry1")->attr().at("T").type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl2")->attr().at(type_key).type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl2w1")->attr().at(type_key).type(), DT_HALF);
   EXPECT_EQ(output_view.GetNode("tl2r1")->attr().at(type_key).type(), DT_HALF);

   auto tensors = EvaluateNodes(output, item.fetch);
   EXPECT_EQ(tensors.size(), tensors_expected.size());
   EXPECT_EQ(tensors.size(), item.fetch.size());
   for (int i = 0; i < item.fetch.size(); ++i) {
     test::ExpectClose(tensors_expected[i], tensors[i], -1, 5e-4);
   }
 }

 int GetCudaVersion(const Cluster& cluster) {
   auto devices = cluster.GetDevices();
   for (const auto& device : devices) {
     const DeviceProperties& device_properties = device.second;
     if (device_properties.type() == "GPU") {
       const auto& device_env = device_properties.environment();
       auto it = device_env.find("cuda");
       if (it != device_env.end()) {
         string cuda_version_str = it->second;
         return std::stoi(cuda_version_str);
       }
     }
   }
   return 0;
 }

 TEST_F(AutoMixedPrecisionTest, BatchMatMul) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input = ops::Const(s.WithOpName("input"), 1.f / 33, {64, 32, 32});
   Output wht1 = ops::BatchMatMul(s.WithOpName("wht1"), input, input);
   Output fetch1 = ops::Identity(s.WithOpName("fetch1"), wht1);

   GrapplerItem item;
   item.fetch = {"fetch1"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);

   AutoMixedPrecision optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));

   VLOG(1) << output.DebugString();

   GraphView output_view(&output);
   EXPECT_EQ(output_view.GetNode("input")->attr().at("dtype").type(), DT_FLOAT);
   if (GetCudaVersion(*virtual_cluster_.get()) >= 9010) {
     EXPECT_EQ(output.node_size(), item.graph.node_size() + 2);
     EXPECT_EQ(output_view.GetNode("wht1")->attr().at("T").type(), DT_HALF);
   } else {
     EXPECT_EQ(output.node_size(), item.graph.node_size());
     EXPECT_EQ(output_view.GetNode("wht1")->attr().at("T").type(), DT_FLOAT);
   }

   auto tensors = EvaluateNodes(output, item.fetch);
   EXPECT_EQ(tensors.size(), tensors_expected.size());
   EXPECT_EQ(tensors.size(), item.fetch.size());
   for (int i = 0; i < item.fetch.size(); ++i) {
     test::ExpectClose(tensors_expected[i], tensors[i], -1, 3.0e-3);
   }
 }

 TEST_F(AutoMixedPrecisionTest, EluOp) {
   TestSimpleUnaryGrayOp(
       -5, 5, 1.0e-3, 1.0e-3,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Elu(scope, input);
       });
 }

 TEST_F(AutoMixedPrecisionTest, ErfOp) {
   TestSimpleUnaryGrayOp(
       -5, 5, 1.0e-3, -1,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Erf(scope, input);
       });
 }

 TEST_F(AutoMixedPrecisionTest, ErfcOp) {
   TestSimpleUnaryGrayOp(
       -5, 5, 1.0e-3, -1,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Erfc(scope, input);
       });
 }

 TEST_F(AutoMixedPrecisionTest, InvOp) {
   TestSimpleUnaryGrayOp(
       0.01, 10, -1, 1.0e-3,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Inv(scope, input);
       });
 }

 TEST_F(AutoMixedPrecisionTest, LogOp) {
   TestSimpleUnaryGrayOp(
       0.01, 10, 1.0e-3, 2.0e-3,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Log(scope, input);
       });
 }

 TEST_F(AutoMixedPrecisionTest, Log1pOp) {
   TestSimpleUnaryGrayOp(
       -0.99, 9, 1.0e-3, 5.0e-3,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Log1p(scope, input);
       });
 }

 TEST_F(AutoMixedPrecisionTest, LogSoftmaxOp) {
   TestSimpleUnaryGrayOp(
       -8, 8, -1, 1.0e-2,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::LogSoftmax(scope, input);
       });
 }

 TEST_F(AutoMixedPrecisionTest, ReciprocalOp) {
   TestSimpleUnaryGrayOp(
       0.01, 10, -1, 1.0e-3,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Reciprocal(scope, input);
       });
 }

 TEST_F(AutoMixedPrecisionTest, SigmoidOp) {
   TestSimpleUnaryGrayOp(
       -5, 5, 1.0e-3, -1,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Sigmoid(scope, input);
       });
 }

 TEST_F(AutoMixedPrecisionTest, SoftmaxOp) {
   TestSimpleUnaryGrayOp(
       -8, 8, 2.0e-3, -1,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Softmax(scope, input);
       });
 }

 TEST_F(AutoMixedPrecisionTest, SoftplusOp) {
   TestSimpleUnaryGrayOp(
       -5, 5, 1.0e-3, 1.0e-3,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Softplus(scope, input);
       });
 }

 TEST_F(AutoMixedPrecisionTest, SqrtOp) {
   TestSimpleUnaryGrayOp(
       0, 10, 1.0e-3, 1.0e-3,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Sqrt(scope, input);
       });
 }

 TEST_F(AutoMixedPrecisionTest, TanhOp) {
   TestSimpleUnaryGrayOp(
       -5, 5, 1.0e-3, -1,
       [](const tensorflow::Scope& scope, Output input) -> Output {
         return ops::Tanh(scope, input);
       });
 }

 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow

 #endif  // GOOGLE_CUDA