tensorflow/core/grappler/optimizers/remapper_test.cc - platform/external/tensorflow - Git at Google

 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

 #include "tensorflow/core/grappler/optimizers/remapper.h"

 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/grappler/devices.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"

 #if GOOGLE_CUDA
 #include "third_party/gpus/cudnn/cudnn.h"
 #endif  // GOOGLE_CUDA

 namespace tensorflow {
 namespace grappler {

 class RemapperTest : public GrapplerTest {
  protected:
   void SetUp() override {
     // This is a requirement for fusing FusedBatchNorm + SideInput + Activation.
     setenv("TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT", "1", 1 /* replace */);
   }
 };

 TEST_F(RemapperTest, FusedBatchNorm) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output dflt = ops::Const(s.WithOpName("dflt"), {3.14f, 2.7f}, {2, 1, 1, 1});
   Output x = ops::PlaceholderWithDefault(s.WithOpName("x"), dflt, {2, 1, 1, 1});
   Output scale = ops::Const(s.WithOpName("scale"), {0.3f}, {1});
   Output offset = ops::Const(s.WithOpName("offset"), {0.123f}, {1});
   Output mean = ops::Const(s.WithOpName("mean"), {7.3f}, {1});
   Output variance = ops::Const(s.WithOpName("variance"), {0.57f}, {1});
   ops::FusedBatchNorm::Attrs attr;
   attr = attr.IsTraining(false);
   ops::FusedBatchNorm bn(s.WithOpName("batch_norm"), x, scale, offset, mean,
                          variance, attr);

   GrapplerItem item;
   TF_ASSERT_OK(s.ToGraphDef(&item.graph));
   item.fetch = {"batch_norm"};

   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
   ASSERT_EQ(tensors_expected.size(), 1);

   Remapper optimizer(RewriterConfig::ON);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));

   auto tensors = EvaluateNodes(output, item.fetch);
   ASSERT_EQ(tensors.size(), 1);
   test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
 }

 TEST_F(RemapperTest, FusedBatchNormNCHW) {
 #if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
 #endif  // !GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output dflt =
       ops::Const(s.WithOpName("dflt"), {3.14f, 2.7f, 1.0f, 2.0f, 3.0f, 100.0f},
                  {1, 3, 1, 2});
   Output x = ops::PlaceholderWithDefault(s.WithOpName("x"), dflt, {1, 3, 1, 2});
   Output scale = ops::Const(s.WithOpName("scale"), {0.3f, 7.0f, 123.0f}, {3});
   Output offset =
       ops::Const(s.WithOpName("offset"), {0.123f, 2.1f, 0.55f}, {3});
   Output mean = ops::Const(s.WithOpName("mean"), {7.3f, 8.3f, 3.1f}, {3});
   Output variance =
       ops::Const(s.WithOpName("variance"), {0.57f, 1.0f, 2.0f}, {3});
   ops::FusedBatchNorm::Attrs attr;
   attr = attr.IsTraining(false);
   attr = attr.DataFormat("NCHW");
   ops::FusedBatchNorm bn(s.WithOpName("batch_norm").WithDevice("/device:GPU:0"),
                          x, scale, offset, mean, variance, attr);

   GrapplerItem item;
   TF_ASSERT_OK(s.ToGraphDef(&item.graph));
   item.fetch = {"batch_norm"};

   Remapper optimizer(RewriterConfig::ON);
   GraphDef output;

   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));

   if (GetNumAvailableGPUs() > 0) {
     // NCHW batch norm is only supported on GPU.
     auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
     ASSERT_EQ(tensors_expected.size(), 1);
     auto tensors = EvaluateNodes(output, item.fetch);
     ASSERT_EQ(tensors.size(), 1);
     test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-3);
   }
 }

 TEST_F(RemapperTest, FuseBatchNormWithRelu) {
   using ::tensorflow::ops::Placeholder;

   for (bool is_training : {true, false}) {
     tensorflow::Scope s = tensorflow::Scope::NewRootScope();

 #if !defined(GOOGLE_CUDA) || !(CUDNN_VERSION >= 7402)
     if (is_training) {
       LOG(INFO) << "Skip FuseBatchNormWithRelu"
                 << "[is_training=" << is_training << "] "
                 << "test. It requires CUDNN_VERSION >= 7402.";
       continue;
     }
 #endif

 #if !defined(GOOGLE_CUDA)
     if (!is_training) {
       LOG(INFO) << "Skip FuseBatchNormWithRelu"
                 << "[is_training=" << is_training << "]";
       continue;
     }
 #endif

     const int num_channels = 24;

     TensorShape channel_shape({num_channels});
     TensorShape empty_shape({0});

     auto input = Placeholder(s.WithOpName("input"), DT_FLOAT,
                              ops::Placeholder::Shape({2, 8, 8, num_channels}));
     auto input_cast = ops::Cast(s.WithOpName("input_cast"), input, DT_HALF);
     auto scale = Placeholder(s.WithOpName("scale"), DT_FLOAT);
     auto offset = Placeholder(s.WithOpName("offset"), DT_FLOAT);
     auto mean = Placeholder(s.WithOpName("mean"), DT_FLOAT);
     auto var = Placeholder(s.WithOpName("var"), DT_FLOAT);

     float epsilon = 0.1f;
     auto fbn = ops::FusedBatchNormV3(
         s.WithOpName("fused_batch_norm"), input_cast, scale, offset, mean, var,
         ops::FusedBatchNormV3::IsTraining(is_training)
             .Epsilon(epsilon)
             .DataFormat("NHWC"));
     auto relu = ops::Relu(s.WithOpName("relu"), fbn.y);
     auto fetch = ops::Identity(s.WithOpName("fetch"), relu);

     auto input_t = GenerateRandomTensor<DT_FLOAT>({2, 8, 8, num_channels});
     auto scale_t = GenerateRandomTensor<DT_FLOAT>(channel_shape);
     auto offset_t = GenerateRandomTensor<DT_FLOAT>(channel_shape);
     auto mean_t = GenerateRandomTensor<DT_FLOAT>(is_training ? empty_shape
                                                              : channel_shape);
     auto var_t = GenerateRandomTensor<DT_FLOAT>(is_training ? empty_shape
                                                             : channel_shape);

     GrapplerItem item;
     item.fetch = {"fetch"};
     item.feed = {{"input", input_t},
                  {"scale", scale_t},
                  {"offset", offset_t},
                  {"mean", mean_t},
                  {"var", var_t}};
     TF_ASSERT_OK(s.ToGraphDef(&item.graph));

     // Place all nodes on GPU.
     for (int i = 0; i < item.graph.node_size(); ++i) {
       item.graph.mutable_node(i)->set_device("/device:GPU:0");
     }

     Remapper optimizer(RewriterConfig::AGGRESSIVE);  // trust placeholders shape
     GraphDef output;
     TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));

     int found = 0;
     for (const NodeDef& node : output.node()) {
       if (node.name() == "relu") {
         EXPECT_EQ(node.op(), "Identity");
         ASSERT_EQ(node.input_size(), 1);
         EXPECT_EQ(node.input(0), "fused_batch_norm");
         found++;
       }
       if (node.name() == "fused_batch_norm") {
         EXPECT_EQ(node.op(), "_FusedBatchNormEx");
         ASSERT_EQ(node.input_size(), 5);
         EXPECT_EQ(node.input(0), "input_cast");
         EXPECT_EQ(node.input(1), "scale");
         EXPECT_EQ(node.input(2), "offset");
         EXPECT_EQ(node.input(3), "mean");
         EXPECT_EQ(node.input(4), "var");

         auto attr = node.attr();
         EXPECT_EQ(attr["num_side_inputs"].i(), 0);
         EXPECT_EQ(attr["activation_mode"].s(), "Relu");
         found++;
       }
     }
     EXPECT_EQ(found, 2);

     if (GetNumAvailableGPUs() > 0) {
       auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
       ASSERT_EQ(tensors_expected.size(), 1);
       auto tensors = EvaluateNodes(output, item.fetch, item.feed);
       ASSERT_EQ(tensors.size(), 1);
       test::ExpectClose(tensors[0], tensors_expected[0], 1e-2, /*rtol=*/1e-2);
     }
   }
 }

 TEST_F(RemapperTest, FuseBatchNormWithAddAndRelu) {
   using ::tensorflow::ops::Placeholder;

   for (bool is_training : {true, false}) {
     tensorflow::Scope s = tensorflow::Scope::NewRootScope();

 #if !defined(GOOGLE_CUDA) || !(CUDNN_VERSION >= 7402)
     if (is_training) {
       LOG(INFO) << "Skip FuseBatchNormWithAddAndRelu"
                 << "[is_training=" << is_training << "] "
                 << "test. It requires CUDNN_VERSION >= 7402.";
       continue;
     }
 #endif

 #if !defined(GOOGLE_CUDA)
     if (!is_training) {
       LOG(INFO) << "Skip FuseBatchNormWithAddAndRelu"
                 << "[is_training=" << is_training << "]";
       continue;
     }
 #endif

     const int num_channels = 24;

     TensorShape input_shape({2, 8, 8, num_channels});
     TensorShape channel_shape({num_channels});
     TensorShape empty_shape({0});

     auto input = Placeholder(s.WithOpName("input"), DT_FLOAT,
                              ops::Placeholder::Shape(input_shape));
     auto input_cast = ops::Cast(s.WithOpName("input_cast"), input, DT_HALF);
     auto scale = Placeholder(s.WithOpName("scale"), DT_FLOAT);
     auto offset = Placeholder(s.WithOpName("offset"), DT_FLOAT);
     auto mean = Placeholder(s.WithOpName("mean"), DT_FLOAT);
     auto var = Placeholder(s.WithOpName("var"), DT_FLOAT);
     auto side_input = Placeholder(s.WithOpName("side_input"), DT_FLOAT,
                                   ops::Placeholder::Shape(input_shape));
     auto side_input_cast =
         ops::Cast(s.WithOpName("side_input_cast"), side_input, DT_HALF);

     float epsilon = 0.1f;
     auto fbn = ops::FusedBatchNormV3(
         s.WithOpName("fused_batch_norm"), input_cast, scale, offset, mean, var,
         ops::FusedBatchNormV3::IsTraining(is_training)
             .Epsilon(epsilon)
             .DataFormat("NHWC"));
     auto add = ops::Add(s.WithOpName("add"), fbn.y, side_input_cast);
     auto relu = ops::Relu(s.WithOpName("relu"), add);
     auto fetch = ops::Identity(s.WithOpName("fetch"), relu);

     auto input_t = GenerateRandomTensor<DT_FLOAT>(input_shape);
     auto scale_t = GenerateRandomTensor<DT_FLOAT>(channel_shape);
     auto offset_t = GenerateRandomTensor<DT_FLOAT>(channel_shape);
     auto mean_t = GenerateRandomTensor<DT_FLOAT>(is_training ? empty_shape
                                                              : channel_shape);
     auto var_t = GenerateRandomTensor<DT_FLOAT>(is_training ? empty_shape
                                                             : channel_shape);
     auto side_input_t = GenerateRandomTensor<DT_FLOAT>({2, 8, 8, num_channels});

     GrapplerItem item;
     item.fetch = {"fetch"};
     item.feed = {{"input", input_t},   {"scale", scale_t},
                  {"offset", offset_t}, {"mean", mean_t},
                  {"var", var_t},       {"side_input", side_input_t}};
     TF_ASSERT_OK(s.ToGraphDef(&item.graph));

     // Place all nodes on GPU.
     for (int i = 0; i < item.graph.node_size(); ++i) {
       item.graph.mutable_node(i)->set_device("/device:GPU:0");
     }

     Remapper optimizer(RewriterConfig::AGGRESSIVE);  // trust placeholders shape
     GraphDef output;
     TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));

     int found = 0;
     for (const NodeDef& node : output.node()) {
       if (node.name() == "relu") {
         EXPECT_EQ(node.op(), "Identity");
         ASSERT_EQ(node.input_size(), 1);
         EXPECT_EQ(node.input(0), "fused_batch_norm");
         found++;
       }
       if (node.name() == "fused_batch_norm") {
         EXPECT_EQ(node.op(), "_FusedBatchNormEx");
         ASSERT_EQ(node.input_size(), 6);
         EXPECT_EQ(node.input(0), "input_cast");
         EXPECT_EQ(node.input(1), "scale");
         EXPECT_EQ(node.input(2), "offset");
         EXPECT_EQ(node.input(3), "mean");
         EXPECT_EQ(node.input(4), "var");
         EXPECT_EQ(node.input(5), "side_input_cast");

         auto attr = node.attr();
         EXPECT_EQ(attr["num_side_inputs"].i(), 1);
         EXPECT_EQ(attr["activation_mode"].s(), "Relu");
         found++;
       }
     }
     EXPECT_EQ(found, 2);

     if (GetNumAvailableGPUs() > 0) {
       auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
       ASSERT_EQ(tensors_expected.size(), 1);
       auto tensors = EvaluateNodes(output, item.fetch, item.feed);
       ASSERT_EQ(tensors.size(), 1);
       test::ExpectClose(tensors[0], tensors_expected[0], 1e-2, /*rtol=*/1e-2);
     }
   }
 }

 TEST_F(RemapperTest, FuseConv2DWithBias) {
   using ::tensorflow::ops::Placeholder;

   tensorflow::Scope s = tensorflow::Scope::NewRootScope();

   auto input_shape = ops::Placeholder::Shape({8, 32, 32, 3});
   auto filter_shape = ops::Placeholder::Shape({1, 1, 3, 128});
   auto bias_shape = ops::Placeholder::Shape({128});

   auto input = Placeholder(s.WithOpName("input"), DT_FLOAT, input_shape);
   auto filter = Placeholder(s.WithOpName("filter"), DT_FLOAT, filter_shape);
   auto bias = Placeholder(s.WithOpName("bias"), DT_FLOAT, bias_shape);

   std::vector<int> strides = {1, 1, 1, 1};
   auto conv = ops::Conv2D(s.WithOpName("conv"), input, filter, strides, "SAME");
   auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), conv, bias);
   auto fetch = ops::Identity(s.WithOpName("fetch"), bias_add);

   auto input_t = GenerateRandomTensor<DT_FLOAT>({8, 32, 32, 3});
   auto filter_t = GenerateRandomTensor<DT_FLOAT>({1, 1, 3, 128});
   auto bias_t = GenerateRandomTensor<DT_FLOAT>({128});

   GrapplerItem item;
   item.fetch = {"fetch"};
   item.feed = {{"input", input_t}, {"filter", filter_t}, {"bias", bias_t}};
   TF_ASSERT_OK(s.ToGraphDef(&item.graph));

   // Place all nodes on CPU.
   for (int i = 0; i < item.graph.node_size(); ++i) {
     item.graph.mutable_node(i)->set_device("/device:CPU:0");
   }

   Remapper optimizer(RewriterConfig::ON);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));

   int found = 0;
   for (const NodeDef& node : output.node()) {
     if (node.name() == "bias_add") {
       EXPECT_EQ(node.op(), "_FusedConv2D");
       ASSERT_GE(node.input_size(), 3);
       EXPECT_EQ(node.input(0), "input");
       EXPECT_EQ(node.input(1), "filter");

       EXPECT_EQ(node.attr().at("num_args").i(), 1);
       EXPECT_EQ(node.input(2), "bias");

       const auto fused_ops = node.attr().at("fused_ops").list().s();
       ASSERT_EQ(fused_ops.size(), 1);
       EXPECT_EQ(fused_ops[0], "BiasAdd");
       found++;
     }
   }
   EXPECT_EQ(found, 1);

   auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
   ASSERT_EQ(tensors_expected.size(), 1);
   auto tensors = EvaluateNodes(output, item.fetch, item.feed);
   ASSERT_EQ(tensors.size(), 1);
   test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
 }

 TEST_F(RemapperTest, FuseMatMulWithBias) {
   using ::tensorflow::ops::Placeholder;

   tensorflow::Scope s = tensorflow::Scope::NewRootScope();

   auto lhs_shape = ops::Placeholder::Shape({8, 32});
   auto rhs_shape = ops::Placeholder::Shape({32, 64});
   auto bias_shape = ops::Placeholder::Shape({64});

   auto lhs = Placeholder(s.WithOpName("lhs"), DT_FLOAT, lhs_shape);
   auto rhs = Placeholder(s.WithOpName("rhs"), DT_FLOAT, rhs_shape);
   auto bias = Placeholder(s.WithOpName("bias"), DT_FLOAT, bias_shape);

   auto matmul = ops::MatMul(s.WithOpName("matmul"), lhs, rhs);
   auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), matmul, bias);
   auto fetch = ops::Identity(s.WithOpName("fetch"), bias_add);

   auto lhs_t = GenerateRandomTensor<DT_FLOAT>({8, 32});
   auto rhs_t = GenerateRandomTensor<DT_FLOAT>({32, 64});
   auto bias_t = GenerateRandomTensor<DT_FLOAT>({64});

   GrapplerItem item;
   item.fetch = {"fetch"};
   item.feed = {{"lhs", lhs_t}, {"rhs", rhs_t}, {"bias", bias_t}};
   TF_ASSERT_OK(s.ToGraphDef(&item.graph));

   // Place all nodes on CPU.
   for (int i = 0; i < item.graph.node_size(); ++i) {
     item.graph.mutable_node(i)->set_device("/device:CPU:0");
   }

   Remapper optimizer(RewriterConfig::ON);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));

   int found = 0;
   for (const NodeDef& node : output.node()) {
     if (node.name() == "bias_add") {
       EXPECT_EQ(node.op(), "_FusedMatMul");
       ASSERT_GE(node.input_size(), 3);
       EXPECT_EQ(node.input(0), "lhs");
       EXPECT_EQ(node.input(1), "rhs");

       EXPECT_EQ(node.attr().at("num_args").i(), 1);
       EXPECT_EQ(node.input(2), "bias");

       const auto fused_ops = node.attr().at("fused_ops").list().s();
       ASSERT_EQ(fused_ops.size(), 1);
       EXPECT_EQ(fused_ops[0], "BiasAdd");
       found++;
     }
   }
   EXPECT_EQ(1, found);

   auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
   ASSERT_EQ(tensors_expected.size(), 1);
   auto tensors = EvaluateNodes(output, item.fetch, item.feed);
   ASSERT_EQ(tensors.size(), 1);
   test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
 }

 TEST_F(RemapperTest, FuseConv2DWithBiasAndActivation) {
   using ::tensorflow::ops::Placeholder;

   for (const string& activation : {"Relu", "Relu6", "Elu"}) {
     tensorflow::Scope s = tensorflow::Scope::NewRootScope();

     auto input_shape = Placeholder::Shape({8, 32, 32, 3});
     auto filter_shape = Placeholder::Shape({1, 1, 3, 128});
     auto bias_shape = Placeholder::Shape({128});

     auto input = Placeholder(s.WithOpName("input"), DT_FLOAT, input_shape);
     auto filter = Placeholder(s.WithOpName("filter"), DT_FLOAT, filter_shape);
     auto bias = Placeholder(s.WithOpName("bias"), DT_FLOAT, bias_shape);

     std::vector<int> strides = {1, 1, 1, 1};
     auto conv =
         ops::Conv2D(s.WithOpName("conv"), input, filter, strides, "SAME");
     auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), conv, bias);

     ops::Identity fetch = [&]() -> ops::Identity {
       auto activate = s.WithOpName("activation");
       auto fetch = s.WithOpName("fetch");

       if (activation == "Relu") {
         return ops::Identity(fetch, ops::Relu(activate, bias_add));
       } else if (activation == "Relu6") {
         return ops::Identity(fetch, ops::Relu6(activate, bias_add));
       } else if (activation == "Elu") {
         return ops::Identity(fetch, ops::Elu(activate, bias_add));
       }

       return ops::Identity(fetch, bias);
     }();

     auto input_t = GenerateRandomTensor<DT_FLOAT>({8, 32, 32, 3});
     auto filter_t = GenerateRandomTensor<DT_FLOAT>({1, 1, 3, 128});
     auto bias_t = GenerateRandomTensor<DT_FLOAT>({128});

     GrapplerItem item;
     item.fetch = {"fetch"};
     item.feed = {{"input", input_t}, {"filter", filter_t}, {"bias", bias_t}};
     TF_ASSERT_OK(s.ToGraphDef(&item.graph));

     // Place all nodes on CPU.
     for (int i = 0; i < item.graph.node_size(); ++i) {
       item.graph.mutable_node(i)->set_device("/device:CPU:0");
     }

     Remapper optimizer(RewriterConfig::ON);
     GraphDef output;
     TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));

     int found = 0;
     for (const NodeDef& node : output.node()) {
       if (node.name() == "activation") {
         EXPECT_EQ(node.op(), "_FusedConv2D");
         ASSERT_GE(node.input_size(), 3);
         EXPECT_EQ(node.input(0), "input");
         EXPECT_EQ(node.input(1), "filter");

         EXPECT_EQ(node.attr().at("num_args").i(), 1);
         EXPECT_EQ(node.input(2), "bias");

         const auto fused_ops = node.attr().at("fused_ops").list().s();
         ASSERT_EQ(fused_ops.size(), 2);
         EXPECT_EQ(fused_ops[0], "BiasAdd");
         EXPECT_EQ(fused_ops[1], activation);
         found++;
       }
     }
     EXPECT_EQ(found, 1);

     auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
     ASSERT_EQ(tensors_expected.size(), 1);
     auto tensors = EvaluateNodes(output, item.fetch, item.feed);
     ASSERT_EQ(tensors.size(), 1);
     test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
   }
 }

 TEST_F(RemapperTest, FuseMatMulWithBiasAndActivation) {
   using ::tensorflow::ops::Placeholder;

   for (const string& activation : {"Relu", "Relu6", "Elu"}) {
     tensorflow::Scope s = tensorflow::Scope::NewRootScope();

     auto lhs_shape = ops::Placeholder::Shape({8, 32});
     auto rhs_shape = ops::Placeholder::Shape({32, 64});
     auto bias_shape = ops::Placeholder::Shape({64});

     auto lhs = Placeholder(s.WithOpName("lhs"), DT_FLOAT, lhs_shape);
     auto rhs = Placeholder(s.WithOpName("rhs"), DT_FLOAT, rhs_shape);
     auto bias = Placeholder(s.WithOpName("bias"), DT_FLOAT, bias_shape);

     auto matmul = ops::MatMul(s.WithOpName("matmul"), lhs, rhs);
     auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), matmul, bias);

     ops::Identity fetch = [&]() -> ops::Identity {
       auto activate = s.WithOpName("activation");
       auto fetch = s.WithOpName("fetch");

       if (activation == "Relu") {
         return ops::Identity(fetch, ops::Relu(activate, bias_add));
       } else if (activation == "Relu6") {
         return ops::Identity(fetch, ops::Relu6(activate, bias_add));
       } else if (activation == "Elu") {
         return ops::Identity(fetch, ops::Elu(activate, bias_add));
       }

       return ops::Identity(fetch, bias);
     }();

     auto lhs_t = GenerateRandomTensor<DT_FLOAT>({8, 32});
     auto rhs_t = GenerateRandomTensor<DT_FLOAT>({32, 64});
     auto bias_t = GenerateRandomTensor<DT_FLOAT>({64});

     GrapplerItem item;
     item.fetch = {"fetch"};
     item.feed = {{"lhs", lhs_t}, {"rhs", rhs_t}, {"bias", bias_t}};
     TF_ASSERT_OK(s.ToGraphDef(&item.graph));

     // Place all nodes on CPU.
     for (int i = 0; i < item.graph.node_size(); ++i) {
       item.graph.mutable_node(i)->set_device("/device:CPU:0");
     }

     Remapper optimizer(RewriterConfig::ON);
     GraphDef output;
     TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));

     int found = 0;
     for (const NodeDef& node : output.node()) {
       if (node.name() == "activation") {
         EXPECT_EQ(node.op(), "_FusedMatMul");
         ASSERT_GE(node.input_size(), 3);
         EXPECT_EQ(node.input(0), "lhs");
         EXPECT_EQ(node.input(1), "rhs");

         EXPECT_EQ(node.attr().at("num_args").i(), 1);
         EXPECT_EQ(node.input(2), "bias");

         const auto fused_ops = node.attr().at("fused_ops").list().s();
         ASSERT_EQ(fused_ops.size(), 2);
         EXPECT_EQ(fused_ops[0], "BiasAdd");
         EXPECT_EQ(fused_ops[1], activation);
         found++;
       }
     }
     EXPECT_EQ(1, found);

     auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
     ASSERT_EQ(tensors_expected.size(), 1);
     auto tensors = EvaluateNodes(output, item.fetch, item.feed);
     ASSERT_EQ(tensors.size(), 1);
     test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
   }
 }

 TEST_F(RemapperTest, FuseConv2DWithBatchNorm) {
   using ops::Placeholder;

   tensorflow::Scope s = tensorflow::Scope::NewRootScope();

   auto input_shape = ops::Placeholder::Shape({8, 32, 32, 3});
   auto filter_shape = ops::Placeholder::Shape({1, 1, 3, 128});
   auto scale_shape = ops::Placeholder::Shape({128});

   auto input = Placeholder(s.WithOpName("input"), DT_FLOAT, input_shape);
   auto filter = Placeholder(s.WithOpName("filter"), DT_FLOAT, filter_shape);
   auto scale = Placeholder(s.WithOpName("scale"), DT_FLOAT, scale_shape);
   auto offset = Placeholder(s.WithOpName("offset"), DT_FLOAT, scale_shape);
   auto mean = Placeholder(s.WithOpName("mean"), DT_FLOAT, scale_shape);
   auto variance = Placeholder(s.WithOpName("variance"), DT_FLOAT, scale_shape);

   std::vector<int> strides = {1, 1, 1, 1};
   auto conv = ops::Conv2D(
       s.WithOpName("conv"), input, filter, strides, "EXPLICIT",
       ops::Conv2D::Attrs().ExplicitPaddings({0, 0, 1, 2, 3, 4, 0, 0}));
   ops::FusedBatchNorm::Attrs attrs;
   attrs = attrs.IsTraining(false);
   auto batch_norm = ops::FusedBatchNorm(s.WithOpName("batch_norm"), conv, scale,
                                         offset, mean, variance, attrs);
   auto fetch = ops::Identity(s.WithOpName("fetch"), batch_norm.y);

   auto input_t = GenerateRandomTensor<DT_FLOAT>({8, 32, 32, 3});
   auto filter_t = GenerateRandomTensor<DT_FLOAT>({1, 1, 3, 128});
   auto scale_t = GenerateRandomTensor<DT_FLOAT>({128});
   auto offset_t = GenerateRandomTensor<DT_FLOAT>({128});
   auto mean_t = GenerateRandomTensor<DT_FLOAT>({128});
   auto variance_t = GenerateRandomTensor<DT_FLOAT>({128});

   GrapplerItem item;
   item.fetch = {"fetch"};
   item.feed = {{"input", input_t}, {"filter", filter_t},
                {"scale", scale_t}, {"offset", offset_t},
                {"mean", mean_t},   {"variance", variance_t}};
   TF_ASSERT_OK(s.ToGraphDef(&item.graph));

   // Place all nodes on CPU.
   for (int i = 0; i < item.graph.node_size(); ++i) {
     item.graph.mutable_node(i)->set_device("/device:CPU:0");
   }

   Remapper optimizer(RewriterConfig::ON);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));

   int found = 0;
   for (const NodeDef& node : output.node()) {
     if (node.name() == "batch_norm") {
       EXPECT_EQ(node.op(), "_FusedConv2D");
       ASSERT_GE(node.input_size(), 6);
       EXPECT_EQ(node.input(0), "input");
       EXPECT_EQ(node.input(1), "filter");

       EXPECT_EQ(node.attr().at("num_args").i(), 4);
       EXPECT_EQ(node.input(2), "scale");
       EXPECT_EQ(node.input(3), "offset");
       EXPECT_EQ(node.input(4), "mean");
       EXPECT_EQ(node.input(5), "variance");

       const auto fused_ops = node.attr().at("fused_ops").list().s();
       ASSERT_EQ(fused_ops.size(), 1);
       EXPECT_EQ(fused_ops[0], "FusedBatchNorm");
       found++;
     }
   }
   EXPECT_EQ(found, 1);

   auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
   ASSERT_EQ(tensors_expected.size(), 1);
   auto tensors = EvaluateNodes(output, item.fetch, item.feed);
   ASSERT_EQ(tensors.size(), 1);
   test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
 }

 TEST_F(RemapperTest, FuseConv2DWithBatchNormAndActivation) {
   using ops::Placeholder;

   for (const string& activation : {"Relu", "Relu6", "Elu"}) {
     tensorflow::Scope s = tensorflow::Scope::NewRootScope();

     auto input_shape = ops::Placeholder::Shape({8, 32, 32, 3});
     auto filter_shape = ops::Placeholder::Shape({1, 1, 3, 128});
     auto scale_shape = ops::Placeholder::Shape({128});

     auto input = Placeholder(s.WithOpName("input"), DT_FLOAT, input_shape);
     auto filter = Placeholder(s.WithOpName("filter"), DT_FLOAT, filter_shape);
     auto scale = Placeholder(s.WithOpName("scale"), DT_FLOAT, scale_shape);
     auto offset = Placeholder(s.WithOpName("offset"), DT_FLOAT, scale_shape);
     auto mean = Placeholder(s.WithOpName("mean"), DT_FLOAT, scale_shape);
     auto variance =
         Placeholder(s.WithOpName("variance"), DT_FLOAT, scale_shape);

     std::vector<int> strides = {1, 1, 1, 1};
     auto conv =
         ops::Conv2D(s.WithOpName("conv"), input, filter, strides, "SAME");
     ops::FusedBatchNorm::Attrs attrs;
     attrs = attrs.IsTraining(false);
     auto batch_norm = ops::FusedBatchNorm(s.WithOpName("batch_norm"), conv,
                                           scale, offset, mean, variance, attrs);

     ops::Identity fetch = [&]() -> ops::Identity {
       auto activate = s.WithOpName("activation");
       auto fetch = s.WithOpName("fetch");

       if (activation == "Relu") {
         return ops::Identity(fetch, ops::Relu(activate, batch_norm.y));
       } else if (activation == "Relu6") {
         return ops::Identity(fetch, ops::Relu6(activate, batch_norm.y));
       } else if (activation == "Elu") {
         return ops::Identity(fetch, ops::Elu(activate, batch_norm.y));
       }

       return ops::Identity(fetch, batch_norm.y);
     }();

     auto input_t = GenerateRandomTensor<DT_FLOAT>({8, 32, 32, 3});
     auto filter_t = GenerateRandomTensor<DT_FLOAT>({1, 1, 3, 128});
     auto scale_t = GenerateRandomTensor<DT_FLOAT>({128});
     auto offset_t = GenerateRandomTensor<DT_FLOAT>({128});
     auto mean_t = GenerateRandomTensor<DT_FLOAT>({128});
     auto variance_t = GenerateRandomTensor<DT_FLOAT>({128});

     GrapplerItem item;
     item.fetch = {"fetch"};
     item.feed = {{"input", input_t}, {"filter", filter_t},
                  {"scale", scale_t}, {"offset", offset_t},
                  {"mean", mean_t},   {"variance", variance_t}};
     TF_ASSERT_OK(s.ToGraphDef(&item.graph));

     // Place all nodes on CPU.
     for (int i = 0; i < item.graph.node_size(); ++i) {
       item.graph.mutable_node(i)->set_device("/device:CPU:0");
     }

     Remapper optimizer(RewriterConfig::ON);
     GraphDef output;
     TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));

     int found = 0;
     for (const NodeDef& node : output.node()) {
       if (node.name() == "activation") {
         EXPECT_EQ(node.op(), "_FusedConv2D");
         ASSERT_GE(node.input_size(), 6);
         EXPECT_EQ(node.input(0), "input");
         EXPECT_EQ(node.input(1), "filter");

         EXPECT_EQ(node.attr().at("num_args").i(), 4);
         EXPECT_EQ(node.input(2), "scale");
         EXPECT_EQ(node.input(3), "offset");
         EXPECT_EQ(node.input(4), "mean");
         EXPECT_EQ(node.input(5), "variance");

         const auto fused_ops = node.attr().at("fused_ops").list().s();
         ASSERT_EQ(fused_ops.size(), 2);
         EXPECT_EQ(fused_ops[0], "FusedBatchNorm");
         EXPECT_EQ(fused_ops[1], activation);
         found++;
       }
     }
     EXPECT_EQ(found, 1);

     auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
     ASSERT_EQ(tensors_expected.size(), 1);
     auto tensors = EvaluateNodes(output, item.fetch, item.feed);
     ASSERT_EQ(tensors.size(), 1);
     test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
   }
 }

 TEST_F(RemapperTest, FuseConv2DWithSqueezeAndBias) {
   using ops::Placeholder;

   tensorflow::Scope s = tensorflow::Scope::NewRootScope();

   auto input_shape = ops::Placeholder::Shape({8, 32, 1, 3});
   auto filter_shape = ops::Placeholder::Shape({1, 1, 3, 128});
   auto bias_shape = ops::Placeholder::Shape({128});

   auto input = Placeholder(s.WithOpName("input"), DT_FLOAT, input_shape);
   auto filter = Placeholder(s.WithOpName("filter"), DT_FLOAT, filter_shape);
   auto bias = Placeholder(s.WithOpName("bias"), DT_FLOAT, bias_shape);

   std::vector<int> strides = {1, 1, 1, 1};
   auto conv = ops::Conv2D(s.WithOpName("conv"), input, filter, strides, "SAME");

   auto squeeze = ops::Squeeze(s.WithOpName("squeeze"), conv,
                               ops::Squeeze::Attrs().Axis({2}));

   auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), squeeze, bias);
   auto fetch = ops::Identity(s.WithOpName("fetch"), bias_add);

   auto input_t = GenerateRandomTensor<DT_FLOAT>({8, 32, 1, 3});
   auto filter_t = GenerateRandomTensor<DT_FLOAT>({1, 1, 3, 128});
   auto bias_t = GenerateRandomTensor<DT_FLOAT>({128});

   GrapplerItem item;
   item.fetch = {"fetch"};
   item.feed = {{"input", input_t}, {"filter", filter_t}, {"bias", bias_t}};
   TF_ASSERT_OK(s.ToGraphDef(&item.graph));

   // Place all nodes on CPU.
   for (int i = 0; i < item.graph.node_size(); ++i) {
     item.graph.mutable_node(i)->set_device("/device:CPU:0");
   }

   Remapper optimizer(RewriterConfig::ON);
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));

   int found = 0;
   for (const NodeDef& node : output.node()) {
     if (node.name() == "conv") {
       EXPECT_EQ(node.op(), "_FusedConv2D");
       ASSERT_GE(node.input_size(), 3);
       EXPECT_EQ(node.input(0), "input");
       EXPECT_EQ(node.input(1), "filter");

       EXPECT_EQ(node.attr().at("num_args").i(), 1);
       EXPECT_EQ(node.input(2), "bias");

       const auto fused_ops = node.attr().at("fused_ops").list().s();
       ASSERT_EQ(fused_ops.size(), 1);
       EXPECT_EQ(fused_ops[0], "BiasAdd");
       found++;
     } else if (node.name() == "bias_add") {
       EXPECT_EQ(node.op(), "Squeeze");
       ASSERT_GE(node.input_size(), 1);
       EXPECT_EQ(node.input(0), "conv");
       found++;
     }
   }
   EXPECT_EQ(found, 2);

   auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
   ASSERT_EQ(tensors_expected.size(), 1);
   auto tensors = EvaluateNodes(output, item.fetch, item.feed);
   ASSERT_EQ(tensors.size(), 1);
   test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
 }

 }  // namespace grappler
 }  // namespace tensorflow