tensorflow/core/kernels/matmul_op_test.cc - platform/external/tensorflow - Git at Google

 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

 #include "absl/algorithm/container.h"
 #include "tensorflow/cc/ops/nn_ops_internal.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 #include "tensorflow/core/public/session.h"

 namespace tensorflow {
 namespace {

 template <typename T>
 class FusedMatMulOpTest : public OpsTestBase {
  protected:
   using BiasAddGraphRunner =
       std::function<void(const Tensor& lhs_data, const Tensor& rhs_data,
                          const Tensor& bias_data, Tensor* out)>;

   // Runs a Tensorflow graph defined by the root scope, and fetches the result
   // of 'fetch' node into the output Tensor. Optional `fetch_node` parameter
   // allows to define a fetch node directly using a NodeDef for the ops that are
   // not supported by the C++ Api.
   void RunAndFetch(const tensorflow::Scope& root, const string& fetch,
                    Tensor* output, bool allow_gpu_device,
                    const NodeDef* fetch_node = nullptr) {
     tensorflow::GraphDef graph;
     TF_ASSERT_OK(root.ToGraphDef(&graph));

     if (fetch_node) {
       *graph.add_node() = *fetch_node;
     }

     // We really want to make sure that graph executed exactly as we passed it
     // to the session, so we disable various optimizations.
     tensorflow::SessionOptions session_options;

     // Disable common runtime constant folding.
     session_options.config.mutable_graph_options()
         ->mutable_optimizer_options()
         ->set_opt_level(OptimizerOptions::L0);

     // Disable Grappler optimizations for tests.
     tensorflow::RewriterConfig* cfg =
         session_options.config.mutable_graph_options()
             ->mutable_rewrite_options();
     cfg->set_constant_folding(tensorflow::RewriterConfig::OFF);
     cfg->set_layout_optimizer(tensorflow::RewriterConfig::OFF);
     cfg->set_remapping(tensorflow::RewriterConfig::OFF);

     std::unique_ptr<tensorflow::Session> session(
         tensorflow::NewSession(session_options));

     std::vector<DeviceAttributes> available_devices;
     TF_ASSERT_OK(session->ListDevices(&available_devices))
         << "Failed to get available session devices";

     // Check if session has an available GPU device.
     const bool has_gpu_device =
         absl::c_any_of(available_devices, [](const DeviceAttributes& device) {
           return device.device_type() == DEVICE_GPU;
         });

     // If fused computation implemented only for CPU, in this test we don't want
     // to compare GPU vs CPU numbers, so place all nodes on CPU in this case.
     const bool place_all_on_gpu = allow_gpu_device && has_gpu_device;

     const string device = place_all_on_gpu ? "/device:GPU:0" : "/device:CPU:0";
     for (NodeDef& mutable_node : *graph.mutable_node()) {
       mutable_node.set_device(device);
     }

     TF_ASSERT_OK(session->Create(graph));

     std::vector<Tensor> unfused_tensors;
     TF_ASSERT_OK(session->Run({}, {fetch}, {}, &unfused_tensors));

     *output = unfused_tensors[0];
   }

   void RunMatMulWithBias(const Tensor& lhs_data, const Tensor& rhs_data,
                          const Tensor& bias_data, bool transpose_a,
                          bool transpose_b, Tensor* output,
                          bool allow_gpu_device = false) {
     Scope root = tensorflow::Scope::NewRootScope();

     ops::MatMul matmul = ops::MatMul(
         root.WithOpName("matmul"),
         ops::Const(root.WithOpName("lhs"), Input::Initializer(lhs_data)),
         ops::Const(root.WithOpName("rhs"), Input::Initializer(rhs_data)),
         ops::MatMul::Attrs().TransposeA(transpose_a).TransposeB(transpose_b));

     ops::BiasAdd with_bias = ops::BiasAdd(
         root.WithOpName("with_bias"), matmul,
         ops::Const(root.WithOpName("bias"), Input::Initializer(bias_data)));

     RunAndFetch(root, "with_bias", output, allow_gpu_device);
   }

   void RunMatMulWithBiasAndActivation(
       const Tensor& lhs_data, const Tensor& rhs_data, const Tensor& bias_data,
       bool transpose_a, bool transpose_b, const string& activation_type,
       Tensor* output, bool allow_gpu_device = false) {
     Scope root = tensorflow::Scope::NewRootScope();

     ops::MatMul matmul = ops::MatMul(
         root.WithOpName("matmul"),
         ops::Const(root.WithOpName("lhs"), Input::Initializer(lhs_data)),
         ops::Const(root.WithOpName("rhs"), Input::Initializer(rhs_data)),
         ops::MatMul::Attrs().TransposeA(transpose_a).TransposeB(transpose_b));

     ops::BiasAdd with_bias = ops::BiasAdd(
         root.WithOpName("with_bias"), matmul,
         ops::Const(root.WithOpName("bias"), Input::Initializer(bias_data)));

     if (activation_type == "Relu") {
       ops::Relu(root.WithOpName("with_activation"), with_bias);
     } else if (activation_type == "Relu6") {
       ops::Relu6(root.WithOpName("with_activation"), with_bias);
     } else if (activation_type == "Elu") {
       ops::Elu(root.WithOpName("with_activation"), with_bias);
     } else if (activation_type == "LeakyRelu") {
       ops::internal::LeakyRelu(root.WithOpName("with_activation"), with_bias);
     } else {
       ops::Identity(root.WithOpName("with_activation"), with_bias);
     }

     RunAndFetch(root, "with_activation", output, allow_gpu_device);
   }

   void RunFusedMatMulOp(const Tensor& lhs_data, const Tensor& rhs_data,
                         const std::vector<Tensor>& args_data,
                         const std::vector<string>& fused_ops, bool transpose_a,
                         bool transpose_b, Tensor* output,
                         bool allow_gpu_device = false) {
     Scope root = tensorflow::Scope::NewRootScope();

     DataType dtype = DataTypeToEnum<T>::v();
     int num_args = static_cast<int>(args_data.size());

     Output lhs =
         ops::Const(root.WithOpName("lhs"), Input::Initializer(lhs_data));
     Output rhs =
         ops::Const(root.WithOpName("rhs"), Input::Initializer(rhs_data));

     std::vector<NodeDefBuilder::NodeOut> args;
     for (int i = 0; i < num_args; ++i) {
       Output arg = ops::Const(root.WithOpName(absl::StrCat("arg", i)),
                               Input::Initializer(args_data[i]));
       args.emplace_back(arg.name(), 0, dtype);
     }

     NodeDef fused_matmul;
     TF_EXPECT_OK(NodeDefBuilder("fused_matmul", "_FusedMatMul")
                      .Input({lhs.name(), 0, dtype})
                      .Input({rhs.name(), 0, dtype})
                      .Input(args)
                      .Attr("num_args", num_args)
                      .Attr("T", dtype)
                      .Attr("fused_ops", fused_ops)
                      .Attr("transpose_a", transpose_a)
                      .Attr("transpose_b", transpose_b)
                      .Finalize(&fused_matmul));

     RunAndFetch(root, fused_matmul.name(), output, allow_gpu_device,
                 &fused_matmul);
   }

   void VerifyBiasAddTensorsNear(int m, int k, int n,
                                 const BiasAddGraphRunner& run_default,
                                 const BiasAddGraphRunner& run_fused) {
     DataType dtype = DataTypeToEnum<T>::v();

     Tensor lhs(dtype, {m, k});
     lhs.flat<T>() = lhs.flat<T>().setRandom();

     // Add some negative values to filter to properly test Relu.
     Tensor rhs(dtype, {k, n});
     rhs.flat<T>() = rhs.flat<T>().setRandom();
     rhs.flat<T>() -= rhs.flat<T>().constant(static_cast<T>(0.5f));

     // Bias added to the inner dimension.
     const int bias_size = n;
     Tensor bias(dtype, {bias_size});
     bias.flat<T>() = bias.flat<T>().setRandom();
     bias.flat<T>() += bias.flat<T>().constant(static_cast<T>(0.5f));

     Tensor matmul;
     Tensor fused_matmul;

     run_default(lhs, rhs, bias, &matmul);
     run_fused(lhs, rhs, bias, &fused_matmul);

     ASSERT_EQ(matmul.dtype(), fused_matmul.dtype());
     ASSERT_EQ(matmul.shape(), fused_matmul.shape());

     test::ExpectClose(matmul, fused_matmul, /*atol=*/1e-5);
   }

   // Verifies that computing MatMul+BiasAdd in a graph is identical to
   // FusedMatMul.
   void VerifyMatMulWithBias(int m, int k, int n, bool transpose_a,
                             bool transpose_b) {
     const BiasAddGraphRunner run_default =
         [&](const Tensor& input_data, const Tensor& filter_data,
             const Tensor& bias_data, Tensor* out) {
           RunMatMulWithBias(input_data, filter_data, bias_data, transpose_a,
                             transpose_b, out);
         };

     const BiasAddGraphRunner run_fused =
         [&](const Tensor& input_data, const Tensor& filter_data,
             const Tensor& bias_data, Tensor* out) {
           RunFusedMatMulOp(input_data, filter_data, {bias_data}, {"BiasAdd"},
                            transpose_a, transpose_b, out);
         };

     VerifyBiasAddTensorsNear(m, k, n, run_default, run_fused);
   }

   // Verifies that computing MatMul+BiasAdd+{Activation} in a graph is identical
   // to FusedMatMul.
   void VerifyConv2DWithBiasAndActivation(int m, int k, int n, bool transpose_a,
                                          bool transpose_b,
                                          const string& activation) {
     const BiasAddGraphRunner run_default = [&](const Tensor& input_data,
                                                const Tensor& filter_data,
                                                const Tensor& bias_data,
                                                Tensor* out) {
       RunMatMulWithBiasAndActivation(input_data, filter_data, bias_data,
                                      transpose_a, transpose_b, activation, out);
     };

     const BiasAddGraphRunner run_fused = [&](const Tensor& input_data,
                                              const Tensor& filter_data,
                                              const Tensor& bias_data,
                                              Tensor* out) {
       RunFusedMatMulOp(input_data, filter_data, {bias_data},
                        {"BiasAdd", activation}, transpose_a, transpose_b, out);
     };

     VerifyBiasAddTensorsNear(m, k, n, run_default, run_fused);
   }
 };

 // MatMul with BatchNorm can be tested only with `T=float`, because default
 // `FusedBatchNorm` kernel supports only floats for scale, mean and variance.

 template <typename T>
 class FusedMatMulWithBiasOpTest : public FusedMatMulOpTest<T> {};

 TYPED_TEST_SUITE_P(FusedMatMulWithBiasOpTest);

 // -------------------------------------------------------------------------- //
 // MatMul + BiasAdd + {Activation}                                            //
 // -------------------------------------------------------------------------- //

 TYPED_TEST_P(FusedMatMulWithBiasOpTest, MatMul256x256x256) {
   this->VerifyMatMulWithBias(256, 256, 256, false, false);
   this->VerifyMatMulWithBias(256, 256, 256, true, false);
   this->VerifyMatMulWithBias(256, 256, 256, false, true);
   this->VerifyMatMulWithBias(256, 256, 256, true, true);
 }

 TYPED_TEST_P(FusedMatMulWithBiasOpTest, MatMul1x256x256) {
   this->VerifyMatMulWithBias(1, 256, 256, false, false);
 }

 TYPED_TEST_P(FusedMatMulWithBiasOpTest, MatMul256x256x1) {
   this->VerifyMatMulWithBias(256, 256, 1, false, false);
 }

 TYPED_TEST_P(FusedMatMulWithBiasOpTest, MatMul1x256x1) {
   this->VerifyMatMulWithBias(1, 256, 1, false, false);
 }

 TYPED_TEST_P(FusedMatMulWithBiasOpTest, MatMul256x256x256WithActivation) {
   for (const string& activation : {"Relu", "Relu6", "Elu", "LeakyRelu"}) {
     this->VerifyConv2DWithBiasAndActivation(256, 256, 256, false, false,
                                             activation);
     this->VerifyConv2DWithBiasAndActivation(256, 256, 256, true, false,
                                             activation);
     this->VerifyConv2DWithBiasAndActivation(256, 256, 256, false, true,
                                             activation);
     this->VerifyConv2DWithBiasAndActivation(256, 256, 256, true, true,
                                             activation);
   }
 }

 TYPED_TEST_P(FusedMatMulWithBiasOpTest, MatMul1x256x256WithActivation) {
   for (const string& activation : {"Relu", "Relu6", "Elu", "LeakyRelu"}) {
     this->VerifyConv2DWithBiasAndActivation(1, 256, 256, false, false,
                                             activation);
   }
 }

 TYPED_TEST_P(FusedMatMulWithBiasOpTest, MatMul256x256x1WithActivation) {
   for (const string& activation : {"Relu", "Relu6", "Elu", "LeakyRelu"}) {
     this->VerifyConv2DWithBiasAndActivation(256, 256, 1, false, false,
                                             activation);
   }
 }

 TYPED_TEST_P(FusedMatMulWithBiasOpTest, MatMul1x256x1WithActivation) {
   for (const string& activation : {"Relu", "Relu6", "Elu", "LeakyRelu"}) {
     this->VerifyConv2DWithBiasAndActivation(1, 256, 1, false, false,
                                             activation);
   }
 }

 REGISTER_TYPED_TEST_SUITE_P(FusedMatMulWithBiasOpTest,        //
                             MatMul256x256x256,                //
                             MatMul1x256x256,                  //
                             MatMul256x256x1,                  //
                             MatMul1x256x1,                    //
                             MatMul256x256x256WithActivation,  //
                             MatMul1x256x256WithActivation,    //
                             MatMul256x256x1WithActivation,    //
                             MatMul1x256x1WithActivation);

 // TODO(ezhulenev): Add support for more data types.
 using FusedBiasAddDataTypes = ::testing::Types<float>;
 INSTANTIATE_TYPED_TEST_SUITE_P(Test, FusedMatMulWithBiasOpTest,
                                FusedBiasAddDataTypes);

 //----------------------------------------------------------------------------//
 // Performance benchmarks are below.                                          //
 //----------------------------------------------------------------------------//

 template <typename T>
 static Graph* Matmul(int m, int k, int n, bool transpose_a, bool transpose_b,
                      DataType type) {
   Graph* g = new Graph(OpRegistry::Global());
   Tensor in0(type, transpose_a ? TensorShape({k, m}) : TensorShape({m, k}));
   in0.flat<T>().setRandom();
   Tensor in1(type, transpose_b ? TensorShape({n, k}) : TensorShape({k, n}));
   in1.flat<T>().setRandom();
   test::graph::Matmul(g, test::graph::Constant(g, in0),
                       test::graph::Constant(g, in1), transpose_a, transpose_b);
   return g;
 }

 #define BM_MatmulDev(M, K, N, TA, TB, T, TFTYPE, DEVICE)                       \
   static void BM_Matmul##_##M##_##K##_##N##_##TA##_##TB##_##TFTYPE##_##DEVICE( \
       int iters) {                                                             \
     testing::UseRealTime();                                                    \
     testing::ItemsProcessed(static_cast<int64>(iters) * M * K * N * 2);        \
     test::Benchmark(#DEVICE, Matmul<T>(M, K, N, TA, TB, TFTYPE)).Run(iters);   \
   }                                                                            \
   BENCHMARK(BM_Matmul##_##M##_##K##_##N##_##TA##_##TB##_##TFTYPE##_##DEVICE);

 #ifdef GOOGLE_CUDA

 #define BM_Matmul(M, K, N, TA, TB)                                       \
   BM_MatmulDev(M, K, N, TA, TB, float, DT_FLOAT, cpu);                   \
   BM_MatmulDev(M, K, N, TA, TB, std::complex<float>, DT_COMPLEX64, cpu); \
   BM_MatmulDev(M, K, N, TA, TB, float, DT_FLOAT, gpu);                   \
   BM_MatmulDev(M, K, N, TA, TB, std::complex<float>, DT_COMPLEX64, gpu); \
   /* Uncomment to enable benchmarks for double/complex128: */            \
   // BM_MatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, cpu);                   \
 // BM_MatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, cpu); \
 // BM_MatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, gpu);                   \
 // BM_MatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, gpu);

 #else

 #define BM_Matmul(M, K, N, TA, TB)                     \
   BM_MatmulDev(M, K, N, TA, TB, float, DT_FLOAT, cpu); \
   BM_MatmulDev(M, K, N, TA, TB, std::complex<float>, DT_COMPLEX64, cpu);

 #endif  // GOOGLE_CUDA

 // Batch size of 1 included for inference.
 // Typical fully connected layers
 BM_Matmul(1, 512, 512, false, false);
 BM_Matmul(8, 512, 512, false, false);
 BM_Matmul(16, 512, 512, false, false);
 BM_Matmul(128, 512, 512, false, false);

 BM_Matmul(1, 1024, 1024, false, false);
 BM_Matmul(8, 1024, 1024, false, false);
 BM_Matmul(16, 1024, 1024, false, false);
 BM_Matmul(128, 1024, 1024, false, false);
 BM_Matmul(4096, 4096, 4096, false, false);

 // Backward for fully connected layers
 BM_Matmul(1, 1024, 1024, false, true);
 BM_Matmul(8, 1024, 1024, false, true);
 BM_Matmul(16, 1024, 1024, false, true);
 BM_Matmul(128, 1024, 1024, false, true);

 // Forward softmax with large output size
 BM_Matmul(1, 200, 10000, false, false);
 BM_Matmul(8, 200, 10000, false, false);
 BM_Matmul(20, 200, 10000, false, false);
 BM_Matmul(20, 200, 20000, false, false);

 // Backward softmax with large output size
 BM_Matmul(1, 10000, 200, false, true);
 BM_Matmul(1, 10000, 200, false, false);
 BM_Matmul(8, 10000, 200, false, true);
 BM_Matmul(20, 10000, 200, false, true);
 BM_Matmul(20, 20000, 200, false, true);

 // Test some matrix-vector multiplies.
 BM_Matmul(50, 50, 1, false, false);
 BM_Matmul(50, 50, 1, true, false);
 BM_Matmul(50, 50, 1, false, true);
 BM_Matmul(50, 50, 1, true, true);
 BM_Matmul(500, 500, 1, false, false);
 BM_Matmul(500, 500, 1, true, false);
 BM_Matmul(500, 500, 1, false, true);
 BM_Matmul(500, 500, 1, true, true);
 BM_Matmul(2000, 2000, 1, false, false);
 BM_Matmul(2000, 2000, 1, true, false);
 BM_Matmul(2000, 2000, 1, false, true);
 BM_Matmul(2000, 2000, 1, true, true);

 // Test some vector-matrix multiplies.
 BM_Matmul(1, 50, 50, false, false);
 BM_Matmul(1, 50, 50, true, false);
 BM_Matmul(1, 50, 50, false, true);
 BM_Matmul(1, 50, 50, true, true);
 BM_Matmul(1, 500, 500, false, false);
 BM_Matmul(1, 500, 500, true, false);
 BM_Matmul(1, 500, 500, false, true);
 BM_Matmul(1, 500, 500, true, true);
 BM_Matmul(1, 2000, 2000, false, false);
 BM_Matmul(1, 2000, 2000, true, false);
 BM_Matmul(1, 2000, 2000, false, true);
 BM_Matmul(1, 2000, 2000, true, true);

 // Test some rank-one products.
 BM_Matmul(50, 1, 50, false, false);
 BM_Matmul(50, 1, 50, true, false);
 BM_Matmul(50, 1, 50, false, true);
 BM_Matmul(50, 1, 50, true, true);
 BM_Matmul(500, 1, 500, false, false);
 BM_Matmul(500, 1, 500, true, false);
 BM_Matmul(500, 1, 500, false, true);
 BM_Matmul(500, 1, 500, true, true);
 BM_Matmul(2000, 1, 2000, false, false);
 BM_Matmul(2000, 1, 2000, true, false);
 BM_Matmul(2000, 1, 2000, false, true);
 BM_Matmul(2000, 1, 2000, true, true);

 // Benchmarks for batched matmul with broadcasting.
 Node* BroadcastTo(Graph* g, Node* input, Node* shape) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(g->NewName("n"), "BroadcastTo")
                   .Input(input)
                   .Input(shape)
                   .Finalize(g, &ret));
   return ret;
 }

 Node* BatchMatmulV2(Graph* g, Node* in0, Node* in1, bool adj_x, bool adj_y) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(g->NewName("n"), "BatchMatMulV2")
                   .Input(in0)
                   .Input(in1)
                   .Attr("adj_x", adj_x)
                   .Attr("adj_y", adj_y)
                   .Finalize(g, &ret));
   return ret;
 }

 template <typename T>
 static Graph* BatchMatmul(int b, int m, int k, int n, bool adjoint_a,
                           bool adjoint_b, DataType type) {
   Graph* g = new Graph(OpRegistry::Global());
   Tensor in0(type, adjoint_a ? TensorShape({b, k, m}) : TensorShape({b, m, k}));
   in0.flat<T>().setRandom();
   Tensor in1(type, adjoint_b ? TensorShape({b, n, k}) : TensorShape({b, k, n}));
   in1.flat<T>().setRandom();
   test::graph::BatchMatmul(g, test::graph::Constant(g, in0),
                            test::graph::Constant(g, in1), adjoint_a, adjoint_b);
   return g;
 }

 template <typename T>
 static Graph* BatchMatmulWithBroadcast(int b0, int b1, int m, int k, int n,
                                        bool manual_broadcast, DataType type) {
   Graph* g = new Graph(OpRegistry::Global());
   Tensor in0(type, TensorShape({b0, m, k}));
   in0.flat<T>().setRandom();
   Tensor in1(type, TensorShape({b1, k, n}));
   in1.flat<T>().setRandom();

   Tensor broadcasted_in0_shape(DT_INT64, TensorShape({3}));
   Tensor broadcasted_in1_shape(DT_INT64, TensorShape({3}));

   Node* in0_node = nullptr;
   Node* in1_node = nullptr;
   if (manual_broadcast) {
     for (int i = 0; i < 3; ++i) {
       auto vec0 = broadcasted_in0_shape.vec<int64>();
       auto vec1 = broadcasted_in1_shape.vec<int64>();
       vec0(i) = (i == 0 ? std::max(b0, b1) : in0.shape().dim_size(i));
       vec1(i) = (i == 0 ? std::max(b0, b1) : in1.shape().dim_size(i));
     }
     in0_node = BroadcastTo(g, test::graph::Constant(g, in0),
                            test::graph::Constant(g, broadcasted_in0_shape));
     in1_node = BroadcastTo(g, test::graph::Constant(g, in1),
                            test::graph::Constant(g, broadcasted_in1_shape));
   } else {
     in0_node = test::graph::Constant(g, in0);
     in1_node = test::graph::Constant(g, in1);
   }

   BatchMatmulV2(g, in0_node, in1_node, false, false);
   return g;
 }

 #define BM_BatchMatmulDev(B, M, K, N, TA, TB, T, TFTYPE, DEVICE)                  \
   static void                                                                     \
       BM_BatchMatmul##_##B##_##M##_##K##_##N##_##TA##_##TB##_##TFTYPE##_##DEVICE( \
           int iters) {                                                            \
     testing::UseRealTime();                                                       \
     testing::ItemsProcessed(static_cast<int64>(iters) * B * M * K * N * 2);       \
     test::Benchmark(#DEVICE, BatchMatmul<T>(B, M, K, N, TA, TB, TFTYPE))          \
         .Run(iters);                                                              \
   }                                                                               \
   BENCHMARK(                                                                      \
       BM_BatchMatmul##_##B##_##M##_##K##_##N##_##TA##_##TB##_##TFTYPE##_##DEVICE);

 #define BM_BatchMatmul(B, M, K, N, TA, TB) \
   BM_BatchMatmulDev(B, M, K, N, TA, TB, float, DT_FLOAT, cpu);
 // BM_BatchMatmulDev(B, M, K, N, TA, TB, std::complex<float>, DT_COMPLEX64,
 // cpu);
 //  BM_BatchMatmulDev(B, M, K, N, TA, TB, float, DT_FLOAT, gpu);
 /* Uncomment to enable benchmarks for double & complex types: */
 // BM_BatchMatmulDev(B, M, K, N, TA, TB, std::complex<float>, DT_COMPLEX64,
 // gpu);
 // BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, cpu); \
 // BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, cpu);
 // \
 // BM_BatchMatmulDev(M, K, N, TA, TB, double, DT_DOUBLE, gpu); \
 // BM_BatchMatmulDev(M, K, N, TA, TB, std::complex<double>, DT_COMPLEX128, gpu);

 // Macro arguments names: --------------------------------------------------- //
 //   B1: batch size of LHS
 //   B2: batch size of RHS
 //    M: outer dimension of LHS
 //    K: inner dimensions of LHS and RHS
 //    N: outer dimension of RHS
 //   MB: boolean indicating whether to use manual broadcasting
 //    T: C++ type of scalars (e.g. float, std::complex)
 //   TT: TensorFlow type of scalars (e.g. DT_FLOAT, DT_COMPLEX128
 //    D: Device (e.g. cpu, gpu)
 #define BM_BatchMatmulBCastDev(B1, B2, M, K, N, MB, T, TT, D)                  \
   static void                                                                  \
       BM_BatchMatmulBCast##_##B1##_##B2##_##M##_##K##_##N##_##MB##_##TT##_##D( \
           int iters) {                                                         \
     testing::UseRealTime();                                                    \
     testing::ItemsProcessed(static_cast<int64>(iters) * std::max(B1, B2) * M * \
                             K * N * 2);                                        \
     test::Benchmark(#D, BatchMatmulWithBroadcast<T>(B1, B2, M, K, N, MB, TT))  \
         .Run(iters);                                                           \
   }                                                                            \
   BENCHMARK(                                                                   \
       BM_BatchMatmulBCast##_##B1##_##B2##_##M##_##K##_##N##_##MB##_##TT##_##D);

 #define BM_BatchMatmulBCast(B1, B2, M, K, N, MB) \
   BM_BatchMatmulBCastDev(B1, B2, M, K, N, MB, float, DT_FLOAT, cpu);

 // Typical fully connected layers
 BM_BatchMatmulBCast(1, 128, 1, 1024, 1024, true);
 BM_BatchMatmulBCast(1, 128, 1, 1024, 1024, false);
 BM_BatchMatmulBCast(128, 1, 1, 1024, 1024, true);
 BM_BatchMatmulBCast(128, 1, 1, 1024, 1024, false);
 BM_BatchMatmulBCast(1, 128, 128, 1024, 1024, true);
 BM_BatchMatmulBCast(1, 128, 128, 1024, 1024, false);
 BM_BatchMatmulBCast(128, 1, 128, 1024, 1024, true);
 BM_BatchMatmulBCast(128, 1, 128, 1024, 1024, false);

 // Square matmul.
 BM_BatchMatmulBCast(1, 128, 512, 512, 512, true);
 BM_BatchMatmulBCast(1, 128, 512, 512, 512, false);
 BM_BatchMatmulBCast(128, 1, 512, 512, 512, true);
 BM_BatchMatmulBCast(128, 1, 512, 512, 512, false);
 BM_BatchMatmulBCast(1, 128, 1024, 1024, 1024, true);
 BM_BatchMatmulBCast(1, 128, 1024, 1024, 1024, false);
 BM_BatchMatmulBCast(128, 1, 1024, 1024, 1024, true);
 BM_BatchMatmulBCast(128, 1, 1024, 1024, 1024, false);

 // Matrix-vector multiplies.
 BM_BatchMatmulBCast(1, 128, 10000, 200, 1, true);
 BM_BatchMatmulBCast(1, 128, 10000, 200, 1, false);
 BM_BatchMatmulBCast(128, 1, 10000, 200, 1, true);
 BM_BatchMatmulBCast(128, 1, 10000, 200, 1, false);

 // Vector-matrix multiplies.
 BM_BatchMatmulBCast(1, 128, 1, 200, 10000, true);
 BM_BatchMatmulBCast(1, 128, 1, 200, 10000, false);
 BM_BatchMatmulBCast(128, 1, 1, 200, 10000, true);
 BM_BatchMatmulBCast(128, 1, 1, 200, 10000, false);

 // Typical fully connected layers
 BM_BatchMatmul(1, 1, 1024, 1024, false, false);
 BM_BatchMatmul(1, 8, 1024, 1024, false, false);
 BM_BatchMatmul(1, 16, 1024, 1024, false, false);
 BM_BatchMatmul(1, 128, 1024, 1024, false, false);
 BM_BatchMatmul(2, 1, 1024, 1024, false, false);
 BM_BatchMatmul(2, 8, 1024, 1024, false, false);
 BM_BatchMatmul(2, 16, 1024, 1024, false, false);
 BM_BatchMatmul(2, 128, 1024, 1024, false, false);
 BM_BatchMatmul(8, 1, 1024, 1024, false, false);
 BM_BatchMatmul(8, 8, 1024, 1024, false, false);
 BM_BatchMatmul(8, 16, 1024, 1024, false, false);
 BM_BatchMatmul(8, 128, 1024, 1024, false, false);
 BM_BatchMatmul(32, 1, 1024, 1024, false, false);
 BM_BatchMatmul(32, 8, 1024, 1024, false, false);
 BM_BatchMatmul(32, 16, 1024, 1024, false, false);
 BM_BatchMatmul(32, 128, 1024, 1024, false, false);

 // Square matmul.
 BM_BatchMatmul(1, 32, 32, 32, false, false);
 BM_BatchMatmul(1, 128, 128, 128, false, false);
 BM_BatchMatmul(1, 256, 256, 256, false, false);
 BM_BatchMatmul(1, 1024, 1024, 1024, false, false);
 BM_BatchMatmul(1, 2048, 2048, 2048, false, false);
 BM_BatchMatmul(2, 32, 32, 32, false, false);
 BM_BatchMatmul(2, 128, 128, 128, false, false);
 BM_BatchMatmul(2, 256, 256, 256, false, false);
 BM_BatchMatmul(2, 1024, 1024, 1024, false, false);
 BM_BatchMatmul(2, 2048, 2048, 2048, false, false);
 BM_BatchMatmul(4, 32, 32, 32, false, false);
 BM_BatchMatmul(4, 128, 128, 128, false, false);
 BM_BatchMatmul(4, 256, 256, 256, false, false);
 BM_BatchMatmul(4, 1024, 1024, 1024, false, false);
 BM_BatchMatmul(4, 2048, 2048, 2048, false, false);
 BM_BatchMatmul(8, 32, 32, 32, false, false);
 BM_BatchMatmul(8, 128, 128, 128, false, false);
 BM_BatchMatmul(8, 256, 256, 256, false, false);
 BM_BatchMatmul(8, 1024, 1024, 1024, false, false);
 BM_BatchMatmul(8, 2048, 2048, 2048, false, false);
 BM_BatchMatmul(32, 32, 32, 32, false, false);
 BM_BatchMatmul(32, 128, 128, 128, false, false);
 BM_BatchMatmul(32, 256, 256, 256, false, false);
 BM_BatchMatmul(32, 1024, 1024, 1024, false, false);
 BM_BatchMatmul(32, 2048, 2048, 2048, false, false);

 // Matrix-vector multiplies.
 BM_BatchMatmul(1, 10000, 200, 1, false, false);
 BM_BatchMatmul(8, 10000, 200, 1, false, false);
 BM_BatchMatmul(32, 10000, 200, 1, false, false);
 BM_BatchMatmul(1, 10000, 200, 1, true, false);
 BM_BatchMatmul(8, 10000, 200, 1, true, false);
 BM_BatchMatmul(32, 10000, 200, 1, true, false);
 BM_BatchMatmul(1, 10000, 200, 1, false, true);
 BM_BatchMatmul(8, 10000, 200, 1, false, true);
 BM_BatchMatmul(32, 10000, 200, 1, false, true);
 BM_BatchMatmul(1, 10000, 200, 1, true, true);
 BM_BatchMatmul(8, 10000, 200, 1, true, true);
 BM_BatchMatmul(32, 10000, 200, 1, true, true);

 // Vector-matrix multiplies.
 BM_BatchMatmul(1, 1, 200, 10000, false, false);
 BM_BatchMatmul(8, 1, 200, 10000, false, false);
 BM_BatchMatmul(32, 1, 200, 10000, false, false);
 BM_BatchMatmul(1, 1, 200, 10000, true, false);
 BM_BatchMatmul(8, 1, 200, 10000, true, false);
 BM_BatchMatmul(32, 1, 200, 10000, true, false);
 BM_BatchMatmul(1, 1, 200, 10000, false, true);
 BM_BatchMatmul(8, 1, 200, 10000, false, true);
 BM_BatchMatmul(32, 1, 200, 10000, false, true);
 BM_BatchMatmul(1, 1, 200, 10000, true, true);
 BM_BatchMatmul(8, 1, 200, 10000, true, true);
 BM_BatchMatmul(32, 1, 200, 10000, true, true);

 }  // namespace
 }  // namespace tensorflow