tensorflow/core/kernels/mkl_conv_ops_test.cc - platform/external/tensorflow - Git at Google

 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

 #include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/nn_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
 #include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/public/session.h"

 #if defined(INTEL_MKL_DNN_ONLY)
 #include "third_party/intel_mkl_dnn/include/mkldnn.h"
 #include "tensorflow/core/util/mkl_util.h"
 #endif

 // TODO(ezhulenev): Add numerical tests that will compare results of default
 // (aka Eigen) convolutions with MKL convolutions.

 // -------------------------------------------------------------------------- //
 // Performance Benchmarks.                                                    //
 // -------------------------------------------------------------------------- //

 // Compare performance of default Tensorflow convolution kernels (Eigen) with
 // MKL kernels on CPU.

 // Before running these benchmarks configure OpenMP environment variables:
 //   export KMP_BLOCKTIME=0
 //   export OMP_NUM_THREADS=${num_threads}

 namespace tensorflow {

 struct Conv2DDimensions {
   Conv2DDimensions(int n, int h, int w, int c, int fc, int fh, int fw)
       : input_batches(n),
         input_height(h),
         input_width(w),
         input_depth(c),
         filter_count(fc),
         filter_height(fh),
         filter_width(fw) {}

   int input_batches;
   int input_height;
   int input_width;
   int input_depth;
   int filter_count;
   int filter_height;
   int filter_width;
 };

 static Tensor GetRandomTensor(const TensorShape& shape) {
   Tensor tensor(DT_FLOAT, TensorShape(shape));
   tensor.flat<float>() = tensor.flat<float>().setRandom();
   return tensor;
 }

 // Get a random Tensor for the Conv2D input.
 static Tensor GetRandomInputTensor(const Conv2DDimensions& dims) {
   return GetRandomTensor({dims.input_batches, dims.input_height,
                           dims.input_width, dims.input_depth});
 }

 // Get a random Tensor for the Conv2D filter.
 static Tensor GetRandomFilterTensor(const Conv2DDimensions& dims) {
   return GetRandomTensor({dims.filter_height, dims.filter_width,
                           dims.input_depth, dims.filter_count});
 }

 // Get a random Tensor for the Conv2D output (assuming SAME padding).
 static Tensor GetRandomOutputTensor(const Conv2DDimensions& dims) {
   return GetRandomTensor({dims.input_batches, dims.input_height,
                           dims.input_width, dims.filter_count});
 }

 // Get a Tensor encoding Conv2D input shape.
 static Tensor GetInputSizesTensor(const Conv2DDimensions& dims) {
   return test::AsTensor<int32>({dims.input_batches, dims.input_height,
                                 dims.input_width, dims.input_depth});
 }

 // Get a Tensor encoding Conv2D filter shape.
 static Tensor GetFilterSizesTensor(const Conv2DDimensions& dims) {
   return test::AsTensor<int32>({dims.filter_height, dims.filter_width,
                                 dims.input_depth, dims.filter_count});
 }

 #if defined(INTEL_MKL_DNN_ONLY)
 static Tensor NonMklTensor() {
   MklDnnShape non_mkl_shape;
   non_mkl_shape.SetMklTensor(false);

   auto size = static_cast<int64>(non_mkl_shape.GetSerializeBufferSize());
   Tensor tensor(DT_UINT8, {size});

   non_mkl_shape.SerializeMklDnnShape(tensor.flat<uint8>().data(),
                                      size * sizeof(uint8));
   return tensor;
 }
 #endif

 static Graph* DefaultConv2D(const Conv2DDimensions& dims) {
   auto* graph = new Graph(OpRegistry::Global());

   Tensor input_t = GetRandomInputTensor(dims);
   Tensor filter_t = GetRandomFilterTensor(dims);

   Node* input = test::graph::Constant(graph, input_t, "input");
   Node* filter = test::graph::Constant(graph, filter_t, "filter");

   Node* conv2d;
   TF_CHECK_OK(NodeBuilder(graph->NewName("conv_2d"), "Conv2D")
                   .Input(input)
                   .Input(filter)
                   .Attr("T", DT_FLOAT)
                   .Attr("strides", {1, 1, 1, 1})
                   .Attr("padding", "SAME")
                   .Finalize(graph, &conv2d));

   return graph;
 }

 #if defined(INTEL_MKL_DNN_ONLY)
 static Graph* MklConv2D(const Conv2DDimensions& dims) {
   auto* graph = new Graph(OpRegistry::Global());

   Tensor input_t = GetRandomInputTensor(dims);
   Tensor filter_t = GetRandomFilterTensor(dims);

   Node* input = test::graph::Constant(graph, input_t, "input");
   Node* filter = test::graph::Constant(graph, filter_t, "filter");

   Node* not_mkl_shape = test::graph::Constant(graph, NonMklTensor(), "not_mkl");

   Node* conv2d;
   TF_CHECK_OK(NodeBuilder(graph->NewName("mkl_conv_2d"), "_MklConv2D")
                   .Input(input)
                   .Input(filter)
                   .Input(not_mkl_shape)
                   .Input(not_mkl_shape)
                   .Attr("T", DT_FLOAT)
                   .Attr("strides", {1, 1, 1, 1})
                   .Attr("padding", "SAME")
                   .Attr("_kernel", "MklOp")
                   .Finalize(graph, &conv2d));

   return graph;
 }
 #endif

 static Graph* DefaultConv2DBwdInput(const Conv2DDimensions& dims) {
   auto* graph = new Graph(OpRegistry::Global());

   Tensor input_sizes_t = GetInputSizesTensor(dims);
   Tensor filter_t = GetRandomFilterTensor(dims);
   Tensor out_backprop_t = GetRandomOutputTensor(dims);  // assuming SAME padding

   Node* input_sizes =
       test::graph::Constant(graph, input_sizes_t, "input_sizes");
   Node* filter = test::graph::Constant(graph, filter_t, "filter");
   Node* out_backprop =
       test::graph::Constant(graph, out_backprop_t, "out_backprop");

   Node* conv2d_bwd_input;
   TF_CHECK_OK(
       NodeBuilder(graph->NewName("conv_2d_bwd_input"), "Conv2DBackpropInput")
           .Input(input_sizes)
           .Input(filter)
           .Input(out_backprop)
           .Attr("T", DT_FLOAT)
           .Attr("strides", {1, 1, 1, 1})
           .Attr("padding", "SAME")
           .Finalize(graph, &conv2d_bwd_input));

   return graph;
 }

 #if defined(INTEL_MKL_DNN_ONLY)
 static Graph* MklConv2DBwdInput(const Conv2DDimensions& dims) {
   auto* graph = new Graph(OpRegistry::Global());

   Tensor input_sizes_t = GetInputSizesTensor(dims);
   Tensor filter_t = GetRandomFilterTensor(dims);
   Tensor out_backprop_t = GetRandomOutputTensor(dims);  // assuming SAME padding

   Node* input_sizes =
       test::graph::Constant(graph, input_sizes_t, "input_sizes");
   Node* filter = test::graph::Constant(graph, filter_t, "filter");
   Node* out_backprop =
       test::graph::Constant(graph, out_backprop_t, "out_backprop");

   Node* not_mkl_shape = test::graph::Constant(graph, NonMklTensor(), "not_mkl");

   Node* conv2d_bwd_input;
   TF_CHECK_OK(NodeBuilder(graph->NewName("conv_2d_bwd_input"),
                           "_MklConv2DBackpropInput")
                   .Input(input_sizes)
                   .Input(filter)
                   .Input(out_backprop)
                   .Input(not_mkl_shape)
                   .Input(not_mkl_shape)
                   .Input(not_mkl_shape)
                   .Attr("T", DT_FLOAT)
                   .Attr("strides", {1, 1, 1, 1})
                   .Attr("padding", "SAME")
                   .Attr("_kernel", "MklOp")
                   .Finalize(graph, &conv2d_bwd_input));

   return graph;
 }
 #endif

 static Graph* DefaultConv2DBwdFilter(const Conv2DDimensions& dims) {
   auto* graph = new Graph(OpRegistry::Global());

   Tensor input_t = GetRandomInputTensor(dims);
   Tensor filter_sizes_t = GetFilterSizesTensor(dims);
   Tensor filter_t = GetRandomFilterTensor(dims);
   Tensor out_backprop_t = GetRandomOutputTensor(dims);  // assuming SAME padding

   Node* input = test::graph::Constant(graph, input_t, "input");
   Node* filter_sizes =
       test::graph::Constant(graph, filter_sizes_t, "filter_sizes");
   Node* out_backprop =
       test::graph::Constant(graph, out_backprop_t, "out_backprop");

   Node* conv2d_bwd_filter;
   TF_CHECK_OK(
       NodeBuilder(graph->NewName("conv_2d_bwd_filter"), "Conv2DBackpropFilter")
           .Input(input)
           .Input(filter_sizes)
           .Input(out_backprop)
           .Attr("T", DT_FLOAT)
           .Attr("strides", {1, 1, 1, 1})
           .Attr("padding", "SAME")
           .Finalize(graph, &conv2d_bwd_filter));

   return graph;
 }

 #if defined(INTEL_MKL_DNN_ONLY)
 static Graph* MklConv2DBwdFilter(const Conv2DDimensions& dims) {
   Graph* graph = new Graph(OpRegistry::Global());

   Tensor input_t = GetRandomInputTensor(dims);
   Tensor filter_sizes_t = GetFilterSizesTensor(dims);
   Tensor filter_t = GetRandomFilterTensor(dims);
   Tensor out_backprop_t = GetRandomOutputTensor(dims);  // assuming SAME padding

   Node* input = test::graph::Constant(graph, input_t, "input");
   Node* filter_sizes =
       test::graph::Constant(graph, filter_sizes_t, "filter_sizes");
   Node* out_backprop =
       test::graph::Constant(graph, out_backprop_t, "out_backprop");

   Node* not_mkl_shape = test::graph::Constant(graph, NonMklTensor(), "not_mkl");

   Node* conv2d_bwd_filter;
   TF_CHECK_OK(NodeBuilder(graph->NewName("conv_2d_bwd_filter"),
                           "_MklConv2DBackpropFilter")
                   .Input(input)
                   .Input(filter_sizes)
                   .Input(out_backprop)
                   .Input(not_mkl_shape)
                   .Input(not_mkl_shape)
                   .Input(not_mkl_shape)
                   .Attr("T", DT_FLOAT)
                   .Attr("strides", {1, 1, 1, 1})
                   .Attr("padding", "SAME")
                   .Attr("_kernel", "MklOp")
                   .Finalize(graph, &conv2d_bwd_filter));

   return graph;
 }
 #endif

 // Macro arguments names: --------------------------------------------------- //
 //    N: batch size
 //    H: height
 //    W: width
 //    C: channels
 //   FC: filter count
 //   FH: filter height
 //   FW: filter width

 #define BM_CONCAT(a, b) a##b

 #define BM_NAME(p, type, N, H, W, C, FC, FH, FW) \
   BM_CONCAT(BM_##p##_##type##_in_##N##_##H##_##W##_##C, _f_##FC##_##FH##_##FW)

 // Flops computation in these benchmarks are the same as in
 // eigen_benchmark_cpu_test.cc.

 #define BM_Conv2DT(kind, N, H, W, C, FC, FH, FW, type, LABEL)            \
   static void BM_NAME(Conv2D_##kind, type, N, H, W, C, FC, FH,           \
                       FW)(int iters) {                                   \
     testing::SetLabel(LABEL);                                            \
                                                                          \
     int64 num_computed_elements = (N) * (H) * (W) * (FC);                \
     int64 flops_per_iter = num_computed_elements * ((C) * (FH) * (FW));  \
     testing::ItemsProcessed(static_cast<int64>(iters) * flops_per_iter); \
                                                                          \
     Conv2DDimensions dims(N, H, W, C, FC, FW, FH);                       \
     test::Benchmark(#type, BM_CONCAT(kind, Conv2D)(dims)).Run(iters);    \
   }                                                                      \
   BENCHMARK(BM_NAME(Conv2D_##kind, type, N, H, W, C, FC, FH, FW))

 #if defined(INTEL_MKL_DNN_ONLY)
 #define BM_Conv2D(N, H, W, C, FC, FH, FW, type, LABEL)      \
   BM_Conv2DT(Default, N, H, W, C, FC, FH, FW, type, LABEL); \
   BM_Conv2DT(Mkl, N, H, W, C, FC, FH, FW, type, LABEL);
 #else
 #define BM_Conv2D(N, H, W, C, FC, FH, FW, type, LABEL) \
   BM_Conv2DT(Default, N, H, W, C, FC, FH, FW, type, LABEL);
 #endif

 #define BM_Conv2DBwdInputT(kind, N, H, W, C, FC, FH, FW, type, LABEL)         \
   static void BM_NAME(Conv2DBwdInput_##kind, type, N, H, W, C, FC, FH,        \
                       FW)(int iters) {                                        \
     testing::SetLabel(LABEL);                                                 \
                                                                               \
     int64 num_computed_elements = (N) * (H) * (W) * (C);                      \
     int64 flops_per_iter = num_computed_elements * ((C) * (FH) * (FW));       \
     testing::ItemsProcessed(static_cast<int64>(iters) * flops_per_iter);      \
                                                                               \
     Conv2DDimensions dims(N, H, W, C, FC, FW, FH);                            \
     test::Benchmark(#type, BM_CONCAT(kind, Conv2DBwdInput)(dims)).Run(iters); \
   }                                                                           \
   BENCHMARK(BM_NAME(Conv2DBwdInput_##kind, type, N, H, W, C, FC, FH, FW))

 #if defined(INTEL_MKL_DNN_ONLY)
 #define BM_Conv2DBwdInput(N, H, W, C, FC, FH, FW, type, LABEL)      \
   BM_Conv2DBwdInputT(Default, N, H, W, C, FC, FH, FW, type, LABEL); \
   BM_Conv2DBwdInputT(Mkl, N, H, W, C, FC, FH, FW, type, LABEL);
 #else
 #define BM_Conv2DBwdInput(N, H, W, C, FC, FH, FW, type, LABEL) \
   BM_Conv2DBwdInputT(Default, N, H, W, C, FC, FH, FW, type, LABEL);
 #endif

 #define BM_Conv2DBwdFilterT(kind, N, H, W, C, FC, FH, FW, type, LABEL)         \
   static void BM_NAME(Conv2DBwdFilter_##kind, type, N, H, W, C, FC, FH,        \
                       FW)(int iters) {                                         \
     testing::SetLabel(LABEL);                                                  \
                                                                                \
     int64 num_computed_elements = (FH) * (FW) * (C) * (FC);                    \
     int64 flops_per_iter = num_computed_elements * ((N) * (H) * (W));          \
     testing::ItemsProcessed(static_cast<int64>(iters) * flops_per_iter);       \
                                                                                \
     Conv2DDimensions dims(N, H, W, C, FC, FW, FH);                             \
     test::Benchmark(#type, BM_CONCAT(kind, Conv2DBwdFilter)(dims)).Run(iters); \
   }                                                                            \
   BENCHMARK(BM_NAME(Conv2DBwdFilter_##kind, type, N, H, W, C, FC, FH, FW))

 #if defined(INTEL_MKL_DNN_ONLY)
 #define BM_Conv2DBwdFilter(N, H, W, C, FC, FH, FW, type, LABEL)      \
   BM_Conv2DBwdFilterT(Default, N, H, W, C, FC, FH, FW, type, LABEL); \
   BM_Conv2DBwdFilterT(Mkl, N, H, W, C, FC, FH, FW, type, LABEL);
 #else
 #define BM_Conv2DBwdFilter(N, H, W, C, FC, FH, FW, type, LABEL) \
   BM_Conv2DBwdFilterT(Default, N, H, W, C, FC, FH, FW, type, LABEL);
 #endif

 // ImageNet Convolutions ---------------------------------------------------- //

 BM_Conv2D(32, 28, 28, 96, 128, 3, 3, cpu, "conv3a_00_3x3");
 BM_Conv2D(32, 28, 28, 16, 32, 5, 5, cpu, "conv3a_00_5x5");
 BM_Conv2D(32, 28, 28, 128, 192, 3, 3, cpu, "conv3_00_3x3");
 BM_Conv2D(32, 28, 28, 32, 96, 5, 5, cpu, "conv3_00_5x5");
 BM_Conv2D(32, 14, 14, 96, 204, 3, 3, cpu, "conv4a_00_3x3");
 BM_Conv2D(32, 14, 14, 16, 48, 5, 5, cpu, "conv4a_00_5x5");
 BM_Conv2D(32, 14, 14, 112, 224, 3, 3, cpu, "conv4b_00_3x3");

 BM_Conv2DBwdInput(32, 28, 28, 96, 128, 3, 3, cpu, "conv3a_00_3x3");
 BM_Conv2DBwdInput(32, 28, 28, 16, 32, 5, 5, cpu, "conv3a_00_5x5");
 BM_Conv2DBwdInput(32, 28, 28, 128, 192, 3, 3, cpu, "conv3_00_3x3");
 BM_Conv2DBwdInput(32, 28, 28, 32, 96, 5, 5, cpu, "conv3_00_5x5");
 BM_Conv2DBwdInput(32, 14, 14, 96, 204, 3, 3, cpu, "conv4a_00_3x3");
 BM_Conv2DBwdInput(32, 14, 14, 16, 48, 5, 5, cpu, "conv4a_00_5x5");
 BM_Conv2DBwdInput(32, 14, 14, 112, 224, 3, 3, cpu, "conv4b_00_3x3");

 BM_Conv2DBwdFilter(32, 28, 28, 96, 128, 3, 3, cpu, "conv3a_00_3x3");
 BM_Conv2DBwdFilter(32, 28, 28, 16, 32, 5, 5, cpu, "conv3a_00_5x5");
 BM_Conv2DBwdFilter(32, 28, 28, 128, 192, 3, 3, cpu, "conv3_00_3x3");
 BM_Conv2DBwdFilter(32, 28, 28, 32, 96, 5, 5, cpu, "conv3_00_5x5");
 BM_Conv2DBwdFilter(32, 14, 14, 96, 204, 3, 3, cpu, "conv4a_00_3x3");
 BM_Conv2DBwdFilter(32, 14, 14, 16, 48, 5, 5, cpu, "conv4a_00_5x5");
 BM_Conv2DBwdFilter(32, 14, 14, 112, 224, 3, 3, cpu, "conv4b_00_3x3");

 }  // namespace tensorflow
	/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.

	Licensed under the Apache License, Version 2.0 (the "License");
	you may not use this file except in compliance with the License.
	You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.
	==============================================================================*/

	#include "tensorflow/cc/ops/const_op.h"
	#include "tensorflow/cc/ops/nn_ops.h"
	#include "tensorflow/cc/ops/standard_ops.h"
	#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
	#include "tensorflow/core/framework/fake_input.h"
	#include "tensorflow/core/framework/node_def_builder.h"
	#include "tensorflow/core/framework/tensor.h"
	#include "tensorflow/core/framework/types.pb.h"
	#include "tensorflow/core/kernels/ops_testutil.h"
	#include "tensorflow/core/kernels/ops_util.h"
	#include "tensorflow/core/platform/test.h"
	#include "tensorflow/core/platform/test_benchmark.h"
	#include "tensorflow/core/public/session.h"

	#if defined(INTEL_MKL_DNN_ONLY)
	#include "third_party/intel_mkl_dnn/include/mkldnn.h"
	#include "tensorflow/core/util/mkl_util.h"
	#endif

	// TODO(ezhulenev): Add numerical tests that will compare results of default
	// (aka Eigen) convolutions with MKL convolutions.

	// -------------------------------------------------------------------------- //
	// Performance Benchmarks. //
	// -------------------------------------------------------------------------- //

	// Compare performance of default Tensorflow convolution kernels (Eigen) with
	// MKL kernels on CPU.

	// Before running these benchmarks configure OpenMP environment variables:
	// export KMP_BLOCKTIME=0
	// export OMP_NUM_THREADS=${num_threads}

	namespace tensorflow {

	struct Conv2DDimensions {
	Conv2DDimensions(int n, int h, int w, int c, int fc, int fh, int fw)
	: input_batches(n),
	input_height(h),
	input_width(w),
	input_depth(c),
	filter_count(fc),
	filter_height(fh),
	filter_width(fw) {}

	int input_batches;
	int input_height;
	int input_width;
	int input_depth;
	int filter_count;
	int filter_height;
	int filter_width;
	};

	static Tensor GetRandomTensor(const TensorShape& shape) {
	Tensor tensor(DT_FLOAT, TensorShape(shape));
	tensor.flat<float>() = tensor.flat<float>().setRandom();
	return tensor;
	}

	// Get a random Tensor for the Conv2D input.
	static Tensor GetRandomInputTensor(const Conv2DDimensions& dims) {
	return GetRandomTensor({dims.input_batches, dims.input_height,
	dims.input_width, dims.input_depth});
	}

	// Get a random Tensor for the Conv2D filter.
	static Tensor GetRandomFilterTensor(const Conv2DDimensions& dims) {
	return GetRandomTensor({dims.filter_height, dims.filter_width,
	dims.input_depth, dims.filter_count});
	}

	// Get a random Tensor for the Conv2D output (assuming SAME padding).
	static Tensor GetRandomOutputTensor(const Conv2DDimensions& dims) {
	return GetRandomTensor({dims.input_batches, dims.input_height,
	dims.input_width, dims.filter_count});
	}

	// Get a Tensor encoding Conv2D input shape.
	static Tensor GetInputSizesTensor(const Conv2DDimensions& dims) {
	return test::AsTensor<int32>({dims.input_batches, dims.input_height,
	dims.input_width, dims.input_depth});
	}

	// Get a Tensor encoding Conv2D filter shape.
	static Tensor GetFilterSizesTensor(const Conv2DDimensions& dims) {
	return test::AsTensor<int32>({dims.filter_height, dims.filter_width,
	dims.input_depth, dims.filter_count});
	}

	#if defined(INTEL_MKL_DNN_ONLY)
	static Tensor NonMklTensor() {
	MklDnnShape non_mkl_shape;
	non_mkl_shape.SetMklTensor(false);

	auto size = static_cast<int64>(non_mkl_shape.GetSerializeBufferSize());
	Tensor tensor(DT_UINT8, {size});

	non_mkl_shape.SerializeMklDnnShape(tensor.flat<uint8>().data(),
	size * sizeof(uint8));
	return tensor;
	}
	#endif

	static Graph* DefaultConv2D(const Conv2DDimensions& dims) {
	auto* graph = new Graph(OpRegistry::Global());

	Tensor input_t = GetRandomInputTensor(dims);
	Tensor filter_t = GetRandomFilterTensor(dims);

	Node* input = test::graph::Constant(graph, input_t, "input");
	Node* filter = test::graph::Constant(graph, filter_t, "filter");

	Node* conv2d;
	TF_CHECK_OK(NodeBuilder(graph->NewName("conv_2d"), "Conv2D")
	.Input(input)
	.Input(filter)
	.Attr("T", DT_FLOAT)
	.Attr("strides", {1, 1, 1, 1})
	.Attr("padding", "SAME")
	.Finalize(graph, &conv2d));

	return graph;
	}

	#if defined(INTEL_MKL_DNN_ONLY)
	static Graph* MklConv2D(const Conv2DDimensions& dims) {
	auto* graph = new Graph(OpRegistry::Global());

	Tensor input_t = GetRandomInputTensor(dims);
	Tensor filter_t = GetRandomFilterTensor(dims);

	Node* input = test::graph::Constant(graph, input_t, "input");
	Node* filter = test::graph::Constant(graph, filter_t, "filter");

	Node* not_mkl_shape = test::graph::Constant(graph, NonMklTensor(), "not_mkl");

	Node* conv2d;
	TF_CHECK_OK(NodeBuilder(graph->NewName("mkl_conv_2d"), "_MklConv2D")
	.Input(input)
	.Input(filter)
	.Input(not_mkl_shape)
	.Input(not_mkl_shape)
	.Attr("T", DT_FLOAT)
	.Attr("strides", {1, 1, 1, 1})
	.Attr("padding", "SAME")
	.Attr("_kernel", "MklOp")
	.Finalize(graph, &conv2d));

	return graph;
	}
	#endif

	static Graph* DefaultConv2DBwdInput(const Conv2DDimensions& dims) {
	auto* graph = new Graph(OpRegistry::Global());

	Tensor input_sizes_t = GetInputSizesTensor(dims);
	Tensor filter_t = GetRandomFilterTensor(dims);
	Tensor out_backprop_t = GetRandomOutputTensor(dims); // assuming SAME padding

	Node* input_sizes =
	test::graph::Constant(graph, input_sizes_t, "input_sizes");
	Node* filter = test::graph::Constant(graph, filter_t, "filter");
	Node* out_backprop =
	test::graph::Constant(graph, out_backprop_t, "out_backprop");

	Node* conv2d_bwd_input;
	TF_CHECK_OK(
	NodeBuilder(graph->NewName("conv_2d_bwd_input"), "Conv2DBackpropInput")
	.Input(input_sizes)
	.Input(filter)
	.Input(out_backprop)
	.Attr("T", DT_FLOAT)
	.Attr("strides", {1, 1, 1, 1})
	.Attr("padding", "SAME")
	.Finalize(graph, &conv2d_bwd_input));

	return graph;
	}

	#if defined(INTEL_MKL_DNN_ONLY)
	static Graph* MklConv2DBwdInput(const Conv2DDimensions& dims) {
	auto* graph = new Graph(OpRegistry::Global());

	Tensor input_sizes_t = GetInputSizesTensor(dims);
	Tensor filter_t = GetRandomFilterTensor(dims);
	Tensor out_backprop_t = GetRandomOutputTensor(dims); // assuming SAME padding

	Node* input_sizes =
	test::graph::Constant(graph, input_sizes_t, "input_sizes");
	Node* filter = test::graph::Constant(graph, filter_t, "filter");
	Node* out_backprop =
	test::graph::Constant(graph, out_backprop_t, "out_backprop");

	Node* not_mkl_shape = test::graph::Constant(graph, NonMklTensor(), "not_mkl");

	Node* conv2d_bwd_input;
	TF_CHECK_OK(NodeBuilder(graph->NewName("conv_2d_bwd_input"),
	"_MklConv2DBackpropInput")
	.Input(input_sizes)
	.Input(filter)
	.Input(out_backprop)
	.Input(not_mkl_shape)
	.Input(not_mkl_shape)
	.Input(not_mkl_shape)
	.Attr("T", DT_FLOAT)
	.Attr("strides", {1, 1, 1, 1})
	.Attr("padding", "SAME")
	.Attr("_kernel", "MklOp")
	.Finalize(graph, &conv2d_bwd_input));

	return graph;
	}
	#endif

	static Graph* DefaultConv2DBwdFilter(const Conv2DDimensions& dims) {
	auto* graph = new Graph(OpRegistry::Global());

	Tensor input_t = GetRandomInputTensor(dims);
	Tensor filter_sizes_t = GetFilterSizesTensor(dims);
	Tensor filter_t = GetRandomFilterTensor(dims);
	Tensor out_backprop_t = GetRandomOutputTensor(dims); // assuming SAME padding

	Node* input = test::graph::Constant(graph, input_t, "input");
	Node* filter_sizes =
	test::graph::Constant(graph, filter_sizes_t, "filter_sizes");
	Node* out_backprop =
	test::graph::Constant(graph, out_backprop_t, "out_backprop");

	Node* conv2d_bwd_filter;
	TF_CHECK_OK(
	NodeBuilder(graph->NewName("conv_2d_bwd_filter"), "Conv2DBackpropFilter")
	.Input(input)
	.Input(filter_sizes)
	.Input(out_backprop)
	.Attr("T", DT_FLOAT)
	.Attr("strides", {1, 1, 1, 1})
	.Attr("padding", "SAME")
	.Finalize(graph, &conv2d_bwd_filter));

	return graph;
	}

	#if defined(INTEL_MKL_DNN_ONLY)
	static Graph* MklConv2DBwdFilter(const Conv2DDimensions& dims) {
	Graph* graph = new Graph(OpRegistry::Global());

	Tensor input_t = GetRandomInputTensor(dims);
	Tensor filter_sizes_t = GetFilterSizesTensor(dims);
	Tensor filter_t = GetRandomFilterTensor(dims);
	Tensor out_backprop_t = GetRandomOutputTensor(dims); // assuming SAME padding

	Node* input = test::graph::Constant(graph, input_t, "input");
	Node* filter_sizes =
	test::graph::Constant(graph, filter_sizes_t, "filter_sizes");
	Node* out_backprop =
	test::graph::Constant(graph, out_backprop_t, "out_backprop");

	Node* not_mkl_shape = test::graph::Constant(graph, NonMklTensor(), "not_mkl");

	Node* conv2d_bwd_filter;
	TF_CHECK_OK(NodeBuilder(graph->NewName("conv_2d_bwd_filter"),
	"_MklConv2DBackpropFilter")
	.Input(input)
	.Input(filter_sizes)
	.Input(out_backprop)
	.Input(not_mkl_shape)
	.Input(not_mkl_shape)
	.Input(not_mkl_shape)
	.Attr("T", DT_FLOAT)
	.Attr("strides", {1, 1, 1, 1})
	.Attr("padding", "SAME")
	.Attr("_kernel", "MklOp")
	.Finalize(graph, &conv2d_bwd_filter));

	return graph;
	}
	#endif

	// Macro arguments names: --------------------------------------------------- //
	// N: batch size
	// H: height
	// W: width
	// C: channels
	// FC: filter count
	// FH: filter height
	// FW: filter width

	#define BM_CONCAT(a, b) a##b

	#define BM_NAME(p, type, N, H, W, C, FC, FH, FW) \
	BM_CONCAT(BM_##p##_##type##_in_##N##_##H##_##W##_##C, _f_##FC##_##FH##_##FW)

	// Flops computation in these benchmarks are the same as in
	// eigen_benchmark_cpu_test.cc.

	#define BM_Conv2DT(kind, N, H, W, C, FC, FH, FW, type, LABEL) \
	static void BM_NAME(Conv2D_##kind, type, N, H, W, C, FC, FH, \
	FW)(int iters) { \
	testing::SetLabel(LABEL); \
	\
	int64 num_computed_elements = (N) * (H) * (W) * (FC); \
	int64 flops_per_iter = num_computed_elements * ((C) * (FH) * (FW)); \
	testing::ItemsProcessed(static_cast<int64>(iters) * flops_per_iter); \
	\
	Conv2DDimensions dims(N, H, W, C, FC, FW, FH); \
	test::Benchmark(#type, BM_CONCAT(kind, Conv2D)(dims)).Run(iters); \
	} \
	BENCHMARK(BM_NAME(Conv2D_##kind, type, N, H, W, C, FC, FH, FW))

	#if defined(INTEL_MKL_DNN_ONLY)
	#define BM_Conv2D(N, H, W, C, FC, FH, FW, type, LABEL) \
	BM_Conv2DT(Default, N, H, W, C, FC, FH, FW, type, LABEL); \
	BM_Conv2DT(Mkl, N, H, W, C, FC, FH, FW, type, LABEL);
	#else
	#define BM_Conv2D(N, H, W, C, FC, FH, FW, type, LABEL) \
	BM_Conv2DT(Default, N, H, W, C, FC, FH, FW, type, LABEL);
	#endif

	#define BM_Conv2DBwdInputT(kind, N, H, W, C, FC, FH, FW, type, LABEL) \
	static void BM_NAME(Conv2DBwdInput_##kind, type, N, H, W, C, FC, FH, \
	FW)(int iters) { \
	testing::SetLabel(LABEL); \
	\
	int64 num_computed_elements = (N) * (H) * (W) * (C); \
	int64 flops_per_iter = num_computed_elements * ((C) * (FH) * (FW)); \
	testing::ItemsProcessed(static_cast<int64>(iters) * flops_per_iter); \
	\
	Conv2DDimensions dims(N, H, W, C, FC, FW, FH); \
	test::Benchmark(#type, BM_CONCAT(kind, Conv2DBwdInput)(dims)).Run(iters); \
	} \
	BENCHMARK(BM_NAME(Conv2DBwdInput_##kind, type, N, H, W, C, FC, FH, FW))

	#if defined(INTEL_MKL_DNN_ONLY)
	#define BM_Conv2DBwdInput(N, H, W, C, FC, FH, FW, type, LABEL) \
	BM_Conv2DBwdInputT(Default, N, H, W, C, FC, FH, FW, type, LABEL); \
	BM_Conv2DBwdInputT(Mkl, N, H, W, C, FC, FH, FW, type, LABEL);
	#else
	#define BM_Conv2DBwdInput(N, H, W, C, FC, FH, FW, type, LABEL) \
	BM_Conv2DBwdInputT(Default, N, H, W, C, FC, FH, FW, type, LABEL);
	#endif

	#define BM_Conv2DBwdFilterT(kind, N, H, W, C, FC, FH, FW, type, LABEL) \
	static void BM_NAME(Conv2DBwdFilter_##kind, type, N, H, W, C, FC, FH, \
	FW)(int iters) { \
	testing::SetLabel(LABEL); \
	\
	int64 num_computed_elements = (FH) * (FW) * (C) * (FC); \
	int64 flops_per_iter = num_computed_elements * ((N) * (H) * (W)); \
	testing::ItemsProcessed(static_cast<int64>(iters) * flops_per_iter); \
	\
	Conv2DDimensions dims(N, H, W, C, FC, FW, FH); \
	test::Benchmark(#type, BM_CONCAT(kind, Conv2DBwdFilter)(dims)).Run(iters); \
	} \
	BENCHMARK(BM_NAME(Conv2DBwdFilter_##kind, type, N, H, W, C, FC, FH, FW))

	#if defined(INTEL_MKL_DNN_ONLY)
	#define BM_Conv2DBwdFilter(N, H, W, C, FC, FH, FW, type, LABEL) \
	BM_Conv2DBwdFilterT(Default, N, H, W, C, FC, FH, FW, type, LABEL); \
	BM_Conv2DBwdFilterT(Mkl, N, H, W, C, FC, FH, FW, type, LABEL);
	#else
	#define BM_Conv2DBwdFilter(N, H, W, C, FC, FH, FW, type, LABEL) \
	BM_Conv2DBwdFilterT(Default, N, H, W, C, FC, FH, FW, type, LABEL);
	#endif

	// ImageNet Convolutions ---------------------------------------------------- //

	BM_Conv2D(32, 28, 28, 96, 128, 3, 3, cpu, "conv3a_00_3x3");
	BM_Conv2D(32, 28, 28, 16, 32, 5, 5, cpu, "conv3a_00_5x5");
	BM_Conv2D(32, 28, 28, 128, 192, 3, 3, cpu, "conv3_00_3x3");
	BM_Conv2D(32, 28, 28, 32, 96, 5, 5, cpu, "conv3_00_5x5");
	BM_Conv2D(32, 14, 14, 96, 204, 3, 3, cpu, "conv4a_00_3x3");
	BM_Conv2D(32, 14, 14, 16, 48, 5, 5, cpu, "conv4a_00_5x5");
	BM_Conv2D(32, 14, 14, 112, 224, 3, 3, cpu, "conv4b_00_3x3");

	BM_Conv2DBwdInput(32, 28, 28, 96, 128, 3, 3, cpu, "conv3a_00_3x3");
	BM_Conv2DBwdInput(32, 28, 28, 16, 32, 5, 5, cpu, "conv3a_00_5x5");
	BM_Conv2DBwdInput(32, 28, 28, 128, 192, 3, 3, cpu, "conv3_00_3x3");
	BM_Conv2DBwdInput(32, 28, 28, 32, 96, 5, 5, cpu, "conv3_00_5x5");
	BM_Conv2DBwdInput(32, 14, 14, 96, 204, 3, 3, cpu, "conv4a_00_3x3");
	BM_Conv2DBwdInput(32, 14, 14, 16, 48, 5, 5, cpu, "conv4a_00_5x5");
	BM_Conv2DBwdInput(32, 14, 14, 112, 224, 3, 3, cpu, "conv4b_00_3x3");

	BM_Conv2DBwdFilter(32, 28, 28, 96, 128, 3, 3, cpu, "conv3a_00_3x3");
	BM_Conv2DBwdFilter(32, 28, 28, 16, 32, 5, 5, cpu, "conv3a_00_5x5");
	BM_Conv2DBwdFilter(32, 28, 28, 128, 192, 3, 3, cpu, "conv3_00_3x3");
	BM_Conv2DBwdFilter(32, 28, 28, 32, 96, 5, 5, cpu, "conv3_00_5x5");
	BM_Conv2DBwdFilter(32, 14, 14, 96, 204, 3, 3, cpu, "conv4a_00_3x3");
	BM_Conv2DBwdFilter(32, 14, 14, 16, 48, 5, 5, cpu, "conv4a_00_5x5");
	BM_Conv2DBwdFilter(32, 14, 14, 112, 224, 3, 3, cpu, "conv4b_00_3x3");

	} // namespace tensorflow