caffe2/mobile/contrib/nnapi/nnapi_benchmark.cc - platform/external/pytorch - Git at Google

 /**
  * Copyright (c) 2016-present, Facebook, Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include "caffe2/core/init.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/core/tensor.h"
 #include "caffe2/core/timer.h"
 #include "caffe2/utils/math.h"
 #include "caffe2/utils/proto_utils.h"
 #include "nnapi.h"

 namespace caffe2 {

 namespace {

 static double benchmark_conv_caffe2(
     Workspace* ws,
     int N,
     int C,
     int H,
     int W,
     int K,
     int kernel,
     int group,
     int warmup = 5,
     int run = 10,
     std::string engine = "NNPACK") {
   caffe2::Workspace localWs;
   if (!ws) {
     ws = &localWs;
   }
   {
     auto* t = BlobGetMutableTensor(ws->CreateBlob("X_cpu"), CPU);
     t->Resize(N, C, H, W);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
     auto* t = BlobGetMutableTensor(ws->CreateBlob("W"), CPU);
     if (group == 1) {
       t->Resize(K, C, kernel, kernel);
     } else {
       t->Resize(K, 1, kernel, kernel);
     }
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
     auto* t = BlobGetMutableTensor(ws->CreateBlob("B"), CPU);
     t->Resize(K);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }

   OperatorDef op;
   {
     op.set_type("Conv");
     op.add_input("X_cpu");
     op.add_input("W");
     op.add_input("B");
     op.add_output("Y_cpu");
     op.set_engine(engine);
     {
       auto& arg = *(op.add_arg());
       arg.set_name("order");
       arg.set_s("NCHW");
     }
     {
       auto& arg = *(op.add_arg());
       arg.set_name("convolution_transform_strategy");
       arg.set_s("PRECOMPUTE");
     }
     {
       auto& arg = *(op.add_arg());
       arg.set_name("kernel");
       arg.set_i(kernel);
     }
     {
       auto& arg = *(op.add_arg());
       arg.set_name("group");
       arg.set_i(group);
     }
   }

   // NNPack
   std::unique_ptr<caffe2::OperatorBase> op1(CreateOperator(op, ws));

   Timer timer;
   CAFFE_ENFORCE(op1->Run());
   for (int i = 0; i < warmup; i++) {
     op1->Run();
   }
   timer.Start();
   for (int i = 0; i < run; i++) {
     op1->Run();
   }
   return double(timer.MilliSeconds()) / run;
 }

 static double benchmark_conv_nnapi(
     Workspace* ws,
     int N,
     int C,
     int H,
     int W,
     int K,
     int kernel,
     int group,
     int warmup = 5,
     int run = 10) {
   caffe2::Workspace localWs;
   if (!ws) {
     ws = &localWs;
   }
   {
     auto* t = BlobGetMutableTensor(ws->CreateBlob("X_cpu"), CPU);
     t->Resize(N, H, W, C);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
     auto* t = BlobGetMutableTensor(ws->CreateBlob("W"), CPU);
     if (group > 1) {
       CAFFE_ENFORCE_EQ(C, group);
       t->Resize(1, kernel, kernel, C);
     } else {
       t->Resize(K, kernel, kernel, C);
     }
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }
   {
     auto* t = BlobGetMutableTensor(ws->CreateBlob("B"), CPU);
     t->Resize(K);
     CPUContext ctx;
     math::RandGaussian<float, CPUContext>(
         t->size(), 0, 30, t->mutable_data<float>(), &ctx);
   }

   NetDef netdef;
   {
     {
       auto& op = *(netdef.add_op());
       op.set_type("Conv");
       op.add_input("X_cpu");
       op.add_input("W");
       op.add_input("B");
       op.add_output("Y_cpu");
       {
         auto& arg = *(op.add_arg());
         arg.set_name("order");
         arg.set_s("NHWC");
       }
       {
         auto& arg = *(op.add_arg());
         arg.set_name("kernel");
         arg.set_i(kernel);
       }
       {
         auto& arg = *(op.add_arg());
         arg.set_name("group");
         arg.set_i(group);
       }
     }
     netdef.add_external_input("X_cpu");
     netdef.add_external_input("W");
     netdef.add_external_input("B");
     netdef.add_external_output("Y_cpu");
   }

   // NN API
   NetDef initNet;
   NNApi model(initNet, netdef, ws);
   std::vector<TensorCPU*> inputs, outputs;
   inputs.push_back(BlobGetMutableTensor(ws->GetBlob("X_cpu"), CPU));
   CAFFE_ENFORCE(model.run(inputs, &outputs));

   for (int i = 0; i < warmup; i++) {
     model.run(inputs, &outputs);
   }
   Timer timer;
   timer.Start();
   for (int i = 0; i < run; i++) {
     model.run(inputs, &outputs);
   }
   return double(timer.MilliSeconds()) / run;
 }

 static double benchmark_conv_nnapi_int8(
     Workspace* ws,
     int N,
     int C,
     int H,
     int W,
     int K,
     int kernel,
     int group,
     int warmup = 5,
     int run = 10) {
   caffe2::Workspace localWs;
   if (!ws) {
     ws = &localWs;
   }
   {
     auto* t = BlobGetMutableTensor(ws->CreateBlob("X_cpu"), CPU);
     t->Resize(N, H, W, C);
     for (int i = 0; i < t->size(); i++) {
       t->mutable_data<uint8_t>()[i] = rand() % 10;
     }
   }
   {
     auto* t = BlobGetMutableTensor(ws->CreateBlob("W"), CPU);
     if (group > 1) {
       CAFFE_ENFORCE_EQ(C, group);
       t->Resize(1, kernel, kernel, C);
     } else {
       t->Resize(K, kernel, kernel, C);
     }
     for (int i = 0; i < t->size(); i++) {
       t->mutable_data<uint8_t>()[i] = rand() % 10;
     }
   }

   // For input tensor of ANEURALNETWORKS_TENSOR_QUANT8_ASYMM type, the bias
   // should be of ANEURALNETWORKS_TENSOR_INT32, with zeroPoint of 0 and
   // bias_scale == input_scale * filter_scale.
   {
     auto* t = BlobGetMutableTensor(ws->CreateBlob("B"), CPU);
     t->Resize(K);
     for (int i = 0; i < t->size(); i++) {
       t->mutable_data<int32_t>()[i] = rand() % 10;
     }
   }

   NetDef netdef;
   {
     {
       auto& op = *(netdef.add_op());
       op.set_type("Conv");
       op.add_input("X_cpu");
       op.add_input("W");
       op.add_input("B");
       op.add_output("Y_cpu");
       {
         auto& arg = *(op.add_arg());
         arg.set_name("order");
         arg.set_s("NHWC");
       }
       {
         auto& arg = *(op.add_arg());
         arg.set_name("kernel");
         arg.set_i(kernel);
       }
       {
         auto& arg = *(op.add_arg());
         arg.set_name("group");
         arg.set_i(group);
       }
       // Hack
       // for weight tensor
       {
         auto& arg = *(op.add_arg());
         arg.set_name("weight_scale");
         arg.set_f(1.0);
       }
       {
         auto& arg = *(op.add_arg());
         arg.set_name("weight_zero_point");
         arg.set_i(0);
       }
       // for output tensor
       // For output tensor of ANEURALNETWORKS_TENSOR_QUANT8_ASYMM type, the
       // following condition must be satisfied: output_scale > input_scale *
       // filter_scale
       {
         auto& arg = *(op.add_arg());
         arg.set_name("output_scale");
         arg.set_f(2.0);
       }
       {
         auto& arg = *(op.add_arg());
         arg.set_name("output_zero_point");
         arg.set_i(0);
       }
     }
     netdef.add_external_input("X_cpu");
     netdef.add_external_input("W");
     netdef.add_external_input("B");
     netdef.add_external_output("Y_cpu");
     // scale and zero_point for the input tensor
     {
       auto& arg = *(netdef.add_arg());
       arg.set_name("scale");
       arg.set_f(1.0);
     }
     {
       auto& arg = *(netdef.add_arg());
       arg.set_name("zero_point");
       arg.set_i(0);
     }
   }

   // NN API
   NetDef initNet;
   NNApi model(initNet, netdef, ws);
   std::vector<TensorCPU*> inputs, outputs;
   inputs.push_back(BlobGetMutableTensor(ws->GetBlob("X_cpu"), CPU));
   CAFFE_ENFORCE(model.run(inputs, &outputs));

   for (int i = 0; i < warmup; i++) {
     model.run(inputs, &outputs);
   }
   Timer timer;
   timer.Start();
   for (int i = 0; i < run; i++) {
     model.run(inputs, &outputs);
   }
   return double(timer.MilliSeconds()) / run;
 }

 } // namespace

 } // namespace caffe2

 int main(int argc, char** argv) {
   caffe2::Workspace ws;
   ws.GetThreadPool()->setMinWorkSize(0);

   int warmup = 2, mainrun = 10;
   // float32
   for (int space : {14, 26, 52, 104}) {
     for (int input_channel : {64, 128, 256, 512}) {
       for (int kernel : {1, 3}) {
         int output_channel = input_channel;
         const double cpu_time = caffe2::benchmark_conv_caffe2(
             &ws,
             1,
             input_channel,
             space,
             space,
             output_channel,
             kernel,
             1,
             warmup,
             mainrun,
             "NNPACK");
         const double nn_time_fp32 = caffe2::benchmark_conv_nnapi(
             &ws,
             1,
             input_channel,
             space,
             space,
             output_channel,
             kernel,
             1,
             warmup,
             mainrun);
         const double nn_time_int8 = caffe2::benchmark_conv_nnapi_int8(
             &ws,
             1,
             input_channel,
             space,
             space,
             output_channel,
             kernel,
             1,
             warmup,
             mainrun);
         const double flops = double(input_channel) * output_channel * kernel *
             kernel * (kernel == 1 ? space : space - 2) *
             (kernel == 1 ? space : space - 2) * 2;
         printf(
             "Conv: X: %ix%i  \tC: %i -> %i\tK: %ix%i\t32b"
             "NNPACK GFLOPS: %.2f\t32b"
             "NN-API GFLOPS: %.2f\t8b"
             "NN-API GOPS: %.2f\n",
             space,
             space,
             input_channel,
             output_channel,
             kernel,
             kernel,
             flops / cpu_time / 1E6,
             flops / nn_time_fp32 / 1E6,
             flops / nn_time_int8 / 1E6);
       }
     }
   }
   fflush(stdout);

   // depthwise
   for (int space : {14, 26, 52, 104}) {
     for (int channel : {64, 128, 256, 512}) {
       for (int kernel : {3}) {
         const double cpu_time = caffe2::benchmark_conv_caffe2(
             &ws,
             1,
             channel,
             space,
             space,
             channel,
             kernel,
             channel,
             warmup,
             mainrun,
             "DEPTHWISE_3x3");
         const double nn_time_fp32_dwise = caffe2::benchmark_conv_nnapi(
             &ws,
             1,
             channel,
             space,
             space,
             channel,
             kernel,
             channel,
             warmup,
             mainrun);
         const double nn_time_int8_dwise = caffe2::benchmark_conv_nnapi_int8(
             &ws,
             1,
             channel,
             space,
             space,
             channel,
             kernel,
             channel,
             warmup,
             mainrun);
         const double dwise_bandwidth = sizeof(float) * double(channel) *
             (space * space + kernel == 1
                  ? space * space
                  : (space - 2) * (space - 2) + kernel * kernel);
         printf(
             "Conv: X: %ix%i  \tC: %i -> %i\tK: %ix%i\t32b"
             "Caffe2 Dwise GB/s: %.2f\t32b"
             "NN-API Dwise GB/s: %.2f\t8b"
             "NN-API Dwise GB/s: %.2f\n",
             space,
             space,
             channel,
             channel,
             kernel,
             kernel,
             dwise_bandwidth / cpu_time / 1E6,
             dwise_bandwidth / nn_time_fp32_dwise / 1E6,
             dwise_bandwidth / sizeof(float) / nn_time_int8_dwise / 1E6);
       }
     }
   }
 }
	/**
	* Copyright (c) 2016-present, Facebook, Inc.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include "caffe2/core/init.h"
	#include "caffe2/core/operator.h"
	#include "caffe2/core/tensor.h"
	#include "caffe2/core/timer.h"
	#include "caffe2/utils/math.h"
	#include "caffe2/utils/proto_utils.h"
	#include "nnapi.h"

	namespace caffe2 {

	namespace {

	static double benchmark_conv_caffe2(
	Workspace* ws,
	int N,
	int C,
	int H,
	int W,
	int K,
	int kernel,
	int group,
	int warmup = 5,
	int run = 10,
	std::string engine = "NNPACK") {
	caffe2::Workspace localWs;
	if (!ws) {
	ws = &localWs;
	}
	{
	auto* t = BlobGetMutableTensor(ws->CreateBlob("X_cpu"), CPU);
	t->Resize(N, C, H, W);
	CPUContext ctx;
	math::RandGaussian<float, CPUContext>(
	t->size(), 0, 30, t->mutable_data<float>(), &ctx);
	}
	{
	auto* t = BlobGetMutableTensor(ws->CreateBlob("W"), CPU);
	if (group == 1) {
	t->Resize(K, C, kernel, kernel);
	} else {
	t->Resize(K, 1, kernel, kernel);
	}
	CPUContext ctx;
	math::RandGaussian<float, CPUContext>(
	t->size(), 0, 30, t->mutable_data<float>(), &ctx);
	}
	{
	auto* t = BlobGetMutableTensor(ws->CreateBlob("B"), CPU);
	t->Resize(K);
	CPUContext ctx;
	math::RandGaussian<float, CPUContext>(
	t->size(), 0, 30, t->mutable_data<float>(), &ctx);
	}

	OperatorDef op;
	{
	op.set_type("Conv");
	op.add_input("X_cpu");
	op.add_input("W");
	op.add_input("B");
	op.add_output("Y_cpu");
	op.set_engine(engine);
	{
	auto& arg = *(op.add_arg());
	arg.set_name("order");
	arg.set_s("NCHW");
	}
	{
	auto& arg = *(op.add_arg());
	arg.set_name("convolution_transform_strategy");
	arg.set_s("PRECOMPUTE");
	}
	{
	auto& arg = *(op.add_arg());
	arg.set_name("kernel");
	arg.set_i(kernel);
	}
	{
	auto& arg = *(op.add_arg());
	arg.set_name("group");
	arg.set_i(group);
	}
	}

	// NNPack
	std::unique_ptr<caffe2::OperatorBase> op1(CreateOperator(op, ws));

	Timer timer;
	CAFFE_ENFORCE(op1->Run());
	for (int i = 0; i < warmup; i++) {
	op1->Run();
	}
	timer.Start();
	for (int i = 0; i < run; i++) {
	op1->Run();
	}
	return double(timer.MilliSeconds()) / run;
	}

	static double benchmark_conv_nnapi(
	Workspace* ws,
	int N,
	int C,
	int H,
	int W,
	int K,
	int kernel,
	int group,
	int warmup = 5,
	int run = 10) {
	caffe2::Workspace localWs;
	if (!ws) {
	ws = &localWs;
	}
	{
	auto* t = BlobGetMutableTensor(ws->CreateBlob("X_cpu"), CPU);
	t->Resize(N, H, W, C);
	CPUContext ctx;
	math::RandGaussian<float, CPUContext>(
	t->size(), 0, 30, t->mutable_data<float>(), &ctx);
	}
	{
	auto* t = BlobGetMutableTensor(ws->CreateBlob("W"), CPU);
	if (group > 1) {
	CAFFE_ENFORCE_EQ(C, group);
	t->Resize(1, kernel, kernel, C);
	} else {
	t->Resize(K, kernel, kernel, C);
	}
	CPUContext ctx;
	math::RandGaussian<float, CPUContext>(
	t->size(), 0, 30, t->mutable_data<float>(), &ctx);
	}
	{
	auto* t = BlobGetMutableTensor(ws->CreateBlob("B"), CPU);
	t->Resize(K);
	CPUContext ctx;
	math::RandGaussian<float, CPUContext>(
	t->size(), 0, 30, t->mutable_data<float>(), &ctx);
	}

	NetDef netdef;
	{
	{
	auto& op = *(netdef.add_op());
	op.set_type("Conv");
	op.add_input("X_cpu");
	op.add_input("W");
	op.add_input("B");
	op.add_output("Y_cpu");
	{
	auto& arg = *(op.add_arg());
	arg.set_name("order");
	arg.set_s("NHWC");
	}
	{
	auto& arg = *(op.add_arg());
	arg.set_name("kernel");
	arg.set_i(kernel);
	}
	{
	auto& arg = *(op.add_arg());
	arg.set_name("group");
	arg.set_i(group);
	}
	}
	netdef.add_external_input("X_cpu");
	netdef.add_external_input("W");
	netdef.add_external_input("B");
	netdef.add_external_output("Y_cpu");
	}

	// NN API
	NetDef initNet;
	NNApi model(initNet, netdef, ws);
	std::vector<TensorCPU*> inputs, outputs;
	inputs.push_back(BlobGetMutableTensor(ws->GetBlob("X_cpu"), CPU));
	CAFFE_ENFORCE(model.run(inputs, &outputs));

	for (int i = 0; i < warmup; i++) {
	model.run(inputs, &outputs);
	}
	Timer timer;
	timer.Start();
	for (int i = 0; i < run; i++) {
	model.run(inputs, &outputs);
	}
	return double(timer.MilliSeconds()) / run;
	}

	static double benchmark_conv_nnapi_int8(
	Workspace* ws,
	int N,
	int C,
	int H,
	int W,
	int K,
	int kernel,
	int group,
	int warmup = 5,
	int run = 10) {
	caffe2::Workspace localWs;
	if (!ws) {
	ws = &localWs;
	}
	{
	auto* t = BlobGetMutableTensor(ws->CreateBlob("X_cpu"), CPU);
	t->Resize(N, H, W, C);
	for (int i = 0; i < t->size(); i++) {
	t->mutable_data<uint8_t>()[i] = rand() % 10;
	}
	}
	{
	auto* t = BlobGetMutableTensor(ws->CreateBlob("W"), CPU);
	if (group > 1) {
	CAFFE_ENFORCE_EQ(C, group);
	t->Resize(1, kernel, kernel, C);
	} else {
	t->Resize(K, kernel, kernel, C);
	}
	for (int i = 0; i < t->size(); i++) {
	t->mutable_data<uint8_t>()[i] = rand() % 10;
	}
	}

	// For input tensor of ANEURALNETWORKS_TENSOR_QUANT8_ASYMM type, the bias
	// should be of ANEURALNETWORKS_TENSOR_INT32, with zeroPoint of 0 and
	// bias_scale == input_scale * filter_scale.
	{
	auto* t = BlobGetMutableTensor(ws->CreateBlob("B"), CPU);
	t->Resize(K);
	for (int i = 0; i < t->size(); i++) {
	t->mutable_data<int32_t>()[i] = rand() % 10;
	}
	}

	NetDef netdef;
	{
	{
	auto& op = *(netdef.add_op());
	op.set_type("Conv");
	op.add_input("X_cpu");
	op.add_input("W");
	op.add_input("B");
	op.add_output("Y_cpu");
	{
	auto& arg = *(op.add_arg());
	arg.set_name("order");
	arg.set_s("NHWC");
	}
	{
	auto& arg = *(op.add_arg());
	arg.set_name("kernel");
	arg.set_i(kernel);
	}
	{
	auto& arg = *(op.add_arg());
	arg.set_name("group");
	arg.set_i(group);
	}
	// Hack
	// for weight tensor
	{
	auto& arg = *(op.add_arg());
	arg.set_name("weight_scale");
	arg.set_f(1.0);
	}
	{
	auto& arg = *(op.add_arg());
	arg.set_name("weight_zero_point");
	arg.set_i(0);
	}
	// for output tensor
	// For output tensor of ANEURALNETWORKS_TENSOR_QUANT8_ASYMM type, the
	// following condition must be satisfied: output_scale > input_scale *
	// filter_scale
	{
	auto& arg = *(op.add_arg());
	arg.set_name("output_scale");
	arg.set_f(2.0);
	}
	{
	auto& arg = *(op.add_arg());
	arg.set_name("output_zero_point");
	arg.set_i(0);
	}
	}
	netdef.add_external_input("X_cpu");
	netdef.add_external_input("W");
	netdef.add_external_input("B");
	netdef.add_external_output("Y_cpu");
	// scale and zero_point for the input tensor
	{
	auto& arg = *(netdef.add_arg());
	arg.set_name("scale");
	arg.set_f(1.0);
	}
	{
	auto& arg = *(netdef.add_arg());
	arg.set_name("zero_point");
	arg.set_i(0);
	}
	}

	// NN API
	NetDef initNet;
	NNApi model(initNet, netdef, ws);
	std::vector<TensorCPU*> inputs, outputs;
	inputs.push_back(BlobGetMutableTensor(ws->GetBlob("X_cpu"), CPU));
	CAFFE_ENFORCE(model.run(inputs, &outputs));

	for (int i = 0; i < warmup; i++) {
	model.run(inputs, &outputs);
	}
	Timer timer;
	timer.Start();
	for (int i = 0; i < run; i++) {
	model.run(inputs, &outputs);
	}
	return double(timer.MilliSeconds()) / run;
	}

	} // namespace

	} // namespace caffe2

	int main(int argc, char** argv) {
	caffe2::Workspace ws;
	ws.GetThreadPool()->setMinWorkSize(0);

	int warmup = 2, mainrun = 10;
	// float32
	for (int space : {14, 26, 52, 104}) {
	for (int input_channel : {64, 128, 256, 512}) {
	for (int kernel : {1, 3}) {
	int output_channel = input_channel;
	const double cpu_time = caffe2::benchmark_conv_caffe2(
	&ws,
	1,
	input_channel,
	space,
	space,
	output_channel,
	kernel,
	1,
	warmup,
	mainrun,
	"NNPACK");
	const double nn_time_fp32 = caffe2::benchmark_conv_nnapi(
	&ws,
	1,
	input_channel,
	space,
	space,
	output_channel,
	kernel,
	1,
	warmup,
	mainrun);
	const double nn_time_int8 = caffe2::benchmark_conv_nnapi_int8(
	&ws,
	1,
	input_channel,
	space,
	space,
	output_channel,
	kernel,
	1,
	warmup,
	mainrun);
	const double flops = double(input_channel) * output_channel * kernel *
	kernel * (kernel == 1 ? space : space - 2) *
	(kernel == 1 ? space : space - 2) * 2;
	printf(
	"Conv: X: %ix%i \tC: %i -> %i\tK: %ix%i\t32b"
	"NNPACK GFLOPS: %.2f\t32b"
	"NN-API GFLOPS: %.2f\t8b"
	"NN-API GOPS: %.2f\n",
	space,
	space,
	input_channel,
	output_channel,
	kernel,
	kernel,
	flops / cpu_time / 1E6,
	flops / nn_time_fp32 / 1E6,
	flops / nn_time_int8 / 1E6);
	}
	}
	}
	fflush(stdout);

	// depthwise
	for (int space : {14, 26, 52, 104}) {
	for (int channel : {64, 128, 256, 512}) {
	for (int kernel : {3}) {
	const double cpu_time = caffe2::benchmark_conv_caffe2(
	&ws,
	1,
	channel,
	space,
	space,
	channel,
	kernel,
	channel,
	warmup,
	mainrun,
	"DEPTHWISE_3x3");
	const double nn_time_fp32_dwise = caffe2::benchmark_conv_nnapi(
	&ws,
	1,
	channel,
	space,
	space,
	channel,
	kernel,
	channel,
	warmup,
	mainrun);
	const double nn_time_int8_dwise = caffe2::benchmark_conv_nnapi_int8(
	&ws,
	1,
	channel,
	space,
	space,
	channel,
	kernel,
	channel,
	warmup,
	mainrun);
	const double dwise_bandwidth = sizeof(float) * double(channel) *
	(space * space + kernel == 1
	? space * space
	: (space - 2) * (space - 2) + kernel * kernel);
	printf(
	"Conv: X: %ix%i \tC: %i -> %i\tK: %ix%i\t32b"
	"Caffe2 Dwise GB/s: %.2f\t32b"
	"NN-API Dwise GB/s: %.2f\t8b"
	"NN-API Dwise GB/s: %.2f\n",
	space,
	space,
	channel,
	channel,
	kernel,
	kernel,
	dwise_bandwidth / cpu_time / 1E6,
	dwise_bandwidth / nn_time_fp32_dwise / 1E6,
	dwise_bandwidth / sizeof(float) / nn_time_int8_dwise / 1E6);
	}
	}
	}
	}