tensorflow/lite/delegates/gpu/metal/kernels/conv_test.mm - platform/external/tensorflow - Git at Google

 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

 #include "tensorflow/lite/delegates/gpu/metal/kernels/conv.h"
 #include "tensorflow/lite/delegates/gpu/metal/kernels/winograd.h"

 #import <XCTest/XCTest.h>

 #include <string>
 #include <vector>

 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
 #include "tensorflow/lite/delegates/gpu/common/util.h"
 #include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
 #include "tensorflow/lite/delegates/gpu/metal/kernels/test_util.h"

 using ::tflite::gpu::Axis;
 using ::tflite::gpu::BHWC;
 using ::tflite::gpu::Convolution2DAttributes;
 using ::tflite::gpu::DataType;
 using ::tflite::gpu::DivideRoundUp;
 using ::tflite::gpu::HW;
 using ::tflite::gpu::Linear;
 using ::tflite::gpu::OHWI;
 using ::tflite::gpu::OperationType;
 using ::tflite::gpu::Tensor;
 using ::tflite::gpu::TensorFloat32;
 using ::tflite::gpu::TensorRef;
 using ::tflite::gpu::ValueId;
 using ::tflite::gpu::metal::ConvolutionGeneric;
 using ::tflite::gpu::metal::ConvolutionWino4x4To6x6;
 using ::tflite::gpu::metal::CompareVectors;
 using ::tflite::gpu::metal::SingleOpModel;

 @interface ConvTest : XCTestCase
 @end

 @implementation ConvTest
 - (void)setUp {
   [super setUp];
 }

 - (void)testO2H2W1I1Stride1x1Dilation1x1 {
   TensorRef<BHWC> input;
   input.type = DataType::FLOAT32;
   input.ref = 0;
   input.shape = BHWC(1, 2, 2, 1);

   Convolution2DAttributes attr;
   Tensor<Linear, DataType::FLOAT32> bias;
   bias.shape.v = 2;
   bias.id = 1;
   bias.data = {1, 1};
   attr.bias = std::move(bias);

   Tensor<OHWI, DataType::FLOAT32> weights;
   weights.shape = OHWI(2, 2, 1, 1);
   weights.id = 2;
   weights.data = {1, 2, 3, 4};
   attr.weights = std::move(weights);

   attr.dilations = HW(1, 1);
   attr.padding.prepended = HW(0, 0);
   attr.padding.appended = HW(1, 0);
   attr.strides = HW(1, 1);

   TensorRef<BHWC> output;
   output.type = DataType::FLOAT32;
   output.ref = 3;
   output.shape = BHWC(1, 2, 2, 2);

   SingleOpModel model({ToString(OperationType::CONVOLUTION_2D), std::move(attr)}, {input},
                       {output});
   XCTAssertTrue(model.PopulateTensor(0, {1, 1, 1, 1}));
   auto status = model.Invoke();
   XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
   status = CompareVectors({4, 8, 4, 8, 2, 4, 2, 4}, model.GetOutput(0), 1e-6f);
   XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
 }

 - (void)testO1H2W2I1Stride1x1Dilation2x2 {
   TensorRef<BHWC> input;
   input.type = DataType::FLOAT32;
   input.ref = 0;
   input.shape = BHWC(1, 3, 3, 1);

   Convolution2DAttributes attr;
   Tensor<Linear, DataType::FLOAT32> bias;
   bias.shape.v = 2;
   bias.id = 1;
   bias.data.push_back(0.0);
   attr.bias = std::move(bias);

   Tensor<OHWI, DataType::FLOAT32> weights;
   weights.shape = OHWI(1, 2, 2, 1);
   weights.id = 2;
   weights.data = {1, 2, 3, 4};
   attr.weights = std::move(weights);

   attr.dilations = HW(2, 2);
   attr.padding.prepended = HW(0, 0);
   attr.padding.appended = HW(0, 0);
   attr.strides = HW(1, 1);

   TensorRef<BHWC> output;
   output.type = DataType::FLOAT32;
   output.ref = 3;
   output.shape = BHWC(1, 1, 1, 1);

   SingleOpModel model({ToString(OperationType::CONVOLUTION_2D), std::move(attr)}, {input},
                       {output});
   XCTAssertTrue(model.PopulateTensor(0, {1, 1, 1, 1, 1, 1, 1, 1, 1}));
   auto status = model.Invoke();
   XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
   status = CompareVectors({10}, model.GetOutput(0), 1e-6f);
   XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
 }

 - (void)testO1H3W3I1Stride1x1Dilation1x1 {
   TensorRef<BHWC> input;
   input.type = DataType::FLOAT32;
   input.ref = 0;
   input.shape = BHWC(1, 2, 2, 1);

   Convolution2DAttributes attr;
   Tensor<Linear, DataType::FLOAT32> bias;
   bias.shape.v = 1;
   bias.id = 1;
   bias.data.push_back(1.0);
   attr.bias = std::move(bias);

   Tensor<OHWI, DataType::FLOAT32> weights;
   weights.shape = OHWI(1, 3, 3, 1);
   weights.id = 2;
   weights.data = {1, 2, 3, 1, 2, 3, 1, 2, 3};
   attr.weights = std::move(weights);

   attr.dilations = HW(1, 1);
   attr.padding.prepended = HW(1, 1);
   attr.padding.appended = HW(0, 0);
   attr.strides = HW(1, 1);

   TensorRef<BHWC> output;
   output.type = DataType::FLOAT32;
   output.ref = 3;
   output.shape = BHWC(1, 1, 1, 1);

   SingleOpModel model({ToString(OperationType::CONVOLUTION_2D), std::move(attr)}, {input},
                       {output});
   XCTAssertTrue(model.PopulateTensor(0, {1, 1, 1, 1}));
   auto status = model.Invoke();
   XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
   status = CompareVectors({11}, model.GetOutput(0), 1e-6f);
   XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
 }

 - (void)testO2H1W1I2Stride1x1Dilation1x1 {
   TensorRef<BHWC> input;
   input.type = DataType::FLOAT32;
   input.ref = 0;
   input.shape = BHWC(1, 2, 1, 2);

   Convolution2DAttributes attr;
   Tensor<Linear, DataType::FLOAT32> bias;
   bias.shape.v = 2;
   bias.id = 1;
   bias.data = {1, 1};
   attr.bias = std::move(bias);

   Tensor<OHWI, DataType::FLOAT32> weights;
   weights.shape = OHWI(2, 1, 1, 2);
   weights.id = 2;
   weights.data = {1, 2, 3, 4};
   attr.weights = std::move(weights);

   attr.dilations = HW(1, 1);
   attr.padding.prepended = HW(0, 0);
   attr.padding.appended = HW(0, 0);
   attr.strides = HW(1, 1);

   TensorRef<BHWC> output;
   output.type = DataType::FLOAT32;
   output.ref = 3;
   output.shape = BHWC(1, 2, 1, 2);

   SingleOpModel model({ToString(OperationType::CONVOLUTION_2D), std::move(attr)}, {input},
                       {output});
   XCTAssertTrue(model.PopulateTensor(0, {1, 1, 1, 1}));
   auto status = model.Invoke();
   XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
   status = CompareVectors({4, 8, 4, 8}, model.GetOutput(0), 1e-6f);
   XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
 }

 - (void)testO1H1W1I1Stride2x2Dilation1x1 {
   TensorRef<BHWC> input;
   input.type = DataType::FLOAT32;
   input.ref = 0;
   input.shape = BHWC(1, 3, 3, 1);

   Convolution2DAttributes attr;
   Tensor<Linear, DataType::FLOAT32> bias;
   bias.shape.v = 2;
   bias.id = 1;
   bias.data.push_back(0.0);
   attr.bias = std::move(bias);

   Tensor<OHWI, DataType::FLOAT32> weights;
   weights.shape = OHWI(1, 1, 1, 1);
   weights.id = 2;
   weights.data.push_back(2.0);

   attr.weights = std::move(weights);

   attr.dilations = HW(1, 1);
   attr.padding.prepended = HW(0, 0);
   attr.padding.appended = HW(0, 0);
   attr.strides = HW(2, 2);

   TensorRef<BHWC> output;
   output.type = DataType::FLOAT32;
   output.ref = 3;
   output.shape = BHWC(1, 2, 2, 1);

   SingleOpModel model({ToString(OperationType::CONVOLUTION_2D), std::move(attr)}, {input},
                       {output});
   XCTAssertTrue(model.PopulateTensor(0, {1, 0, 2, 0, 0, 0, 4, 0, 8}));
   auto status = model.Invoke();
   XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
   status = CompareVectors({2, 4, 8, 16}, model.GetOutput(0), 1e-6f);
   XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
 }

 - (void)testWinograd4x4To6x6 {
   const int src_channels = 7;
   const int dst_channels = 13;
   Convolution2DAttributes attr;
   attr.padding.prepended = HW(0, 0);
   attr.padding.appended = HW(10, 10);
   attr.strides = HW(1, 1);
   attr.dilations = HW(1, 1);
   attr.weights.shape = OHWI(dst_channels, 3, 3, src_channels);
   attr.weights.data.resize(attr.weights.shape.DimensionsProduct());
   for (int i = 0; i < attr.weights.data.size(); ++i) {
     attr.weights.data[i] = sin(i);
   }
   attr.bias.shape = Linear(dst_channels);
   attr.bias.data.resize(attr.bias.shape.DimensionsProduct());
   for (int i = 0; i < attr.bias.data.size(); ++i) {
     attr.bias.data[i] = sin(i);
   }

   auto src_shape = BHWC(1, 17, 13, src_channels);
   auto dst_shape = CalculateOutputShape(src_shape, attr);
   int new_width = src_shape.w + attr.padding.prepended.w +
                         attr.padding.appended.w - 2;
   int new_height = src_shape.h + attr.padding.prepended.h +
                          attr.padding.appended.h - 2;
   BHWC conv_shape;
   conv_shape.b = dst_shape.b;
   conv_shape.h = 36;
   conv_shape.w = DivideRoundUp(new_width, 4) * DivideRoundUp(new_height, 4);
   conv_shape.c = dst_shape.c;

   TensorFloat32 src_tensor;
   src_tensor.shape = src_shape;
   src_tensor.data.resize(src_tensor.shape.DimensionsProduct());
   for (int i = 0; i < src_tensor.data.size(); ++i) {
     src_tensor.data[i] = sin(i);
   }

   id<MTLDevice> device = MTLCreateSystemDefaultDevice();

   std::map<ValueId, TensorFloat32> inputs_v0;
   inputs_v0[0] = src_tensor;
   std::map<ValueId, TensorFloat32> outputs_v0;
   outputs_v0[1].shape = dst_shape;
   outputs_v0[1].data.resize(dst_shape.DimensionsProduct());

   tflite::gpu::OperationDef op_def;
   op_def.precision = tflite::gpu::CalculationsPrecision::F32;
   tflite::gpu::TensorDescriptor tensor_descriptor = tflite::gpu::TensorDescriptor{
     DataType::FLOAT32, tflite::gpu::TensorStorageType::BUFFER, tflite::gpu::Layout::HWC};
   op_def.src_tensors.push_back(tensor_descriptor);
   op_def.dst_tensors.push_back(tensor_descriptor);

   std::string device_name = std::string([[device name] UTF8String]);
   tflite::gpu::GpuInfo gpu_info;
   tflite::gpu::GetGpuInfoFromDeviceDescription(device_name, tflite::gpu::GpuApi::kMetal, &gpu_info);
   auto gpu_op0 = ConvolutionGeneric(op_def, dst_shape, attr, gpu_info);
   std::vector<tflite::gpu::metal::NodeDescriptor> nodes(1);
   nodes[0].task = std::make_shared<tflite::gpu::metal::ComputeTaskDescriptor>(std::move(gpu_op0));
   nodes[0].src_tensors_ids = {0};
   nodes[0].dst_tensors_ids = {1};
   auto status = RunGraph(nodes, device, inputs_v0, &outputs_v0);
   XCTAssertTrue(status.ok(), @"%s", status.error_message().c_str());

   tflite::gpu::metal::Winograd4x4To36Attributes wino_up_attr;
   wino_up_attr.padding = attr.padding;
   auto gpu_op1 = tflite::gpu::metal::Winograd4x4To36(op_def, wino_up_attr);

   auto gpu_op2 = ConvolutionWino4x4To6x6(op_def, conv_shape, attr, gpu_info);

   tflite::gpu::metal::Winograd36To4x4Attributes wino_down_attr;
   wino_down_attr.output_shape = dst_shape;
   wino_down_attr.biases = attr.bias;
   auto gpu_op3 = tflite::gpu::metal::Winograd36To4x4(op_def, wino_down_attr);

   std::map<ValueId, TensorFloat32> inputs_v1;
   inputs_v1[0] = src_tensor;
   std::map<ValueId, TensorFloat32> outputs_v1;
   outputs_v1[2].shape = conv_shape;
   outputs_v1[2].shape.c = src_shape.c;
   outputs_v1[2].data.resize(outputs_v1[2].shape.DimensionsProduct());
   nodes[0].task = std::make_shared<tflite::gpu::metal::ComputeTaskDescriptor>(std::move(gpu_op1));
   nodes[0].src_tensors_ids = {0};
   nodes[0].dst_tensors_ids = {2};
   status = RunGraph(nodes, device, inputs_v1, &outputs_v1);

   std::map<ValueId, TensorFloat32> inputs_v2;
   inputs_v2[2] = outputs_v1[2];
   std::map<ValueId, TensorFloat32> outputs_v2;
   outputs_v2[3].shape = conv_shape;
   outputs_v2[3].data.resize(outputs_v2[3].shape.DimensionsProduct());
   nodes[0].task = std::make_shared<tflite::gpu::metal::ComputeTaskDescriptor>(std::move(gpu_op2));
   nodes[0].src_tensors_ids = {2};
   nodes[0].dst_tensors_ids = {3};
   status = RunGraph(nodes, device, inputs_v2, &outputs_v2);

   std::map<ValueId, TensorFloat32> inputs_v3;
   inputs_v3[3] = outputs_v2[3];
   std::map<ValueId, TensorFloat32> outputs_v3;
   outputs_v3[1].shape = dst_shape;
   outputs_v3[1].data.resize(outputs_v3[1].shape.DimensionsProduct());
   nodes[0].task = std::make_shared<tflite::gpu::metal::ComputeTaskDescriptor>(std::move(gpu_op3));
   nodes[0].src_tensors_ids = {3};
   nodes[0].dst_tensors_ids = {1};
   status = RunGraph(nodes, device, inputs_v3, &outputs_v3);
   XCTAssertTrue(status.ok(), @"%s", status.error_message().c_str());

   status = CompareVectors(outputs_v0[1].data, outputs_v3[1].data, 1e-4f);
   XCTAssertTrue(status.ok(), @"%s", status.error_message().c_str());
 }

 @end
	/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.

	Licensed under the Apache License, Version 2.0 (the "License");
	you may not use this file except in compliance with the License.
	You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.
	==============================================================================*/

	#include "tensorflow/lite/delegates/gpu/metal/kernels/conv.h"
	#include "tensorflow/lite/delegates/gpu/metal/kernels/winograd.h"

	#import <XCTest/XCTest.h>

	#include <string>
	#include <vector>

	#include "tensorflow/lite/delegates/gpu/common/operations.h"
	#include "tensorflow/lite/delegates/gpu/common/shape.h"
	#include "tensorflow/lite/delegates/gpu/common/status.h"
	#include "tensorflow/lite/delegates/gpu/common/tensor.h"
	#include "tensorflow/lite/delegates/gpu/common/util.h"
	#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h"
	#include "tensorflow/lite/delegates/gpu/metal/kernels/test_util.h"

	using ::tflite::gpu::Axis;
	using ::tflite::gpu::BHWC;
	using ::tflite::gpu::Convolution2DAttributes;
	using ::tflite::gpu::DataType;
	using ::tflite::gpu::DivideRoundUp;
	using ::tflite::gpu::HW;
	using ::tflite::gpu::Linear;
	using ::tflite::gpu::OHWI;
	using ::tflite::gpu::OperationType;
	using ::tflite::gpu::Tensor;
	using ::tflite::gpu::TensorFloat32;
	using ::tflite::gpu::TensorRef;
	using ::tflite::gpu::ValueId;
	using ::tflite::gpu::metal::ConvolutionGeneric;
	using ::tflite::gpu::metal::ConvolutionWino4x4To6x6;
	using ::tflite::gpu::metal::CompareVectors;
	using ::tflite::gpu::metal::SingleOpModel;

	@interface ConvTest : XCTestCase
	@end

	@implementation ConvTest
	- (void)setUp {
	[super setUp];
	}

	- (void)testO2H2W1I1Stride1x1Dilation1x1 {
	TensorRef<BHWC> input;
	input.type = DataType::FLOAT32;
	input.ref = 0;
	input.shape = BHWC(1, 2, 2, 1);

	Convolution2DAttributes attr;
	Tensor<Linear, DataType::FLOAT32> bias;
	bias.shape.v = 2;
	bias.id = 1;
	bias.data = {1, 1};
	attr.bias = std::move(bias);

	Tensor<OHWI, DataType::FLOAT32> weights;
	weights.shape = OHWI(2, 2, 1, 1);
	weights.id = 2;
	weights.data = {1, 2, 3, 4};
	attr.weights = std::move(weights);

	attr.dilations = HW(1, 1);
	attr.padding.prepended = HW(0, 0);
	attr.padding.appended = HW(1, 0);
	attr.strides = HW(1, 1);

	TensorRef<BHWC> output;
	output.type = DataType::FLOAT32;
	output.ref = 3;
	output.shape = BHWC(1, 2, 2, 2);

	SingleOpModel model({ToString(OperationType::CONVOLUTION_2D), std::move(attr)}, {input},
	{output});
	XCTAssertTrue(model.PopulateTensor(0, {1, 1, 1, 1}));
	auto status = model.Invoke();
	XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
	status = CompareVectors({4, 8, 4, 8, 2, 4, 2, 4}, model.GetOutput(0), 1e-6f);
	XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
	}

	- (void)testO1H2W2I1Stride1x1Dilation2x2 {
	TensorRef<BHWC> input;
	input.type = DataType::FLOAT32;
	input.ref = 0;
	input.shape = BHWC(1, 3, 3, 1);

	Convolution2DAttributes attr;
	Tensor<Linear, DataType::FLOAT32> bias;
	bias.shape.v = 2;
	bias.id = 1;
	bias.data.push_back(0.0);
	attr.bias = std::move(bias);

	Tensor<OHWI, DataType::FLOAT32> weights;
	weights.shape = OHWI(1, 2, 2, 1);
	weights.id = 2;
	weights.data = {1, 2, 3, 4};
	attr.weights = std::move(weights);

	attr.dilations = HW(2, 2);
	attr.padding.prepended = HW(0, 0);
	attr.padding.appended = HW(0, 0);
	attr.strides = HW(1, 1);

	TensorRef<BHWC> output;
	output.type = DataType::FLOAT32;
	output.ref = 3;
	output.shape = BHWC(1, 1, 1, 1);

	SingleOpModel model({ToString(OperationType::CONVOLUTION_2D), std::move(attr)}, {input},
	{output});
	XCTAssertTrue(model.PopulateTensor(0, {1, 1, 1, 1, 1, 1, 1, 1, 1}));
	auto status = model.Invoke();
	XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
	status = CompareVectors({10}, model.GetOutput(0), 1e-6f);
	XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
	}

	- (void)testO1H3W3I1Stride1x1Dilation1x1 {
	TensorRef<BHWC> input;
	input.type = DataType::FLOAT32;
	input.ref = 0;
	input.shape = BHWC(1, 2, 2, 1);

	Convolution2DAttributes attr;
	Tensor<Linear, DataType::FLOAT32> bias;
	bias.shape.v = 1;
	bias.id = 1;
	bias.data.push_back(1.0);
	attr.bias = std::move(bias);

	Tensor<OHWI, DataType::FLOAT32> weights;
	weights.shape = OHWI(1, 3, 3, 1);
	weights.id = 2;
	weights.data = {1, 2, 3, 1, 2, 3, 1, 2, 3};
	attr.weights = std::move(weights);

	attr.dilations = HW(1, 1);
	attr.padding.prepended = HW(1, 1);
	attr.padding.appended = HW(0, 0);
	attr.strides = HW(1, 1);

	TensorRef<BHWC> output;
	output.type = DataType::FLOAT32;
	output.ref = 3;
	output.shape = BHWC(1, 1, 1, 1);

	SingleOpModel model({ToString(OperationType::CONVOLUTION_2D), std::move(attr)}, {input},
	{output});
	XCTAssertTrue(model.PopulateTensor(0, {1, 1, 1, 1}));
	auto status = model.Invoke();
	XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
	status = CompareVectors({11}, model.GetOutput(0), 1e-6f);
	XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
	}

	- (void)testO2H1W1I2Stride1x1Dilation1x1 {
	TensorRef<BHWC> input;
	input.type = DataType::FLOAT32;
	input.ref = 0;
	input.shape = BHWC(1, 2, 1, 2);

	Convolution2DAttributes attr;
	Tensor<Linear, DataType::FLOAT32> bias;
	bias.shape.v = 2;
	bias.id = 1;
	bias.data = {1, 1};
	attr.bias = std::move(bias);

	Tensor<OHWI, DataType::FLOAT32> weights;
	weights.shape = OHWI(2, 1, 1, 2);
	weights.id = 2;
	weights.data = {1, 2, 3, 4};
	attr.weights = std::move(weights);

	attr.dilations = HW(1, 1);
	attr.padding.prepended = HW(0, 0);
	attr.padding.appended = HW(0, 0);
	attr.strides = HW(1, 1);

	TensorRef<BHWC> output;
	output.type = DataType::FLOAT32;
	output.ref = 3;
	output.shape = BHWC(1, 2, 1, 2);

	SingleOpModel model({ToString(OperationType::CONVOLUTION_2D), std::move(attr)}, {input},
	{output});
	XCTAssertTrue(model.PopulateTensor(0, {1, 1, 1, 1}));
	auto status = model.Invoke();
	XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
	status = CompareVectors({4, 8, 4, 8}, model.GetOutput(0), 1e-6f);
	XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
	}

	- (void)testO1H1W1I1Stride2x2Dilation1x1 {
	TensorRef<BHWC> input;
	input.type = DataType::FLOAT32;
	input.ref = 0;
	input.shape = BHWC(1, 3, 3, 1);

	Convolution2DAttributes attr;
	Tensor<Linear, DataType::FLOAT32> bias;
	bias.shape.v = 2;
	bias.id = 1;
	bias.data.push_back(0.0);
	attr.bias = std::move(bias);

	Tensor<OHWI, DataType::FLOAT32> weights;
	weights.shape = OHWI(1, 1, 1, 1);
	weights.id = 2;
	weights.data.push_back(2.0);

	attr.weights = std::move(weights);

	attr.dilations = HW(1, 1);
	attr.padding.prepended = HW(0, 0);
	attr.padding.appended = HW(0, 0);
	attr.strides = HW(2, 2);

	TensorRef<BHWC> output;
	output.type = DataType::FLOAT32;
	output.ref = 3;
	output.shape = BHWC(1, 2, 2, 1);

	SingleOpModel model({ToString(OperationType::CONVOLUTION_2D), std::move(attr)}, {input},
	{output});
	XCTAssertTrue(model.PopulateTensor(0, {1, 0, 2, 0, 0, 0, 4, 0, 8}));
	auto status = model.Invoke();
	XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
	status = CompareVectors({2, 4, 8, 16}, model.GetOutput(0), 1e-6f);
	XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
	}

	- (void)testWinograd4x4To6x6 {
	const int src_channels = 7;
	const int dst_channels = 13;
	Convolution2DAttributes attr;
	attr.padding.prepended = HW(0, 0);
	attr.padding.appended = HW(10, 10);
	attr.strides = HW(1, 1);
	attr.dilations = HW(1, 1);
	attr.weights.shape = OHWI(dst_channels, 3, 3, src_channels);
	attr.weights.data.resize(attr.weights.shape.DimensionsProduct());
	for (int i = 0; i < attr.weights.data.size(); ++i) {
	attr.weights.data[i] = sin(i);
	}
	attr.bias.shape = Linear(dst_channels);
	attr.bias.data.resize(attr.bias.shape.DimensionsProduct());
	for (int i = 0; i < attr.bias.data.size(); ++i) {
	attr.bias.data[i] = sin(i);
	}

	auto src_shape = BHWC(1, 17, 13, src_channels);
	auto dst_shape = CalculateOutputShape(src_shape, attr);
	int new_width = src_shape.w + attr.padding.prepended.w +
	attr.padding.appended.w - 2;
	int new_height = src_shape.h + attr.padding.prepended.h +
	attr.padding.appended.h - 2;
	BHWC conv_shape;
	conv_shape.b = dst_shape.b;
	conv_shape.h = 36;
	conv_shape.w = DivideRoundUp(new_width, 4) * DivideRoundUp(new_height, 4);
	conv_shape.c = dst_shape.c;

	TensorFloat32 src_tensor;
	src_tensor.shape = src_shape;
	src_tensor.data.resize(src_tensor.shape.DimensionsProduct());
	for (int i = 0; i < src_tensor.data.size(); ++i) {
	src_tensor.data[i] = sin(i);
	}

	id<MTLDevice> device = MTLCreateSystemDefaultDevice();

	std::map<ValueId, TensorFloat32> inputs_v0;
	inputs_v0[0] = src_tensor;
	std::map<ValueId, TensorFloat32> outputs_v0;
	outputs_v0[1].shape = dst_shape;
	outputs_v0[1].data.resize(dst_shape.DimensionsProduct());

	tflite::gpu::OperationDef op_def;
	op_def.precision = tflite::gpu::CalculationsPrecision::F32;
	tflite::gpu::TensorDescriptor tensor_descriptor = tflite::gpu::TensorDescriptor{
	DataType::FLOAT32, tflite::gpu::TensorStorageType::BUFFER, tflite::gpu::Layout::HWC};
	op_def.src_tensors.push_back(tensor_descriptor);
	op_def.dst_tensors.push_back(tensor_descriptor);

	std::string device_name = std::string([[device name] UTF8String]);
	tflite::gpu::GpuInfo gpu_info;
	tflite::gpu::GetGpuInfoFromDeviceDescription(device_name, tflite::gpu::GpuApi::kMetal, &gpu_info);
	auto gpu_op0 = ConvolutionGeneric(op_def, dst_shape, attr, gpu_info);
	std::vector<tflite::gpu::metal::NodeDescriptor> nodes(1);
	nodes[0].task = std::make_shared<tflite::gpu::metal::ComputeTaskDescriptor>(std::move(gpu_op0));
	nodes[0].src_tensors_ids = {0};
	nodes[0].dst_tensors_ids = {1};
	auto status = RunGraph(nodes, device, inputs_v0, &outputs_v0);
	XCTAssertTrue(status.ok(), @"%s", status.error_message().c_str());

	tflite::gpu::metal::Winograd4x4To36Attributes wino_up_attr;
	wino_up_attr.padding = attr.padding;
	auto gpu_op1 = tflite::gpu::metal::Winograd4x4To36(op_def, wino_up_attr);

	auto gpu_op2 = ConvolutionWino4x4To6x6(op_def, conv_shape, attr, gpu_info);

	tflite::gpu::metal::Winograd36To4x4Attributes wino_down_attr;
	wino_down_attr.output_shape = dst_shape;
	wino_down_attr.biases = attr.bias;
	auto gpu_op3 = tflite::gpu::metal::Winograd36To4x4(op_def, wino_down_attr);

	std::map<ValueId, TensorFloat32> inputs_v1;
	inputs_v1[0] = src_tensor;
	std::map<ValueId, TensorFloat32> outputs_v1;
	outputs_v1[2].shape = conv_shape;
	outputs_v1[2].shape.c = src_shape.c;
	outputs_v1[2].data.resize(outputs_v1[2].shape.DimensionsProduct());
	nodes[0].task = std::make_shared<tflite::gpu::metal::ComputeTaskDescriptor>(std::move(gpu_op1));
	nodes[0].src_tensors_ids = {0};
	nodes[0].dst_tensors_ids = {2};
	status = RunGraph(nodes, device, inputs_v1, &outputs_v1);

	std::map<ValueId, TensorFloat32> inputs_v2;
	inputs_v2[2] = outputs_v1[2];
	std::map<ValueId, TensorFloat32> outputs_v2;
	outputs_v2[3].shape = conv_shape;
	outputs_v2[3].data.resize(outputs_v2[3].shape.DimensionsProduct());
	nodes[0].task = std::make_shared<tflite::gpu::metal::ComputeTaskDescriptor>(std::move(gpu_op2));
	nodes[0].src_tensors_ids = {2};
	nodes[0].dst_tensors_ids = {3};
	status = RunGraph(nodes, device, inputs_v2, &outputs_v2);

	std::map<ValueId, TensorFloat32> inputs_v3;
	inputs_v3[3] = outputs_v2[3];
	std::map<ValueId, TensorFloat32> outputs_v3;
	outputs_v3[1].shape = dst_shape;
	outputs_v3[1].data.resize(outputs_v3[1].shape.DimensionsProduct());
	nodes[0].task = std::make_shared<tflite::gpu::metal::ComputeTaskDescriptor>(std::move(gpu_op3));
	nodes[0].src_tensors_ids = {3};
	nodes[0].dst_tensors_ids = {1};
	status = RunGraph(nodes, device, inputs_v3, &outputs_v3);
	XCTAssertTrue(status.ok(), @"%s", status.error_message().c_str());

	status = CompareVectors(outputs_v0[1].data, outputs_v3[1].data, 1e-4f);
	XCTAssertTrue(status.ok(), @"%s", status.error_message().c_str());
	}

	@end