tensorflow/lite/kernels/depthwise_conv.cc - platform/external/tensorflow - Git at Google

 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

 #include "tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h"

 #include <cassert>
 #include <cmath>
 #include <cstdio>
 #include <cstdlib>
 #include <iostream>
 #include <limits>

 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_multithread.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/kernels/padding.h"

 namespace tflite {
 namespace ops {
 namespace builtin {
 namespace depthwise_conv {

 constexpr int kInputTensor = 0;
 constexpr int kFilterTensor = 1;
 constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;

 // This file has three implementation of DepthwiseConv.
 enum KernelType {
   kReference,
   kGenericOptimized,  // Neon-free
   kNeonOptimized,
 };

 struct OpData {
   TfLitePaddingValues padding;
   // The scaling factor from input to output (aka the 'real multiplier') can
   // be represented as a fixed point multiplier plus a left shift.
   int32_t output_multiplier;
   int output_shift;
   // The range of the fused activation layer. For example for kNone and
   // uint8_t these would be 0 and 255.
   int32_t output_activation_min;
   int32_t output_activation_max;

   // Per channel output multiplier and shift.
   std::vector<int32_t> per_channel_output_multiplier;
   std::vector<int> per_channel_output_shift;
 };

 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   // This is a builtin op, so we don't use the contents in 'buffer', if any.
   // Instead, we allocate a new object to carry information from Prepare() to
   // Eval().
   return new OpData;
 }

 void Free(TfLiteContext* context, void* buffer) {
   delete reinterpret_cast<OpData*>(buffer);
 }

 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   auto* params =
       reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
   OpData* data = reinterpret_cast<OpData*>(node->user_data);

   // TODO(ahentz): use could use GetOptionalInputTensor() here, but we need to
   // decide whether we are OK with optional tensors being completely absent, as
   // opposed to having -1 as their index.
   bool hasBias = NumInputs(node) == 3;

   TF_LITE_ENSURE(context, hasBias || NumInputs(node) == 2);
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
   const TfLiteTensor* bias = nullptr;

   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);

   TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
   TF_LITE_ENSURE_EQ(context, NumDimensions(filter), 4);

   const TfLiteType data_type = input->type;
   TF_LITE_ENSURE(context,
                  data_type == kTfLiteFloat32 || data_type == kTfLiteUInt8 ||
                      data_type == kTfLiteInt8 || data_type == kTfLiteInt16);
   TF_LITE_ENSURE_EQ(context, output->type, data_type);
   TF_LITE_ENSURE(context,
                  filter->type == data_type || data_type == kTfLiteInt16);
   // Filter in DepthwiseConv is expected to be [1, H, W, O].
   TF_LITE_ENSURE_EQ(context, SizeOfDimension(filter, 0), 1);

   if (hasBias) {
     bias = GetInput(context, node, kBiasTensor);
     if (data_type == kTfLiteUInt8 || data_type == kTfLiteInt8) {
       TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
       TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
     } else if (data_type == kTfLiteInt16) {
       TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt64);
       TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
     } else {
       TF_LITE_ENSURE_EQ(context, bias->type, data_type);
     }
     TF_LITE_ENSURE_EQ(context, NumDimensions(bias), 1);
     TF_LITE_ENSURE_EQ(context, SizeOfDimension(filter, 3),
                       SizeOfDimension(bias, 0));
   }

   int channels_out = SizeOfDimension(filter, 3);
   int width = SizeOfDimension(input, 2);
   int height = SizeOfDimension(input, 1);
   int filter_width = SizeOfDimension(filter, 2);
   int filter_height = SizeOfDimension(filter, 1);
   int batches = SizeOfDimension(input, 0);

   // Matching GetWindowedOutputSize in TensorFlow.
   auto padding = params->padding;
   int out_width, out_height;

   data->padding = ComputePaddingHeightWidth(
       params->stride_height, params->stride_width,
       params->dilation_height_factor, params->dilation_width_factor, height,
       width, filter_height, filter_width, padding, &out_height, &out_width);

   // Note that quantized inference requires that all tensors have their
   // parameters set. This is usually done during quantized training or
   // calibration.
   if (data_type != kTfLiteFloat32) {
     TF_LITE_ENSURE_EQ(context, filter->quantization.type,
                       kTfLiteAffineQuantization);
     const auto* affine_quantization =
         reinterpret_cast<TfLiteAffineQuantization*>(
             filter->quantization.params);
     TF_LITE_ENSURE(context, affine_quantization);
     TF_LITE_ENSURE(context, affine_quantization->scale);
     TF_LITE_ENSURE(context, (affine_quantization->scale->size == 1 ||
                              affine_quantization->scale->size == channels_out));

     data->per_channel_output_multiplier.resize(channels_out);
     data->per_channel_output_shift.resize(channels_out);
     TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
         context, input, filter, bias, output, params->activation,
         &data->output_multiplier, &data->output_shift,
         &data->output_activation_min, &data->output_activation_max,
         data->per_channel_output_multiplier.data(),
         data->per_channel_output_shift.data(), channels_out));
   }

   TfLiteIntArray* outputSize = TfLiteIntArrayCreate(4);
   outputSize->data[0] = batches;
   outputSize->data[1] = out_height;
   outputSize->data[2] = out_width;
   outputSize->data[3] = channels_out;
   return context->ResizeTensor(context, output, outputSize);
 }

 TfLiteStatus ComputeDepthMultiplier(TfLiteContext* context,
                                     const TfLiteTensor* input,
                                     const TfLiteTensor* filter,
                                     int16* depth_multiplier) {
   int num_filter_channels = SizeOfDimension(filter, 3);
   int num_input_channels = SizeOfDimension(input, 3);
   TF_LITE_ENSURE_EQ(context, num_filter_channels % num_input_channels, 0);

   *depth_multiplier = num_filter_channels / num_input_channels;
   return kTfLiteOk;
 }

 template <KernelType kernel_type>
 TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
                        TfLiteDepthwiseConvParams* params, OpData* data,
                        const TfLiteTensor* input, const TfLiteTensor* filter,
                        const TfLiteTensor* bias, TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRange(params->activation, &output_activation_min,
                            &output_activation_max);

   DepthwiseParams op_params;
   op_params.padding_type = PaddingType::kSame;
   op_params.padding_values.width = data->padding.width;
   op_params.padding_values.height = data->padding.height;
   op_params.stride_width = params->stride_width;
   op_params.stride_height = params->stride_height;
   op_params.dilation_width_factor = params->dilation_width_factor;
   op_params.dilation_height_factor = params->dilation_height_factor;
   op_params.float_activation_min = output_activation_min;
   op_params.float_activation_max = output_activation_max;
   TF_LITE_ENSURE_STATUS(ComputeDepthMultiplier(context, input, filter,
                                                &op_params.depth_multiplier));
   if (kernel_type == kReference) {
     reference_ops::DepthwiseConv(
         op_params, GetTensorShape(input), GetTensorData<float>(input),
         GetTensorShape(filter), GetTensorData<float>(filter),
         GetTensorShape(bias), GetTensorData<float>(bias),
         GetTensorShape(output), GetTensorData<float>(output));
   } else {
     optimized_ops::DepthwiseConv<float, float>(
         op_params, GetTensorShape(input), GetTensorData<float>(input),
         GetTensorShape(filter), GetTensorData<float>(filter),
         GetTensorShape(bias), GetTensorData<float>(bias),
         GetTensorShape(output), GetTensorData<float>(output),
         CpuBackendContext::GetFromContext(context));
   }
   return kTfLiteOk;
 }

 template <KernelType kernel_type>
 TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                            TfLiteDepthwiseConvParams* params, OpData* data,
                            const TfLiteTensor* input,
                            const TfLiteTensor* filter, const TfLiteTensor* bias,
                            TfLiteTensor* output) {
   auto input_offset = -input->params.zero_point;
   auto filter_offset = -filter->params.zero_point;
   auto output_offset = output->params.zero_point;

   DepthwiseParams op_params;
   op_params.padding_type = PaddingType::kSame;
   op_params.padding_values.width = data->padding.width;
   op_params.padding_values.height = data->padding.height;
   op_params.stride_width = params->stride_width;
   op_params.stride_height = params->stride_height;
   op_params.dilation_width_factor = params->dilation_width_factor;
   op_params.dilation_height_factor = params->dilation_height_factor;
   op_params.input_offset = input_offset;
   op_params.weights_offset = filter_offset;
   op_params.output_offset = output_offset;
   op_params.output_multiplier = data->output_multiplier;
   op_params.output_shift = -data->output_shift;
   op_params.quantized_activation_min = data->output_activation_min;
   op_params.quantized_activation_max = data->output_activation_max;
   TF_LITE_ENSURE_STATUS(ComputeDepthMultiplier(context, input, filter,
                                                &op_params.depth_multiplier));
   if (kernel_type == kReference) {
     reference_ops::DepthwiseConv(
         op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
         GetTensorShape(filter), GetTensorData<uint8_t>(filter),
         GetTensorShape(bias), GetTensorData<int32_t>(bias),
         GetTensorShape(output), GetTensorData<uint8_t>(output));
   } else {
     optimized_ops::DepthwiseConv<uint8, int32>(
         op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
         GetTensorShape(filter), GetTensorData<uint8_t>(filter),
         GetTensorShape(bias), GetTensorData<int32_t>(bias),
         GetTensorShape(output), GetTensorData<uint8_t>(output),
         CpuBackendContext::GetFromContext(context));
   }
   return kTfLiteOk;
 }

 template <KernelType kernel_type>
 TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
                                      TfLiteDepthwiseConvParams* params,
                                      OpData* data, const TfLiteTensor* input,
                                      const TfLiteTensor* filter,
                                      const TfLiteTensor* bias,
                                      TfLiteTensor* output) {
   DepthwiseParams op_params;
   op_params.padding_type = PaddingType::kSame;
   op_params.padding_values.width = data->padding.width;
   op_params.padding_values.height = data->padding.height;
   op_params.stride_width = params->stride_width;
   op_params.stride_height = params->stride_height;
   op_params.dilation_width_factor = params->dilation_width_factor;
   op_params.dilation_height_factor = params->dilation_height_factor;
   op_params.input_offset = -input->params.zero_point;
   op_params.weights_offset = 0;
   op_params.output_offset = output->params.zero_point;
   // TODO(b/130439627): Use calculated value for clamping.
   op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
   op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
   TF_LITE_ENSURE_STATUS(ComputeDepthMultiplier(context, input, filter,
                                                &op_params.depth_multiplier));

   if (kernel_type == kReference) {
     reference_integer_ops::DepthwiseConvPerChannel(
         op_params, data->per_channel_output_multiplier.data(),
         data->per_channel_output_shift.data(), GetTensorShape(input),
         GetTensorData<int8>(input), GetTensorShape(filter),
         GetTensorData<int8>(filter), GetTensorShape(bias),
         GetTensorData<int32>(bias), GetTensorShape(output),
         GetTensorData<int8>(output));
   } else {
     optimized_integer_ops::DepthwiseConvPerChannel(
         op_params, data->per_channel_output_multiplier.data(),
         data->per_channel_output_shift.data(), GetTensorShape(input),
         GetTensorData<int8>(input), GetTensorShape(filter),
         GetTensorData<int8>(filter), GetTensorShape(bias),
         GetTensorData<int32>(bias), GetTensorShape(output),
         GetTensorData<int8>(output),
         CpuBackendContext::GetFromContext(context));
   }
   return kTfLiteOk;
 }

 TfLiteStatus EvalQuantizedPerChannel16x8(
     TfLiteContext* context, TfLiteNode* node, TfLiteDepthwiseConvParams* params,
     OpData* data, const TfLiteTensor* input, const TfLiteTensor* filter,
     const TfLiteTensor* bias, TfLiteTensor* output) {
   DepthwiseParams op_params;
   op_params.padding_type = PaddingType::kSame;
   op_params.padding_values.width = data->padding.width;
   op_params.padding_values.height = data->padding.height;
   op_params.stride_width = params->stride_width;
   op_params.stride_height = params->stride_height;
   op_params.dilation_width_factor = params->dilation_width_factor;
   op_params.dilation_height_factor = params->dilation_height_factor;
   op_params.depth_multiplier = params->depth_multiplier;
   op_params.input_offset = -input->params.zero_point;
   op_params.weights_offset = 0;
   op_params.output_offset = output->params.zero_point;
   // TODO(b/130439627): Use calculated value for clamping.
   op_params.quantized_activation_min = std::numeric_limits<int16_t>::min();
   op_params.quantized_activation_max = std::numeric_limits<int16_t>::max();

   reference_integer_ops::DepthwiseConvPerChannel(
       op_params, data->per_channel_output_multiplier.data(),
       data->per_channel_output_shift.data(), GetTensorShape(input),
       GetTensorData<int16>(input), GetTensorShape(filter),
       GetTensorData<int8>(filter), GetTensorShape(bias),
       GetTensorData<std::int64_t>(bias), GetTensorShape(output),
       GetTensorData<int16>(output));
   return kTfLiteOk;
 }

 template <KernelType kernel_type, TfLiteType input_type>
 TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node) {
   auto* params =
       reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
   OpData* data = reinterpret_cast<OpData*>(node->user_data);

   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
   const TfLiteTensor* bias =
       (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
   TFLITE_DCHECK_EQ(input_type, input->type);

   switch (input_type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
       return EvalFloat<kernel_type>(context, node, params, data, input, filter,
                                     bias, output);
       break;
     case kTfLiteUInt8:
       return EvalQuantized<kernel_type>(context, node, params, data, input,
                                         filter, bias, output);
       break;
     case kTfLiteInt8:
       return EvalQuantizedPerChannel<kernel_type>(context, node, params, data,
                                                   input, filter, bias, output);
       break;
     case kTfLiteInt16:
       return EvalQuantizedPerChannel16x8(context, node, params, data, input,
                                          filter, bias, output);
       break;
     default:
       context->ReportError(context, "Type %d not currently supported.",
                            input->type);
       return kTfLiteError;
   }
 }

 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);

   switch (input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
       return EvalImpl<kernel_type, kTfLiteFloat32>(context, node);
     case kTfLiteUInt8:
       return EvalImpl<kernel_type, kTfLiteUInt8>(context, node);
     case kTfLiteInt8:
       return EvalImpl<kernel_type, kTfLiteInt8>(context, node);
     case kTfLiteInt16:
       return EvalImpl<kernel_type, kTfLiteInt16>(context, node);
     default:
       context->ReportError(context, "Type %d not currently supported.",
                            input->type);
       return kTfLiteError;
   }
 }

 }  // namespace depthwise_conv

 TfLiteRegistration* Register_DEPTHWISE_CONVOLUTION_REF() {
   static TfLiteRegistration r = {
       depthwise_conv::Init, depthwise_conv::Free, depthwise_conv::Prepare,
       depthwise_conv::Eval<depthwise_conv::kReference>};
   return &r;
 }

 TfLiteRegistration* Register_DEPTHWISE_CONVOLUTION_GENERIC_OPT() {
   static TfLiteRegistration r = {
       depthwise_conv::Init, depthwise_conv::Free, depthwise_conv::Prepare,
       depthwise_conv::Eval<depthwise_conv::kGenericOptimized>};
   return &r;
 }

 TfLiteRegistration* Register_DEPTHWISE_CONVOLUTION_NEON_OPT() {
   static TfLiteRegistration r = {
       depthwise_conv::Init, depthwise_conv::Free, depthwise_conv::Prepare,
       depthwise_conv::Eval<depthwise_conv::kNeonOptimized>};
   return &r;
 }

 TfLiteRegistration* Register_DEPTHWISE_CONVOLUTION_NEON_OPT_UINT8() {
   static TfLiteRegistration r = {
       depthwise_conv::Init, depthwise_conv::Free, depthwise_conv::Prepare,
       depthwise_conv::EvalImpl<depthwise_conv::kNeonOptimized, kTfLiteUInt8>};
   return &r;
 }

 TfLiteRegistration* Register_DEPTHWISE_CONV_2D() {
 #ifdef USE_NEON
   return Register_DEPTHWISE_CONVOLUTION_NEON_OPT();
 #else
   return Register_DEPTHWISE_CONVOLUTION_GENERIC_OPT();
 #endif
 }

 // Warning: Clients using this variant are responsible for ensuring that their
 // models only need the UINT8 type. TFLite's op registration mechanism doesn't
 // yet allow for more nuanced registration mechanisms.
 TfLiteRegistration* Register_DEPTHWISE_CONV_2D_UINT8() {
 #ifdef USE_NEON
   return Register_DEPTHWISE_CONVOLUTION_NEON_OPT_UINT8();
 #else
   return Register_DEPTHWISE_CONV_2D();
 #endif
 }

 }  // namespace builtin
 }  // namespace ops
 }  // namespace tflite
	/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.

	Licensed under the Apache License, Version 2.0 (the "License");
	you may not use this file except in compliance with the License.
	You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.
	==============================================================================*/

	#include "tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h"

	#include <cassert>
	#include <cmath>
	#include <cstdio>
	#include <cstdlib>
	#include <iostream>
	#include <limits>

	#include "tensorflow/lite/c/builtin_op_data.h"
	#include "tensorflow/lite/c/common.h"
	#include "tensorflow/lite/kernels/cpu_backend_context.h"
	#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
	#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_multithread.h"
	#include "tensorflow/lite/kernels/internal/quantization_util.h"
	#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
	#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
	#include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
	#include "tensorflow/lite/kernels/internal/tensor.h"
	#include "tensorflow/lite/kernels/kernel_util.h"
	#include "tensorflow/lite/kernels/op_macros.h"
	#include "tensorflow/lite/kernels/padding.h"

	namespace tflite {
	namespace ops {
	namespace builtin {
	namespace depthwise_conv {

	constexpr int kInputTensor = 0;
	constexpr int kFilterTensor = 1;
	constexpr int kBiasTensor = 2;
	constexpr int kOutputTensor = 0;

	// This file has three implementation of DepthwiseConv.
	enum KernelType {
	kReference,
	kGenericOptimized, // Neon-free
	kNeonOptimized,
	};

	struct OpData {
	TfLitePaddingValues padding;
	// The scaling factor from input to output (aka the 'real multiplier') can
	// be represented as a fixed point multiplier plus a left shift.
	int32_t output_multiplier;
	int output_shift;
	// The range of the fused activation layer. For example for kNone and
	// uint8_t these would be 0 and 255.
	int32_t output_activation_min;
	int32_t output_activation_max;

	// Per channel output multiplier and shift.
	std::vector<int32_t> per_channel_output_multiplier;
	std::vector<int> per_channel_output_shift;
	};

	void* Init(TfLiteContext* context, const char* buffer, size_t length) {
	// This is a builtin op, so we don't use the contents in 'buffer', if any.
	// Instead, we allocate a new object to carry information from Prepare() to
	// Eval().
	return new OpData;
	}

	void Free(TfLiteContext* context, void* buffer) {
	delete reinterpret_cast<OpData*>(buffer);
	}

	TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
	auto* params =
	reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
	OpData* data = reinterpret_cast<OpData*>(node->user_data);

	// TODO(ahentz): use could use GetOptionalInputTensor() here, but we need to
	// decide whether we are OK with optional tensors being completely absent, as
	// opposed to having -1 as their index.
	bool hasBias = NumInputs(node) == 3;

	TF_LITE_ENSURE(context, hasBias \|\| NumInputs(node) == 2);
	const TfLiteTensor* input = GetInput(context, node, kInputTensor);
	const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
	const TfLiteTensor* bias = nullptr;

	TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
	TfLiteTensor* output = GetOutput(context, node, kOutputTensor);

	TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
	TF_LITE_ENSURE_EQ(context, NumDimensions(filter), 4);

	const TfLiteType data_type = input->type;
	TF_LITE_ENSURE(context,
	data_type == kTfLiteFloat32 \|\| data_type == kTfLiteUInt8 \|\|
	data_type == kTfLiteInt8 \|\| data_type == kTfLiteInt16);
	TF_LITE_ENSURE_EQ(context, output->type, data_type);
	TF_LITE_ENSURE(context,
	filter->type == data_type \|\| data_type == kTfLiteInt16);
	// Filter in DepthwiseConv is expected to be [1, H, W, O].
	TF_LITE_ENSURE_EQ(context, SizeOfDimension(filter, 0), 1);

	if (hasBias) {
	bias = GetInput(context, node, kBiasTensor);
	if (data_type == kTfLiteUInt8 \|\| data_type == kTfLiteInt8) {
	TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
	TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
	} else if (data_type == kTfLiteInt16) {
	TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt64);
	TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
	} else {
	TF_LITE_ENSURE_EQ(context, bias->type, data_type);
	}
	TF_LITE_ENSURE_EQ(context, NumDimensions(bias), 1);
	TF_LITE_ENSURE_EQ(context, SizeOfDimension(filter, 3),
	SizeOfDimension(bias, 0));
	}

	int channels_out = SizeOfDimension(filter, 3);
	int width = SizeOfDimension(input, 2);
	int height = SizeOfDimension(input, 1);
	int filter_width = SizeOfDimension(filter, 2);
	int filter_height = SizeOfDimension(filter, 1);
	int batches = SizeOfDimension(input, 0);

	// Matching GetWindowedOutputSize in TensorFlow.
	auto padding = params->padding;
	int out_width, out_height;

	data->padding = ComputePaddingHeightWidth(
	params->stride_height, params->stride_width,
	params->dilation_height_factor, params->dilation_width_factor, height,
	width, filter_height, filter_width, padding, &out_height, &out_width);

	// Note that quantized inference requires that all tensors have their
	// parameters set. This is usually done during quantized training or
	// calibration.
	if (data_type != kTfLiteFloat32) {
	TF_LITE_ENSURE_EQ(context, filter->quantization.type,
	kTfLiteAffineQuantization);
	const auto* affine_quantization =
	reinterpret_cast<TfLiteAffineQuantization*>(
	filter->quantization.params);
	TF_LITE_ENSURE(context, affine_quantization);
	TF_LITE_ENSURE(context, affine_quantization->scale);
	TF_LITE_ENSURE(context, (affine_quantization->scale->size == 1 \|\|
	affine_quantization->scale->size == channels_out));

	data->per_channel_output_multiplier.resize(channels_out);
	data->per_channel_output_shift.resize(channels_out);
	TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
	context, input, filter, bias, output, params->activation,
	&data->output_multiplier, &data->output_shift,
	&data->output_activation_min, &data->output_activation_max,
	data->per_channel_output_multiplier.data(),
	data->per_channel_output_shift.data(), channels_out));
	}

	TfLiteIntArray* outputSize = TfLiteIntArrayCreate(4);
	outputSize->data[0] = batches;
	outputSize->data[1] = out_height;
	outputSize->data[2] = out_width;
	outputSize->data[3] = channels_out;
	return context->ResizeTensor(context, output, outputSize);
	}

	TfLiteStatus ComputeDepthMultiplier(TfLiteContext* context,
	const TfLiteTensor* input,
	const TfLiteTensor* filter,
	int16* depth_multiplier) {
	int num_filter_channels = SizeOfDimension(filter, 3);
	int num_input_channels = SizeOfDimension(input, 3);
	TF_LITE_ENSURE_EQ(context, num_filter_channels % num_input_channels, 0);

	*depth_multiplier = num_filter_channels / num_input_channels;
	return kTfLiteOk;
	}

	template <KernelType kernel_type>
	TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
	TfLiteDepthwiseConvParams* params, OpData* data,
	const TfLiteTensor* input, const TfLiteTensor* filter,
	const TfLiteTensor* bias, TfLiteTensor* output) {
	float output_activation_min, output_activation_max;
	CalculateActivationRange(params->activation, &output_activation_min,
	&output_activation_max);

	DepthwiseParams op_params;
	op_params.padding_type = PaddingType::kSame;
	op_params.padding_values.width = data->padding.width;
	op_params.padding_values.height = data->padding.height;
	op_params.stride_width = params->stride_width;
	op_params.stride_height = params->stride_height;
	op_params.dilation_width_factor = params->dilation_width_factor;
	op_params.dilation_height_factor = params->dilation_height_factor;
	op_params.float_activation_min = output_activation_min;
	op_params.float_activation_max = output_activation_max;
	TF_LITE_ENSURE_STATUS(ComputeDepthMultiplier(context, input, filter,
	&op_params.depth_multiplier));
	if (kernel_type == kReference) {
	reference_ops::DepthwiseConv(
	op_params, GetTensorShape(input), GetTensorData<float>(input),
	GetTensorShape(filter), GetTensorData<float>(filter),
	GetTensorShape(bias), GetTensorData<float>(bias),
	GetTensorShape(output), GetTensorData<float>(output));
	} else {
	optimized_ops::DepthwiseConv<float, float>(
	op_params, GetTensorShape(input), GetTensorData<float>(input),
	GetTensorShape(filter), GetTensorData<float>(filter),
	GetTensorShape(bias), GetTensorData<float>(bias),
	GetTensorShape(output), GetTensorData<float>(output),
	CpuBackendContext::GetFromContext(context));
	}
	return kTfLiteOk;
	}

	template <KernelType kernel_type>
	TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
	TfLiteDepthwiseConvParams* params, OpData* data,
	const TfLiteTensor* input,
	const TfLiteTensor* filter, const TfLiteTensor* bias,
	TfLiteTensor* output) {
	auto input_offset = -input->params.zero_point;
	auto filter_offset = -filter->params.zero_point;
	auto output_offset = output->params.zero_point;

	DepthwiseParams op_params;
	op_params.padding_type = PaddingType::kSame;
	op_params.padding_values.width = data->padding.width;
	op_params.padding_values.height = data->padding.height;
	op_params.stride_width = params->stride_width;
	op_params.stride_height = params->stride_height;
	op_params.dilation_width_factor = params->dilation_width_factor;
	op_params.dilation_height_factor = params->dilation_height_factor;
	op_params.input_offset = input_offset;
	op_params.weights_offset = filter_offset;
	op_params.output_offset = output_offset;
	op_params.output_multiplier = data->output_multiplier;
	op_params.output_shift = -data->output_shift;
	op_params.quantized_activation_min = data->output_activation_min;
	op_params.quantized_activation_max = data->output_activation_max;
	TF_LITE_ENSURE_STATUS(ComputeDepthMultiplier(context, input, filter,
	&op_params.depth_multiplier));
	if (kernel_type == kReference) {
	reference_ops::DepthwiseConv(
	op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
	GetTensorShape(filter), GetTensorData<uint8_t>(filter),
	GetTensorShape(bias), GetTensorData<int32_t>(bias),
	GetTensorShape(output), GetTensorData<uint8_t>(output));
	} else {
	optimized_ops::DepthwiseConv<uint8, int32>(
	op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
	GetTensorShape(filter), GetTensorData<uint8_t>(filter),
	GetTensorShape(bias), GetTensorData<int32_t>(bias),
	GetTensorShape(output), GetTensorData<uint8_t>(output),
	CpuBackendContext::GetFromContext(context));
	}
	return kTfLiteOk;
	}

	template <KernelType kernel_type>
	TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
	TfLiteDepthwiseConvParams* params,
	OpData* data, const TfLiteTensor* input,
	const TfLiteTensor* filter,
	const TfLiteTensor* bias,
	TfLiteTensor* output) {
	DepthwiseParams op_params;
	op_params.padding_type = PaddingType::kSame;
	op_params.padding_values.width = data->padding.width;
	op_params.padding_values.height = data->padding.height;
	op_params.stride_width = params->stride_width;
	op_params.stride_height = params->stride_height;
	op_params.dilation_width_factor = params->dilation_width_factor;
	op_params.dilation_height_factor = params->dilation_height_factor;
	op_params.input_offset = -input->params.zero_point;
	op_params.weights_offset = 0;
	op_params.output_offset = output->params.zero_point;
	// TODO(b/130439627): Use calculated value for clamping.
	op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
	op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
	TF_LITE_ENSURE_STATUS(ComputeDepthMultiplier(context, input, filter,
	&op_params.depth_multiplier));

	if (kernel_type == kReference) {
	reference_integer_ops::DepthwiseConvPerChannel(
	op_params, data->per_channel_output_multiplier.data(),
	data->per_channel_output_shift.data(), GetTensorShape(input),
	GetTensorData<int8>(input), GetTensorShape(filter),
	GetTensorData<int8>(filter), GetTensorShape(bias),
	GetTensorData<int32>(bias), GetTensorShape(output),
	GetTensorData<int8>(output));
	} else {
	optimized_integer_ops::DepthwiseConvPerChannel(
	op_params, data->per_channel_output_multiplier.data(),
	data->per_channel_output_shift.data(), GetTensorShape(input),
	GetTensorData<int8>(input), GetTensorShape(filter),
	GetTensorData<int8>(filter), GetTensorShape(bias),
	GetTensorData<int32>(bias), GetTensorShape(output),
	GetTensorData<int8>(output),
	CpuBackendContext::GetFromContext(context));
	}
	return kTfLiteOk;
	}

	TfLiteStatus EvalQuantizedPerChannel16x8(
	TfLiteContext* context, TfLiteNode* node, TfLiteDepthwiseConvParams* params,
	OpData* data, const TfLiteTensor* input, const TfLiteTensor* filter,
	const TfLiteTensor* bias, TfLiteTensor* output) {
	DepthwiseParams op_params;
	op_params.padding_type = PaddingType::kSame;
	op_params.padding_values.width = data->padding.width;
	op_params.padding_values.height = data->padding.height;
	op_params.stride_width = params->stride_width;
	op_params.stride_height = params->stride_height;
	op_params.dilation_width_factor = params->dilation_width_factor;
	op_params.dilation_height_factor = params->dilation_height_factor;
	op_params.depth_multiplier = params->depth_multiplier;
	op_params.input_offset = -input->params.zero_point;
	op_params.weights_offset = 0;
	op_params.output_offset = output->params.zero_point;
	// TODO(b/130439627): Use calculated value for clamping.
	op_params.quantized_activation_min = std::numeric_limits<int16_t>::min();
	op_params.quantized_activation_max = std::numeric_limits<int16_t>::max();

	reference_integer_ops::DepthwiseConvPerChannel(
	op_params, data->per_channel_output_multiplier.data(),
	data->per_channel_output_shift.data(), GetTensorShape(input),
	GetTensorData<int16>(input), GetTensorShape(filter),
	GetTensorData<int8>(filter), GetTensorShape(bias),
	GetTensorData<std::int64_t>(bias), GetTensorShape(output),
	GetTensorData<int16>(output));
	return kTfLiteOk;
	}

	template <KernelType kernel_type, TfLiteType input_type>
	TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node) {
	auto* params =
	reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
	OpData* data = reinterpret_cast<OpData*>(node->user_data);

	TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
	const TfLiteTensor* input = GetInput(context, node, kInputTensor);
	const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
	const TfLiteTensor* bias =
	(NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
	TFLITE_DCHECK_EQ(input_type, input->type);

	switch (input_type) { // Already know in/out types are same.
	case kTfLiteFloat32:
	return EvalFloat<kernel_type>(context, node, params, data, input, filter,
	bias, output);
	break;
	case kTfLiteUInt8:
	return EvalQuantized<kernel_type>(context, node, params, data, input,
	filter, bias, output);
	break;
	case kTfLiteInt8:
	return EvalQuantizedPerChannel<kernel_type>(context, node, params, data,
	input, filter, bias, output);
	break;
	case kTfLiteInt16:
	return EvalQuantizedPerChannel16x8(context, node, params, data, input,
	filter, bias, output);
	break;
	default:
	context->ReportError(context, "Type %d not currently supported.",
	input->type);
	return kTfLiteError;
	}
	}

	template <KernelType kernel_type>
	TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
	const TfLiteTensor* input = GetInput(context, node, kInputTensor);

	switch (input->type) { // Already know in/out types are same.
	case kTfLiteFloat32:
	return EvalImpl<kernel_type, kTfLiteFloat32>(context, node);
	case kTfLiteUInt8:
	return EvalImpl<kernel_type, kTfLiteUInt8>(context, node);
	case kTfLiteInt8:
	return EvalImpl<kernel_type, kTfLiteInt8>(context, node);
	case kTfLiteInt16:
	return EvalImpl<kernel_type, kTfLiteInt16>(context, node);
	default:
	context->ReportError(context, "Type %d not currently supported.",
	input->type);
	return kTfLiteError;
	}
	}

	} // namespace depthwise_conv

	TfLiteRegistration* Register_DEPTHWISE_CONVOLUTION_REF() {
	static TfLiteRegistration r = {
	depthwise_conv::Init, depthwise_conv::Free, depthwise_conv::Prepare,
	depthwise_conv::Eval<depthwise_conv::kReference>};
	return &r;
	}

	TfLiteRegistration* Register_DEPTHWISE_CONVOLUTION_GENERIC_OPT() {
	static TfLiteRegistration r = {
	depthwise_conv::Init, depthwise_conv::Free, depthwise_conv::Prepare,
	depthwise_conv::Eval<depthwise_conv::kGenericOptimized>};
	return &r;
	}

	TfLiteRegistration* Register_DEPTHWISE_CONVOLUTION_NEON_OPT() {
	static TfLiteRegistration r = {
	depthwise_conv::Init, depthwise_conv::Free, depthwise_conv::Prepare,
	depthwise_conv::Eval<depthwise_conv::kNeonOptimized>};
	return &r;
	}

	TfLiteRegistration* Register_DEPTHWISE_CONVOLUTION_NEON_OPT_UINT8() {
	static TfLiteRegistration r = {
	depthwise_conv::Init, depthwise_conv::Free, depthwise_conv::Prepare,
	depthwise_conv::EvalImpl<depthwise_conv::kNeonOptimized, kTfLiteUInt8>};
	return &r;
	}

	TfLiteRegistration* Register_DEPTHWISE_CONV_2D() {
	#ifdef USE_NEON
	return Register_DEPTHWISE_CONVOLUTION_NEON_OPT();
	#else
	return Register_DEPTHWISE_CONVOLUTION_GENERIC_OPT();
	#endif
	}

	// Warning: Clients using this variant are responsible for ensuring that their
	// models only need the UINT8 type. TFLite's op registration mechanism doesn't
	// yet allow for more nuanced registration mechanisms.
	TfLiteRegistration* Register_DEPTHWISE_CONV_2D_UINT8() {
	#ifdef USE_NEON
	return Register_DEPTHWISE_CONVOLUTION_NEON_OPT_UINT8();
	#else
	return Register_DEPTHWISE_CONV_2D();
	#endif
	}

	} // namespace builtin
	} // namespace ops
	} // namespace tflite