Added CMSIS-NN specialization for int8 depthwise conv op.

Change-Id: Icc8b933363677eca7cc444078ff15721c734ca8f
diff --git a/tensorflow/lite/experimental/micro/kernels/cmsis-nn/ b/tensorflow/lite/experimental/micro/kernels/cmsis-nn/
index 79b6120..948d672 100644
--- a/tensorflow/lite/experimental/micro/kernels/cmsis-nn/
+++ b/tensorflow/lite/experimental/micro/kernels/cmsis-nn/
@@ -20,9 +20,11 @@
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
+#include "tensorflow/lite/experimental/micro/kernels/cmsis-nn/scratch_buffer.h"
 namespace tflite {
 namespace ops {
@@ -34,6 +36,7 @@
 constexpr int kFilterTensor = 1;
 constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;
+constexpr int kMaxChannels = 256;
 struct OpData {
   TfLitePaddingValues padding;
@@ -41,6 +44,12 @@
   // be represented as a fixed point multiplier plus a left shift.
   int32_t output_multiplier;
   int output_shift;
+  // Per channel output multiplier and shift.
+  // TODO(b/141139247): Allocate these dynamically when possible.
+  int32_t per_channel_output_multiplier[kMaxChannels];
+  int32_t per_channel_output_shift[kMaxChannels];
   // The range of the fused activation layer. For example for kNone and
   // uint8_t these would be 0 and 255.
   int32_t output_activation_min;
@@ -50,12 +59,17 @@
 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
                              TfLiteDepthwiseConvParams* params, int width,
                              int height, int filter_width, int filter_height,
-                             int out_width, int out_height,
                              const TfLiteType data_type, OpData* data) {
-  data->padding.height = ComputePadding(params->stride_height, 1, height,
-                                        filter_height, out_height);
-  data->padding.width =
-      ComputePadding(params->stride_width, 1, width, filter_width, out_width);
+  bool has_bias = node->inputs->size == 3;
+  // Check number of inputs/outputs
+  TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
+  int unused_output_height, unused_output_width;
+  data->padding = ComputePaddingHeightWidth(
+      params->stride_height, params->stride_width, 1, 1, height, width,
+      filter_height, filter_width, params->padding, &unused_output_height,
+      &unused_output_width);
   // Note that quantized inference requires that all tensors have their
   // parameters set. This is usually done during quantized training.
@@ -66,15 +80,12 @@
         GetOptionalInputTensor(context, node, kBiasTensor);
     TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-    double real_multiplier = 0.0;
-    TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
-        context, input, filter, bias, output, &real_multiplier));
-    int exponent;
-    QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
-    data->output_shift = -exponent;
-    CalculateActivationRangeUint8(params->activation, output,
-                                  &data->output_activation_min,
-                                  &data->output_activation_max);
+    TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
+        context, input, filter, bias, output, params->activation,
+        &data->output_multiplier, &data->output_shift,
+        &data->output_activation_min, &data->output_activation_max,
+        data->per_channel_output_multiplier,
+        reinterpret_cast<int*>(data->per_channel_output_shift)));
   return kTfLiteOk;
@@ -91,10 +102,10 @@
   return kTfLiteOk;
-void EvalFloat(TfLiteContext* context, TfLiteNode* node,
-               TfLiteDepthwiseConvParams* params, OpData* data,
-               const TfLiteTensor* input, const TfLiteTensor* filter,
-               const TfLiteTensor* bias, TfLiteTensor* output) {
+TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
+                       TfLiteDepthwiseConvParams* params, OpData* data,
+                       const TfLiteTensor* input, const TfLiteTensor* filter,
+                       const TfLiteTensor* bias, TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRange(params->activation, &output_activation_min,
@@ -117,12 +128,113 @@
       GetTensorShape(filter), GetTensorData<float>(filter),
       GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
+  return kTfLiteOk;
-void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                   TfLiteDepthwiseConvParams* params, OpData* data,
-                   const TfLiteTensor* input, const TfLiteTensor* filter,
-                   const TfLiteTensor* bias, TfLiteTensor* output) {
+TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
+                             TfLiteDepthwiseConvParams* params, OpData* data,
+                             const TfLiteTensor* input,
+                             const TfLiteTensor* filter,
+                             const TfLiteTensor* bias, TfLiteTensor* output) {
+#if defined(ARM_MATH_DSP) && defined(ARM_MATH_LOOPUNROLL)
+    DepthwiseParams op_params;
+    op_params.padding_type = PaddingType::kSame;
+    op_params.padding_values.width = data->padding.width;
+    op_params.padding_values.height = data->padding.height;
+    op_params.stride_width = params->stride_width;
+    op_params.stride_height = params->stride_height;
+    op_params.dilation_width_factor = params->dilation_width_factor;
+    op_params.dilation_height_factor = params->dilation_height_factor;
+    op_params.depth_multiplier = params->depth_multiplier;
+    op_params.input_offset = -input->params.zero_point;
+    op_params.weights_offset = 0;
+    op_params.output_offset = output->params.zero_point;
+    // TODO(b/130439627): Use calculated value for clamping.
+    op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
+    op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
+    RuntimeShape filter_shape = GetTensorShape(filter);
+    const int filter_height = filter_shape.Dims(1);
+    const int filter_width = filter_shape.Dims(2);
+    RuntimeShape input_shape = GetTensorShape(input);
+    const int input_height = input_shape.Dims(1);
+    const int input_width = input_shape.Dims(2);
+    const int input_depth = input_shape.Dims(3);
+    RuntimeShape output_shape = GetTensorShape(output);
+    const int output_height = output_shape.Dims(1);
+    const int output_width = output_shape.Dims(2);
+    RuntimeShape bias_shape = GetTensorShape(bias);
+    if (op_params.depth_multiplier == 1) {
+      int16_t* buf = nullptr;
+      const int32_t buf_size =
+        arm_depthwise_conv_s8_opt_get_buffer_size(input_depth,
+                                                  filter_width,
+                                                  filter_height);
+      TF_LITE_ENSURE_OK(context,
+                        get_cmsis_scratch_buffer(context, &buf, buf_size));
+      TF_LITE_ENSURE_EQ(context,
+                        arm_depthwise_conv_s8_opt(
+                          GetTensorData<int8_t>(input),
+                          input_width, input_height, input_depth,
+                          GetTensorData<int8_t>(filter),
+                          input_depth,
+                          filter_width, filter_height,
+                          op_params.padding_values.width,
+                          op_params.padding_values.height,
+                          op_params.stride_width,
+                          op_params.stride_height,
+                          GetTensorData<int32>(bias),
+                          GetTensorData<int8_t>(output),
+                          data->per_channel_output_shift,
+                          data->per_channel_output_multiplier,
+                          output_width,
+                          output_height,
+                          op_params.output_offset,
+                          op_params.input_offset,
+                          op_params.quantized_activation_min,
+                          op_params.quantized_activation_max,
+                          op_params.dilation_width_factor,
+                          op_params.dilation_height_factor,
+                          buf),
+                        ARM_MATH_SUCCESS);
+    } else {
+      TF_LITE_ENSURE_EQ(context,
+                        arm_depthwise_conv_s8(
+                          GetTensorData<int8_t>(input),
+                          input_width, input_height, input_depth,
+                          GetTensorData<int8_t>(filter),
+                          op_params.depth_multiplier * input_depth,
+                          op_params.depth_multiplier,
+                          filter_width, filter_height,
+                          op_params.padding_values.width,
+                          op_params.padding_values.height,
+                          op_params.stride_width,
+                          op_params.stride_height,
+                          GetTensorData<int32>(bias),
+                          GetTensorData<int8_t>(output),
+                          data->per_channel_output_shift,
+                          data->per_channel_output_multiplier,
+                          output_width,
+                          output_height,
+                          op_params.output_offset,
+                          op_params.input_offset,
+                          op_params.quantized_activation_min,
+                          op_params.quantized_activation_max,
+                          op_params.dilation_width_factor,
+                          op_params.dilation_height_factor,
+                          nullptr),
+                        ARM_MATH_SUCCESS);
+    }
+  #error ARM_MATH_DSP and ARM_MATH_LOOPUNROLL must be set
+  return kTfLiteOk;
+TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                          TfLiteDepthwiseConvParams* params, OpData* data,
+                          const TfLiteTensor* input, const TfLiteTensor* filter,
+                          const TfLiteTensor* bias, TfLiteTensor* output) {
   const int32_t input_offset = -input->params.zero_point;
   const int32_t filter_offset = -filter->params.zero_point;
   const int32_t output_offset = output->params.zero_point;
@@ -181,6 +293,7 @@
         GetTensorShape(bias), GetTensorData<int32_t>(bias),
         GetTensorShape(output), GetTensorData<uint8_t>(output));
+  return kTfLiteOk;
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
@@ -198,28 +311,42 @@
   int height = SizeOfDimension(input, 1);
   int filter_width = SizeOfDimension(filter, 2);
   int filter_height = SizeOfDimension(filter, 1);
-  int out_width = ComputeOutSize(params->padding, width, filter_width,
-                                 params->stride_width);
-  int out_height = ComputeOutSize(params->padding, height, filter_height,
-                                  params->stride_height);
-  OpData local_data_object;
-  OpData* data = &local_data_object;
+  OpData data;
+  if (input->type != kTfLiteFloat32) {
+    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
+                      kTfLiteAffineQuantization);
+    const auto* affine_quantization =
+        reinterpret_cast<TfLiteAffineQuantization*>(
+            filter->quantization.params);
+    TF_LITE_ENSURE(context, affine_quantization);
+    TF_LITE_ENSURE(context, affine_quantization->scale);
+  }
   TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
-                                        filter_width, filter_height, out_width,
-                                        out_height, data_type, data));
+                                        filter_width, filter_height, data_type,
+                                        &data));
   // TODO(aselle): Consider whether float conv and quantized conv should be
   // separate ops to avoid dispatch overhead here.
   switch (input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
-      EvalFloat(context, node, params, data, input, filter, bias, output);
+      return EvalFloat(context, node, params, &data, input, filter, bias,
+                       output);
+      break;
+    case kTfLiteInt8:
+      return EvalQuantizedPerChannel(context, node, params, &data, input,
+                                     filter, bias, output);
     case kTfLiteUInt8:
-      EvalQuantized(context, node, params, data, input, filter, bias, output);
+      return EvalQuantized(context, node, params, &data, input, filter, bias,
+                           output);
-      context->ReportError(context, "Type %d not currently supported.",
-                           input->type);
+      context->ReportError(context, "Type %s (%d) not supported.",
+                           TfLiteTypeGetName(input->type), input->type);
       return kTfLiteError;
   return kTfLiteOk;
diff --git a/tensorflow/lite/experimental/micro/kernels/cmsis-nn/ b/tensorflow/lite/experimental/micro/kernels/cmsis-nn/
index 5c8b371..d8dda97 100644
--- a/tensorflow/lite/experimental/micro/kernels/cmsis-nn/
+++ b/tensorflow/lite/experimental/micro/kernels/cmsis-nn/
@@ -89,7 +89,6 @@
                                const TfLiteTensor* input,
                                const TfLiteTensor* filter,
                                const TfLiteTensor* bias, TfLiteTensor* output) {
-  TfLiteStatus status = kTfLiteOk;
   RuntimeShape output_shape = GetTensorShape(output);
   const int batches = output_shape.Dims(0);
   const int output_depth = output_shape.Dims(1);
@@ -100,18 +99,23 @@
 #if defined(ARM_MATH_DSP) && defined(ARM_MATH_LOOPUNROLL)
   const int32_t buf_size = arm_fully_connected_s8_get_buffer_size(accum_depth);
   int16_t* buf = nullptr;
-  status = get_cmsis_scratch_buffer(context, &buf, buf_size);
-  arm_fully_connected_s8(
-      GetTensorData<int8_t>(input), GetTensorData<int8_t>(filter), accum_depth,
-      output_depth, batches, -input->params.zero_point,
-      -filter->params.zero_point, data->output_multiplier, -data->output_shift,
-      output->params.zero_point, GetTensorData<int32_t>(bias),
-      GetTensorData<int8_t>(output), data->output_activation_min,
-      data->output_activation_max, buf);
+  TF_LITE_ENSURE_OK(context,
+                    get_cmsis_scratch_buffer(context, &buf, buf_size));
+  TF_LITE_ENSURE_EQ(context,
+                    arm_fully_connected_s8(
+                      GetTensorData<int8_t>(input),
+                      GetTensorData<int8_t>(filter),
+                      accum_depth, output_depth, batches,
+                      -input->params.zero_point, -filter->params.zero_point,
+                      data->output_multiplier, -data->output_shift,
+                      output->params.zero_point, GetTensorData<int32_t>(bias),
+                      GetTensorData<int8_t>(output), data->output_activation_min,
+                      data->output_activation_max, buf),
+                    ARM_MATH_SUCCESS);
 #error ARM_MATH_DSP and ARM_MATH_LOOPUNROLL must be set
-  return status;
+  return kTfLiteOk;
 TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
diff --git a/tensorflow/lite/experimental/micro/kernels/cmsis-nn/ b/tensorflow/lite/experimental/micro/kernels/cmsis-nn/
index b41420e..a9bc82d 100644
--- a/tensorflow/lite/experimental/micro/kernels/cmsis-nn/
+++ b/tensorflow/lite/experimental/micro/kernels/cmsis-nn/
@@ -19,17 +19,18 @@
 // implemented.
 // This buffer is used by CMSIS-NN optimized operator implementations.
-// SCRATCH_BUFFER_BYTES bytes is chosenn empirically. It needs to be large
+// SCRATCH_BUFFER_BYTES bytes is chosen empirically. It needs to be large
 // enough to hold the biggest buffer needed by all CMSIS-NN operators in the
 // network.
-    4))) static int16_t cmsis_scratch_buffer[SCRATCH_BUFFER_BYTES / 2] = {0};
+__attribute__((aligned(4))) static int16_t
+  cmsis_scratch_buffer[SCRATCH_BUFFER_BYTES/2] = {0};
 TfLiteStatus get_cmsis_scratch_buffer(TfLiteContext* context, int16_t** buf,
-                                      int32_t buf_size) {
-  TF_LITE_ENSURE(context, buf_size <= SCRATCH_BUFFER_BYTES / 2);
+                                      int32_t buf_size_bytes)
+  TF_LITE_ENSURE(context, buf_size_bytes <= SCRATCH_BUFFER_BYTES);
   *buf = cmsis_scratch_buffer;
   return kTfLiteOk;
\ No newline at end of file