Remove xtensa_hifimini_staging (in favor of consolidated xtensa directory).

See http://b/173043817 for more details.

PiperOrigin-RevId: 343329539
Change-Id: I0b38539f197b924f2f9f58f5f84ed5bc612ae7a2
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/fully_connected.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/fully_connected.cc
deleted file mode 100644
index f9b49a2..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/fully_connected.cc
+++ /dev/null
@@ -1,197 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2020 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
-
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
-
-#include <xtensa/tie/xt_hifi2.h>
-
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/kernels/internal/common.h"
-#include "tensorflow/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
-#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xtensa_tf_micro_common.h"
-namespace tflite {
-namespace ops {
-namespace micro {
-
-namespace fully_connected {
-namespace {
-
-struct OpData {
-  // The scaling factor from input to output (aka the 'real multiplier') can
-  // be represented as a fixed point multiplier plus a left shift.
-  int32_t output_multiplier;
-  int output_shift;
-  // The range of the fused activation layer. For example for kNone and
-  // uint8_t these would be 0 and 255.
-  int32_t output_activation_min;
-  int32_t output_activation_max;
-  // The index of the temporary tensor where the quantized inputs are cached.
-  int input_quantized_index;
-};
-
-constexpr int kInputTensor = 0;
-constexpr int kWeightsTensor = 1;
-constexpr int kBiasTensor = 2;
-constexpr int kOutputTensor = 0;
-
-TfLiteStatus CalculateOpData(TfLiteContext* context,
-                             TfLiteFusedActivation activation,
-                             TfLiteType data_type, const TfLiteTensor* input,
-                             const TfLiteTensor* filter,
-                             const TfLiteTensor* bias, TfLiteTensor* output,
-                             OpData* data) {
-  if (data_type != kTfLiteInt8) {
-    TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
-                       TfLiteTypeGetName(data_type), data_type);
-    return kTfLiteError;
-  }
-
-  double real_multiplier = 0.0;
-  TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
-      context, input, filter, bias, output, &real_multiplier));
-  xtensa::hifimini::QuantizeMultiplier(
-      real_multiplier, &data->output_multiplier, &data->output_shift);
-  return CalculateActivationRangeQuantized(context, activation, output,
-                                           &data->output_activation_min,
-                                           &data->output_activation_max);
-}
-
-}  // namespace
-
-void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  return context->AllocatePersistentBuffer(context, sizeof(OpData));
-}
-
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  TFLITE_DCHECK(node->user_data != nullptr);
-  TFLITE_DCHECK(node->builtin_data != nullptr);
-
-  OpData* data = static_cast<OpData*>(node->user_data);
-  const auto* params =
-      reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
-
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
-  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-  return CalculateOpData(context, params->activation, input->type, input,
-                         filter, bias, output, data);
-}
-
-TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
-                               const OpData& data, const TfLiteTensor* input,
-                               const TfLiteTensor* filter,
-                               const TfLiteTensor* bias, TfLiteTensor* output) {
-  // TODO(b/154032858): Investigate removing extra copies.
-  FullyConnectedParams op_params;
-  op_params.input_offset = -input->params.zero_point;
-  op_params.weights_offset = -filter->params.zero_point;
-  op_params.output_offset = output->params.zero_point;
-  op_params.output_multiplier = data.output_multiplier;
-  op_params.output_shift = data.output_shift;
-  op_params.quantized_activation_min = data.output_activation_min;
-  op_params.quantized_activation_max = data.output_activation_max;
-
-  {
-    int ret, b, weight_depth, out_depth, batches;
-    int8_t* p_out = GetTensorData<int8_t>(output);
-    weight_depth = GetTensorShape(filter).Dims(
-        GetTensorShape(filter).DimensionsCount() - 1);
-    out_depth = GetTensorShape(output).Dims(
-        GetTensorShape(output).DimensionsCount() - 1);
-    batches = FlatSizeSkipDim(GetTensorShape(output),
-                              GetTensorShape(output).DimensionsCount() - 1);
-
-    // TODO: Use xa_nn_fully_connected_sym8xasym8s_asym8s? the kernel tests fail
-    // with it.
-    for (b = 0; b < batches; b++) {
-      ret = xa_nn_fully_connected_asym8sxasym8s_asym8s(
-          (GetTensorData<int8_t>(output) + b * out_depth),
-          GetTensorData<int8_t>(filter),
-          (GetTensorData<int8_t>(input) + b * weight_depth),
-          GetTensorData<int32_t>(bias), weight_depth, out_depth,
-          op_params.weights_offset, op_params.input_offset,
-          (op_params.output_multiplier << 8), op_params.output_shift,
-          op_params.output_offset);
-      CHECK_ERR_HIFI_NNLIB_KER(
-          ret, "xa_nn_fully_connected_sym8xasym8s_asym8s failed");
-    }
-    ret = xa_nn_vec_activation_min_max_asym8s_asym8s(
-        p_out, p_out, data.output_activation_min, data.output_activation_max,
-        batches * out_depth);
-    CHECK_ERR_HIFI_NNLIB_KER(
-        ret,
-        "fully_connected: xa_nn_vec_activation_min_max_asym8s_asym8s failed");
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  TFLITE_DCHECK(node->user_data != nullptr);
-  const OpData& data = *(static_cast<const OpData*>(node->user_data));
-
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
-  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-  TFLITE_DCHECK(filter->type == kTfLiteInt8);
-  return EvalQuantizedInt8(context, node, data, input, filter, bias, output);
-}
-
-}  // namespace fully_connected
-
-TfLiteRegistration Register_FULLY_CONNECTED() {
-  return {/*init=*/fully_connected::Init,
-          /*free=*/nullptr,
-          /*prepare=*/fully_connected::Prepare,
-          /*invoke=*/fully_connected::Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
-}
-
-}  // namespace micro
-}  // namespace ops
-}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/quantize.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/quantize.cc
deleted file mode 100644
index 13c19cc..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/quantize.cc
+++ /dev/null
@@ -1,172 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/kernels/internal/reference/quantize.h"
-
-#include <xtensa/tie/xt_hifi2.h>
-
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
-
-namespace tflite {
-namespace ops {
-namespace micro {
-
-namespace xtensa {
-namespace hifimini {
-
-void AffineQuantize(int scale_multiplier,
-                    const tflite::QuantizationParams& op_params,
-                    const RuntimeShape& input_shape, const int16_t* input_data,
-                    const RuntimeShape& output_shape, int8_t* output_data) {
-  const int32_t zero_point = op_params.zero_point;
-  const int flat_size = MatchingFlatSize(input_shape, output_shape);
-  ae_q56s min_val_56 = AE_CVTQ48A32S(INT16_MIN);
-  ae_q56s max_val_56 = AE_CVTQ48A32S(INT16_MAX);
-  ae_q56s zero_point_56 = AE_CVTQ48A32S(zero_point);
-
-  const ae_p16x2s* input_data_ptr = (const ae_p16x2s*)(input_data - 2);
-
-  ae_p24x2s scale_multiplier_24x2 = AE_MOVPA24(scale_multiplier);
-
-  int iters = flat_size / 2;
-  for (int i = 0; i < iters; i++) {
-    // Load two 16bit pairs into the 2x24bit register PR:
-    // Values need to be right shifted 8 bits to align from upper 16bits to a
-    // 24bit value:
-    ae_p24x2s inputs_24x2;
-    AE_LP16X2F_IU(inputs_24x2, input_data_ptr, 4);
-    inputs_24x2 = AE_P24X2S_SRAI(inputs_24x2, 8);
-
-    // Q0.23 * Q16.0 == Q16.23
-    {
-      ae_q56s sum_56 = AE_MULP24S_HH(scale_multiplier_24x2, inputs_24x2);
-
-      // Q16.23 -> Q16.0
-      // Shift right only 7 bits (23 - 16). This truncated shift aligns the
-      // 16bit value at the truncation line for 32bit in the QR register. The
-      // lower 16 bits will be used for rounding in AE_ROUNDSQ32SYM.
-      sum_56 = AE_Q56S_SRAI(sum_56, 7);
-
-      // Round and truncate 32 bits
-      sum_56 = AE_ROUNDSQ32SYM(sum_56);
-
-      // Add offset (zero_point_56 is already aligned at 32bits.
-      sum_56 = AE_ADDQ56(sum_56, zero_point_56);
-
-      // Saturate:
-      sum_56 = AE_MINQ56S(sum_56, max_val_56);
-      sum_56 = AE_MAXQ56S(sum_56, min_val_56);
-
-      output_data[i * 2] = static_cast<int16_t>(AE_TRUNCA32Q48(sum_56));
-    }
-    {
-      ae_q56s sum_56 = AE_MULP24S_LL(scale_multiplier_24x2, inputs_24x2);
-
-      // Q16.23 -> Q16.0
-      // Shift right only 7 bits (23 - 16). This truncated shift aligns the
-      // 16bit value at the truncation line for 32bit in the QR register. The
-      // lower 16 bits will be used for rounding in AE_ROUNDSQ32SYM.
-      sum_56 = AE_Q56S_SRAI(sum_56, 23 - 16);
-
-      // Round and truncate 32 bits
-      sum_56 = AE_ROUNDSQ32SYM(sum_56);
-
-      // Add offset (zero_point_56 is already aligned at 32bits.
-      sum_56 = AE_ADDQ56(sum_56, zero_point_56);
-
-      // Saturate:
-      sum_56 = AE_MINQ56S(sum_56, max_val_56);
-      sum_56 = AE_MAXQ56S(sum_56, min_val_56);
-
-      output_data[i * 2 + 1] = static_cast<int16_t>(AE_TRUNCA32Q48(sum_56));
-    }
-  }
-}
-
-}  // namespace hifimini
-}  // namespace xtensa
-
-namespace quantize {
-
-struct OpData {
-  int scale_multiplier = 0;
-};
-
-void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  return context->AllocatePersistentBuffer(context, sizeof(OpData));
-}
-
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  TFLITE_DCHECK(node->user_data != nullptr);
-  auto* op_data = static_cast<OpData*>(node->user_data);
-
-  TfLiteTensor* output = GetOutput(context, node, 0);
-  const TfLiteTensor* input = GetInput(context, node, 0);
-
-  // TODO(b/155682734): Fix dangerous input/output scale ratio assumptions.
-  op_data->scale_multiplier = xtensa::hifimini::CreateQConstantForInt24(
-      0, input->params.scale / output->params.scale);
-
-  return kTfLiteOk;
-}
-
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  TFLITE_DCHECK(node->user_data != nullptr);
-  auto* op_data = static_cast<OpData*>(node->user_data);
-
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
-
-  tflite::QuantizationParams op_params;
-  op_params.zero_point = output->params.zero_point;
-
-  if (input->type != kTfLiteInt16 && output->type != kTfLiteInt8) {
-    TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
-                       TfLiteTypeGetName(input->type),
-                       TfLiteTypeGetName(output->type));
-    return kTfLiteError;
-  }
-
-  xtensa::hifimini::AffineQuantize(
-      op_data->scale_multiplier, op_params, GetTensorShape(input),
-      GetTensorData<int16_t>(input), GetTensorShape(output),
-      GetTensorData<int8_t>(output));
-  return kTfLiteOk;
-}
-
-}  // namespace quantize
-
-// This Op (QUANTIZE) quantizes the input and produces quantized output.
-// AffineQuantize takes scale and zero point and quantizes the float value to
-// quantized output, in int8_t or uint8_t format.
-TfLiteRegistration Register_QUANTIZE() {
-  return {/*init=*/quantize::Init,
-          /*free=*/nullptr,
-          /*prepare=*/quantize::Prepare,
-          /*invoke=*/quantize::Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
-}
-
-}  // namespace micro
-}  // namespace ops
-}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/softmax.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/softmax.cc
deleted file mode 100644
index 3e5ef19..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/softmax.cc
+++ /dev/null
@@ -1,189 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2020 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
-
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/kernels/internal/reference/softmax.h"
-
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/kernels/internal/common.h"
-#include "tensorflow/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xtensa_tf_micro_common.h"
-
-namespace tflite {
-namespace ops {
-namespace micro {
-namespace activations {
-namespace {
-
-struct OpData {
-  int32_t input_multiplier;
-  int32_t input_left_shift;
-  int32_t diff_min;
-  int scratch_tensor_index;
-};
-
-}  // namespace
-
-TfLiteStatus CalculateSoftmaxOpData(TfLiteContext* context,
-                                    const TfLiteTensor* input,
-                                    TfLiteTensor* output,
-                                    const TfLiteSoftmaxParams* params,
-                                    OpData* op_data) {
-  if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
-    if (input->type == kTfLiteUInt8) {
-      TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
-    } else {
-      if (output->type == kTfLiteInt16) {
-        TF_LITE_ENSURE_EQ(context, output->params.zero_point,
-                          std::numeric_limits<int16_t>::min());
-        // NOTE: Current int16_t softmax output does not require symmetric
-        // scaling
-        // - so no need to verify scale here.
-      } else {
-        TF_LITE_ENSURE_EQ(context, output->params.zero_point,
-                          std::numeric_limits<int8_t>::min());
-        TF_LITE_ENSURE(context, output->params.scale == 1.f / 256);
-      }
-    }
-
-    static const int kScaledDiffIntegerBits = 5;
-
-    int input_left_shift;
-    tflite::PreprocessSoftmaxScaling(
-        static_cast<double>(params->beta),
-        static_cast<double>(input->params.scale), kScaledDiffIntegerBits,
-        &op_data->input_multiplier, &input_left_shift);
-    op_data->input_left_shift = input_left_shift;
-    op_data->diff_min =
-        -1.0 * tflite::CalculateInputRadius(kScaledDiffIntegerBits,
-                                            op_data->input_left_shift);
-  }
-  return kTfLiteOk;
-}
-
-void* SoftmaxInit(TfLiteContext* context, const char* buffer, size_t length) {
-  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  return context->AllocatePersistentBuffer(context, sizeof(OpData));
-}
-
-TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
-  auto* params = static_cast<TfLiteSoftmaxParams*>(node->builtin_data);
-
-  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
-  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
-  TF_LITE_ENSURE(context, NumDimensions(input) >= 1);
-
-  TFLITE_DCHECK(node->user_data != nullptr);
-  OpData* op_data = static_cast<OpData*>(node->user_data);
-
-  const RuntimeShape& input_shape = GetTensorShape(input);
-  const RuntimeShape& output_shape = GetTensorShape(output);
-  const int trailing_dim = input_shape.DimensionsCount() - 1;
-  const int depth =
-      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
-  int scratch_size =
-      xa_nn_get_softmax_scratch_size(PREC_SYM8S, PREC_SYM8S, depth);
-
-  const TfLiteStatus scratch_status = context->RequestScratchBufferInArena(
-      context, scratch_size, &(op_data->scratch_tensor_index));
-  TF_LITE_ENSURE_OK(context, scratch_status);
-  // Allocate an array to precompute exponents over all int8_t inputs, applying
-  // the scale and beta before calculating exp. It is mandatory to apply beta
-  // and scale here, since each softmax op may have different beta and scale
-  // values. Beta and scale will remain constant for a given softmax op.
-
-  TF_LITE_ENSURE_STATUS(
-      CalculateSoftmaxOpData(context, input, output, params, op_data));
-
-  return kTfLiteOk;
-}
-
-TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
-  auto* op_data = static_cast<OpData*>(node->user_data);
-
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
-
-  if (input->type == kTfLiteInt8 && output->type == kTfLiteInt16) {
-    const RuntimeShape& input_shape = GetTensorShape(input);
-    const int8_t* input_data = GetTensorData<int8_t>(input);
-    const RuntimeShape& output_shape = GetTensorShape(output);
-    int16_t* output_data = GetTensorData<int16_t>(output);
-    const int trailing_dim = input_shape.DimensionsCount() - 1;
-    const int outer_size =
-        MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
-    const int depth =
-        MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
-
-    void* p_scratch = static_cast<void*>(
-        context->GetScratchBuffer(context, op_data->scratch_tensor_index));
-    TFLITE_DCHECK(p_scratch != nullptr);
-
-    for (int i = 0; i < outer_size; ++i) {
-      int err = xa_nn_vec_softmax_asym8s_16(
-          &output_data[i * depth], &input_data[i * depth], op_data->diff_min,
-          op_data->input_left_shift, op_data->input_multiplier, depth,
-          p_scratch);
-      CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_softmax_asym8s_16 failed");
-    }
-    return kTfLiteOk;
-  } else {
-    TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
-                       TfLiteTypeGetName(input->type), input->type);
-    return kTfLiteError;
-  }
-}
-}  // namespace activations
-
-TfLiteRegistration Register_SOFTMAX() {
-  return {/*init=*/activations::SoftmaxInit,
-          /*free=*/nullptr,
-          /*prepare=*/activations::SoftmaxPrepare,
-          /*invoke=*/activations::SoftmaxEval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
-}
-
-}  // namespace micro
-}  // namespace ops
-}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/svdf.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/svdf.cc
deleted file mode 100644
index 05256f3..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/svdf.cc
+++ /dev/null
@@ -1,356 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2020 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
-
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <math.h>
-#include <xtensa/tie/xt_hifi2.h>
-
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/kernels/internal/common.h"
-#include "tensorflow/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
-#include "tensorflow/lite/micro/kernels/activation_utils.h"
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xtensa_tf_micro_common.h"
-
-namespace tflite {
-namespace ops {
-namespace micro {
-namespace svdf {
-namespace {
-
-struct OpData {
-  int32_t effective_scale_1_a;
-  int32_t effective_scale_2_a;
-  // b versions of each scale are kept at int since the numbers are just the
-  // shift value - typically between [-32, 32].
-  int effective_scale_1_b;
-  int effective_scale_2_b;
-  int scratch_tensor_index;
-  int scratch_output_tensor_index;
-};
-
-// Input tensors.
-constexpr int kInputTensor = 0;
-constexpr int kWeightsFeatureTensor = 1;
-constexpr int kWeightsTimeTensor = 2;
-constexpr int kBiasTensor = 3;
-// This is a variable tensor, and will be modified by this op.
-constexpr int kInputActivationStateTensor = 4;
-
-// Output tensor.
-constexpr int kOutputTensor = 0;
-
-/**
- * This version of SVDF is specific to TFLite Micro. It contains only a full
- * integer receipe with optimizations for the Xtensa HiFiMini platform.
- *
- * Note: passing OpData by value might seem like an oversight but it helps
- * reduce the latency. See b/155656675 for more details.
- */
-TfLiteStatus EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
-                             const TfLiteTensor* input_tensor,
-                             const TfLiteTensor* weights_feature_tensor,
-                             const TfLiteTensor* weights_time_tensor,
-                             const TfLiteTensor* bias_tensor,
-                             const TfLiteSVDFParams* params,
-                             TfLiteTensor* activation_state_tensor,
-                             TfLiteTensor* output_tensor, OpData data,
-                             int32_t input_zp, int32_t output_zp) {
-  const int n_rank = params->rank;
-  const int n_batch = input_tensor->dims->data[0];
-  const int n_input = input_tensor->dims->data[1];
-  const int n_filter = weights_feature_tensor->dims->data[0];
-  const int n_unit = n_filter / n_rank;
-  const int n_memory = weights_time_tensor->dims->data[1];
-
-  TFLITE_DCHECK(context != nullptr);
-  TFLITE_DCHECK(context->GetScratchBuffer != nullptr);
-
-  int32_t* scratch_tensor = static_cast<int32_t*>(
-      context->GetScratchBuffer(context, data.scratch_tensor_index));
-  TFLITE_DCHECK(scratch_tensor != nullptr);
-  int32_t* scratch_output_tensor = static_cast<int32_t*>(
-      context->GetScratchBuffer(context, data.scratch_output_tensor_index));
-  TFLITE_DCHECK(scratch_output_tensor != nullptr);
-
-  // Shift states.
-  int16_t* const state_ptr = GetTensorData<int16_t>(activation_state_tensor);
-
-  // Left shift the activation_state.
-
-  // 4-byte alignment check for state_ptr
-  if (((reinterpret_cast<int>(state_ptr)) & 0x3) == 0) {
-    // 4-bytes aligned processing
-    ae_p16x2s* new_state_start = (ae_p16x2s*)(state_ptr - 2);
-    const ae_p16x2s* old_state_start = (ae_p16x2s*)(state_ptr - 2);
-    int loopcnt = (n_batch * n_filter * n_memory) - 1;
-    ae_p24x2s dstate, dtmp, dout;
-
-    AE_LP16X2F_IU(dtmp, old_state_start, 4);
-    AE_LP16X2F_IU(dstate, old_state_start, 4);
-    for (int i = 0; i < (loopcnt >> 1); i++) {
-      dout = AE_SELP24_LH(dtmp, dstate);
-      dtmp = dstate;
-      AE_LP16X2F_IU(dstate, old_state_start, 4);
-      AE_SP16X2F_IU(dout, new_state_start, 4);
-    }
-    if (loopcnt & 0x1) {
-      AE_SP16F_L_I(dtmp, (ae_p16s*)new_state_start, 4);
-    }
-  } else {
-    // 2-bytes aligned processing
-    ae_p16s* new_state_start = (ae_p16s*)(state_ptr - 1);
-    const ae_p16s* old_state_start = (ae_p16s*)(state_ptr);
-    int loopcnt = (n_batch * n_filter * n_memory) - 1;
-    ae_p24x2s dstate;
-    for (int i = 0; i < loopcnt; i++) {
-      AE_LP16F_IU(dstate, old_state_start, 2);
-      AE_SP16F_L_IU(dstate, new_state_start, 2);
-    }
-  }
-  // Note: no need to clear the latest activation, matmul is not accumulative.
-
-  // Feature matmul.
-  {
-    int16_t* state = GetTensorData<int16_t>(activation_state_tensor);
-    const int8_t* input = GetTensorData<int8_t>(input_tensor);
-    const int8_t* weight_feature =
-        GetTensorData<int8_t>(weights_feature_tensor);
-    int16_t* result_in_batch = state + (n_memory - 1);
-    int err = 0;
-
-    for (int b = 0; b < n_batch; b++) {
-      err = xa_nn_matXvec_out_stride_sym8sxasym8s_16(
-          &result_in_batch[b * n_filter * n_memory], weight_feature,
-          &input[b * n_input], NULL, n_filter, n_input, n_input, n_memory,
-          -input_zp, (data.effective_scale_1_a << 8), data.effective_scale_1_b);
-      CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_matXvec_sym8sxasym8s_16 failed");
-    }
-  }
-
-  // Time.
-  {
-    for (int b = 0; b < n_batch; ++b) {
-      int8_t* output_ptr = GetTensorData<int8_t>(output_tensor) + b * n_unit;
-
-      const int16_t* vector1_ptr = GetTensorData<int16_t>(weights_time_tensor);
-      const int16_t* vector2_ptr =
-          GetTensorData<int16_t>(activation_state_tensor) +
-          b * n_memory * n_filter;
-      int err = 0;
-      const int32_t* bias_ptr = GetTensorData<int32_t>(bias_tensor);
-      err = xa_nn_dot_prod_16x16_asym8s(
-          output_ptr, vector1_ptr, vector2_ptr, bias_ptr, n_memory * n_rank,
-          (data.effective_scale_2_a << 8), data.effective_scale_2_b, output_zp,
-          n_unit);
-      CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_dot_prod_16x16_asym8s failed");
-    }
-  }
-  return kTfLiteOk;
-}
-
-}  // namespace
-
-void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  TFLITE_DCHECK(context != nullptr);
-  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
-  return context->AllocatePersistentBuffer(context, sizeof(OpData));
-}
-
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  TFLITE_DCHECK(node->builtin_data != nullptr);
-  const auto* params = static_cast<const TfLiteSVDFParams*>(node->builtin_data);
-
-  // Validate Tensor Inputs (dtype depends on quantization):
-  // [0] = Input, {2, batch_size, input_size}
-  // [1] = Weights Feature, {2, num_filters, input_size}
-  // [2] = Weights Time, {2, num_filters, memory_size}
-  // [3] = Bias (optional), {1, num_units}
-  // [4] = Activation State (variable),
-  //         {2, batch_size, memory_size * num_filters}
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* weights_feature =
-      GetInput(context, node, kWeightsFeatureTensor);
-  const TfLiteTensor* weights_time =
-      GetInput(context, node, kWeightsTimeTensor);
-  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
-  const TfLiteTensor* activation_state =
-      GetInput(context, node, kInputActivationStateTensor);
-
-  // Define input constants based on input tensor definition above:
-  const int rank = params->rank;
-  const int input_size = input->dims->data[1];
-  const int batch_size = input->dims->data[0];
-  // Ensure the input size is a multiple of two.  This is necessary since
-  // optimized kernels access the memory in chunks of two, and all accesses
-  // must be aligned to 16 bits.
-  // TODO(b/153202598): Remove when padding is allowed in TFLite tensors.
-  TF_LITE_ENSURE_EQ(context, input_size % 2, 0);
-
-  const int num_filters = weights_feature->dims->data[0];
-  TF_LITE_ENSURE_EQ(context, num_filters % rank, 0);
-  const int num_units = num_filters / rank;
-  const int memory_size = weights_time->dims->data[1];
-
-  if (input->type != kTfLiteInt8) {
-    TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
-                       TfLiteTypeGetName(input->type), input->type);
-    return kTfLiteError;
-  }
-
-  // Validate Input Tensor:
-  TF_LITE_ENSURE(context, input->type == kTfLiteInt8);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 2);
-
-  // Validate Tensor Output:
-  // [0] = float/int8_t, {2, batch_size, num_units}
-  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(output), 2);
-  TF_LITE_ENSURE_EQ(context, output->dims->data[0], batch_size);
-  TF_LITE_ENSURE_EQ(context, output->dims->data[1], num_units);
-
-  // Validate Weights Feature Input Tensor:
-  TF_LITE_ENSURE_EQ(context, NumDimensions(weights_feature), 2);
-  TF_LITE_ENSURE_EQ(context, weights_feature->dims->data[1], input_size);
-
-  // Validate Weights Time Input Tensor:
-  TF_LITE_ENSURE_EQ(context, NumDimensions(weights_time), 2);
-  TF_LITE_ENSURE_EQ(context, weights_time->dims->data[0], num_filters);
-  TF_LITE_ENSURE_EQ(context, weights_time->dims->data[1], memory_size);
-
-  // Validate Optional Bias Input Tensor:
-  if (bias != nullptr) {
-    TF_LITE_ENSURE_EQ(context, bias->dims->data[0], num_units);
-    TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
-  }
-
-  // Validate Activation State Input Tensor:
-  TF_LITE_ENSURE_EQ(context, NumDimensions(activation_state), 2);
-  TF_LITE_ENSURE_EQ(context, activation_state->dims->data[0], batch_size);
-  TF_LITE_ENSURE_EQ(context, activation_state->dims->data[1],
-                    memory_size * num_filters);
-
-  TF_LITE_ENSURE_EQ(context, node->inputs->size, 5);
-  TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteInt8);
-  TF_LITE_ENSURE_EQ(context, weights_time->type, kTfLiteInt16);
-  TF_LITE_ENSURE_EQ(context, activation_state->type, kTfLiteInt16);
-
-  // Validate output tensor:
-  TF_LITE_ENSURE_EQ(context, output->type, kTfLiteInt8);
-
-  // Calculate effective scales.
-  auto* input_params =
-      static_cast<TfLiteAffineQuantization*>(input->quantization.params);
-  auto* weights_feature_params = static_cast<TfLiteAffineQuantization*>(
-      weights_feature->quantization.params);
-  auto* state_params = static_cast<TfLiteAffineQuantization*>(
-      activation_state->quantization.params);
-  auto* weight_time_params =
-      static_cast<TfLiteAffineQuantization*>(weights_time->quantization.params);
-  auto* output_params =
-      static_cast<TfLiteAffineQuantization*>(output->quantization.params);
-  const float effective_scale_1 = input_params->scale->data[0] *
-                                  weights_feature_params->scale->data[0] /
-                                  state_params->scale->data[0];
-  const float effective_scale_2 = state_params->scale->data[0] *
-                                  weight_time_params->scale->data[0] /
-                                  output_params->scale->data[0];
-
-  TFLITE_DCHECK(node->user_data != nullptr);
-  OpData* data = static_cast<OpData*>(node->user_data);
-
-  xtensa::hifimini::QuantizeMultiplier(effective_scale_1,
-                                       &data->effective_scale_1_a,
-                                       &data->effective_scale_1_b);
-  xtensa::hifimini::QuantizeMultiplier(effective_scale_2,
-                                       &data->effective_scale_2_a,
-                                       &data->effective_scale_2_b);
-
-  const TfLiteStatus scratch_status = context->RequestScratchBufferInArena(
-      context, batch_size * num_filters * sizeof(int32_t),
-      &(data->scratch_tensor_index));
-  TF_LITE_ENSURE_OK(context, scratch_status);
-  const TfLiteStatus scratch_output_status =
-      context->RequestScratchBufferInArena(
-          context, batch_size * num_units * sizeof(int32_t),
-          &(data->scratch_output_tensor_index));
-  TF_LITE_ENSURE_OK(context, scratch_output_status);
-
-  return kTfLiteOk;
-}
-
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params = static_cast<TfLiteSVDFParams*>(node->builtin_data);
-
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* weights_feature =
-      GetInput(context, node, kWeightsFeatureTensor);
-  const TfLiteTensor* weights_time =
-      GetInput(context, node, kWeightsTimeTensor);
-  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
-  TfLiteTensor* activation_state =
-      GetVariableInput(context, node, kInputActivationStateTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActRelu);
-
-  TFLITE_DCHECK(node->user_data != nullptr);
-  const OpData& data = *(static_cast<const OpData*>(node->user_data));
-
-  return EvalIntegerSVDF(context, node, input, weights_feature, weights_time,
-                         bias, params, activation_state, output, data,
-                         input->params.zero_point, output->params.zero_point);
-}
-
-}  // namespace svdf
-
-TfLiteRegistration Register_SVDF() {
-  return {/*init=*/svdf::Init,
-          /*free=*/nullptr,
-          /*prepare=*/svdf::Prepare,
-          /*invoke=*/svdf::Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
-}
-
-}  // namespace micro
-}  // namespace ops
-}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/common/include/xa_api_defs.h b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/common/include/xa_api_defs.h
deleted file mode 100644
index a3eac67..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/common/include/xa_api_defs.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
-
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef __XA_API_DEFS_H__
-#define __XA_API_DEFS_H__
-
-/*****************************************************************************/
-/* Constant hash defines                                                     */
-/*****************************************************************************/
-/* A constant to let API copy small strings to buffers outside */
-#define XA_API_STR_LEN 30
-#define XA_APIVERSION_MAJOR 1
-#define XA_APIVERSION_MINOR 0
-
-/* last compatible version */
-/* sometimes a new API version is just for a bugfix, or a added feature  in */
-/* this case it is better to use a newer version even though a library  was */
-/* made for an older version, library API can then be upgraded to newer API */
-/* version after checking for compatibility or by adding features           */
-#define XA_LASTCOMP_APIVERSION_MAJOR 1
-#define XA_LASTCOMP_APIVERSION_MINOR 0
-
-#define XA_STR(str) #str
-#define XA_MAKE_VERSION_STR(maj, min) XA_STR(maj) "." XA_STR(min)
-#define XA_APIVERSION \
-  XA_MAKE_VERSION_STR(XA_APIVERSION_MAJOR, XA_APIVERSION_MINOR)
-
-#define XA_LAST_COMP_APIVERSION                     \
-  XA_MAKE_VERSION_STR(XA_LASTCOMP_APIVERSION_MAJOR, \
-                      XA_LASTCOMP_APIVERSION_MINOR)
-
-#endif /* __XA_API_DEFS_H__ */
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/common/include/xa_nnlib_common.h b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/common/include/xa_nnlib_common.h
deleted file mode 100644
index 71e6682..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/common/include/xa_nnlib_common.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
-
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef __XA_NNLIB_COMMON_H__
-#define __XA_NNLIB_COMMON_H__
-
-#include <inttypes.h>
-#include <stddef.h>
-#include <xtensa/config/core-isa.h>
-#include <xtensa/tie/xt_core.h>
-#include <xtensa/tie/xt_hifi2.h>
-#include <xtensa/tie/xt_misc.h>
-#if XCHAL_HAVE_HIFI4_VFPU
-#include <xtensa/tie/xt_FP.h>
-#endif
-
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/common/include/xa_nnlib_err_chk.h"
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/nnlib/xa_nnlib_kernels_api.h"
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/nnlib/xa_nnlib_standards.h"
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/xa_type_def.h"
-
-#endif /* __XA_NNLIB_COMMON_H__ */
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/common/include/xa_nnlib_common_macros.h b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/common/include/xa_nnlib_common_macros.h
deleted file mode 100644
index d04752b..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/common/include/xa_nnlib_common_macros.h
+++ /dev/null
@@ -1,921 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
-
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef __XA_NNLIB_COMMON_MACROS_H__
-#define __XA_NNLIB_COMMON_MACROS_H__
-
-#ifndef NULL
-#define NULL (void *)0
-#endif /* NULL */
-
-#define ALIGNMENT 8
-
-/* Macro for zero value */
-#define ZERO64 AE_MOVINT64_FROMINT32X2(AE_MOVDA32(0))
-#define ZERO16X4 AE_MOVDA16(0)
-#define ZERO16 (0)
-#define ZERO32 (0)
-
-/* Macro for 1 */
-#define ONE16X4 AE_MOVDA16(1)
-
-/* Value of ROW_UNROLL currently supported are 1,2,4,8 only */
-#ifndef ROW_UNROLL
-#define ROW_UNROLL 8
-#endif
-#define VEC_UNROLL 2
-
-#define ACC_LSH_AFTER_FIRST_MATXVEC 0
-
-/* Increment in bytes required for particular load
- * instructions. */
-#define INCREMENT_IN_BYTES_FOR_WORD8 1
-#define INCREMENT_IN_BYTES_FOR_INT16 2
-#define INCREMENT_IN_BYTES_FOR_INT32 (INCREMENT_IN_BYTES_FOR_INT16 * 2)
-#define INCREMENT_IN_BYTES_FOR_WORD8X4 (INCREMENT_IN_BYTES_FOR_WORD8 * 4)
-#define INCREMENT_IN_BYTES_FOR_INT16X4 (INCREMENT_IN_BYTES_FOR_INT16 * 4)
-#define INCREMENT_IN_BYTES_FOR_INT64 INCREMENT_IN_BYTES_FOR_INT16X4
-#define INCREMENT_IN_BYTES_FOR_FLOAT32 4
-#define INCREMENT_IN_BYTES_FOR_FLOAT32x2 (INCREMENT_IN_BYTES_FOR_FLOAT32 * 2)
-
-#define HF2_AE_ADDCIRC16X4_XC(ptr, offset) \
-  ptr = ptr + offset;                      \
-  if (ptr >= p_end) ptr = ptr - size;
-
-#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(q_out, inp, out_multiplier, \
-                                         left_shift, right_shift)    \
-  {                                                                  \
-    ae_q56s d1;                                                      \
-    ae_p24x2s d_mul;                                                 \
-    d_mul = AE_CVTP24A16X2_HL(out_multiplier, out_multiplier);       \
-    d1 = AE_CVTQ48A32S(inp);                                         \
-    d1 = AE_SLLAQ56(d1, left_shift);                                 \
-    q_out = AE_MULFQ32SP16U_L(d1, d_mul);                            \
-    q_out = AE_SRAIQ56(q_out, 16);                                   \
-    AE_MULAFQ32SP16S_H(q_out, d1, d_mul);                            \
-    q_out = AE_SRAAQ56(q_out, right_shift);                          \
-    q_out = AE_ROUNDSQ32SYM(q_out);                                  \
-  }
-
-/* Limit effective bias_shift and acc_shift to [-63 ... 63] */
-#define LIMIT_VARIABLE(_var, _left_limit, _right_limit) \
-  _var = _var > _right_limit ? _right_limit             \
-                             : _var < _left_limit ? _left_limit : _var;
-
-#define LIMIT_ACC_LSH LIMIT_VARIABLE(acc_shift, -63, 63);
-
-#define LIMIT_BIAS_LSH LIMIT_VARIABLE(bias_shift, -63, 63);
-
-#define BW(_datatype) sizeof(_datatype)
-
-#define ADJUST_VAR_AxB(A, B) (((8 * (4 - (BW(A) + BW(B))))))
-
-#define ADJUST_VAR_C(C) (((64 - (8 * BW(C)))))
-
-#define ADJUST_ACC_LSH_AxB_C(A, B, C) \
-  acc_shift = acc_shift + 32;         \
-  LIMIT_ACC_LSH;
-
-#define ADJUST_BIAS_LSH_AxB(A, B) LIMIT_BIAS_LSH;
-
-#define ADJUST_ACC_LSH_AND_BIAS_LSH_AxB_C(A, B, C) \
-  ADJUST_ACC_LSH_AxB_C(A, B, C);                   \
-  ADJUST_BIAS_LSH_AxB(A, B);
-
-/* ====================================================================================================
- */
-#define SETUP_BIAS_f32                   \
-  xtfloat _xtfloat_bias = (xtfloat)0.0f; \
-  xtfloat *_xtfloat_p_bias = (xtfloat *)p_bias;
-
-#define SETUP_BIAS_ASYM8b               \
-  WORD32 _WORD32_bias;                  \
-  ae_int64 _ae_int64_sat_bias = ZERO64; \
-  WORD32 *_WORD32_p_bias = (WORD32 *)p_bias;
-
-#define SETUP_BIAS_8b                   \
-  WORD8 _WORD8_bias;                    \
-  UWORD32 _UWORD32_bias;                \
-  ae_int64 _ae_int64_bias = ZERO64;     \
-  ae_int64 _ae_int64_sat_bias = ZERO64; \
-  WORD8 *_WORD8_p_bias = (WORD8 *)p_bias;
-
-#define SETUP_BIAS_8b_BATCH                     \
-  WORD8 _WORD8_bias;                            \
-  WORD16 _WORD16_bias;                          \
-  ae_int16 _ae_int16_bias = ZERO16;             \
-  ae_int16 *_ae_int16_p_bias = &_ae_int16_bias; \
-  ae_int64 _ae_int64_sat_bias = ZERO64;         \
-  WORD8 *_WORD8_p_bias = (WORD8 *)p_bias;
-
-#define SETUP_BIAS_32b                  \
-  ae_int32 _ae_int32_bias = ZERO32;     \
-  ae_int64 _ae_int64_sat_bias = ZERO64; \
-  ae_int32 *_ae_int32_p_bias = (ae_int32 *)p_bias;
-
-#define SETUP_BIAS_16b                  \
-  ae_int16 _ae_int16_bias = ZERO16;     \
-  ae_int64 _ae_int64_sat_bias = ZERO64; \
-  ae_int16 *_ae_int16_p_bias = (ae_int16 *)p_bias;
-
-#define SETUP_BIAS_64b                  \
-  ae_int64 _ae_int64_bias = ZERO64;     \
-  ae_int64 _ae_int64_sat_bias = ZERO64; \
-  ae_int64 *_ae_int64_p_bias = (ae_int64 *)p_bias;
-
-#define SETUP_ACC_FOR_8bx8b(idx) SETUP_ACC_64b(idx)
-#define SETUP_ACC_FOR_8bx16b(idx) SETUP_ACC_64b(idx)
-#define SETUP_ACC_FOR_16bx8b(idx) SETUP_ACC_64b(idx)
-#define SETUP_ACC_FOR_16bx16b(idx) SETUP_ACC_64b(idx)
-#define SETUP_ACC_FOR_ASYM8bxASYM8b(idx) SETUP_ACC_64b(idx)
-
-/*------------------ time batching macros ----------------- */
-
-#define SETUP_ACC_BATCH_ROW_FOR_16bx8b SETUP_ACC_BATCH_ROW_FOR_16bx16b
-#define SETUP_ACC_BATCH_ROW_FOR_8bx16b SETUP_ACC_BATCH_ROW_FOR_16bx16b
-#define SETUP_ACC_BATCH_ROW_FOR_8bx8b SETUP_ACC_BATCH_ROW_FOR_16bx16b
-#define SETUP_ACC_BATCH_ROW_FOR_ASYM8bxASYM8b SETUP_ACC_BATCH_ROW_FOR_16bx16b
-
-#define SETUP_ACC_BATCH_FOR_16bx8b SETUP_ACC_BATCH_FOR_16bx16b
-#define SETUP_ACC_BATCH_FOR_8bx16b SETUP_ACC_BATCH_FOR_16bx16b
-#define SETUP_ACC_BATCH_FOR_8bx8b SETUP_ACC_BATCH_FOR_16bx16b
-#define SETUP_ACC_BATCH_FOR_ASYM8bxASYM8b SETUP_ACC_BATCH_FOR_16bx16b
-
-#define SETUP_ACC_BATCH_ROW_FOR_16bx16b(idx_row) \
-  SETUP_ACC_BATCH_VEC_UNROLL(idx_row);
-
-#define SETUP_ACC_BATCH_FOR_16bx16b(idx_row, idx_vec) \
-  ae_int64 _ae_int64_acc_##idx_row##_##idx_vec = ZERO64;
-
-#define SETUP_ACC_BATCH_ROW_FOR_f32(idx_row) \
-  SETUP_ACC_BATCH_VEC_UNROLL(idx_row);
-
-#define SETUP_ACC_BATCH_FOR_f32(idx_row, idx_vec)                   \
-  xtfloatx2 _xtfloatx2_acc_##idx_row##_##idx_vec = (xtfloatx2)0.0f; \
-  xtfloat _xtfloat_acc_##idx_row##_##idx_vec = (xtfloat)0.0f;       \
-  /*---------------------------------------------------------*/
-
-#define SETUP_ACC_64b(idx) ae_int64 _ae_int64_acc_##idx = ZERO64;
-
-#define SETUP_VEC1_8b                     \
-  ae_int16x4 _ae_int16x4_vec1 = ZERO16X4; \
-  WORD8 *_WORD8_p_vec1 = (WORD8 *)p_vec1;
-
-#define SETUP_VEC2_8b                     \
-  ae_int16x4 _ae_int16x4_vec2 = ZERO16X4; \
-  WORD8 *_WORD8_p_vec2 = (WORD8 *)p_vec2;
-
-#define SETUP_VEC1_16b                    \
-  ae_int16x4 _ae_int16x4_vec1 = ZERO16X4; \
-  ae_int16x4 *_ae_int16x4_p_vec1 = (ae_int16x4 *)p_vec1;
-
-#define SETUP_VEC2_16b                    \
-  ae_int16x4 _ae_int16x4_vec2 = ZERO16X4; \
-  ae_int16x4 *_ae_int16x4_p_vec2 = (ae_int16x4 *)p_vec2;
-
-#define SETUP_VEC1_ASYM8b SETUP_VEC1_8b
-#define SETUP_VEC2_ASYM8b SETUP_VEC2_8b
-/*------------------ time batching macros ----------------- */
-
-#define SETUP_VEC_BATCH_8b(idx_vec)                      \
-  ae_int16x4 _ae_int16x4_vec_batch_##idx_vec = ZERO16X4; \
-  WORD8 *_WORD8_p_vec_batch_##idx_vec = (WORD8 *)(p_vec1[vec_itr + idx_vec]);
-
-#define SETUP_VEC_BATCH_16b(idx_vec)                     \
-  ae_int16x4 _ae_int16x4_vec_batch_##idx_vec = ZERO16X4; \
-  ae_int16x4 *_ae_int16x4_p_vec_batch_##idx_vec =        \
-      (ae_int16x4 *)(p_vec1[vec_itr + idx_vec]);
-
-#define SETUP_VEC_OFFSET_BATCH_16b(idx_vec)              \
-  ae_int16x4 _ae_int16x4_vec_batch_##idx_vec = ZERO16X4; \
-  ae_int16x4 *_ae_int16x4_p_vec_batch_##idx_vec =        \
-      (ae_int16x4 *)(p_vec1 + (vec_itr + idx_vec) * vec_offset);
-
-#define SETUP_VEC_BATCH_f32(idx_vec)                          \
-  xtfloatx2 _xtfloatx2_vec_batch_##idx_vec = (xtfloatx2)0.0f; \
-  xtfloatx2 *_xtfloatx2_p_vec_batch_##idx_vec =               \
-      (xtfloatx2 *)(p_vec1[vec_itr + idx_vec]);
-
-#define SETUP_VEC_BATCH_ASYM8b SETUP_VEC_BATCH_8b
-/*---------------------------------------------------------*/
-
-#define SETUP_MAT1_8b(idx)                      \
-  ae_int16x4 _ae_int16x4_mat1_##idx = ZERO16X4; \
-  WORD8 *_WORD8_p_mat1_##idx = (WORD8 *)&p_mat1[(m_itr + idx) * row_stride1];
-
-#define SETUP_MAT2_8b(idx)                      \
-  ae_int16x4 _ae_int16x4_mat2_##idx = ZERO16X4; \
-  WORD8 *_WORD8_p_mat2_##idx = (WORD8 *)&p_mat2[(m_itr + idx) * row_stride2];
-
-#define SETUP_MAT1_16b(idx)                     \
-  ae_int16x4 _ae_int16x4_mat1_##idx = ZERO16X4; \
-  ae_int16x4 *_ae_int16x4_p_mat1_##idx =        \
-      (ae_int16x4 *)&p_mat1[(m_itr + idx) * row_stride1];
-
-#define SETUP_MAT2_16b(idx)                     \
-  ae_int16x4 _ae_int16x4_mat2_##idx = ZERO16X4; \
-  ae_int16x4 *_ae_int16x4_p_mat2_##idx =        \
-      (ae_int16x4 *)&p_mat2[(m_itr + idx) * row_stride2];
-
-#define SETUP_MAT1_f32(idx)                          \
-  xtfloatx2 _xtfloatx2_mat1_##idx = (xtfloatx2)0.0f; \
-  xtfloatx2 *_xtfloatx2_p_mat1_##idx =               \
-      (xtfloatx2 *)&p_mat1[(m_itr + idx) * row_stride1];
-
-#define SETUP_MAT1_ASYM8b SETUP_MAT1_8b
-#define SETUP_MAT2_ASYM8b SETUP_MAT2_8b
-/* ====================================================================== */
-
-#define LOAD_VEC1_8b \
-  AE_L8X4F_IP(_ae_int16x4_vec1, _WORD8_p_vec1, INCREMENT_IN_BYTES_FOR_WORD8X4);
-
-#define LOAD_VEC2_8b \
-  AE_L8X4F_IP(_ae_int16x4_vec2, _WORD8_p_vec2, INCREMENT_IN_BYTES_FOR_WORD8X4);
-
-#define LOAD_VEC1_16b                               \
-  AE_L16X4_IP(_ae_int16x4_vec1, _ae_int16x4_p_vec1, \
-              INCREMENT_IN_BYTES_FOR_INT16X4);
-
-#define LOAD_VEC2_16b                               \
-  AE_L16X4_IP(_ae_int16x4_vec2, _ae_int16x4_p_vec2, \
-              INCREMENT_IN_BYTES_FOR_INT16X4);
-
-#define LOAD_VEC1_ASYM8b                                    \
-  AE_L8X4F_IP(_ae_int16x4_vec1, _WORD8_p_vec1,              \
-              INCREMENT_IN_BYTES_FOR_WORD8X4);              \
-  _ae_int16x4_vec1 = AE_MOVF16X4_FROMF64(                   \
-      AE_SRLI64(AE_MOVF64_FROMF16X4(_ae_int16x4_vec1), 8)); \
-  _ae_int16x4_vec1 = AE_ADD16(_ae_int16x4_vec1, AE_MOVDA16(vec1_zero_bias));
-
-#define LOAD_VEC2_ASYM8b                                                     \
-  AE_L8X4F_IP(_ae_int16x4_vec2, _WORD8_p_vec2,                               \
-              INCREMENT_IN_BYTES_FOR_WORD8X4);                               \
-  _ae_int16x4_vec2 = AE_MOVF16X4_FROMF64(                                    \
-      AE_SRLI64(AE_MOVF64_FROMF16X4(_ae_int16x4_vec2), 8));                  \
-  _ae_int16x4_vec2 = AE_ADD16(_ae_int16x4_vec2, AE_MOVDA16(vec2_zero_bias)); \
-/*------------------ time batching macros ----------------- */
-#define LOAD_VEC_BATCH_f32(idx_vec)                                           \
-  XT_LSX2IP(_xtfloatx2_vec_batch_##idx_vec, _xtfloatx2_p_vec_batch_##idx_vec, \
-            INCREMENT_IN_BYTES_FOR_FLOAT32x2);
-
-#define LOAD_VEC_BATCH_8b(idx_vec)                                           \
-  AE_L8X4F_IP(_ae_int16x4_vec_batch_##idx_vec, _WORD8_p_vec_batch_##idx_vec, \
-              INCREMENT_IN_BYTES_FOR_WORD8X4);
-
-#define LOAD_VEC_BATCH_16b(idx_vec)              \
-  AE_L16X4_IP(_ae_int16x4_vec_batch_##idx_vec,   \
-              _ae_int16x4_p_vec_batch_##idx_vec, \
-              INCREMENT_IN_BYTES_FOR_INT16X4);
-
-#define LOAD_VEC_BATCH_ASYM8b(idx_vec)                                       \
-  AE_L8X4F_IP(_ae_int16x4_vec_batch_##idx_vec, _WORD8_p_vec_batch_##idx_vec, \
-              INCREMENT_IN_BYTES_FOR_WORD8X4);                               \
-  _ae_int16x4_vec_batch_##idx_vec = AE_MOVF16X4_FROMF64(                     \
-      AE_SRLI64(AE_MOVF64_FROMF16X4(_ae_int16x4_vec_batch_##idx_vec), 8));   \
-  _ae_int16x4_vec_batch_##idx_vec =                                          \
-      AE_ADD16(_ae_int16x4_vec_batch_##idx_vec, AE_MOVDA16(vec1_zero_bias));
-
-#define LOAD_BIAS_8b_FOR_8bx8b                  \
-  _WORD8_bias = *_WORD8_p_bias++;               \
-  _WORD16_bias = _WORD8_bias;                   \
-  *((WORD16 *)_ae_int16_p_bias) = _WORD16_bias; \
-  _ae_int64_sat_bias = AE_SLAA64S(((ae_int64)_ae_int16_bias), bias_shift);
-
-#define LOAD_BIAS_16b_FOR_8bx16b                    \
-  ae_int16_loadip(_ae_int16_bias, _ae_int16_p_bias, \
-                  INCREMENT_IN_BYTES_FOR_INT16);    \
-  _ae_int64_sat_bias = AE_SLAA64S(((ae_int64)_ae_int16_bias), bias_shift);
-
-#define LOAD_BIAS_16b_FOR_16bx8b LOAD_BIAS_16b_FOR_8bx16b
-
-#define LOAD_BIAS_16b_FOR_16bx16b                   \
-  ae_int16_loadip(_ae_int16_bias, _ae_int16_p_bias, \
-                  INCREMENT_IN_BYTES_FOR_INT16);    \
-  _ae_int64_sat_bias = AE_SLAA64S(((ae_int64)_ae_int16_bias), bias_shift);
-
-#define LOAD_BIAS_f32 \
-  XT_LSIP(_xtfloat_bias, _xtfloat_p_bias, INCREMENT_IN_BYTES_FOR_FLOAT32);
-
-#define LOAD_BIAS_ASYM8b                                                \
-  _WORD32_bias = *_WORD32_p_bias++;                                     \
-  _ae_int64_sat_bias =                                                  \
-      AE_SRAI64(AE_MOVINT64_FROMINT32X2(AE_MOVDA32(_WORD32_bias)), 32); \
-/*---------------------------------------------------------*/
-#define LOAD_ROW_MAT1_8b(idx)                              \
-  AE_L8X4F_IP(_ae_int16x4_mat1_##idx, _WORD8_p_mat1_##idx, \
-              INCREMENT_IN_BYTES_FOR_WORD8X4);
-
-#define LOAD_ROW_MAT2_8b(idx)                              \
-  AE_L8X4F_IP(_ae_int16x4_mat2_##idx, _WORD8_p_mat2_##idx, \
-              INCREMENT_IN_BYTES_FOR_WORD8X4);
-
-#define LOAD_ROW_MAT1_16b(idx)                                  \
-  AE_L16X4_IP(_ae_int16x4_mat1_##idx, _ae_int16x4_p_mat1_##idx, \
-              INCREMENT_IN_BYTES_FOR_INT16X4);
-
-#define LOAD_ROW_MAT2_16b(idx)                                  \
-  AE_L16X4_IP(_ae_int16x4_mat2_##idx, _ae_int16x4_p_mat2_##idx, \
-              INCREMENT_IN_BYTES_FOR_INT16X4);
-
-#define LOAD_ROW_MAT1_f32(idx)                              \
-  XT_LSX2IP(_xtfloatx2_mat1_##idx, _xtfloatx2_p_mat1_##idx, \
-            INCREMENT_IN_BYTES_FOR_FLOAT32x2);
-
-#define LOAD_ROW_MAT1_ASYM8b(idx)                                 \
-  AE_L8X4F_IP(_ae_int16x4_mat1_##idx, _WORD8_p_mat1_##idx,        \
-              INCREMENT_IN_BYTES_FOR_WORD8X4);                    \
-  _ae_int16x4_mat1_##idx = AE_MOVF16X4_FROMF64(                   \
-      AE_SRLI64(AE_MOVF64_FROMF16X4(_ae_int16x4_mat1_##idx), 8)); \
-  _ae_int16x4_mat1_##idx =                                        \
-      AE_ADD16(_ae_int16x4_mat1_##idx, AE_MOVDA16(mat1_zero_bias));
-
-#define LOAD_ROW_MAT2_ASYM8b(idx)                                 \
-  AE_L8X4F_IP(_ae_int16x4_mat2_##idx, _WORD8_p_mat2_##idx,        \
-              INCREMENT_IN_BYTES_FOR_WORD8X4);                    \
-  _ae_int16x4_mat2_##idx = AE_MOVF16X4_FROMF64(                   \
-      AE_SRLI64(AE_MOVF64_FROMF16X4(_ae_int16x4_mat2_##idx), 8)); \
-  _ae_int16x4_mat2_##idx =                                        \
-      AE_ADD16(_ae_int16x4_mat2_##idx, AE_MOVDA16(mat2_zero_bias));
-
-#define KERNEL_MAT1_VEC1_8b_8b(idx) \
-  LOAD_ROW_MAT1_8b(idx);            \
-  AE_MULAAAAQ16(_ae_int64_acc_##idx, _ae_int16x4_vec1, _ae_int16x4_mat1_##idx);
-
-#define KERNEL_MAT2_VEC2_8b_8b(idx) \
-  LOAD_ROW_MAT2_8b(idx);            \
-  AE_MULAAAAQ16(_ae_int64_acc_##idx, _ae_int16x4_vec2, _ae_int16x4_mat2_##idx);
-
-#define KERNEL_MAT1_VEC1_16b_8b(idx) \
-  LOAD_ROW_MAT1_16b(idx);            \
-  AE_MULAAAAQ16(_ae_int64_acc_##idx, _ae_int16x4_vec1, _ae_int16x4_mat1_##idx);
-
-#define KERNEL_MAT2_VEC2_16b_8b(idx) \
-  LOAD_ROW_MAT2_16b(idx);            \
-  AE_MULAAAAQ16(_ae_int64_acc_##idx, _ae_int16x4_vec2, _ae_int16x4_mat2_##idx);
-
-#define KERNEL_MAT1_VEC1_8b_16b(idx) \
-  LOAD_ROW_MAT1_8b(idx);             \
-  AE_MULAAAAQ16(_ae_int64_acc_##idx, _ae_int16x4_vec1, _ae_int16x4_mat1_##idx);
-
-#define KERNEL_MAT2_VEC2_8b_16b(idx) \
-  LOAD_ROW_MAT2_8b(idx);             \
-  AE_MULAAAAQ16(_ae_int64_acc_##idx, _ae_int16x4_vec2, _ae_int16x4_mat2_##idx);
-
-#define KERNEL_MAT1_VEC1_16b_16b(idx) \
-  LOAD_ROW_MAT1_16b(idx);             \
-  AE_MULAAAAQ16(_ae_int64_acc_##idx, _ae_int16x4_vec1, _ae_int16x4_mat1_##idx);
-
-#define KERNEL_MAT2_VEC2_16b_16b(idx) \
-  LOAD_ROW_MAT2_16b(idx);             \
-  AE_MULAAAAQ16(_ae_int64_acc_##idx, _ae_int16x4_vec2, _ae_int16x4_mat2_##idx);
-
-#define KERNEL_MAT1_VEC1_ASYM8b_ASYM8b(idx) \
-  LOAD_ROW_MAT1_ASYM8b(idx);                \
-  AE_MULAAAAQ16(_ae_int64_acc_##idx, _ae_int16x4_vec1, _ae_int16x4_mat1_##idx);
-
-#define KERNEL_MAT2_VEC2_ASYM8b_ASYM8b(idx) \
-  LOAD_ROW_MAT2_ASYM8b(idx);                \
-  AE_MULAAAAQ16(_ae_int64_acc_##idx, _ae_int16x4_vec2, _ae_int16x4_mat2_##idx);
-
-/*------------------ time batching macros ----------------- */
-
-#define KERNEL_MAT1_VEC_BATCH_ROW_8b_8b KERNEL_MAT1_VEC_BATCH_ROW_16b_16b
-#define KERNEL_MAT1_VEC_BATCH_ROW_16b_8b KERNEL_MAT1_VEC_BATCH_ROW_16b_16b
-#define KERNEL_MAT1_VEC_BATCH_ROW_8b_16b KERNEL_MAT1_VEC_BATCH_ROW_16b_16b
-#define KERNEL_MAT1_VEC_BATCH_ROW_ASYM8b_ASYM8b \
-  KERNEL_MAT1_VEC_BATCH_ROW_16b_16b
-#define KERNEL_MAT1_VEC_BATCH_8b_8b KERNEL_MAT1_VEC_BATCH_16b_16b
-#define KERNEL_MAT1_VEC_BATCH_16b_8b KERNEL_MAT1_VEC_BATCH_16b_16b
-#define KERNEL_MAT1_VEC_BATCH_8b_16b KERNEL_MAT1_VEC_BATCH_16b_16b
-#define KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b KERNEL_MAT1_VEC_BATCH_16b_16b
-
-#define KERNEL_MAT1_VEC_BATCH_ROW_16b_16b(idx_row) \
-  KERNEL_MAT1_VEC_BATCH_VEC_UNROLL(idx_row);
-
-#define KERNEL_MAT1_VEC_BATCH_16b_16b(idx_row, idx_vec) \
-  AE_MULAAAAQ16(_ae_int64_acc_##idx_row##_##idx_vec,    \
-                _ae_int16x4_vec_batch_##idx_vec, _ae_int16x4_mat1_##idx_row);
-
-#define KERNEL_MAT1_VEC_BATCH_ROW_f32(idx_row) \
-  KERNEL_MAT1_VEC_BATCH_VEC_UNROLL(idx_row);
-
-#define KERNEL_MAT1_VEC_BATCH_f32(idx_row, idx_vec) \
-  XT_MADD_SX2(_xtfloatx2_acc_##idx_row##_##idx_vec, \
-              _xtfloatx2_vec_batch_##idx_vec, _xtfloatx2_mat1_##idx_row);
-
-/*---------------------------------------------------------*/
-#define ADD_BIAS_8b_ACC_FOR_8bx8b(idx)                                        \
-  /* Load 8b bias */                                                          \
-  _WORD8_bias = *_WORD8_p_bias++;                                             \
-  /* Copy 8-bits to unsigned 32-bits */                                       \
-  _UWORD32_bias = _WORD8_bias;                                                \
-  /*Move unsigned 32 bit value to DR register*/                               \
-  _ae_int64_bias = AE_MOVINT64_FROMINT32X2((AE_MOVDA32X2(_UWORD32_bias, 0))); \
-  _ae_int64_bias = AE_SRAA64(_ae_int64_bias, 32);                             \
-  _ae_int64_sat_bias = AE_SLAA64S(_ae_int64_bias, bias_shift);                \
-  _ae_int64_acc_##idx = AE_SRAA64(_ae_int64_acc_##idx, 16);                   \
-  _ae_int64_acc_##idx = AE_ADD64S(_ae_int64_acc_##idx, _ae_int64_sat_bias);
-
-#define ADD_BIAS_32b_ACC_FOR_8bx8b(idx)                                    \
-  ae_int32_loadip(_ae_int32_bias, _ae_int32_p_bias,                        \
-                  INCREMENT_IN_BYTES_FOR_INT32);                           \
-  _ae_int64_sat_bias = AE_SLAA64S(((ae_int64)_ae_int32_bias), bias_shift); \
-  _ae_int64_acc_##idx = AE_SRAA64(_ae_int64_acc_##idx, 16);                \
-  _ae_int64_acc_##idx = AE_ADD64S(_ae_int64_acc_##idx, _ae_int64_sat_bias);
-
-#define ADD_BIAS_16b_ACC_FOR_8bx16b(idx)                                   \
-  ae_int16_loadip(_ae_int16_bias, _ae_int16_p_bias,                        \
-                  INCREMENT_IN_BYTES_FOR_INT16);                           \
-  /* Saturate 16b bias after shift to 64b */                               \
-  _ae_int64_sat_bias = AE_SLAA64S(((ae_int64)_ae_int16_bias), bias_shift); \
-  _ae_int64_acc_##idx = AE_SRAA64(_ae_int64_acc_##idx, 8);                 \
-  _ae_int64_acc_##idx = AE_ADD64S(_ae_int64_acc_##idx, _ae_int64_sat_bias);
-
-#define ADD_BIAS_16b_ACC_FOR_16bx8b ADD_BIAS_16b_ACC_FOR_8bx16b
-
-#define ADD_BIAS_64b_ACC_FOR_8bx16b(idx)                                   \
-  ae_int64_loadip(_ae_int64_bias, _ae_int64_p_bias,                        \
-                  INCREMENT_IN_BYTES_FOR_INT64);                           \
-  /* Saturate 64b bias after shift to 64b */                               \
-  _ae_int64_sat_bias = AE_SLAA64S(((ae_int64)_ae_int64_bias), bias_shift); \
-  _ae_int64_acc_##idx = AE_SRAA64(_ae_int64_acc_##idx, 8);                 \
-  _ae_int64_acc_##idx = AE_ADD64S(_ae_int64_acc_##idx, _ae_int64_sat_bias);
-
-#define ADD_BIAS_16b_ACC_FOR_16bx16b(idx)                                  \
-  ae_int16_loadip(_ae_int16_bias, _ae_int16_p_bias,                        \
-                  INCREMENT_IN_BYTES_FOR_INT16);                           \
-  /* Saturate 16b bias after shift to 64b */                               \
-  _ae_int64_sat_bias = AE_SLAA64S(((ae_int64)_ae_int16_bias), bias_shift); \
-  _ae_int64_acc_##idx = AE_ADD64S(_ae_int64_acc_##idx, _ae_int64_sat_bias);
-
-#define ADD_BIAS_64b_ACC_FOR_16bx16b(idx)                                  \
-  ae_int64_loadip(_ae_int64_bias, _ae_int64_p_bias,                        \
-                  INCREMENT_IN_BYTES_FOR_INT64);                           \
-  /* Saturate 64b bias after shift to 64b */                               \
-  _ae_int64_sat_bias = AE_SLAA64S(((ae_int64)_ae_int64_bias), bias_shift); \
-  _ae_int64_acc_##idx = AE_ADD64S(_ae_int64_acc_##idx, _ae_int64_sat_bias);
-
-#define ADD_BIAS_ASYM8b_ACC_FOR_ASYM8bxASYM8b(idx)                      \
-  /* Load 32b bias */                                                   \
-  _WORD32_bias = *_WORD32_p_bias++;                                     \
-  _ae_int64_sat_bias =                                                  \
-      AE_SRAI64(AE_MOVINT64_FROMINT32X2(AE_MOVDA32(_WORD32_bias)), 32); \
-  _ae_int64_acc_##idx = AE_ADD64S(_ae_int64_acc_##idx, _ae_int64_sat_bias);
-
-/*------------------ time batching macros ----------------- */
-#define ADD_BIAS_BATCH_ROW_8b_ACC_FOR_8bx8b(idx_row) \
-  LOAD_BIAS_8b_FOR_8bx8b;                            \
-  ADD_BIAS_BATCH_ACC_VEC_UNROLL(idx_row);
-
-#define ADD_BIAS_BATCH_ROW_16b_ACC_FOR_8bx16b(idx_row) \
-  LOAD_BIAS_16b_FOR_8bx16b;                            \
-  ADD_BIAS_BATCH_ACC_VEC_UNROLL(idx_row);
-
-#define ADD_BIAS_BATCH_ROW_16b_ACC_FOR_16bx8b(idx_row) \
-  LOAD_BIAS_16b_FOR_16bx8b;                            \
-  ADD_BIAS_BATCH_ACC_VEC_UNROLL(idx_row);
-
-#define ADD_BIAS_BATCH_ROW_16b_ACC_FOR_16bx16b(idx_row) \
-  LOAD_BIAS_16b_FOR_16bx16b;                            \
-  ADD_BIAS_BATCH_ACC_VEC_UNROLL(idx_row);
-
-#define ADD_BIAS_BATCH_ROW_ASYM8b_ACC_FOR_ASYM8bxASYM8b(idx_row) \
-  LOAD_BIAS_ASYM8b ADD_BIAS_BATCH_ACC_VEC_UNROLL(idx_row);
-
-#define ADD_BIAS_BATCH_8b_ACC_FOR_8bx8b(idx_row, idx_vec) \
-  _ae_int64_acc_##idx_row##_##idx_vec =                   \
-      AE_SRAA64(_ae_int64_acc_##idx_row##_##idx_vec, 16); \
-  _ae_int64_acc_##idx_row##_##idx_vec =                   \
-      AE_ADD64S(_ae_int64_acc_##idx_row##_##idx_vec, _ae_int64_sat_bias);
-
-#define ADD_BIAS_BATCH_16b_ACC_FOR_8bx16b(idx_row, idx_vec) \
-  _ae_int64_acc_##idx_row##_##idx_vec =                     \
-      AE_SRAA64(_ae_int64_acc_##idx_row##_##idx_vec, 8);    \
-  _ae_int64_acc_##idx_row##_##idx_vec =                     \
-      AE_ADD64S(_ae_int64_acc_##idx_row##_##idx_vec, _ae_int64_sat_bias);
-
-#define ADD_BIAS_BATCH_16b_ACC_FOR_16bx16b(idx_row, idx_vec) \
-  _ae_int64_acc_##idx_row##_##idx_vec =                      \
-      AE_ADD64S(_ae_int64_acc_##idx_row##_##idx_vec, _ae_int64_sat_bias);
-
-#define ADD_BIAS_BATCH_16b_ACC_FOR_16bx8b ADD_BIAS_BATCH_16b_ACC_FOR_8bx16b
-#define ADD_BIAS_BATCH_ASYM8b_ACC_FOR_ASYM8bxASYM8b \
-  ADD_BIAS_BATCH_16b_ACC_FOR_16bx16b
-
-#define ADD_BIAS_BATCH_ROW_ACC_FOR_f32(idx_row) \
-  LOAD_BIAS_f32;                                \
-  ADD_BIAS_BATCH_ACC_VEC_UNROLL(idx_row);
-
-#define ADD_BIAS_BATCH_ACC_FOR_f32(idx_row, idx_vec)     \
-  _xtfloat_acc_##idx_row##_##idx_vec =                   \
-      XT_RADD_SX2(_xtfloatx2_acc_##idx_row##_##idx_vec); \
-  _xtfloat_acc_##idx_row##_##idx_vec =                   \
-      XT_ADD_S(_xtfloat_acc_##idx_row##_##idx_vec, _xtfloat_bias);
-
-#define STORE_ACC_8bx8b_AT_SCRATCH_32b(idx)  \
-  (*((ae_int32 *)p_scratch + m_itr + idx)) = \
-      AE_ROUND32F64SSYM(AE_SLAA64S(_ae_int64_acc_##idx, acc_shift));
-
-#define STORE_ACC_8bx8b_AT_OUT_8b(idx)                                    \
-  ae_int32 _ae_int32_tmp_var_##idx;                                       \
-  ae_f32x2 _ae_f32x2_tmp_var_##idx = AE_SLAA32S(                          \
-      AE_ROUND32F64SSYM(AE_SLAA64S(_ae_int64_acc_##idx, acc_shift)), 24); \
-  _ae_int32_tmp_var_##idx = AE_SLAA32S(_ae_f32x2_tmp_var_##idx, -24);     \
-  (*((WORD8 *)p_out + m_itr + idx)) = (*((UWORD32 *)&_ae_int32_tmp_var_##idx));
-
-#define STORE_ACC_8bx8b_AT_OUT_16b(idx)                                   \
-  ae_int32 _ae_int32_tmp_var_##idx;                                       \
-  ae_f32x2 _ae_f32x2_tmp_var_##idx = AE_SLAA32S(                          \
-      AE_ROUND32F64SSYM(AE_SLAA64S(_ae_int64_acc_##idx, acc_shift)), 16); \
-  _ae_int32_tmp_var_##idx = AE_SLAA32S(_ae_f32x2_tmp_var_##idx, -16);     \
-  (*((WORD16 *)p_out + m_itr + idx)) = (*((UWORD32 *)&_ae_int32_tmp_var_##idx));
-
-#define STORE_ACC_8bx8b_AT_OUT_32b(idx)  \
-  (*((ae_int32 *)p_out + m_itr + idx)) = \
-      AE_ROUND32F64SSYM(AE_SLAA64S(_ae_int64_acc_##idx, acc_shift));
-
-#define STORE_ACC_ASYM8bxASYM8b_AT_OUT_ASYM8b(idx)                      \
-  _ae_int32x2_acc_##idx = AE_MIN32(                                     \
-      AE_MAX32(_ae_int32x2_acc_##idx, AE_MOVDA32(0)), AE_MOVDA32(255)); \
-  (*((UWORD8 *)p_out + m_itr + idx)) =                                  \
-      (UWORD8)AE_MOVAD32_L(_ae_int32x2_acc_##idx);
-
-/* ====================================================================================================
- */
-#define STORE_ACC_8bx16b_AT_SCRATCH_32b(idx) \
-  (*((ae_int32 *)p_scratch + m_itr + idx)) = \
-      AE_ROUND32F64SSYM(AE_SLAA64S(_ae_int64_acc_##idx, acc_shift));
-
-#define STORE_ACC_8bx16b_AT_OUT_16b(idx)                                  \
-  ae_int32 _ae_int32_tmp_var_##idx;                                       \
-  ae_f32x2 _ae_f32x2_tmp_var_##idx = AE_SLAA32S(                          \
-      AE_ROUND32F64SSYM(AE_SLAA64S(_ae_int64_acc_##idx, acc_shift)), 16); \
-  _ae_int32_tmp_var_##idx = AE_SLAA32S(_ae_f32x2_tmp_var_##idx, -16);     \
-  (*((WORD16 *)p_out + m_itr + idx)) = (*((UWORD32 *)&_ae_int32_tmp_var_##idx));
-
-#define STORE_ACC_16bx8b_AT_OUT_16b STORE_ACC_8bx16b_AT_OUT_16b
-
-#define STORE_ACC_8bx16b_AT_OUT_32b(idx) \
-  (*((ae_int32 *)p_out + m_itr + idx)) = \
-      AE_ROUND32F64SSYM(AE_SLAA64S(_ae_int64_acc_##idx, acc_shift));
-
-#define STORE_ACC_8bx16b_AT_OUT_64b(idx) \
-  (*((ae_int64 *)p_out + m_itr + idx)) = \
-      AE_SLAA64S(_ae_int64_acc_##idx, acc_shift);
-
-/* ====================================================================================================
- */
-#define STORE_ACC_16bx16b_AT_SCRATCH_32b(idx) \
-  (*((ae_int32 *)p_scratch + m_itr + idx)) =  \
-      AE_ROUND32F64SSYM(AE_SLAA64S(_ae_int64_acc_##idx, acc_shift));
-
-#define STORE_ACC_16bx16b_AT_OUT_16b(idx)                                 \
-  ae_int32 _ae_int32_tmp_var_##idx;                                       \
-  ae_f32x2 _ae_f32x2_tmp_var_##idx = AE_SLAA32S(                          \
-      AE_ROUND32F64SSYM(AE_SLAA64S(_ae_int64_acc_##idx, acc_shift)), 16); \
-  _ae_int32_tmp_var_##idx = AE_SLAA32S(_ae_f32x2_tmp_var_##idx, -16);     \
-  (*((WORD16 *)p_out + m_itr + idx)) = (*((UWORD32 *)&_ae_int32_tmp_var_##idx));
-
-#define STORE_ACC_16bx16b_AT_OUT_32b(idx) \
-  (*((ae_int32 *)p_out + m_itr + idx)) =  \
-      AE_ROUND32F64SSYM(AE_SLAA64S(_ae_int64_acc_##idx, acc_shift));
-
-#define STORE_ACC_16bx16b_AT_OUT_64b(idx) \
-  (*((ae_int64 *)p_out + m_itr + idx)) =  \
-      AE_SLAA64S(_ae_int64_acc_##idx, acc_shift);
-
-/*------------------ time batching macros ----------------- */
-#define STORE_ACC_BATCH_ROW_8bx8b_AT_OUT_32b(idx_row) \
-  STORE_ACC_BATCH_VEC_UNROLL(idx_row);
-
-#define STORE_ACC_BATCH_ROW_8bx8b_AT_OUT_8b(idx_row) \
-  STORE_ACC_BATCH_VEC_UNROLL(idx_row);
-
-#define STORE_ACC_BATCH_8bx8b_AT_OUT_32b(idx_row, idx_vec)      \
-  (*((ae_int32 *)p_out[vec_itr + idx_vec] + m_itr + idx_row)) = \
-      AE_ROUND32F64SSYM(                                        \
-          AE_SLAA64S(_ae_int64_acc_##idx_row##_##idx_vec, acc_shift));
-
-#define STORE_ACC_BATCH_8bx8b_AT_OUT_8b(idx_row, idx_vec)              \
-  ae_int32 _ae_int32_tmp_var_##idx_row##_##idx_vec;                    \
-  ae_f32x2 _ae_f32x2_tmp_var_##idx_row##_##idx_vec =                   \
-      AE_SLAA32S(AE_ROUND32F64SSYM(AE_SLAA64S(                         \
-                     _ae_int64_acc_##idx_row##_##idx_vec, acc_shift)), \
-                 24);                                                  \
-  _ae_int32_tmp_var_##idx_row##_##idx_vec =                            \
-      AE_SLAA32S(_ae_f32x2_tmp_var_##idx_row##_##idx_vec, -24);        \
-  (*((WORD8 *)p_out[vec_itr + idx_vec] + m_itr + idx_row)) =           \
-      (*((UWORD32 *)&_ae_int32_tmp_var_##idx_row##_##idx_vec));
-
-#define STORE_ACC_BATCH_ROW_8bx16b_AT_OUT_64b(idx_row) \
-  STORE_ACC_BATCH_VEC_UNROLL(idx_row);
-
-#define STORE_ACC_BATCH_ROW_16bx8b_AT_OUT_16b \
-  STORE_ACC_BATCH_ROW_8bx16b_AT_OUT_64b
-
-#define STORE_ACC_BATCH_ROW_8bx16b_AT_OUT_16b \
-  STORE_ACC_BATCH_ROW_8bx16b_AT_OUT_64b
-
-#define STORE_ACC_BATCH_8bx16b_AT_OUT_64b(idx_row, idx_vec)     \
-  (*((ae_int64 *)p_out[vec_itr + idx_vec] + m_itr + idx_row)) = \
-      AE_SLAA64S(_ae_int64_acc_##idx_row##_##idx_vec, acc_shift);
-
-#define STORE_ACC_BATCH_8bx16b_AT_OUT_16b(idx_row, idx_vec) \
-  STORE_ACC_BATCH_16bx16b_AT_OUT_16b(idx_row, idx_vec);
-
-#define STORE_ACC_BATCH_ROW_16bx16b_AT_OUT_64b(idx_row) \
-  STORE_ACC_BATCH_VEC_UNROLL(idx_row);
-
-#define STORE_ACC_BATCH_ROW_16bx16b_AT_OUT_16b \
-  STORE_ACC_BATCH_ROW_16bx16b_AT_OUT_64b
-
-#define STORE_ACC_BATCH_16bx16b_AT_OUT_64b(idx_row, idx_vec)    \
-  (*((ae_int64 *)p_out[vec_itr + idx_vec] + m_itr + idx_row)) = \
-      AE_SLAA64S(_ae_int64_acc_##idx_row##_##idx_vec, acc_shift);
-
-#define STORE_STRIDE_ACC_BATCH_16bx16b_AT_OUT_16b(idx_row, idx_vec)    \
-  ae_int32 _ae_int32_tmp_var_##idx_row##_##idx_vec;                    \
-  ae_f32x2 _ae_f32x2_tmp_var_##idx_row##_##idx_vec =                   \
-      AE_SLAA32S(AE_ROUND32F64SSYM(AE_SLAA64S(                         \
-                     _ae_int64_acc_##idx_row##_##idx_vec, acc_shift)), \
-                 16);                                                  \
-  _ae_int32_tmp_var_##idx_row##_##idx_vec =                            \
-      AE_SLAA32S(_ae_f32x2_tmp_var_##idx_row##_##idx_vec, -16);        \
-  (*((WORD16 *)p_out + (vec_itr + idx_vec) * out_offset +              \
-     (m_itr + idx_row) * out_stride)) =                                \
-      (*((UWORD32 *)&_ae_int32_tmp_var_##idx_row##_##idx_vec));
-
-#define STORE_ACC_BATCH_ROW_AT_OUT_f32(idx_row) \
-  STORE_ACC_BATCH_VEC_UNROLL(idx_row);
-
-#define STORE_ACC_BATCH_AT_OUT_f32(idx_row, idx_vec)                \
-  /*p_out value stored in a tmp pointer to make it inout for ISA */ \
-  p_out_tmp = (p_out[vec_itr + idx_vec] + m_itr + idx_row);         \
-  XT_SSIP(_xtfloat_acc_##idx_row##_##idx_vec, p_out_tmp, 0);
-
-#define STORE_ACC_BATCH_ROW_ASYM8bxASYM8b_AT_OUT_ASYM8b(idx_row) \
-  STORE_ACC_BATCH_VEC_UNROLL(idx_row);
-
-#define STORE_ACC_BATCH_ASYM8bxASYM8b_AT_OUT_ASYM8b(idx_row, idx_vec)          \
-  _ae_int32x2_acc_##idx_row##_##idx_vec =                                      \
-      AE_MIN32(AE_MAX32(_ae_int32x2_acc_##idx_row##_##idx_vec, AE_MOVDA32(0)), \
-               AE_MOVDA32(255));                                               \
-  (*((UWORD8 *)(p_out[vec_itr + idx_vec] + m_itr + idx_row))) =                \
-      (UWORD8)AE_MOVAD32_L(_ae_int32x2_acc_##idx_row##_##idx_vec);
-
-/*---------------------------------------------------------*/
-/* Specific macros needed for extra calculations involved
-  for ASYM8b */
-
-/* This is written to match with Tensorflow */
-#define ADJUST_ACC_ASYM8b(idx)                                             \
-  /* Multiply accumulator with 'out_multiplier', same as Tensorflow */     \
-  ae_int32x2 _ae_int32x2_acc_##idx =                                       \
-      AE_SLAA32(AE_MOVINT32X2_FROMINT64(_ae_int64_acc_##idx), left_shift); \
-  _ae_int32x2_acc_##idx =                                                  \
-      AE_MULFP32X2RAS(_ae_int32x2_acc_##idx, AE_MOVDA32(out_multiplier));  \
-  /* Shift by out_shift, same as Tensorflow */                             \
-  _ae_int64_acc_##idx =                                                    \
-      AE_SLAI64(AE_MOVINT64_FROMINT32X2(_ae_int32x2_acc_##idx), 32);       \
-  _ae_int64_acc_##idx = AE_SRAA64(_ae_int64_acc_##idx, right_shift);       \
-  _ae_int32x2_acc_##idx = AE_ROUND32F64SSYM(_ae_int64_acc_##idx);          \
-  /* Add output zero point */                                              \
-  (_ae_int32x2_acc_##idx) =                                                \
-      AE_ADD32S(_ae_int32x2_acc_##idx, AE_MOVDA32(out_zero_bias));
-
-/* For time batching */
-#define ADJUST_ACC_BATCH_ROW_ASYM8b(idx_row) \
-  ADJUST_ACC_BATCH_VEC_UNROLL(idx_row);
-
-/* For time batching */
-#define ADJUST_ACC_BATCH_ASYM8b(idx_row, idx_vec)                             \
-  /* Multiply accumulator with 'out_multiplier', same as Tensorflow */        \
-  ae_int32x2 _ae_int32x2_acc_##idx_row##_##idx_vec =                          \
-      AE_SLAA32(AE_MOVINT32X2_FROMINT64(_ae_int64_acc_##idx_row##_##idx_vec), \
-                left_shift);                                                  \
-  _ae_int32x2_acc_##idx_row##_##idx_vec = AE_MULFP32X2RAS(                    \
-      _ae_int32x2_acc_##idx_row##_##idx_vec, AE_MOVDA32(out_multiplier));     \
-  /* Shift by out_shift, same as Tensorflow */                                \
-  _ae_int64_acc_##idx_row##_##idx_vec = AE_SLAI64(                            \
-      AE_MOVINT64_FROMINT32X2(_ae_int32x2_acc_##idx_row##_##idx_vec), 32);    \
-  _ae_int64_acc_##idx_row##_##idx_vec =                                       \
-      AE_SRAA64(_ae_int64_acc_##idx_row##_##idx_vec, right_shift);            \
-  _ae_int32x2_acc_##idx_row##_##idx_vec =                                     \
-      AE_ROUND32F64SSYM(_ae_int64_acc_##idx_row##_##idx_vec);                 \
-  /* Add output zero point */                                                 \
-  (_ae_int32x2_acc_##idx_row##_##idx_vec) = AE_ADD32S(                        \
-      _ae_int32x2_acc_##idx_row##_##idx_vec, AE_MOVDA32(out_zero_bias));
-
-/*---------------------------------------------------------*/
-/* ====================================================================================================
- */
-#if (ROW_UNROLL == 1)
-#define SETUP_ACC UNROLL_SETUP_ACC(0)
-#define SETUP_MAT1 UNROLL_SETUP_MAT1(0)
-#define SETUP_MAT2 UNROLL_SETUP_MAT2(0)
-#define KERNEL_MAT1_VEC1 UNROLL_KERNEL_MAT1_VEC1(0)
-#define KERNEL_MAT2_VEC2 UNROLL_KERNEL_MAT2_VEC2(0)
-#define ADD_BIAS_ACC UNROLL_ADD_BIAS_ACC(0)
-#define ADJUST_ACC UNROLL_ADJUST_ACC(0)
-#define STORE_ACC UNROLL_STORE_ACC(0)
-
-#elif (ROW_UNROLL == 2)
-#define SETUP_ACC UNROLL_SETUP_ACC(0) UNROLL_SETUP_ACC(1)
-#define SETUP_MAT1 UNROLL_SETUP_MAT1(0) UNROLL_SETUP_MAT1(1)
-#define SETUP_MAT2 UNROLL_SETUP_MAT2(0) UNROLL_SETUP_MAT2(1)
-#define KERNEL_MAT1_VEC1 UNROLL_KERNEL_MAT1_VEC1(0) UNROLL_KERNEL_MAT1_VEC1(1)
-#define KERNEL_MAT2_VEC2 UNROLL_KERNEL_MAT2_VEC2(0) UNROLL_KERNEL_MAT2_VEC2(1)
-#define ADD_BIAS_ACC UNROLL_ADD_BIAS_ACC(0) UNROLL_ADD_BIAS_ACC(1)
-#define ADJUST_ACC UNROLL_ADJUST_ACC(0) UNROLL_ADJUST_ACC(1)
-#define STORE_ACC UNROLL_STORE_ACC(0) UNROLL_STORE_ACC(1)
-
-#elif (ROW_UNROLL == 4)
-#define SETUP_ACC     \
-  UNROLL_SETUP_ACC(0) \
-  UNROLL_SETUP_ACC(1) UNROLL_SETUP_ACC(2) UNROLL_SETUP_ACC(3)
-#define SETUP_MAT1     \
-  UNROLL_SETUP_MAT1(0) \
-  UNROLL_SETUP_MAT1(1) UNROLL_SETUP_MAT1(2) UNROLL_SETUP_MAT1(3)
-#define SETUP_MAT2     \
-  UNROLL_SETUP_MAT2(0) \
-  UNROLL_SETUP_MAT2(1) UNROLL_SETUP_MAT2(2) UNROLL_SETUP_MAT2(3)
-#define KERNEL_MAT1_VEC1     \
-  UNROLL_KERNEL_MAT1_VEC1(0) \
-  UNROLL_KERNEL_MAT1_VEC1(1) \
-  UNROLL_KERNEL_MAT1_VEC1(2) UNROLL_KERNEL_MAT1_VEC1(3)
-#define KERNEL_MAT2_VEC2     \
-  UNROLL_KERNEL_MAT2_VEC2(0) \
-  UNROLL_KERNEL_MAT2_VEC2(1) \
-  UNROLL_KERNEL_MAT2_VEC2(2) UNROLL_KERNEL_MAT2_VEC2(3)
-#define ADD_BIAS_ACC     \
-  UNROLL_ADD_BIAS_ACC(0) \
-  UNROLL_ADD_BIAS_ACC(1) UNROLL_ADD_BIAS_ACC(2) UNROLL_ADD_BIAS_ACC(3)
-#define ADJUST_ACC     \
-  UNROLL_ADJUST_ACC(0) \
-  UNROLL_ADJUST_ACC(1) UNROLL_ADJUST_ACC(2) UNROLL_ADJUST_ACC(3)
-#define STORE_ACC     \
-  UNROLL_STORE_ACC(0) \
-  UNROLL_STORE_ACC(1) UNROLL_STORE_ACC(2) UNROLL_STORE_ACC(3)
-
-#elif (ROW_UNROLL == 8)
-#define SETUP_ACC     \
-  UNROLL_SETUP_ACC(0) \
-  UNROLL_SETUP_ACC(1) \
-  UNROLL_SETUP_ACC(2) \
-  UNROLL_SETUP_ACC(3) \
-  UNROLL_SETUP_ACC(4) \
-  UNROLL_SETUP_ACC(5) UNROLL_SETUP_ACC(6) UNROLL_SETUP_ACC(7)
-#define SETUP_MAT1     \
-  UNROLL_SETUP_MAT1(0) \
-  UNROLL_SETUP_MAT1(1) \
-  UNROLL_SETUP_MAT1(2) \
-  UNROLL_SETUP_MAT1(3) \
-  UNROLL_SETUP_MAT1(4) \
-  UNROLL_SETUP_MAT1(5) UNROLL_SETUP_MAT1(6) UNROLL_SETUP_MAT1(7)
-#define SETUP_MAT2     \
-  UNROLL_SETUP_MAT2(0) \
-  UNROLL_SETUP_MAT2(1) \
-  UNROLL_SETUP_MAT2(2) \
-  UNROLL_SETUP_MAT2(3) \
-  UNROLL_SETUP_MAT2(4) \
-  UNROLL_SETUP_MAT2(5) UNROLL_SETUP_MAT2(6) UNROLL_SETUP_MAT2(7)
-#define KERNEL_MAT1_VEC1     \
-  UNROLL_KERNEL_MAT1_VEC1(0) \
-  UNROLL_KERNEL_MAT1_VEC1(1) \
-  UNROLL_KERNEL_MAT1_VEC1(2) \
-  UNROLL_KERNEL_MAT1_VEC1(3) \
-  UNROLL_KERNEL_MAT1_VEC1(4) \
-  UNROLL_KERNEL_MAT1_VEC1(5) \
-  UNROLL_KERNEL_MAT1_VEC1(6) UNROLL_KERNEL_MAT1_VEC1(7)
-#define KERNEL_MAT2_VEC2     \
-  UNROLL_KERNEL_MAT2_VEC2(0) \
-  UNROLL_KERNEL_MAT2_VEC2(1) \
-  UNROLL_KERNEL_MAT2_VEC2(2) \
-  UNROLL_KERNEL_MAT2_VEC2(3) \
-  UNROLL_KERNEL_MAT2_VEC2(4) \
-  UNROLL_KERNEL_MAT2_VEC2(5) \
-  UNROLL_KERNEL_MAT2_VEC2(6) UNROLL_KERNEL_MAT2_VEC2(7)
-#define ADD_BIAS_ACC     \
-  UNROLL_ADD_BIAS_ACC(0) \
-  UNROLL_ADD_BIAS_ACC(1) \
-  UNROLL_ADD_BIAS_ACC(2) \
-  UNROLL_ADD_BIAS_ACC(3) \
-  UNROLL_ADD_BIAS_ACC(4) \
-  UNROLL_ADD_BIAS_ACC(5) UNROLL_ADD_BIAS_ACC(6) UNROLL_ADD_BIAS_ACC(7)
-#define ADJUST_ACC     \
-  UNROLL_ADJUST_ACC(0) \
-  UNROLL_ADJUST_ACC(1) \
-  UNROLL_ADJUST_ACC(2) \
-  UNROLL_ADJUST_ACC(3) \
-  UNROLL_ADJUST_ACC(4) \
-  UNROLL_ADJUST_ACC(5) UNROLL_ADJUST_ACC(6) UNROLL_ADJUST_ACC(7)
-#define STORE_ACC     \
-  UNROLL_STORE_ACC(0) \
-  UNROLL_STORE_ACC(1) \
-  UNROLL_STORE_ACC(2) \
-  UNROLL_STORE_ACC(3) \
-  UNROLL_STORE_ACC(4) \
-  UNROLL_STORE_ACC(5) UNROLL_STORE_ACC(6) UNROLL_STORE_ACC(7)
-
-#endif /* (ROW_UNROLL == 1) */
-
-#if (ROW_UNROLL == 4 && VEC_UNROLL == 2)
-
-#define SETUP_VEC_BATCH UNROLL_SETUP_VEC_BATCH(0) UNROLL_SETUP_VEC_BATCH(1)
-
-#define SETUP_ACC_BATCH         \
-  UNROLL_ROW_SETUP_ACC_BATCH(0) \
-  UNROLL_ROW_SETUP_ACC_BATCH(1) \
-  UNROLL_ROW_SETUP_ACC_BATCH(2) UNROLL_ROW_SETUP_ACC_BATCH(3)
-#define SETUP_ACC_BATCH_VEC_UNROLL(idx_row) \
-  UNROLL_SETUP_ACC_BATCH(idx_row, 0) UNROLL_SETUP_ACC_BATCH(idx_row, 1)
-#define SETUP_ACC_BATCH_TAIL   \
-  UNROLL_SETUP_ACC_BATCH(0, 0) \
-  UNROLL_SETUP_ACC_BATCH(1, 0) \
-  UNROLL_SETUP_ACC_BATCH(2, 0) UNROLL_SETUP_ACC_BATCH(3, 0)
-
-#define LOAD_VEC_BATCH UNROLL_LOAD_VEC_BATCH(0) UNROLL_LOAD_VEC_BATCH(1)
-#define LOAD_MAT1         \
-  UNROLL_LOAD_ROW_MAT1(0) \
-  UNROLL_LOAD_ROW_MAT1(1) UNROLL_LOAD_ROW_MAT1(2) UNROLL_LOAD_ROW_MAT1(3)
-
-#define KERNEL_MAT1_VEC_BATCH         \
-  UNROLL_ROW_KERNEL_MAT1_VEC_BATCH(0) \
-  UNROLL_ROW_KERNEL_MAT1_VEC_BATCH(1) \
-  UNROLL_ROW_KERNEL_MAT1_VEC_BATCH(2) UNROLL_ROW_KERNEL_MAT1_VEC_BATCH(3)
-#define KERNEL_MAT1_VEC_BATCH_VEC_UNROLL(idx_row) \
-  UNROLL_KERNEL_MAT1_VEC_BATCH(idx_row, 0)        \
-  UNROLL_KERNEL_MAT1_VEC_BATCH(idx_row, 1)
-#define KERNEL_MAT1_VEC_BATCH_TAIL   \
-  UNROLL_KERNEL_MAT1_VEC_BATCH(0, 0) \
-  UNROLL_KERNEL_MAT1_VEC_BATCH(1, 0) \
-  UNROLL_KERNEL_MAT1_VEC_BATCH(2, 0) UNROLL_KERNEL_MAT1_VEC_BATCH(3, 0)
-
-#define ADD_BIAS_ACC_BATCH   \
-  UNROLL_ROW_ADD_BIAS_ACC(0) \
-  UNROLL_ROW_ADD_BIAS_ACC(1) \
-  UNROLL_ROW_ADD_BIAS_ACC(2) UNROLL_ROW_ADD_BIAS_ACC(3)
-#define ADD_BIAS_BATCH_ACC_VEC_UNROLL(idx_row) \
-  UNROLL_ADD_BIAS_ACC_BATCH(idx_row, 0) UNROLL_ADD_BIAS_ACC_BATCH(idx_row, 1)
-#define ADD_BIAS_ACC_BATCH_TAIL                     \
-  LOAD_BIAS UNROLL_ADD_BIAS_ACC_BATCH(0, 0)         \
-      LOAD_BIAS UNROLL_ADD_BIAS_ACC_BATCH(1, 0)     \
-          LOAD_BIAS UNROLL_ADD_BIAS_ACC_BATCH(2, 0) \
-              LOAD_BIAS UNROLL_ADD_BIAS_ACC_BATCH(3, 0)
-
-#define STORE_ACC_BATCH   \
-  UNROLL_ROW_STORE_ACC(0) \
-  UNROLL_ROW_STORE_ACC(1) UNROLL_ROW_STORE_ACC(2) UNROLL_ROW_STORE_ACC(3)
-#define STORE_ACC_BATCH_VEC_UNROLL(idx_row) \
-  UNROLL_STORE_ACC_BATCH(idx_row, 0) UNROLL_STORE_ACC_BATCH(idx_row, 1)
-#define STORE_ACC_BATCH_TAIL   \
-  UNROLL_STORE_ACC_BATCH(0, 0) \
-  UNROLL_STORE_ACC_BATCH(1, 0) \
-  UNROLL_STORE_ACC_BATCH(2, 0) UNROLL_STORE_ACC_BATCH(3, 0)
-
-#define ADJUST_ACC_BATCH_TAIL   \
-  UNROLL_ADJUST_ACC_BATCH(0, 0) \
-  UNROLL_ADJUST_ACC_BATCH(1, 0) \
-  UNROLL_ADJUST_ACC_BATCH(2, 0) UNROLL_ADJUST_ACC_BATCH(3, 0)
-#define ADJUST_ACC_BATCH   \
-  UNROLL_ROW_ADJUST_ACC(0) \
-  UNROLL_ROW_ADJUST_ACC(1) UNROLL_ROW_ADJUST_ACC(2) UNROLL_ROW_ADJUST_ACC(3)
-#define ADJUST_ACC_BATCH_VEC_UNROLL(idx_row) \
-  UNROLL_ADJUST_ACC_BATCH(idx_row, 0) UNROLL_ADJUST_ACC_BATCH(idx_row, 1)
-
-#endif /* (ROW_UNROLL == 4 && VEC_UNROLL == 2)*/
-
-#endif /* __XA_NNLIB_COMMON_MACROS_H__ */
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/common/include/xa_nnlib_definitions.h b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/common/include/xa_nnlib_definitions.h
deleted file mode 100644
index 7199887..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/common/include/xa_nnlib_definitions.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
-
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef __XA_OPUS_CODEC_DEFINITIONS_H__
-#define __XA_OPUS_CODEC_DEFINITIONS_H__
-
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/common/include/xa_api_defs.h"
-
-/* Identification Strings */
-#define LIBNAME "HiFi Mini Neural Network Library"
-#define LIBVERSION "0.6.0"
-
-#define LIB_APIVERSION_MAJOR 1
-#define LIB_APIVERSION_MINOR 0
-
-#if LIB_APIVERSION_MAJOR != XA_APIVERSION_MAJOR || \
-    LIB_APIVERSION_MINOR != XA_APIVERSION_MINOR
-// #error "Version Mismatch"
-#endif
-
-#define LIB_APIVERSION \
-  XA_MAKE_VERSION_STR(LIB_APIVERSION_MAJOR, LIB_APIVERSION_MINOR)
-
-#endif /* __XA_OPUS_CODEC_DEFINITIONS_H__ */
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/common/include/xa_nnlib_err_chk.h b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/common/include/xa_nnlib_err_chk.h
deleted file mode 100644
index 8508e54..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/common/include/xa_nnlib_err_chk.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
-
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef __XA_NNLIB_ERR_CHK_H__
-#define __XA_NNLIB_ERR_CHK_H__
-
-#ifndef NULL
-#define NULL (void *)0
-#endif /* NULL */
-
-#ifndef DISABLE_ARG_CHK
-
-#define XA_NNLIB_ARG_CHK_PTR(_ptr, _err) \
-  do {                                   \
-    if ((_ptr) == NULL) return (_err);   \
-  } while (0)
-
-#define XA_NNLIB_ARG_CHK_ALIGN(_ptr, _align, _err)                 \
-  do {                                                             \
-    if (((unsigned int)(_ptr) & ((_align)-1)) != 0) return (_err); \
-  } while (0)
-
-#define XA_NNLIB_ARG_CHK_COND(_cond, _err) \
-  do {                                     \
-    if ((_cond)) return (_err);            \
-  } while (0)
-
-#else /* DISABLE_ARG_CHK */
-
-#define XA_NNLIB_ARG_CHK_PTR(_ptr, _err)
-#define XA_NNLIB_ARG_CHK_ALIGN(_ptr, _align, _err)
-#define XA_NNLIB_ARG_CHK_COND(_cond, _err)
-
-#endif /* DISABLE_ARG_CHK */
-
-#define XA_NNLIB_CHK_PTR(_ptr, _err)   \
-  do {                                 \
-    if ((_ptr) == NULL) return (_err); \
-  } while (0)
-
-#define XA_NNLIB_CHK_ALIGN(_ptr, _align, _err)                     \
-  do {                                                             \
-    if (((unsigned int)(_ptr) & ((_align)-1)) != 0) return (_err); \
-  } while (0)
-
-#define XA_NNLIB_CHK_COND(_cond, _err) \
-  do {                                 \
-    if ((_cond)) return (_err);        \
-  } while (0)
-
-#endif /* __XA_NNLIB_ERR_CHK_H__ */
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/kernels/activations/hifi_mini/xa_nn_activations_asym8s_asym8s.c b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/kernels/activations/hifi_mini/xa_nn_activations_asym8s_asym8s.c
deleted file mode 100644
index 060b706..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/kernels/activations/hifi_mini/xa_nn_activations_asym8s_asym8s.c
+++ /dev/null
@@ -1,176 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
-
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xa_nnlib_common.h"
-
-#define ALIGNMENT 8 /* 8 bytes alignment */
-
-#define ALIGN_PTR(x, bytes) ((((unsigned)(x)) + (bytes - 1)) & (~(bytes - 1)))
-
-#define LIMIT(out, inp, min, max) \
-  {                               \
-    out = min;                    \
-    out = AE_MAXP24S(inp, min);   \
-    out = AE_MINP24S(out, max);   \
-  }
-
-#define STORE_8X2_FROM_24X2(out_ptr, val) \
-  {                                       \
-    int o1, o2;                           \
-    o1 = AE_MOVAP24S_H(val);              \
-    o2 = AE_MOVAP24S_L(val);              \
-    *out_ptr++ = (WORD8)o1;               \
-    *out_ptr++ = (WORD8)o2;               \
-  }
-
-/*
- * inp: p_vec: 4 byte aligned input pointer
- * out: p_out: no alignment needed for output pointer*/
-WORD32 xa_nn_vec_activation_min_max_asym8s_asym8s(
-    WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_vec,
-    int activation_min, int activation_max, WORD32 vec_length) {
-  int i;
-  ae_p24x2s x, y, min, max;
-
-  /* NULL pointer checks */
-  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_vec, -1);
-
-  /* Basic Parameter checks */
-  XA_NNLIB_ARG_CHK_COND((vec_length <= 0), -1);
-
-  /* Basic Parameter checks */
-  XA_NNLIB_ARG_CHK_COND((activation_max < activation_min), -1);
-
-  WORD8 *p_o = p_out;
-  WORD8 *p_v = (WORD8 *)p_vec;
-
-  min = AE_SRAIP24(AE_CVTP24A16(activation_min), 8);
-  max = AE_SRAIP24(AE_CVTP24A16(activation_max), 8);
-
-  int pre_loop_count = 0;
-  // pre loop, active when input ptr is not 4 byte aligned
-  pre_loop_count = (int)((unsigned)ALIGN_PTR(p_v, 4) - (unsigned)p_v);
-  pre_loop_count = (pre_loop_count < vec_length) ? pre_loop_count : vec_length;
-
-  vec_length = vec_length - pre_loop_count;
-  vec_length = (vec_length < 0) ? 0 : vec_length;
-
-  for (i = 0; i < pre_loop_count; i++) {
-    int i1;
-    i1 = ((WORD8)*p_v++);
-    x = AE_MOVPA24(i1);
-    LIMIT(y, x, min, max)
-    i1 = AE_MOVAP24S_H(y);
-    *p_o++ = (WORD8)i1;
-  }
-
-  if ((activation_max >= (int)127) && (activation_min <= (int)-128)) {
-    p_v = p_v - 2;
-    for (i = 0; i < (vec_length >> 1); i++) {
-      AE_LP8X2F_IU(x, (WORD8 *)p_v, 2 * sizeof(WORD8));
-      y = AE_SRAIP24(x, 16);
-
-      STORE_8X2_FROM_24X2(p_o, y)
-    }
-    if (vec_length & 1) {
-      p_v = p_v + 2;
-      int i1;
-      i1 = (WORD8)p_v[0];
-      *p_o++ = (WORD8)i1;
-    }
-  } else if ((activation_max < (int)127) && (activation_min <= (int)-128)) {
-    p_v = p_v - 2;
-    for (i = 0; i < (vec_length >> 1); i++) {
-      AE_LP8X2F_IU(x, (WORD8 *)p_v, 2 * sizeof(WORD8));
-      y = AE_SRAIP24(x, 16);
-
-      y = AE_MINP24S(y, max);
-
-      STORE_8X2_FROM_24X2(p_o, y)
-    }
-    if (vec_length & 1) {
-      p_v = p_v + 2;
-      int i1;
-      i1 = (WORD8)p_v[0];
-      y = AE_MOVPA24(i1);
-
-      y = AE_MINP24S(y, max);
-
-      i1 = AE_MOVAP24S_H(y);
-      *p_o++ = (WORD8)i1;
-    }
-  } else if ((activation_max >= (int)127) && (activation_min > (int)-128)) {
-    p_v = p_v - 2;
-    for (i = 0; i < (vec_length >> 1); i++) {
-      AE_LP8X2F_IU(x, (WORD8 *)p_v, 2 * sizeof(WORD8));
-      y = AE_SRAIP24(x, 16);
-
-      y = AE_MAXP24S(y, min);
-
-      STORE_8X2_FROM_24X2(p_o, y)
-    }
-    if (vec_length & 1) {
-      p_v = p_v + 2;
-      int i1;
-      i1 = (WORD8)p_v[0];
-      y = AE_MOVPA24(i1);
-
-      y = AE_MAXP24S(y, min);
-
-      i1 = AE_MOVAP24S_H(y);
-      *p_o++ = (WORD8)i1;
-    }
-  } else {
-    p_v = p_v - 2;
-    for (i = 0; i < (vec_length >> 1); i++) {
-      AE_LP8X2F_IU(x, (WORD8 *)p_v, 2 * sizeof(WORD8));
-      x = AE_SRAIP24(x, 16);
-      LIMIT(y, x, min, max)
-      STORE_8X2_FROM_24X2(p_o, y)
-    }
-    if (vec_length & 1) {
-      p_v = p_v + 2;
-      int i1;
-      i1 = (WORD8)p_v[0];
-      x = AE_MOVPA24(i1);
-      LIMIT(y, x, min, max)
-      i1 = AE_MOVAP24S_H(y);
-      *p_o++ = (WORD8)i1;
-    }
-  }
-  return 0;
-}
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/kernels/activations/hifi_mini/xa_nn_softmax_asym8_asym8.c b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/kernels/activations/hifi_mini/xa_nn_softmax_asym8_asym8.c
deleted file mode 100644
index 4f7dce8..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/kernels/activations/hifi_mini/xa_nn_softmax_asym8_asym8.c
+++ /dev/null
@@ -1,1005 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
-
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xa_nnlib_common.h"
-
-#define ALIGNMENT 8 /* 8 bytes alignment */
-#define ALIGNED_SIZE(x, bytes) (((x) + (bytes - 1)) & (~(bytes - 1)))
-#define ALIGN_PTR(x, bytes) ((((unsigned)(x)) + (bytes - 1)) & (~(bytes - 1)))
-
-#ifndef AE_LP8X2F_IU
-#define AE_LP8X2F_IU(p_x, p_in, x)                           \
-  AE_LP16F_IU(p_x, (ae_p16s *)p_in, x);                      \
-  ae_p24x2s p_tmp1 = AE_SLLIP24(p_x, 8);                     \
-  ae_p24x2s p_tmp2 = AE_ANDP48(p_x, AE_MOVPA24(0xFFFF0000)); \
-  p_x = AE_SELP24_LL(p_tmp2, p_tmp1);
-
-#endif
-
-#define NSA64_T(y, x)               \
-  {                                 \
-    ae_q56s q_tmp = *(ae_q56s *)&x; \
-    y = AE_NSAQ56S(q_tmp) + 8;      \
-  }
-
-#define MULFP32X2RAS_T(result, a, b)             \
-  {                                              \
-    ae_q56s q_a = AE_CVTQ48A32S(a);              \
-    ae_p24x2s p_b = AE_CVTP24A16X2_HL(b, b);     \
-    ae_q56s q_out = AE_MULFQ32SP16U_L(q_a, p_b); \
-    q_out = AE_SRAIQ56(q_out, 16);               \
-    AE_MULAFQ32SP16S_H(q_out, q_a, p_b);         \
-    q_out = AE_ROUNDSQ32ASYM(q_out);             \
-    *(ae_q32s *)&result = q_out;                 \
-  }
-
-#define MULFP32X2RS_T(result, a, b)              \
-  {                                              \
-    ae_q56s q_a = AE_CVTQ48A32S(a);              \
-    ae_p24x2s p_b = AE_CVTP24A16X2_HL(b, b);     \
-    ae_q56s q_out = AE_MULFQ32SP16U_L(q_a, p_b); \
-    q_out = AE_SRAIQ56(q_out, 16);               \
-    AE_MULAFQ32SP16S_H(q_out, q_a, p_b);         \
-    q_out = AE_ROUNDSQ32SYM(q_out);              \
-    *(ae_q32s *)&result = q_out;                 \
-  }
-#define ADD32S_T(result, a, b)             \
-  {                                        \
-    ae_q56s q_a = AE_CVTQ48A32S(a);        \
-    ae_q56s q_b = AE_CVTQ48A32S(b);        \
-    ae_q56s q_out = AE_ADDSQ56S(q_a, q_b); \
-    q_out = AE_SATQ48S(q_out);             \
-    *(ae_q32s *)&result = q_out;           \
-  }
-
-#define SUB32S_T(result, a, b)             \
-  {                                        \
-    ae_q56s q_a = AE_CVTQ48A32S(a);        \
-    ae_q56s q_b = AE_CVTQ48A32S(b);        \
-    ae_q56s q_out = AE_SUBSQ56S(q_a, q_b); \
-    q_out = AE_SATQ48S(q_out);             \
-    *(ae_q32s *)&result = q_out;           \
-  }
-
-#define SLAI32S_T(result, a, b)         \
-  {                                     \
-    ae_q56s q_a = AE_CVTQ48A32S(a);     \
-    ae_q56s q_out = AE_SLLIQ56(q_a, b); \
-    q_out = AE_SATQ48S(q_out);          \
-    *(ae_q32s *)&result = q_out;        \
-  }
-
-#define SRAA32RS_T(result, a, b)             \
-  {                                          \
-    ae_q56s q_a = AE_CVTQ48A32S(a);          \
-    ae_q56s q_out = AE_SLAASQ56S(q_a, (-b)); \
-    q_out = AE_ROUNDSQ32ASYM(q_out);         \
-    *(ae_q32s *)&result = q_out;             \
-  }
-
-#define SRAI32R_T(result, a, b)         \
-  {                                     \
-    ae_q56s q_a = AE_CVTQ48A32S(a);     \
-    ae_q56s q_out = AE_SRAIQ56(q_a, b); \
-    q_out = AE_ROUNDSQ32ASYM(q_out);    \
-    *(ae_q32s *)&result = q_out;        \
-  }
-
-static const int CONSTANT_TERM = (0x70f5a894);
-static const int CONSTANT_1_OVER_3 = (0x2aaaaaab);
-static const int CONSTANT_1_OVER_8 = (0x10000000);
-static const int ONE_QUATER_Q26 = (0x1000000);  // Q6.26
-static const int MASK = (0xffffff);
-static const int Q31 = 0x7fffffff;
-static const int constant_48_over_17 = 1515870810;
-static const int constant_neg_32_over_17 = -1010580540;  // Q29
-static const int F2_ONE = 0x20000000;
-
-static const int constant_neg_32_over_17_Q21 = -3947580;  // Q21
-static const int constant_48_over_17_Q21 = 5921370;       // Q21
-
-static ae_p24x2s GetReciprocal(ae_q56s q_x, int x_integerbits, int *lsh) {
-  int headroom_plus_one;
-  ae_p24x2s p_x;
-  ae_q56s q_tmp;
-  ae_p24x2s p_half_den;
-  int i;
-
-  headroom_plus_one = AE_NSAQ56S(q_x) + 8;
-  headroom_plus_one = headroom_plus_one - 31;
-  *lsh = x_integerbits - headroom_plus_one;
-
-  q_x = (q_x << (headroom_plus_one + 15));
-  p_half_den = AE_ROUNDSP24Q48SYM(q_x);
-
-  q_tmp = AE_CVTQ48A32S(constant_48_over_17);
-  AE_MULAFP24S_LL(q_tmp, p_half_den, AE_MOVPA24(constant_neg_32_over_17_Q21));
-  p_x = AE_ROUNDSP24Q48SYM(q_tmp);
-
-  for (i = 0; i < 3; i++) {
-    q_tmp = AE_CVTQ48A32S(F2_ONE);
-    AE_MULSFP24S_LL(q_tmp, p_x, p_half_den);
-    ae_p24x2s p_one_minus_half_denominator_times_x = AE_ROUNDSP24Q48SYM(q_tmp);
-
-    q_tmp = AE_MULFP24S_LL(p_x, p_one_minus_half_denominator_times_x);
-    ae_p24x2s p_m = AE_ROUNDSP24Q48SYM(q_tmp);
-    p_m = AE_SLLISP24S(p_m, 2);
-    p_x = AE_ADDSP24S(p_x, p_m);
-  }
-
-  p_x = AE_SLLISP24S(p_x, 1);
-
-  return p_x;
-}
-
-static const int MASK_16BITS = (0xffff);
-static const int ONE_QUATER_Q18 = (0x10000);          // Q18
-static const int CONSTANT_1_OVER_8_Q23 = (0x100000);  // Q23
-static const int CONSTANT_1_OVER_3_Q23 = (0x2aaaaa);  // Q23
-static const int CONSTANT_TERM_Q23 = (0x70f5a8);      // Q23
-static const int Q23 = 0x7fffff;
-
-#define GEMMLOWP_EXP_BARREL_SHIFTER_OPT_II(p_in_out, exponent,                \
-                                           FixedPointMultiplier, p_remainder) \
-  {                                                                           \
-    ae_p24x2s p_out;                                                          \
-                                                                              \
-    ae_p24x2s p_zero = AE_ZEROP48();                                          \
-                                                                              \
-    ae_p24x2s p_scale = AE_MOVPA24(1 << (18 + exponent));                     \
-    ae_p24x2s p_mask = p_remainder & p_scale;                                 \
-                                                                              \
-    ae_p24x2s p_FixedPointMultiplier = AE_MOVPA24(FixedPointMultiplier >> 8); \
-                                                                              \
-    ae_q56s q_tmp1 = AE_MULFP24S_HH(p_in_out, p_FixedPointMultiplier);        \
-    ae_q56s q_tmp2 = AE_MULFP24S_LL(p_in_out, p_FixedPointMultiplier);        \
-    ae_p24x2s p_t1 = AE_ROUNDSP24Q48SYM(q_tmp1);                              \
-    ae_p24x2s p_t2 = AE_ROUNDSP24Q48SYM(q_tmp2);                              \
-    p_out = AE_SELP24_LL(p_t1, p_t2);                                         \
-                                                                              \
-    xtbool2 flag_le = AE_LTP24S(p_zero, p_mask);                              \
-    AE_MOVTP24X2(p_in_out, p_out, flag_le);                                   \
-  }
-
-#define EXP_Q26_II(p_exp_y, p_inp_t)                                        \
-  {                                                                         \
-    ae_p24x2s p_x1_in, p_x2, p_x3, p_x4, p_x4_by_4, p_y1, p_y2, p_y3, p_y4, \
-        p_y5, p_y6, p_y;                                                    \
-                                                                            \
-    p_x2 = p_inp_t & AE_MOVPA24(MASK_16BITS);                               \
-    ae_p24x2s p_a_mod_quater_minus_q_1_by_4 =                               \
-        p_x2 - AE_MOVPA24(ONE_QUATER_Q18);                                  \
-    ae_p24x2s p_x_in = p_a_mod_quater_minus_q_1_by_4 << 5;                  \
-    ae_p24x2s p_remainder = p_a_mod_quater_minus_q_1_by_4 - p_inp_t;        \
-                                                                            \
-    p_x1_in = AE_ADDSP24S(p_x_in, AE_MOVPA24(CONSTANT_1_OVER_8_Q23));       \
-                                                                            \
-    ae_q56s q_tmp1 = AE_MULFP24S_HH(p_x1_in, p_x1_in);                      \
-    ae_q56s q_tmp2 = AE_MULFP24S_LL(p_x1_in, p_x1_in);                      \
-    ae_p24x2s p_t1 = AE_ROUNDSP24Q48SYM(q_tmp1);                            \
-    ae_p24x2s p_t2 = AE_ROUNDSP24Q48SYM(q_tmp2);                            \
-    p_x2 = AE_SELP24_LL(p_t1, p_t2);                                        \
-                                                                            \
-    q_tmp1 = AE_MULFP24S_HH(p_t1, p_x1_in);                                 \
-    q_tmp2 = AE_MULFP24S_LL(p_t2, p_x1_in);                                 \
-    p_t1 = AE_ROUNDSP24Q48SYM(q_tmp1);                                      \
-    p_t2 = AE_ROUNDSP24Q48SYM(q_tmp2);                                      \
-    p_x3 = AE_SELP24_LL(p_t1, p_t2);                                        \
-                                                                            \
-    q_tmp1 = AE_MULFP24S_HH(p_x2, p_x2);                                    \
-    q_tmp2 = AE_MULFP24S_LL(p_x2, p_x2);                                    \
-    p_t1 = AE_ROUNDSP24Q48SYM(q_tmp1);                                      \
-    p_t2 = AE_ROUNDSP24Q48SYM(q_tmp2);                                      \
-    p_x4 = AE_SELP24_LL(p_t1, p_t2);                                        \
-    p_x4_by_4 = p_x4 >> 2;                                                  \
-                                                                            \
-    p_y1 = AE_ADDSP24S(p_x4_by_4, p_x3);                                    \
-                                                                            \
-    ae_p24x2s p_const = AE_MOVPA24(CONSTANT_1_OVER_3_Q23);                  \
-    q_tmp1 = AE_MULFP24S_HH(p_y1, p_const);                                 \
-    q_tmp2 = AE_MULFP24S_LL(p_y1, p_const);                                 \
-    p_t1 = AE_ROUNDSP24Q48SYM(q_tmp1);                                      \
-    p_t2 = AE_ROUNDSP24Q48SYM(q_tmp2);                                      \
-    p_y2 = AE_SELP24_LL(p_t1, p_t2);                                        \
-                                                                            \
-    p_y3 = AE_ADDSP24S(p_y2, p_x2);                                         \
-    p_y4 = p_y3 >> 1;                                                       \
-                                                                            \
-    p_y5 = AE_ADDSP24S(p_x1_in, p_y4); /* ADD32S_T(y5, x1_in, y4);  */      \
-                                                                            \
-    p_const = AE_MOVPA24(CONSTANT_TERM_Q23);                                \
-    q_tmp1 = AE_MULFP24S_HH(p_y5, p_const);                                 \
-    q_tmp2 = AE_MULFP24S_LL(p_y5, p_const);                                 \
-    p_t1 = AE_ROUNDSP24Q48SYM(q_tmp1);                                      \
-    p_t2 = AE_ROUNDSP24Q48SYM(q_tmp2);                                      \
-    p_y6 = AE_SELP24_LL(p_t1, p_t2);                                        \
-    p_y = AE_ADDSP24S(p_y6, p_const);                                       \
-                                                                            \
-    {                                                                       \
-      GEMMLOWP_EXP_BARREL_SHIFTER_OPT_II(p_y, -2, 1672461947, p_remainder); \
-      GEMMLOWP_EXP_BARREL_SHIFTER_OPT_II(p_y, -1, 1302514674, p_remainder); \
-      GEMMLOWP_EXP_BARREL_SHIFTER_OPT_II(p_y, 0, 790015084, p_remainder);   \
-      GEMMLOWP_EXP_BARREL_SHIFTER_OPT_II(p_y, 1, 290630308, p_remainder);   \
-      GEMMLOWP_EXP_BARREL_SHIFTER_OPT_II(p_y, 2, 39332535, p_remainder);    \
-      GEMMLOWP_EXP_BARREL_SHIFTER_OPT_II(p_y, 3, 720401, p_remainder);      \
-      GEMMLOWP_EXP_BARREL_SHIFTER_OPT_II(p_y, 4, 242, p_remainder);         \
-    }                                                                       \
-    p_exp_y = p_y;                                                          \
-    p_const = AE_MOVPA24(Q23);                                              \
-    xtbool2 flag_eq = AE_EQP24(p_inp_t, AE_ZEROP48());                      \
-    AE_MOVTP24X2(p_exp_y, p_const, flag_eq);                                \
-  }
-
-#define GEMMLOWP_EXP_BARREL_SHIFTER_OPT_I(p_in_out, exponent,                 \
-                                          FixedPointMultiplier, p_remainder)  \
-  {                                                                           \
-    ae_p24x2s p_out;                                                          \
-                                                                              \
-    ae_p24x2s p_zero = AE_ZEROP48();                                          \
-                                                                              \
-    ae_p24x2s p_scale = AE_MOVPA24(1 << (18 + exponent));                     \
-    ae_p24x2s p_mask = p_remainder & p_scale;                                 \
-                                                                              \
-    ae_p24x2s p_FixedPointMultiplier = AE_MOVPA24(FixedPointMultiplier >> 8); \
-                                                                              \
-    ae_q56s q_tmp1 = AE_MULFP24S_HH(p_in_out, p_FixedPointMultiplier);        \
-    p_out = AE_ROUNDSP24Q48SYM(q_tmp1);                                       \
-                                                                              \
-    xtbool2 flag_le = AE_LTP24S(p_zero, p_mask);                              \
-    AE_MOVTP24X2(p_in_out, p_out, flag_le);                                   \
-  }
-
-#define EXP_Q26_I(p_exp_y, p_inp_t)                                         \
-  {                                                                         \
-    ae_p24x2s p_x1_in, p_x2, p_x3, p_x4, p_x4_by_4, p_y1, p_y2, p_y3, p_y4, \
-        p_y5, p_y6, p_y;                                                    \
-                                                                            \
-    p_x2 = p_inp_t & AE_MOVPA24(MASK_16BITS);                               \
-    ae_p24x2s p_a_mod_quater_minus_q_1_by_4 =                               \
-        p_x2 - AE_MOVPA24(ONE_QUATER_Q18);                                  \
-    ae_p24x2s p_x_in = p_a_mod_quater_minus_q_1_by_4 << 5;                  \
-    ae_p24x2s p_remainder = p_a_mod_quater_minus_q_1_by_4 - p_inp_t;        \
-                                                                            \
-    p_x1_in = AE_ADDSP24S(p_x_in, AE_MOVPA24(CONSTANT_1_OVER_8_Q23));       \
-                                                                            \
-    ae_q56s q_tmp1 = AE_MULFP24S_HH(p_x1_in, p_x1_in);                      \
-    p_x2 = AE_ROUNDSP24Q48SYM(q_tmp1);                                      \
-                                                                            \
-    q_tmp1 = AE_MULFP24S_HH(p_x2, p_x1_in);                                 \
-    p_x3 = AE_ROUNDSP24Q48SYM(q_tmp1);                                      \
-                                                                            \
-    q_tmp1 = AE_MULFP24S_HH(p_x2, p_x2);                                    \
-    p_x4 = AE_ROUNDSP24Q48SYM(q_tmp1);                                      \
-    p_x4_by_4 = p_x4 >> 2;                                                  \
-                                                                            \
-    p_y1 = AE_ADDSP24S(p_x4_by_4, p_x3);                                    \
-                                                                            \
-    ae_p24x2s p_const = AE_MOVPA24(CONSTANT_1_OVER_3_Q23);                  \
-    q_tmp1 = AE_MULFP24S_HH(p_y1, p_const);                                 \
-    p_y2 = AE_ROUNDSP24Q48SYM(q_tmp1);                                      \
-                                                                            \
-    p_y3 = AE_ADDSP24S(p_y2, p_x2);                                         \
-    p_y4 = p_y3 >> 1;                                                       \
-                                                                            \
-    p_y5 = AE_ADDSP24S(p_x1_in, p_y4); /* ADD32S_T(y5, x1_in, y4);  */      \
-                                                                            \
-    p_const = AE_MOVPA24(CONSTANT_TERM_Q23);                                \
-    q_tmp1 = AE_MULFP24S_HH(p_y5, p_const);                                 \
-    p_y6 = AE_ROUNDSP24Q48SYM(q_tmp1);                                      \
-    p_y = AE_ADDSP24S(p_y6, p_const);                                       \
-                                                                            \
-    {                                                                       \
-      GEMMLOWP_EXP_BARREL_SHIFTER_OPT_I(p_y, -2, 1672461947, p_remainder);  \
-      GEMMLOWP_EXP_BARREL_SHIFTER_OPT_I(p_y, -1, 1302514674, p_remainder);  \
-      GEMMLOWP_EXP_BARREL_SHIFTER_OPT_I(p_y, 0, 790015084, p_remainder);    \
-      GEMMLOWP_EXP_BARREL_SHIFTER_OPT_I(p_y, 1, 290630308, p_remainder);    \
-      GEMMLOWP_EXP_BARREL_SHIFTER_OPT_I(p_y, 2, 39332535, p_remainder);     \
-      GEMMLOWP_EXP_BARREL_SHIFTER_OPT_I(p_y, 3, 720401, p_remainder);       \
-      GEMMLOWP_EXP_BARREL_SHIFTER_OPT_I(p_y, 4, 242, p_remainder);          \
-    }                                                                       \
-    p_exp_y = p_y;                                                          \
-    p_const = AE_MOVPA24(Q23);                                              \
-    xtbool2 flag_eq = AE_EQP24(p_inp_t, AE_ZEROP48());                      \
-    AE_MOVTP24X2(p_exp_y, p_const, flag_eq);                                \
-  }
-
-WORD32 xa_nn_vec_softmax_asym8u_8(UWORD8 *__restrict__ pOut,
-                                  const UWORD8 *__restrict__ pVec,
-                                  WORD32 diffmin, WORD32 input_beta_left_shift,
-                                  WORD32 input_beta_multiplier,
-                                  WORD32 vec_length, pVOID pScratch) {
-  /* NULL pointer checks */
-  XA_NNLIB_ARG_CHK_PTR(pOut, -1);
-  XA_NNLIB_ARG_CHK_PTR(pVec, -1);
-  XA_NNLIB_ARG_CHK_PTR(pScratch, -1);
-  /* Pointer alignment checks */
-  /* No alignment (1-byte) needed for any pointer */
-  /* Basic Parameter checks */
-  XA_NNLIB_ARG_CHK_COND((vec_length <= 0), -1);
-  XA_NNLIB_ARG_CHK_COND(
-      ((input_beta_left_shift < -31) || (input_beta_left_shift > 31)), -1);
-  XA_NNLIB_ARG_CHK_COND((input_beta_multiplier < 0), -1);
-
-  int i;
-  int shift_bits_reciprocal;
-  UWORD8 *p_in;
-  WORD32 *__restrict pExp = (WORD32 *)ALIGN_PTR(pScratch, ALIGNMENT);
-  ae_p24f *__restrict pTmpScratch = (ae_p24f *)pExp;
-  int max;
-  ae_p24x2s p_x;
-  ae_p24x2s p_max = AE_MOVPA24(0xFF800000);
-  ae_p24x2s p_recip_sum_exp;
-  int pre_loop_count;
-  int main_loop_count;
-  int post_loop_count;
-
-  if (vec_length > 1) {
-    pre_loop_count = (int)pVec & 0x1;
-    main_loop_count = vec_length - pre_loop_count;
-    post_loop_count = (main_loop_count & 1);
-    main_loop_count = main_loop_count >> 1;
-  } else {
-    pre_loop_count = 0;
-    main_loop_count = 0;
-    post_loop_count = vec_length;
-  }
-
-  /* Calculating Max */
-  {
-    p_in = (UWORD8 *)pVec;
-
-    if (pre_loop_count) {
-      p_x = AE_MOVPA24(*p_in++);
-      p_max = AE_MAXP24S(p_max, p_x);
-    }
-
-    p_in -= 2;
-    for (i = 0; i < main_loop_count; i++) {
-      AE_LP8X2F_IU(p_x, p_in, 2 * sizeof(WORD8));
-      p_x = AE_SRLIP24(p_x, 16);
-      p_max = AE_MAXP24S(p_max, p_x);
-    }
-
-    if (post_loop_count) {
-      p_in += 2;
-      p_x = AE_MOVPA24(*p_in);
-      p_max = AE_MAXP24S(p_max, p_x);
-    }
-    p_max = AE_MAXP24S(p_max, AE_SELP24_LH(p_max, p_max));
-    max = AE_MOVAP24S_L(p_max);
-  }
-
-  /* Calculate exponents */
-  {
-    ae_q56s q_sum_exp = AE_ZEROQ56();
-    ae_p24x2s p_rem_x, p_y, p_exp_y;
-    ae_p24x2s p_zero = AE_ZEROP48();
-    ae_p24x2s p_input_beta_multiplier =
-        AE_MOVPA24((input_beta_multiplier >> 8));
-    ae_p24x2s p_diffmin = AE_MOVPA24(diffmin);
-    int input_beta_left_shift_for_24bit = input_beta_left_shift - 8;
-
-    p_in = (UWORD8 *)pVec;
-    WUR_AE_SAR(input_beta_left_shift_for_24bit);
-
-    if (pre_loop_count) {
-      p_x = AE_MOVPA24(*p_in++);
-      p_rem_x = p_x - p_max;
-      p_y = AE_SLLSSP24S(p_rem_x);
-
-      ae_q56s q_dequantized_y1 = AE_MULFP24S_LL(p_y, p_input_beta_multiplier);
-
-      ae_p24x2s p_dequantized_y1 = AE_ROUNDSP24Q48ASYM(q_dequantized_y1);
-
-      EXP_Q26_I(p_exp_y, p_dequantized_y1)
-
-      xtbool2 flag_cmp = AE_LTP24S(p_rem_x, p_diffmin);
-      AE_MOVTP24X2(p_exp_y, p_zero, flag_cmp);
-
-      *pTmpScratch++ = p_exp_y;
-
-      p_exp_y = p_exp_y >> 4;
-
-      AE_MULAP24S_LL(q_sum_exp, p_exp_y, AE_MOVPA24(1));
-    }
-
-    p_in -= 2;
-    for (i = 0; i < main_loop_count; i++) {
-      AE_LP8X2F_IU(p_x, p_in, 2 * sizeof(WORD8));
-      p_x = AE_SRLIP24(p_x, 16);
-      p_rem_x = p_x - p_max;
-      p_y = AE_SLLSSP24S(p_rem_x);
-
-      ae_q56s q_dequantized_y1 = AE_MULFP24S_HH(p_y, p_input_beta_multiplier);
-      ae_q56s q_dequantized_y2 = AE_MULFP24S_LL(p_y, p_input_beta_multiplier);
-
-      ae_p24x2s p_dequantized_y1 = AE_ROUNDSP24Q48ASYM(q_dequantized_y1);
-      ae_p24x2s p_dequantized_y2 = AE_ROUNDSP24Q48ASYM(q_dequantized_y2);
-
-      ae_p24x2s p_dequantized =
-          AE_SELP24_LL(p_dequantized_y1, p_dequantized_y2);
-
-      EXP_Q26_II(p_exp_y, p_dequantized)
-
-      xtbool2 flag_cmp = AE_LTP24S(p_rem_x, p_diffmin);
-      AE_MOVTP24X2(p_exp_y, p_zero, flag_cmp);
-
-      *pTmpScratch++ = AE_SELP24_HH(p_exp_y, p_exp_y);
-      *pTmpScratch++ = p_exp_y; /* store lower element */
-
-      p_exp_y = p_exp_y >> 4;
-
-      AE_MULAAP24S_HH_LL(q_sum_exp, p_exp_y, AE_MOVPA24(1));
-    }
-    if (post_loop_count) {
-      p_in += 2;
-
-      p_x = AE_MOVPA24(*p_in);
-      p_rem_x = p_x - p_max;
-      p_y = AE_SLLSSP24S(p_rem_x);
-
-      ae_q56s q_dequantized_y1 = AE_MULFP24S_LL(p_y, p_input_beta_multiplier);
-
-      ae_p24x2s p_dequantized_y1 = AE_ROUNDSP24Q48ASYM(q_dequantized_y1);
-
-      EXP_Q26_I(p_exp_y, p_dequantized_y1)
-
-      xtbool2 flag_cmp = AE_LTP24S(p_rem_x, p_diffmin);
-      AE_MOVTP24X2(p_exp_y, p_zero, flag_cmp);
-
-      *pTmpScratch = p_exp_y;
-
-      p_exp_y = p_exp_y >> 4;
-
-      AE_MULAP24S_LL(q_sum_exp, p_exp_y, AE_MOVPA24(1));
-    }
-    p_recip_sum_exp = GetReciprocal(q_sum_exp, 12, &shift_bits_reciprocal);
-  }
-
-  /* Calculate output */
-  {
-    ae_p24x2s p_exp;
-
-    int shift_val = -(shift_bits_reciprocal + 31 - 8 - 8);
-
-    ae_p24x2s p_min = AE_ZEROP48();
-    ae_p24x2s p_max = AE_MOVPA24(255);
-
-    for (i = 0; i<vec_length >> 1; i++) {
-      int out;
-
-      p_exp = *(ae_p24x2f *)&pExp[2 * i];
-
-      ae_q56s q_tmp1 = AE_MULFP24S_HH(p_exp, p_recip_sum_exp);
-      ae_q56s q_tmp2 = AE_MULFP24S_LL(p_exp, p_recip_sum_exp);
-
-      q_tmp1 = AE_SLAASQ56S(q_tmp1, shift_val);
-      q_tmp2 = AE_SLAASQ56S(q_tmp2, shift_val);
-
-      ae_p24x2s p_out1 = AE_ROUNDSP24Q48ASYM(q_tmp1);
-      ae_p24x2s p_out2 = AE_ROUNDSP24Q48ASYM(q_tmp2);
-
-      ae_p24x2s p_out = AE_SELP24_LL(p_out1, p_out2);
-
-      p_out = AE_MAXP24S(p_out, p_min);
-      p_out = AE_MINP24S(p_out, p_max);
-
-      out = AE_MOVAP24S_H(p_out);
-      *pOut++ = (UWORD8)out;
-
-      out = AE_MOVAP24S_L(p_out);
-      *pOut++ = (UWORD8)out;
-    }
-
-    if (vec_length & 0x1) {
-      int out;
-
-      p_exp = *(ae_p24f *)&pExp[vec_length - 1];
-
-      ae_q56s q_tmp1 = AE_MULFP24S_LL(p_exp, p_recip_sum_exp);
-
-      q_tmp1 = AE_SLAASQ56S(q_tmp1, shift_val);
-
-      ae_p24x2s p_out = AE_ROUNDSP24Q48ASYM(q_tmp1);
-
-      p_out = AE_MAXP24S(p_out, p_min);
-      p_out = AE_MINP24S(p_out, p_max);
-
-      out = AE_MOVAP24S_L(p_out);
-      *pOut++ = (UWORD8)out;
-    }
-  }
-
-  return 0;
-}
-
-WORD32 xa_nn_vec_softmax_asym8s_8(WORD8 *__restrict__ pOut,
-                                  const WORD8 *__restrict__ pVec,
-                                  WORD32 diffmin, WORD32 input_beta_left_shift,
-                                  WORD32 input_beta_multiplier,
-                                  WORD32 vec_length, pVOID pScratch) {
-  /* NULL pointer checks */
-  XA_NNLIB_ARG_CHK_PTR(pOut, -1);
-  XA_NNLIB_ARG_CHK_PTR(pVec, -1);
-  XA_NNLIB_ARG_CHK_PTR(pScratch, -1);
-  /* Pointer alignment checks */
-  /* No alignment (1-byte) needed for any pointer */
-  /* Basic Parameter checks */
-  XA_NNLIB_ARG_CHK_COND((vec_length <= 0), -1);
-  XA_NNLIB_ARG_CHK_COND(
-      ((input_beta_left_shift < -31) || (input_beta_left_shift > 31)), -1);
-  XA_NNLIB_ARG_CHK_COND((input_beta_multiplier < 0), -1);
-
-  int i;
-  int shift_bits_reciprocal;
-  WORD8 *p_in;
-  WORD32 *__restrict pExp = (WORD32 *)ALIGN_PTR(pScratch, ALIGNMENT);
-  ae_p24x2s p_recip_sum_exp;
-  ae_p24x2s p_x;
-  ae_p24x2s p_max = AE_MOVPA24(0xFF800000);
-
-  int pre_loop_count;
-  int main_loop_count;
-  int post_loop_count;
-
-  if (vec_length > 1) {
-    pre_loop_count = (int)pVec & 0x1;
-    main_loop_count = vec_length - pre_loop_count;
-    post_loop_count = (main_loop_count & 1);
-    main_loop_count = main_loop_count >> 1;
-  } else {
-    pre_loop_count = 0;
-    main_loop_count = 0;
-    post_loop_count = vec_length;
-  }
-
-  /* Calculating Max */
-  {
-    p_in = (WORD8 *)pVec;
-
-    if (pre_loop_count) {
-      p_x = AE_MOVPA24(*p_in++);
-      p_max = AE_MAXP24S(p_max, p_x);
-    }
-
-    p_in -= 2;
-    for (i = 0; i < main_loop_count; i++) {
-      AE_LP8X2F_IU(p_x, p_in, 2 * sizeof(WORD8));
-      p_max = AE_MAXP24S(p_max, p_x);
-    }
-    p_max = AE_SRAIP24(p_max, 16);
-
-    if (post_loop_count) {
-      p_in += 2;
-      p_x = AE_MOVPA24(*p_in);
-      p_max = AE_MAXP24S(p_max, p_x);
-    }
-    p_max = AE_MAXP24S(p_max, AE_SELP24_LH(p_max, p_max));
-  }
-
-  /* Calculate exponents */
-  {
-    ae_q56s q_sum_exp = AE_ZEROQ56();
-    ae_p24x2s p_rem_x, p_y, p_exp_y;
-    ae_p24x2s p_zero = AE_ZEROP48();
-    ae_p24x2s p_input_beta_multiplier =
-        AE_MOVPA24((input_beta_multiplier >> 8));
-    ae_p24x2s p_diffmin = AE_MOVPA24(diffmin);
-    int input_beta_left_shift_for_24bit = input_beta_left_shift - 8;
-
-    p_in = (WORD8 *)pVec;
-    WUR_AE_SAR(input_beta_left_shift_for_24bit);
-
-    if (pre_loop_count) {
-      p_x = AE_MOVPA24(*p_in++);
-      p_rem_x = p_x - p_max;
-      p_y = AE_SLLSSP24S(p_rem_x);
-
-      ae_q56s q_dequantized_y1 = AE_MULFP24S_LL(p_y, p_input_beta_multiplier);
-
-      ae_p24x2s p_dequantized_y1 = AE_ROUNDSP24Q48ASYM(q_dequantized_y1);
-
-      EXP_Q26_I(p_exp_y, p_dequantized_y1)
-
-      xtbool2 flag_cmp = AE_LTP24S(p_rem_x, p_diffmin);
-      AE_MOVTP24X2(p_exp_y, p_zero, flag_cmp);
-
-      *(ae_p24f *)&pExp[0] = p_exp_y;
-
-      p_exp_y = p_exp_y >> 4;
-
-      AE_MULAP24S_LL(q_sum_exp, p_exp_y, AE_MOVPA24(1));
-    }
-
-    p_in -= 2;
-    for (i = 0; i < main_loop_count; i++) {
-      AE_LP8X2F_IU(p_x, p_in, 2 * sizeof(WORD8));
-      p_x = AE_SRAIP24(p_x, 16);
-      p_rem_x = p_x - p_max;
-      p_y = AE_SLLSSP24S(p_rem_x);
-
-      ae_q56s q_dequantized_y1 = AE_MULFP24S_HH(p_y, p_input_beta_multiplier);
-      ae_q56s q_dequantized_y2 = AE_MULFP24S_LL(p_y, p_input_beta_multiplier);
-
-      ae_p24x2s p_dequantized_y1 = AE_ROUNDSP24Q48ASYM(q_dequantized_y1);
-      ae_p24x2s p_dequantized_y2 = AE_ROUNDSP24Q48ASYM(q_dequantized_y2);
-
-      ae_p24x2s p_dequantized =
-          AE_SELP24_LL(p_dequantized_y1, p_dequantized_y2);
-
-      EXP_Q26_II(p_exp_y, p_dequantized)
-
-      xtbool2 flag_cmp = AE_LTP24S(p_rem_x, p_diffmin);
-      AE_MOVTP24X2(p_exp_y, p_zero, flag_cmp);
-
-      //*(ae_p24x2f *)&pExp[pre_loop_count + 2*i] = p_exp_y;
-      *(ae_p24f *)&pExp[pre_loop_count + 2 * i] =
-          AE_SELP24_HH(p_exp_y, p_exp_y);
-      *(ae_p24f *)&pExp[pre_loop_count + 2 * i + 1] =
-          AE_SELP24_LL(p_exp_y, p_exp_y);
-      //*(ae_p24f *)&pExp[0] = p_exp_y;
-
-      p_exp_y = p_exp_y >> 4;
-
-      AE_MULAAP24S_HH_LL(q_sum_exp, p_exp_y, AE_MOVPA24(1));
-    }
-
-    if (post_loop_count) {
-      p_in += 2;
-
-      p_x = AE_MOVPA24(*p_in);
-      p_rem_x = p_x - p_max;
-      p_y = AE_SLLSSP24S(p_rem_x);
-
-      ae_q56s q_dequantized_y1 = AE_MULFP24S_LL(p_y, p_input_beta_multiplier);
-
-      ae_p24x2s p_dequantized_y1 = AE_ROUNDSP24Q48ASYM(q_dequantized_y1);
-
-      EXP_Q26_I(p_exp_y, p_dequantized_y1)
-
-      xtbool2 flag_cmp = AE_LTP24S(p_rem_x, p_diffmin);
-      AE_MOVTP24X2(p_exp_y, p_zero, flag_cmp);
-
-      *(ae_p24f *)&pExp[vec_length - 1] = p_exp_y;
-
-      p_exp_y = p_exp_y >> 4;
-
-      AE_MULAP24S_LL(q_sum_exp, p_exp_y, AE_MOVPA24(1));
-    }
-
-    p_recip_sum_exp = GetReciprocal(q_sum_exp, 12, &shift_bits_reciprocal);
-  }
-
-  /* Calculate output */
-  pExp = (WORD32 *)ALIGN_PTR(pScratch, ALIGNMENT);
-  {
-    ae_p24x2s p_exp;
-
-    int shift_val = -(shift_bits_reciprocal + 31 - 8 - 8);
-
-    ae_p24x2s p_min = AE_MOVPA24(-128);
-    ae_p24x2s p_max = AE_MOVPA24(127);
-
-    for (i = 0; i<vec_length >> 1; i++) {
-      int out;
-
-      p_exp = *(ae_p24x2f *)&pExp[2 * i];
-
-      ae_q56s q_tmp1 = AE_MULFP24S_HH(p_exp, p_recip_sum_exp);
-      ae_q56s q_tmp2 = AE_MULFP24S_LL(p_exp, p_recip_sum_exp);
-
-      q_tmp1 = AE_SLAASQ56S(q_tmp1, shift_val);
-      q_tmp2 = AE_SLAASQ56S(q_tmp2, shift_val);
-
-      ae_p24x2s p_out1 = AE_ROUNDSP24Q48ASYM(q_tmp1);
-      ae_p24x2s p_out2 = AE_ROUNDSP24Q48ASYM(q_tmp2);
-
-      ae_p24x2s p_out = AE_SELP24_LL(p_out1, p_out2);
-
-      p_out = AE_SUBSP24S(p_out, AE_MOVPA24(128));
-      p_out = AE_MAXP24S(p_out, p_min);
-      p_out = AE_MINP24S(p_out, p_max);
-
-      out = AE_MOVAP24S_H(p_out);
-      *pOut++ = (WORD8)out;
-
-      out = AE_MOVAP24S_L(p_out);
-      *pOut++ = (WORD8)out;
-    }
-
-    if (vec_length & 0x1) {
-      int out;
-
-      p_exp = *(ae_p24f *)&pExp[vec_length - 1];
-
-      ae_q56s q_tmp1 = AE_MULFP24S_LL(p_exp, p_recip_sum_exp);
-
-      q_tmp1 = AE_SLAASQ56S(q_tmp1, shift_val);
-
-      ae_p24x2s p_out = AE_ROUNDSP24Q48ASYM(q_tmp1);
-
-      p_out = AE_SUBSP24S(p_out, AE_MOVPA24(128));
-      p_out = AE_MAXP24S(p_out, p_min);
-      p_out = AE_MINP24S(p_out, p_max);
-
-      out = AE_MOVAP24S_L(p_out);
-      *pOut++ = (WORD8)out;
-    }
-  }
-
-  return 0;
-}
-
-WORD32 xa_nn_vec_softmax_asym8s_16(WORD16 *__restrict__ pOut,
-                                   const WORD8 *__restrict__ pVec,
-                                   WORD32 diffmin, WORD32 input_beta_left_shift,
-                                   WORD32 input_beta_multiplier,
-                                   WORD32 vec_length, pVOID pScratch) {
-  /* NULL pointer checks */
-  XA_NNLIB_ARG_CHK_PTR(pOut, -1);
-  XA_NNLIB_ARG_CHK_PTR(pVec, -1);
-  XA_NNLIB_ARG_CHK_PTR(pScratch, -1);
-  /* Pointer alignment checks */
-  /* No alignment (1-byte) needed for any pointer */
-  /* Basic Parameter checks */
-  XA_NNLIB_ARG_CHK_COND((vec_length <= 0), -1);
-  XA_NNLIB_ARG_CHK_COND(
-      ((input_beta_left_shift < -31) || (input_beta_left_shift > 31)), -1);
-  XA_NNLIB_ARG_CHK_COND((input_beta_multiplier < 0), -1);
-
-  int i;
-  int shift_bits_reciprocal;
-  WORD8 *p_in;
-  WORD32 *__restrict pExp = (WORD32 *)ALIGN_PTR(pScratch, ALIGNMENT);
-  ae_p24x2s p_recip_sum_exp;
-  ae_p24x2s p_x;
-  ae_p24x2s p_max = AE_MOVPA24(0xFF800000);
-
-  int pre_loop_count;
-  int main_loop_count;
-  int post_loop_count;
-
-  if (vec_length > 1) {
-    pre_loop_count = (int)pVec & 0x1;
-    main_loop_count = vec_length - pre_loop_count;
-    post_loop_count = (main_loop_count & 1);
-    main_loop_count = main_loop_count >> 1;
-  } else {
-    pre_loop_count = 0;
-    main_loop_count = 0;
-    post_loop_count = vec_length;
-  }
-
-  /* Calculating Max */
-  {
-    p_in = (WORD8 *)pVec;
-
-    if (pre_loop_count) {
-      p_x = AE_MOVPA24(*p_in++);
-      p_max = AE_MAXP24S(p_max, p_x);
-    }
-
-    p_in -= 2;
-    for (i = 0; i < main_loop_count; i++) {
-      AE_LP8X2F_IU(p_x, p_in, 2 * sizeof(WORD8));
-      p_max = AE_MAXP24S(p_max, p_x);
-    }
-    p_max = AE_SRAIP24(p_max, 16);
-
-    if (post_loop_count) {
-      p_in += 2;
-      p_x = AE_MOVPA24(*p_in);
-      p_max = AE_MAXP24S(p_max, p_x);
-    }
-    p_max = AE_MAXP24S(p_max, AE_SELP24_LH(p_max, p_max));
-  }
-
-  /* Calculate exponents */
-  {
-    ae_q56s q_sum_exp = AE_ZEROQ56();
-    ae_p24x2s p_rem_x, p_y, p_exp_y;
-    ae_p24x2s p_zero = AE_ZEROP48();
-    ae_p24x2s p_input_beta_multiplier =
-        AE_MOVPA24((input_beta_multiplier >> 8));
-    ae_p24x2s p_diffmin = AE_MOVPA24(diffmin);
-    int input_beta_left_shift_for_24bit = input_beta_left_shift - 8;
-
-    p_in = (WORD8 *)pVec;
-    WUR_AE_SAR(input_beta_left_shift_for_24bit);
-
-    if (pre_loop_count) {
-      p_x = AE_MOVPA24(*p_in++);
-      p_rem_x = p_x - p_max;
-      p_y = AE_SLLSSP24S(p_rem_x);
-
-      ae_q56s q_dequantized_y1 = AE_MULFP24S_LL(p_y, p_input_beta_multiplier);
-
-      ae_p24x2s p_dequantized_y1 = AE_ROUNDSP24Q48ASYM(q_dequantized_y1);
-
-      EXP_Q26_I(p_exp_y, p_dequantized_y1)
-
-      xtbool2 flag_cmp = AE_LTP24S(p_rem_x, p_diffmin);
-      AE_MOVTP24X2(p_exp_y, p_zero, flag_cmp);
-
-      *(ae_p24f *)&pExp[0] = p_exp_y;
-
-      p_exp_y = p_exp_y >> 4;
-
-      AE_MULAP24S_LL(q_sum_exp, p_exp_y, AE_MOVPA24(1));
-    }
-
-    p_in -= 2;
-    for (i = 0; i < main_loop_count; i++) {
-      AE_LP8X2F_IU(p_x, p_in, 2 * sizeof(WORD8));
-      p_x = AE_SRAIP24(p_x, 16);
-      p_rem_x = p_x - p_max;
-      p_y = AE_SLLSSP24S(p_rem_x);
-
-      ae_q56s q_dequantized_y1 = AE_MULFP24S_HH(p_y, p_input_beta_multiplier);
-      ae_q56s q_dequantized_y2 = AE_MULFP24S_LL(p_y, p_input_beta_multiplier);
-
-      ae_p24x2s p_dequantized_y1 = AE_ROUNDSP24Q48ASYM(q_dequantized_y1);
-      ae_p24x2s p_dequantized_y2 = AE_ROUNDSP24Q48ASYM(q_dequantized_y2);
-
-      ae_p24x2s p_dequantized =
-          AE_SELP24_LL(p_dequantized_y1, p_dequantized_y2);
-
-      EXP_Q26_II(p_exp_y, p_dequantized)
-
-      xtbool2 flag_cmp = AE_LTP24S(p_rem_x, p_diffmin);
-      AE_MOVTP24X2(p_exp_y, p_zero, flag_cmp);
-
-      *(ae_p24f *)&pExp[pre_loop_count + 2 * i] =
-          AE_SELP24_HH(p_exp_y, p_exp_y);
-      *(ae_p24f *)&pExp[pre_loop_count + 2 * i + 1] =
-          AE_SELP24_LL(p_exp_y, p_exp_y);
-
-      p_exp_y = p_exp_y >> 4;
-
-      AE_MULAAP24S_HH_LL(q_sum_exp, p_exp_y, AE_MOVPA24(1));
-    }
-
-    if (post_loop_count) {
-      p_in += 2;
-
-      p_x = AE_MOVPA24(*p_in);
-      p_rem_x = p_x - p_max;
-      p_y = AE_SLLSSP24S(p_rem_x);
-
-      ae_q56s q_dequantized_y1 = AE_MULFP24S_LL(p_y, p_input_beta_multiplier);
-
-      ae_p24x2s p_dequantized_y1 = AE_ROUNDSP24Q48ASYM(q_dequantized_y1);
-
-      EXP_Q26_I(p_exp_y, p_dequantized_y1)
-
-      xtbool2 flag_cmp = AE_LTP24S(p_rem_x, p_diffmin);
-      AE_MOVTP24X2(p_exp_y, p_zero, flag_cmp);
-
-      *(ae_p24f *)&pExp[vec_length - 1] = p_exp_y;
-
-      p_exp_y = p_exp_y >> 4;
-
-      AE_MULAP24S_LL(q_sum_exp, p_exp_y, AE_MOVPA24(1));
-    }
-
-    p_recip_sum_exp = GetReciprocal(q_sum_exp, 12, &shift_bits_reciprocal);
-  }
-
-  /* Calculate output */
-  pExp = (WORD32 *)ALIGN_PTR(pScratch, ALIGNMENT);
-  {
-    ae_p24x2s p_exp;
-
-    int shift_val = -(shift_bits_reciprocal + 31 - 8 - 16);
-
-    ae_p24x2s p_min = AE_MOVPA24(-32768);
-    ae_p24x2s p_max = AE_MOVPA24(32767);
-
-    for (i = 0; i<vec_length >> 1; i++) {
-      int out;
-
-      p_exp = *(ae_p24x2f *)&pExp[2 * i];
-
-      ae_q56s q_tmp1 = AE_MULFP24S_HH(p_exp, p_recip_sum_exp);
-      ae_q56s q_tmp2 = AE_MULFP24S_LL(p_exp, p_recip_sum_exp);
-
-      q_tmp1 = AE_SLAASQ56S(q_tmp1, shift_val);
-      q_tmp2 = AE_SLAASQ56S(q_tmp2, shift_val);
-
-      ae_p24x2s p_out1 = AE_ROUNDSP24Q48ASYM(q_tmp1);
-      ae_p24x2s p_out2 = AE_ROUNDSP24Q48ASYM(q_tmp2);
-
-      ae_p24x2s p_out = AE_SELP24_LL(p_out1, p_out2);
-
-      p_out = AE_SUBSP24S(p_out, AE_MOVPA24(32768));
-      p_out = AE_MAXP24S(p_out, p_min);
-      p_out = AE_MINP24S(p_out, p_max);
-
-      out = AE_MOVAP24S_H(p_out);
-      *pOut++ = (WORD16)out;
-
-      out = AE_MOVAP24S_L(p_out);
-      *pOut++ = (WORD16)out;
-    }
-
-    if (vec_length & 0x1) {
-      int out;
-
-      p_exp = *(ae_p24f *)&pExp[vec_length - 1];
-
-      ae_q56s q_tmp1 = AE_MULFP24S_LL(p_exp, p_recip_sum_exp);
-
-      q_tmp1 = AE_SLAASQ56S(q_tmp1, shift_val);
-
-      ae_p24x2s p_out = AE_ROUNDSP24Q48ASYM(q_tmp1);
-
-      p_out = AE_SUBSP24S(p_out, AE_MOVPA24(32768));
-      p_out = AE_MAXP24S(p_out, p_min);
-      p_out = AE_MINP24S(p_out, p_max);
-
-      out = AE_MOVAP24S_L(p_out);
-      *pOut++ = (WORD16)out;
-    }
-  }
-
-  return 0;
-}
-
-int xa_nn_get_softmax_scratch_size(int inp_precision, int out_precision,
-                                   int length) {
-  int size_of_one_elm_in_bytes, total_bytes;
-  (void)out_precision;
-
-  /* This function returns scratch size required by softmax implementation in
-     bytes scratch memory is needed to save exponents of inputs computed in the
-     function, every exponent is computed as 32 bit (4 bytes) number currently*/
-  switch (inp_precision) {
-    case PREC_ASYM8U:
-      size_of_one_elm_in_bytes = 4;
-      break;
-    case PREC_SYM8S:
-      size_of_one_elm_in_bytes = 4;
-      break;
-    default:
-      size_of_one_elm_in_bytes = 4;
-      break;
-  }
-
-  total_bytes = size_of_one_elm_in_bytes * length;
-  total_bytes = ALIGNED_SIZE(total_bytes, ALIGNMENT);
-
-  return total_bytes;
-}
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/kernels/basic/hifi_mini/xa_nn_dot_prod_16x16.c b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/kernels/basic/hifi_mini/xa_nn_dot_prod_16x16.c
deleted file mode 100644
index 80697ca..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/kernels/basic/hifi_mini/xa_nn_dot_prod_16x16.c
+++ /dev/null
@@ -1,175 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
-
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xa_nnlib_common.h"
-#include "xa_nnlib_common_macros.h"
-
-/*----------------------------Main function---------------------------------*/
-WORD32 xa_nn_dot_prod_16x16_asym8s(
-    WORD8 *__restrict__ p_out,               /* pointer to output */
-    const WORD16 *__restrict__ p_inp1_start, /* pointer to input1 */
-    const WORD16 *__restrict__ p_inp2_start, /* pointer to input2 */
-    const WORD32 *bias_ptr, WORD32 vec_length, WORD32 out_multiplier,
-    WORD32 out_shift, WORD32 out_zero_bias, WORD32 vec_count) {
-  /* NULL pointer checks */
-  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp1_start, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp2_start, -1);
-  /* Pointer alignment checks */
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp1_start, sizeof(WORD16), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp2_start, sizeof(WORD16), -1);
-  /* Basic Parameter checks */
-  XA_NNLIB_ARG_CHK_COND((vec_length <= 0), -1);
-  XA_NNLIB_ARG_CHK_COND((out_shift < -31 || out_shift > 31), -1);
-  XA_NNLIB_ARG_CHK_COND((out_zero_bias < -128 || out_zero_bias > 127), -1);
-  int left_shift, right_shift;
-  int loopcnt;
-  const WORD32 bias_buffer[2] = {0, 0};
-  const WORD32 *p_bias_load;
-  WORD32 bias_address_increment = sizeof(WORD32);
-
-  if (bias_ptr == NULL) {
-    p_bias_load = bias_buffer - 1;
-    bias_address_increment = 0;
-  } else {
-    p_bias_load = bias_ptr - 1;
-  }
-
-  left_shift = out_shift < 0 ? 0 : out_shift;
-  right_shift = out_shift > 0 ? 0 : -out_shift;
-  /* inp1 4-bytes aligned, inp2 4-bytes aligned and vec_length is multple of 2
-   */
-  if (((((unsigned)p_inp1_start) & 0x3) == 0) &&
-      ((((unsigned)p_inp2_start) & 0x3) == 0) && ((vec_length & 0x1) == 0)) {
-    const ae_p16x2s *pt_inp1, *pt_inp2;
-    pt_inp1 = (const ae_p16x2s *)&p_inp1_start[-2];
-    pt_inp2 = (const ae_p16x2s *)&p_inp2_start[-2];
-
-    ae_q56s output_int8_max_56 = AE_CVTQ48A32S(127);
-    ae_q56s output_int8_min_56 = AE_CVTQ48A32S(-128);
-    for (loopcnt = 0; loopcnt < vec_count; loopcnt++) {
-      ae_p24x2s dp_inp1, dp_inp2;
-      ae_q32s dq_out32;
-      ae_q56s dq_out;
-      int i;
-
-      AE_LQ32F_XU(dq_out, (ae_q32s *)p_bias_load, bias_address_increment);
-
-      for (i = 0; i < (vec_length >> 1); i++) {
-        AE_LP16X2F_IU(dp_inp1, pt_inp1, 4);
-        AE_LP16X2F_IU(dp_inp2, pt_inp2, 4);
-        AE_MULAAP24S_HH_LL(dq_out, dp_inp1, dp_inp2);
-      }
-
-      dq_out32 = AE_SATQ48S(dq_out);
-      MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
-                                       out_multiplier, left_shift, right_shift);
-      dq_out = AE_ADDSQ56S(dq_out, AE_CVTQ48A32S(out_zero_bias));
-
-      dq_out = AE_MAXQ56S(dq_out, output_int8_min_56);
-      dq_out = AE_MINQ56S(dq_out, output_int8_max_56);
-      *p_out++ = (WORD8)AE_TRUNCA32Q48(dq_out);
-    }
-  } else {
-#ifndef DISABLE_NNLIB_UNALIGNED_SUPPORT
-    for (loopcnt = 0; loopcnt < vec_count; loopcnt++) {
-      ae_p24x2s dp_inp1, dp_inp2;
-      ae_q32s dq_out32;
-      ae_q56s dq_out;
-      int i;
-      const WORD16 *p_inp1 = (WORD16 *)&p_inp1_start[loopcnt * vec_length];
-      const WORD16 *p_inp2 = (WORD16 *)&p_inp2_start[loopcnt * vec_length];
-
-      AE_LQ32F_XU(dq_out, (ae_q32s *)p_bias_load, bias_address_increment);
-
-      if (((((unsigned)p_inp1) & 3) != 0 && (((unsigned)p_inp2) & 3) != 0) ||
-          ((((unsigned)p_inp1) & 3) == 0 && (((unsigned)p_inp2) & 3) == 0)) {
-        int pre_loop_count = ((int)(((unsigned)p_inp1) & 3)) >> 1;
-        if (pre_loop_count != 0) {
-          dp_inp1 = AE_CVTP24A16X2_LL(*p_inp1++, *p_inp2++);
-          AE_MULAP24S_HL(dq_out, dp_inp1, dp_inp1);
-        }
-        const ae_p16x2s *pt_inp1, *pt_inp2;
-        pt_inp1 = (const ae_p16x2s *)(p_inp1 - 2);
-        pt_inp2 = (const ae_p16x2s *)(p_inp2 - 2);
-        for (i = 0; i < (vec_length - pre_loop_count - 1); i += 2) {
-          AE_LP16X2F_IU(dp_inp1, pt_inp1, 4);
-          AE_LP16X2F_IU(dp_inp2, pt_inp2, 4);
-          AE_MULAAP24S_HH_LL(dq_out, dp_inp1, dp_inp2);
-        }
-        if ((vec_length - pre_loop_count) & 1) {
-          dp_inp1 = AE_CVTP24A16X2_LL(p_inp1[i], p_inp2[i]);
-          AE_MULAP24S_HL(dq_out, dp_inp1, dp_inp1);
-        }
-      } else {
-        /* One of the pointers in not aligned to 4 bytes, if it is p_inp1, swap
-         * them */
-        if ((((unsigned)p_inp1) & 3) != 0) {
-          const WORD16 *p_tmp;
-          p_tmp = p_inp1;
-          p_inp1 = p_inp2;
-          p_inp2 = p_tmp;
-        }
-        const ae_p16x2s *pt_inp1 = (const ae_p16x2s *)(p_inp1 - 2);
-        const ae_p16s *pt_inp2 = (const ae_p16s *)(p_inp2 - 1);
-        for (i = 0; i < (vec_length - 1); i += 2) {
-          ae_p24x2s dp_t0, dp_t1;
-          AE_LP16X2F_IU(dp_inp1, pt_inp1, 4);
-          AE_LP16F_IU(dp_t0, pt_inp2, 2);
-          AE_LP16F_IU(dp_t1, pt_inp2, 2);
-          dp_inp2 = AE_SELP24_LL(dp_t0, dp_t1);
-          AE_MULAAP24S_HH_LL(dq_out, dp_inp1, dp_inp2);
-        }
-        if (vec_length & 1) {
-          dp_inp1 = AE_CVTP24A16X2_LL(p_inp1[i], p_inp2[i]);
-          AE_MULAP24S_HL(dq_out, dp_inp1, dp_inp1);
-        }
-      }
-      dq_out32 = AE_SATQ48S(dq_out);
-      MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
-                                       out_multiplier, left_shift, right_shift);
-      dq_out = AE_ADDSQ56S(dq_out, AE_CVTQ48A32S(out_zero_bias));
-      WORD32 out_i32 = AE_TRUNCA32Q48(AE_SATQ48S(dq_out));
-      out_i32 = out_i32 < -128 ? -128 : out_i32;
-      out_i32 = out_i32 > 127 ? 127 : out_i32;
-      *p_out++ = (WORD8)out_i32;
-    }
-#else
-    return 1;
-#endif
-  }
-  return 0;
-}
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/kernels/fc/hifi_mini/xa_nn_fully_connected.c b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/kernels/fc/hifi_mini/xa_nn_fully_connected.c
deleted file mode 100644
index 0a9325e..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/kernels/fc/hifi_mini/xa_nn_fully_connected.c
+++ /dev/null
@@ -1,142 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
-
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xa_nnlib_err_chk.h"
-#include "xa_nnlib_kernels_api.h"
-#include "xa_type_def.h"
-
-WORD32 xa_nn_fully_connected_asym8uxasym8u_asym8u(
-    UWORD8 *__restrict__ p_out, const UWORD8 *__restrict__ p_weight,
-    const UWORD8 *__restrict__ p_inp, const WORD32 *__restrict__ p_bias,
-    WORD32 weight_depth, WORD32 out_depth, WORD32 input_zero_bias,
-    WORD32 weight_zero_bias, WORD32 out_multiplier, WORD32 out_shift,
-    WORD32 out_zero_bias) {
-  /* NULL pointer checks */
-  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_weight, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_bias, -1);
-  /* Pointer alignment checks */
-  XA_NNLIB_ARG_CHK_ALIGN(p_bias, sizeof(WORD32), -1);
-  /* Basic Parameter checks */
-  XA_NNLIB_ARG_CHK_COND((out_depth <= 0), -1);
-  XA_NNLIB_ARG_CHK_COND((input_zero_bias < -255 || input_zero_bias > 0), -1);
-  XA_NNLIB_ARG_CHK_COND((weight_zero_bias < -255 || weight_zero_bias > 0), -1);
-  XA_NNLIB_ARG_CHK_COND((out_shift < -31 || out_shift > 31), -1);
-  XA_NNLIB_ARG_CHK_COND((out_zero_bias < 0 || out_zero_bias > 255), -1);
-
-  WORD32 ret = 0;
-  ret = xa_nn_matXvec_out_stride_asym8uxasym8u_asym8u(
-      p_out, p_weight, p_inp, p_bias, out_depth /* rows */
-      ,
-      weight_depth /* cols */
-      ,
-      weight_depth /* row_stride */
-      ,
-      1 /* out_stride */
-      ,
-      weight_zero_bias, input_zero_bias, out_multiplier, out_shift,
-      out_zero_bias);
-  return ret;
-}
-
-WORD32 xa_nn_fully_connected_sym8sxasym8s_asym8s(
-    WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_weight,
-    const WORD8 *__restrict__ p_inp, const WORD32 *__restrict__ p_bias,
-    WORD32 weight_depth, WORD32 out_depth, WORD32 input_zero_bias,
-    WORD32 out_multiplier, WORD32 out_shift, WORD32 out_zero_bias) {
-  /* NULL pointer checks */
-  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_weight, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_bias, -1);
-  /* Pointer alignment checks */
-  XA_NNLIB_ARG_CHK_ALIGN(p_bias, sizeof(WORD32), -1);
-  /* Basic Parameter checks */
-  XA_NNLIB_ARG_CHK_COND((out_depth <= 0), -1);
-  XA_NNLIB_ARG_CHK_COND((input_zero_bias < -127 || input_zero_bias > 128), -1);
-  XA_NNLIB_ARG_CHK_COND((out_shift < -31 || out_shift > 31), -1);
-  XA_NNLIB_ARG_CHK_COND((out_zero_bias < -128 || out_zero_bias > 127), -1);
-
-  WORD32 ret = 0;
-  ret = xa_nn_matXvec_out_stride_sym8sxasym8s_asym8s(
-      p_out, p_weight, p_inp, p_bias, out_depth /* rows */
-      ,
-      weight_depth /* cols */
-      ,
-      weight_depth /* row_stride */
-      ,
-      1 /* out_stride */
-      ,
-      input_zero_bias, out_multiplier, out_shift, out_zero_bias);
-  return ret;
-}
-
-WORD32 xa_nn_fully_connected_asym8sxasym8s_asym8s(
-    WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_weight,
-    const WORD8 *__restrict__ p_inp, const WORD32 *__restrict__ p_bias,
-    WORD32 weight_depth, WORD32 out_depth, WORD32 weight_zero_bias,
-    WORD32 input_zero_bias, WORD32 out_multiplier, WORD32 out_shift,
-    WORD32 out_zero_bias) {
-  /* NULL pointer checks */
-  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_weight, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_bias, -1);
-  /* Pointer alignment checks */
-  XA_NNLIB_ARG_CHK_ALIGN(p_bias, sizeof(WORD32), -1);
-  /* Basic Parameter checks */
-  XA_NNLIB_ARG_CHK_COND((out_depth <= 0), -1);
-  XA_NNLIB_ARG_CHK_COND((weight_zero_bias < -127 || weight_zero_bias > 128),
-                        -1);
-  XA_NNLIB_ARG_CHK_COND((input_zero_bias < -127 || input_zero_bias > 128), -1);
-  XA_NNLIB_ARG_CHK_COND((out_shift < -31 || out_shift > 31), -1);
-  XA_NNLIB_ARG_CHK_COND((out_zero_bias < -128 || out_zero_bias > 127), -1);
-
-  WORD32 ret = 0;
-  ret = xa_nn_matXvec_out_stride_asym8sxasym8s_asym8s(
-      p_out, p_weight, p_inp, p_bias, out_depth /* rows */
-      ,
-      weight_depth /* cols */
-      ,
-      weight_depth /* row_stride */
-      ,
-      1 /* out_stride */
-      ,
-      weight_zero_bias, input_zero_bias, out_multiplier, out_shift,
-      out_zero_bias);
-  return ret;
-}
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/kernels/matXvec/hifi_mini/xa_nn_matXvec_sym8sxasym8s.c b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/kernels/matXvec/hifi_mini/xa_nn_matXvec_sym8sxasym8s.c
deleted file mode 100644
index 71af822..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/kernels/matXvec/hifi_mini/xa_nn_matXvec_sym8sxasym8s.c
+++ /dev/null
@@ -1,1053 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
-
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xa_nnlib_common.h"
-#include "xa_nnlib_common_macros.h"
-
-#define ADD_OUT_OFFSET_STORE_INT8(ptr, data, out_offset) \
-  {                                                      \
-    data = AE_ADDSQ56S(data, AE_CVTQ48A32S(out_offset)); \
-    int out_i32 = AE_TRUNCA32Q48(AE_SATQ48S(data));      \
-    out_i32 = out_i32 < -128 ? -128 : out_i32;           \
-    out_i32 = out_i32 > 127 ? 127 : out_i32;             \
-    *(ptr) = (WORD8)out_i32;                             \
-  }
-
-WORD32 xa_nn_matXvec_out_stride_sym8sxasym8s_asym8s(
-    WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_mat1,
-    const WORD8 *__restrict__ p_vec1, const WORD32 *__restrict__ p_bias,
-    WORD32 rows, WORD32 cols1, WORD32 row_stride1, WORD32 out_stride,
-    WORD32 vec1_zero_bias, WORD32 out_multiplier, WORD32 out_shift,
-    WORD32 out_zero_bias) {
-  /* NULL pointer checks */
-  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_mat1, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_vec1, -1);
-  /* Pointer alignment checks */
-  XA_NNLIB_ARG_CHK_ALIGN(p_bias, sizeof(WORD32), -1);
-  /* Basic Parameter checks */
-  XA_NNLIB_ARG_CHK_COND((rows <= 0), -1);
-  XA_NNLIB_ARG_CHK_COND((cols1 <= 0), -1);
-  XA_NNLIB_ARG_CHK_COND((row_stride1 < cols1), -1);
-  XA_NNLIB_ARG_CHK_COND((vec1_zero_bias < -127 || vec1_zero_bias > 128), -1);
-  XA_NNLIB_ARG_CHK_COND((out_shift < -31 || out_shift > 31), -1);
-  XA_NNLIB_ARG_CHK_COND((out_zero_bias < -128 || out_zero_bias > 127), -1);
-
-  /* Iterators used in for loops */
-  int m_itr, c_itr, i;
-  /* Assign initial value so this value will be used in trailing loop */
-  m_itr = 0;
-  /* Shifts to match with Tensorflow */
-  int left_shift, right_shift;
-
-  left_shift = out_shift < 0 ? 0 : out_shift;
-  right_shift = out_shift > 0 ? 0 : -out_shift;
-
-  const WORD8 *p_mat1_0, *p_mat1_1, *p_mat1_2, *p_mat1_3;
-  const WORD8 *p_vec1_0;
-  ae_p24x2s dp_mat1_0, dp_mat1_1, dp_mat1_2, dp_mat1_3, dp_vec1_0;
-  ae_p24x2s dp_vec1_zb;
-  ae_q56s dq_acc[4];
-  ae_q56s dq_out32, dq_out;
-
-  dp_vec1_zb = AE_MOVPA24(vec1_zero_bias);
-  if (((((unsigned)p_mat1) & 1) == 0) && ((((unsigned)p_vec1) & 1) == 0) &&
-      ((row_stride1 & 1) == 0)) {
-    for (m_itr = 0; m_itr < (rows - 3); m_itr += 4) {
-      p_mat1_0 = &p_mat1[(m_itr + 0) * row_stride1 - 2];
-      p_mat1_1 = &p_mat1[(m_itr + 1) * row_stride1 - 2];
-      p_mat1_2 = &p_mat1[(m_itr + 2) * row_stride1 - 2];
-      p_mat1_3 = &p_mat1[(m_itr + 3) * row_stride1 - 2];
-      p_vec1_0 = p_vec1 - 2;
-
-      dq_acc[0] = dq_acc[1] = dq_acc[2] = dq_acc[3] = AE_ZEROQ56();
-
-      /* AE_LP8X2F* instruction loads in upper 8 bits of P register, so shifting
-      vector right by 16 to get multiplication result in middle 32 bits of Q
-      register (lower 16 bits 0) */
-      for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
-        AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
-        AE_LP8X2F_IU(dp_mat1_1, p_mat1_1, 2);
-        AE_LP8X2F_IU(dp_mat1_2, p_mat1_2, 2);
-        AE_LP8X2F_IU(dp_mat1_3, p_mat1_3, 2);
-        AE_LP8X2F_IU(dp_vec1_0, p_vec1_0, 2);
-        dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
-        dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
-        AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
-        AE_MULAAP24S_HH_LL(dq_acc[1], dp_mat1_1, dp_vec1_0);
-        AE_MULAAP24S_HH_LL(dq_acc[2], dp_mat1_2, dp_vec1_0);
-        AE_MULAAP24S_HH_LL(dq_acc[3], dp_mat1_3, dp_vec1_0);
-      }
-      /* Pointers are aligned so can do 8X2 loads and ignore L parts of
-       * registers */
-      if (cols1 & 1) {
-        AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
-        AE_LP8X2F_IU(dp_mat1_1, p_mat1_1, 2);
-        AE_LP8X2F_IU(dp_mat1_2, p_mat1_2, 2);
-        AE_LP8X2F_IU(dp_mat1_3, p_mat1_3, 2);
-        AE_LP8X2F_IU(dp_vec1_0, p_vec1_0, 2);
-        dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
-        dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
-        AE_MULAP24S_HH(dq_acc[0], dp_mat1_0, dp_vec1_0);
-        AE_MULAP24S_HH(dq_acc[1], dp_mat1_1, dp_vec1_0);
-        AE_MULAP24S_HH(dq_acc[2], dp_mat1_2, dp_vec1_0);
-        AE_MULAP24S_HH(dq_acc[3], dp_mat1_3, dp_vec1_0);
-      }
-
-      if (p_bias != NULL) {
-        for (i = 0; i < 4; i++)
-          dq_acc[i] = AE_ADDSQ56S(dq_acc[i], *(ae_q32s *)(&p_bias[m_itr + i]));
-      }
-
-      for (i = 0; i < 4; i++) {
-        dq_out32 = AE_SATQ48S(dq_acc[i]);
-        MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
-                                         out_multiplier, left_shift,
-                                         right_shift);
-        ADD_OUT_OFFSET_STORE_INT8(&p_out[(m_itr + i) * out_stride], dq_out,
-                                  out_zero_bias);
-      }
-    }
-    for (; m_itr < rows; m_itr++) {
-      p_mat1_0 = &p_mat1[m_itr * row_stride1 - 2];
-      p_vec1_0 = p_vec1 - 2;
-
-      dq_acc[0] = AE_ZEROQ56();
-
-      for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
-        AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
-        AE_LP8X2F_IU(dp_vec1_0, p_vec1_0, 2);
-        dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
-        dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
-        AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
-      }
-      /* Pointers are aligned so can do 8X2 loads and ignore L parts of
-       * registers */
-      if (cols1 & 1) {
-        AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
-        AE_LP8X2F_IU(dp_vec1_0, p_vec1_0, 2);
-        dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
-        dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
-        AE_MULAP24S_HH(dq_acc[0], dp_mat1_0, dp_vec1_0);
-      }
-
-      if (p_bias != NULL)
-        dq_acc[0] = AE_ADDSQ56S(dq_acc[0], *(ae_q32s *)(&p_bias[m_itr]));
-
-      dq_out32 = AE_SATQ48S(dq_acc[0]);
-      MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
-                                       out_multiplier, left_shift, right_shift);
-      ADD_OUT_OFFSET_STORE_INT8(&p_out[m_itr * out_stride], dq_out,
-                                out_zero_bias);
-    }
-  } else {
-    if ((((unsigned)p_mat1) & 1) == 0) {
-      for (m_itr = 0; m_itr < (rows - 3); m_itr += 4) {
-        p_mat1_0 = &p_mat1[(m_itr + 0) * row_stride1 - 2];
-        p_mat1_1 = &p_mat1[(m_itr + 1) * row_stride1];
-        p_mat1_2 = &p_mat1[(m_itr + 2) * row_stride1 - 2];
-        p_mat1_3 = &p_mat1[(m_itr + 3) * row_stride1];
-        p_vec1_0 = p_vec1;
-
-        dq_acc[0] = dq_acc[1] = dq_acc[2] = dq_acc[3] = AE_ZEROQ56();
-
-        /* Matrix elements are kept in upper 8 bits of P registers, vector
-        elements are kept in lower 8 bits of P registers, typecasting to UWORD8
-        is to avoid extra extui instructions since signed 8-bit load in not
-        there in HiFiMini */
-        for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
-          AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
-          dp_mat1_1 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_1[c_itr],
-                                        (UWORD8)p_mat1_1[c_itr + 1]);
-          AE_LP8X2F_IU(dp_mat1_2, p_mat1_2, 2);
-          dp_mat1_3 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_3[c_itr],
-                                        (UWORD8)p_mat1_3[c_itr + 1]);
-          dp_vec1_0 = AE_CVTP24A16X2_LL((UWORD8)p_vec1_0[c_itr],
-                                        (UWORD8)p_vec1_0[c_itr + 1]);
-          dp_mat1_1 = AE_SLLIP24(dp_mat1_1, 8);
-          dp_mat1_3 = AE_SLLIP24(dp_mat1_3, 8);
-          dp_vec1_0 = AE_SLLIP24(dp_vec1_0, 8);
-          dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
-          dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
-          AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
-          AE_MULAAP24S_HH_LL(dq_acc[1], dp_mat1_1, dp_vec1_0);
-          AE_MULAAP24S_HH_LL(dq_acc[2], dp_mat1_2, dp_vec1_0);
-          AE_MULAAP24S_HH_LL(dq_acc[3], dp_mat1_3, dp_vec1_0);
-        }
-        if (cols1 & 1) {
-          ae_p24x2s dp_mat1_01, dp_mat1_23;
-          dp_mat1_01 =
-              AE_CVTP24A16X2_LL((UWORD8)p_mat1_0[2], (UWORD8)p_mat1_1[c_itr]);
-          dp_mat1_23 =
-              AE_CVTP24A16X2_LL((UWORD8)p_mat1_2[2], (UWORD8)p_mat1_3[c_itr]);
-          dp_vec1_0 = AE_MOVPA24(p_vec1_0[c_itr]);
-          dp_mat1_01 = AE_SLLIP24(dp_mat1_01, 8);
-          dp_mat1_23 = AE_SLLIP24(dp_mat1_23, 8);
-          dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
-          AE_MULAP24S_HH(dq_acc[0], dp_mat1_01, dp_vec1_0);
-          AE_MULAP24S_LL(dq_acc[1], dp_mat1_01, dp_vec1_0);
-          AE_MULAP24S_HH(dq_acc[2], dp_mat1_23, dp_vec1_0);
-          AE_MULAP24S_LL(dq_acc[3], dp_mat1_23, dp_vec1_0);
-        }
-
-        if (p_bias != NULL) {
-          for (i = 0; i < 4; i++)
-            dq_acc[i] =
-                AE_ADDSQ56S(dq_acc[i], *(ae_q32s *)(&p_bias[m_itr + i]));
-        }
-
-        for (i = 0; i < 4; i++) {
-          dq_out32 = AE_SATQ48S(dq_acc[i]);
-          MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
-                                           out_multiplier, left_shift,
-                                           right_shift);
-          ADD_OUT_OFFSET_STORE_INT8(&p_out[(m_itr + i) * out_stride], dq_out,
-                                    out_zero_bias);
-        }
-      }
-    } else {
-      for (m_itr = 0; m_itr < (rows - 3); m_itr += 4) {
-        p_mat1_0 = &p_mat1[(m_itr + 0) * row_stride1];
-        p_mat1_1 = &p_mat1[(m_itr + 1) * row_stride1];
-        p_mat1_2 = &p_mat1[(m_itr + 2) * row_stride1];
-        p_mat1_3 = &p_mat1[(m_itr + 3) * row_stride1];
-        p_vec1_0 = p_vec1;
-
-        dq_acc[0] = dq_acc[1] = dq_acc[2] = dq_acc[3] = AE_ZEROQ56();
-
-        /* Matrix elements are kept in upper 8 bits of P registers, vector
-        elements are kept in lower 8 bits of P registers, typecasting to UWORD8
-        is to avoid extra extui instructions since signed 8-bit load in not
-        there in HiFiMini */
-        for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
-          dp_mat1_0 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_0[c_itr],
-                                        (UWORD8)p_mat1_0[c_itr + 1]);
-          dp_mat1_1 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_1[c_itr],
-                                        (UWORD8)p_mat1_1[c_itr + 1]);
-          dp_mat1_2 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_2[c_itr],
-                                        (UWORD8)p_mat1_2[c_itr + 1]);
-          dp_mat1_3 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_3[c_itr],
-                                        (UWORD8)p_mat1_3[c_itr + 1]);
-          dp_vec1_0 = AE_CVTP24A16X2_LL((UWORD8)p_vec1_0[c_itr],
-                                        (UWORD8)p_vec1_0[c_itr + 1]);
-          dp_mat1_0 = AE_SLLIP24(dp_mat1_0, 8);
-          dp_mat1_1 = AE_SLLIP24(dp_mat1_1, 8);
-          dp_mat1_2 = AE_SLLIP24(dp_mat1_2, 8);
-          dp_mat1_3 = AE_SLLIP24(dp_mat1_3, 8);
-          dp_vec1_0 = AE_SLLIP24(dp_vec1_0, 8);
-          dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
-          dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
-          AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
-          AE_MULAAP24S_HH_LL(dq_acc[1], dp_mat1_1, dp_vec1_0);
-          AE_MULAAP24S_HH_LL(dq_acc[2], dp_mat1_2, dp_vec1_0);
-          AE_MULAAP24S_HH_LL(dq_acc[3], dp_mat1_3, dp_vec1_0);
-        }
-        if (cols1 & 1) {
-          ae_p24x2s dp_mat1_01, dp_mat1_23;
-          dp_mat1_01 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_0[c_itr],
-                                         (UWORD8)p_mat1_1[c_itr]);
-          dp_mat1_23 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_2[c_itr],
-                                         (UWORD8)p_mat1_3[c_itr]);
-          dp_vec1_0 = AE_MOVPA24(p_vec1_0[c_itr]);
-          dp_mat1_01 = AE_SLLIP24(dp_mat1_01, 8);
-          dp_mat1_23 = AE_SLLIP24(dp_mat1_23, 8);
-          dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
-          AE_MULAP24S_HH(dq_acc[0], dp_mat1_01, dp_vec1_0);
-          AE_MULAP24S_LL(dq_acc[1], dp_mat1_01, dp_vec1_0);
-          AE_MULAP24S_HH(dq_acc[2], dp_mat1_23, dp_vec1_0);
-          AE_MULAP24S_LL(dq_acc[3], dp_mat1_23, dp_vec1_0);
-        }
-
-        if (p_bias != NULL) {
-          for (i = 0; i < 4; i++)
-            dq_acc[i] =
-                AE_ADDSQ56S(dq_acc[i], *(ae_q32s *)(&p_bias[m_itr + i]));
-        }
-
-        for (i = 0; i < 4; i++) {
-          dq_out32 = AE_SATQ48S(dq_acc[i]);
-          MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
-                                           out_multiplier, left_shift,
-                                           right_shift);
-          ADD_OUT_OFFSET_STORE_INT8(&p_out[(m_itr + i) * out_stride], dq_out,
-                                    out_zero_bias);
-        }
-      }
-    }
-    for (; m_itr < rows; m_itr++) {
-      p_mat1_0 = &p_mat1[m_itr * row_stride1];
-      p_vec1_0 = p_vec1;
-
-      dq_acc[0] = AE_ZEROQ56();
-
-      for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
-        dp_mat1_0 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_0[c_itr],
-                                      (UWORD8)p_mat1_0[c_itr + 1]);
-        dp_vec1_0 = AE_CVTP24A16X2_LL((UWORD8)p_vec1_0[c_itr],
-                                      (UWORD8)p_vec1_0[c_itr + 1]);
-        dp_mat1_0 = AE_SLLIP24(dp_mat1_0, 8);
-        dp_vec1_0 = AE_SLLIP24(dp_vec1_0, 8);
-        dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
-        dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
-        AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
-      }
-      if (cols1 & 1) {
-        dp_mat1_0 = AE_CVTP24A16(p_mat1_0[c_itr]);
-        dp_vec1_0 = AE_CVTP24A16(p_vec1_0[c_itr]);
-        dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, AE_CVTP24A16(vec1_zero_bias));
-        AE_MULAP24S_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
-      }
-
-      if (p_bias != NULL)
-        dq_acc[0] = AE_ADDSQ56S(dq_acc[0], *(ae_q32s *)(&p_bias[m_itr]));
-
-      dq_out32 = AE_SATQ48S(dq_acc[0]);
-      MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
-                                       out_multiplier, left_shift, right_shift);
-      ADD_OUT_OFFSET_STORE_INT8(&p_out[m_itr * out_stride], dq_out,
-                                out_zero_bias);
-    }
-  }
-
-  return 0;
-}
-
-WORD32 xa_nn_matXvec_out_stride_asym8sxasym8s_asym8s(
-    WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_mat1,
-    const WORD8 *__restrict__ p_vec1, const WORD32 *__restrict__ p_bias,
-    WORD32 rows, WORD32 cols1, WORD32 row_stride1, WORD32 out_stride,
-    WORD32 mat1_zero_bias, WORD32 vec1_zero_bias, WORD32 out_multiplier,
-    WORD32 out_shift, WORD32 out_zero_bias) {
-  /* NULL pointer checks */
-  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_mat1, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_vec1, -1);
-  /* Pointer alignment checks */
-  XA_NNLIB_ARG_CHK_ALIGN(p_bias, sizeof(WORD32), -1);
-  /* Basic Parameter checks */
-  XA_NNLIB_ARG_CHK_COND((rows <= 0), -1);
-  XA_NNLIB_ARG_CHK_COND((cols1 <= 0), -1);
-  XA_NNLIB_ARG_CHK_COND((row_stride1 < cols1), -1);
-  XA_NNLIB_ARG_CHK_COND((mat1_zero_bias < -127 || mat1_zero_bias > 128), -1);
-  XA_NNLIB_ARG_CHK_COND((vec1_zero_bias < -127 || vec1_zero_bias > 128), -1);
-  XA_NNLIB_ARG_CHK_COND((out_shift < -31 || out_shift > 31), -1);
-  XA_NNLIB_ARG_CHK_COND((out_zero_bias < -128 || out_zero_bias > 127), -1);
-
-  /* Iterators used in for loops */
-  int m_itr, c_itr, i;
-  /* Assign initial value so this value will be used in trailing loop */
-  m_itr = 0;
-  /* Shifts to match with Tensorflow */
-  int left_shift, right_shift;
-
-  left_shift = out_shift < 0 ? 0 : out_shift;
-  right_shift = out_shift > 0 ? 0 : -out_shift;
-
-  const WORD8 *p_mat1_0, *p_mat1_1, *p_mat1_2, *p_mat1_3;
-  const WORD8 *p_vec1_0;
-  ae_p24x2s dp_mat1_0, dp_mat1_1, dp_mat1_2, dp_mat1_3, dp_vec1_0;
-  ae_p24x2s dp_vec1_zb, dp_mat1_zb;
-  ae_q56s dq_acc_0, dq_acc_1, dq_acc_2, dq_acc_3;
-  ae_q56s dq_out32, dq_out;
-
-  const WORD32 bias_buffer[1] = {0};
-  const WORD32 *p_bias_load;
-  WORD32 bias_address_increment = sizeof(WORD32);
-
-  dp_mat1_zb = AE_MOVPA24(mat1_zero_bias);
-  dp_vec1_zb = AE_MOVPA24(vec1_zero_bias);
-
-  /* Check for alignment conditions */
-  if (((((unsigned)p_mat1) & 1) == 0) && ((((unsigned)p_vec1) & 1) == 0) &&
-      ((row_stride1 & 1) == 0) && ((cols1 & 1) == 0)) {
-    /* Calculate partial zero offset adjustment outside the loop */
-    WORD32 zero_offset_adjustment;
-
-    // Constant part of total zero bias
-    ae_q56s dq_zero_bias_sum =
-        AE_CVTQ48A32S(vec1_zero_bias * cols1 * mat1_zero_bias);
-
-    WORD8 *p_inp = (WORD8 *)p_vec1 - 2;
-    for (i = 0; i < (cols1 >> 1); i++) {
-      /* Input vector is in MSB 8 bits, matrix zero bias in LSB 8 bits */
-      AE_LP8X2F_IU(dp_vec1_0, p_inp, 2);
-      AE_MULAAP24S_HH_LL(dq_zero_bias_sum, dp_vec1_0, dp_mat1_zb);
-    }
-    /* Product is already aligned to bits 16 to 47 in QR register. */
-    zero_offset_adjustment = AE_TRUNCA32Q48(dq_zero_bias_sum);
-
-    /* If bias is not provided, use a dummy zero value from bias_buffer. */
-    if (p_bias == NULL) {
-      p_bias_load = bias_buffer - 1;
-      bias_address_increment = 0;
-    } else {
-      p_bias_load = p_bias - 1;
-    }
-
-    for (m_itr = 0; m_itr < (rows - 3); m_itr += 4) {
-      p_mat1_0 = &p_mat1[(m_itr + 0) * row_stride1 - 2];
-      p_mat1_1 = &p_mat1[(m_itr + 1) * row_stride1 - 2];
-      p_mat1_2 = &p_mat1[(m_itr + 2) * row_stride1 - 2];
-      p_mat1_3 = &p_mat1[(m_itr + 3) * row_stride1 - 2];
-      p_vec1_0 = p_vec1 - 2;
-
-      AE_LQ32F_XU(dq_acc_0, (ae_q32s *)p_bias_load, bias_address_increment);
-      AE_LQ32F_XU(dq_acc_1, (ae_q32s *)p_bias_load, bias_address_increment);
-      AE_LQ32F_XU(dq_acc_2, (ae_q32s *)p_bias_load, bias_address_increment);
-      AE_LQ32F_XU(dq_acc_3, (ae_q32s *)p_bias_load, bias_address_increment);
-
-      dq_acc_0 = AE_ADDQ56(dq_acc_0, AE_CVTQ48A32S(zero_offset_adjustment));
-      dq_acc_1 = AE_ADDQ56(dq_acc_1, AE_CVTQ48A32S(zero_offset_adjustment));
-      dq_acc_2 = AE_ADDQ56(dq_acc_2, AE_CVTQ48A32S(zero_offset_adjustment));
-      dq_acc_3 = AE_ADDQ56(dq_acc_3, AE_CVTQ48A32S(zero_offset_adjustment));
-
-      /* AE_LP8X2F* instruction loads in upper 8 bits of P register, so shifting
-      vector right by 16 to get multiplication result in middle 32 bits of Q
-      register (lower 16 bits 0) */
-      for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
-        AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
-        AE_LP8X2F_IU(dp_mat1_1, p_mat1_1, 2);
-        AE_LP8X2F_IU(dp_mat1_2, p_mat1_2, 2);
-        AE_LP8X2F_IU(dp_mat1_3, p_mat1_3, 2);
-        AE_LP8X2F_IU(dp_vec1_0, p_vec1_0, 2);
-
-        dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
-        dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
-
-        AE_MULAAP24S_HH_LL(dq_acc_0, dp_mat1_0, dp_vec1_0);
-        AE_MULAAP24S_HH_LL(dq_acc_1, dp_mat1_1, dp_vec1_0);
-        AE_MULAAP24S_HH_LL(dq_acc_2, dp_mat1_2, dp_vec1_0);
-        AE_MULAAP24S_HH_LL(dq_acc_3, dp_mat1_3, dp_vec1_0);
-      }
-
-      /* Pointers are aligned so can do 8X2 loads and ignore L parts of
-       * registers */
-      if (cols1 & 1) {
-        AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
-        AE_LP8X2F_IU(dp_mat1_1, p_mat1_1, 2);
-        AE_LP8X2F_IU(dp_mat1_2, p_mat1_2, 2);
-        AE_LP8X2F_IU(dp_mat1_3, p_mat1_3, 2);
-        AE_LP8X2F_IU(dp_vec1_0, p_vec1_0, 2);
-
-        dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
-        dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
-
-        AE_MULAP24S_HH(dq_acc_0, dp_mat1_0, dp_vec1_0);
-        AE_MULAP24S_HH(dq_acc_1, dp_mat1_1, dp_vec1_0);
-        AE_MULAP24S_HH(dq_acc_2, dp_mat1_2, dp_vec1_0);
-        AE_MULAP24S_HH(dq_acc_3, dp_mat1_3, dp_vec1_0);
-      }
-
-      dq_out32 = AE_SATQ48S(dq_acc_0);
-      MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
-                                       out_multiplier, left_shift, right_shift);
-      ADD_OUT_OFFSET_STORE_INT8(&p_out[(m_itr + i) * out_stride], dq_out,
-                                out_zero_bias);
-
-      dq_out32 = AE_SATQ48S(dq_acc_1);
-      MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
-                                       out_multiplier, left_shift, right_shift);
-      ADD_OUT_OFFSET_STORE_INT8(&p_out[(m_itr + i) * out_stride], dq_out,
-                                out_zero_bias);
-
-      dq_out32 = AE_SATQ48S(dq_acc_2);
-      MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
-                                       out_multiplier, left_shift, right_shift);
-      ADD_OUT_OFFSET_STORE_INT8(&p_out[(m_itr + i) * out_stride], dq_out,
-                                out_zero_bias);
-
-      dq_out32 = AE_SATQ48S(dq_acc_3);
-      MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
-                                       out_multiplier, left_shift, right_shift);
-      ADD_OUT_OFFSET_STORE_INT8(&p_out[(m_itr + i) * out_stride], dq_out,
-                                out_zero_bias);
-    }
-    for (; m_itr < rows; m_itr++) {
-      p_mat1_0 = &p_mat1[m_itr * row_stride1 - 2];
-      p_vec1_0 = p_vec1 - 2;
-
-      AE_LQ32F_XU(dq_acc_0, (ae_q32s *)p_bias_load, bias_address_increment);
-      dq_acc_0 = AE_ADDQ56(dq_acc_0, AE_CVTQ48A32S(zero_offset_adjustment));
-
-      for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
-        AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
-        AE_LP8X2F_IU(dp_vec1_0, p_vec1_0, 2);
-        dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
-        dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
-
-        AE_MULAAP24S_HH_LL(dq_acc_0, dp_mat1_0, dp_vec1_0);
-      }
-
-      dq_out32 = AE_SATQ48S(dq_acc_0);
-      MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
-                                       out_multiplier, left_shift, right_shift);
-      ADD_OUT_OFFSET_STORE_INT8(&p_out[m_itr * out_stride], dq_out,
-                                out_zero_bias);
-    }
-  } else {
-#ifndef DISABLE_NNLIB_UNALIGNED_SUPPORT
-    ae_q56s dq_acc[4];
-
-    if ((((unsigned)p_mat1) & 1) == 0) {
-      for (m_itr = 0; m_itr < (rows - 3); m_itr += 4) {
-        p_mat1_0 = &p_mat1[(m_itr + 0) * row_stride1 - 2];
-        p_mat1_1 = &p_mat1[(m_itr + 1) * row_stride1];
-        p_mat1_2 = &p_mat1[(m_itr + 2) * row_stride1 - 2];
-        p_mat1_3 = &p_mat1[(m_itr + 3) * row_stride1];
-        p_vec1_0 = p_vec1;
-
-        dq_acc[0] = dq_acc[1] = dq_acc[2] = dq_acc[3] = AE_ZEROQ56();
-
-        /* Matrix elements are kept in upper 8 bits of P registers, vector
-        elements are kept in lower 8 bits of P registers, typecasting to UWORD8
-        is to avoid extra extui instructions since signed 8-bit load in not
-        there in HiFiMini */
-        for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
-          AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
-          dp_mat1_1 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_1[c_itr],
-                                        (UWORD8)p_mat1_1[c_itr + 1]);
-          AE_LP8X2F_IU(dp_mat1_2, p_mat1_2, 2);
-          dp_mat1_3 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_3[c_itr],
-                                        (UWORD8)p_mat1_3[c_itr + 1]);
-          dp_vec1_0 = AE_CVTP24A16X2_LL((UWORD8)p_vec1_0[c_itr],
-                                        (UWORD8)p_vec1_0[c_itr + 1]);
-          dp_mat1_1 = AE_SLLIP24(dp_mat1_1, 8);
-          dp_mat1_3 = AE_SLLIP24(dp_mat1_3, 8);
-          dp_vec1_0 = AE_SLLIP24(dp_vec1_0, 8);
-          dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
-          dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
-
-          dp_mat1_0 = AE_SRAIP24(dp_mat1_0, 16);
-          dp_mat1_0 = AE_ADDSP24S(dp_mat1_0, dp_mat1_zb);
-          dp_mat1_1 = AE_SRAIP24(dp_mat1_1, 16);
-          dp_mat1_1 = AE_ADDSP24S(dp_mat1_1, dp_mat1_zb);
-          dp_mat1_2 = AE_SRAIP24(dp_mat1_2, 16);
-          dp_mat1_2 = AE_ADDSP24S(dp_mat1_2, dp_mat1_zb);
-          dp_mat1_3 = AE_SRAIP24(dp_mat1_3, 16);
-          dp_mat1_3 = AE_ADDSP24S(dp_mat1_3, dp_mat1_zb);
-
-          AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
-          AE_MULAAP24S_HH_LL(dq_acc[1], dp_mat1_1, dp_vec1_0);
-          AE_MULAAP24S_HH_LL(dq_acc[2], dp_mat1_2, dp_vec1_0);
-          AE_MULAAP24S_HH_LL(dq_acc[3], dp_mat1_3, dp_vec1_0);
-        }
-        if (cols1 & 1) {
-          ae_p24x2s dp_mat1_01, dp_mat1_23;
-          dp_mat1_01 =
-              AE_CVTP24A16X2_LL((UWORD8)p_mat1_0[2], (UWORD8)p_mat1_1[c_itr]);
-          dp_mat1_23 =
-              AE_CVTP24A16X2_LL((UWORD8)p_mat1_2[2], (UWORD8)p_mat1_3[c_itr]);
-          dp_vec1_0 = AE_MOVPA24(p_vec1_0[c_itr]);
-          dp_mat1_01 = AE_SLLIP24(dp_mat1_01, 8);
-          dp_mat1_23 = AE_SLLIP24(dp_mat1_23, 8);
-          dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
-
-          dp_mat1_01 = AE_SRAIP24(dp_mat1_01, 16);
-          dp_mat1_01 = AE_ADDSP24S(dp_mat1_01, dp_mat1_zb);
-          dp_mat1_23 = AE_SRAIP24(dp_mat1_23, 16);
-          dp_mat1_23 = AE_ADDSP24S(dp_mat1_23, dp_mat1_zb);
-
-          AE_MULAP24S_HH(dq_acc[0], dp_mat1_01, dp_vec1_0);
-          AE_MULAP24S_LL(dq_acc[1], dp_mat1_01, dp_vec1_0);
-          AE_MULAP24S_HH(dq_acc[2], dp_mat1_23, dp_vec1_0);
-          AE_MULAP24S_LL(dq_acc[3], dp_mat1_23, dp_vec1_0);
-        }
-
-        dq_acc[0] = AE_SLLISQ56S(dq_acc[0], 16);
-        dq_acc[1] = AE_SLLISQ56S(dq_acc[1], 16);
-        dq_acc[2] = AE_SLLISQ56S(dq_acc[2], 16);
-        dq_acc[3] = AE_SLLISQ56S(dq_acc[3], 16);
-
-        if (p_bias != NULL) {
-          for (i = 0; i < 4; i++)
-            dq_acc[i] =
-                AE_ADDSQ56S(dq_acc[i], *(ae_q32s *)(&p_bias[m_itr + i]));
-        }
-
-        for (i = 0; i < 4; i++) {
-          dq_out32 = AE_SATQ48S(dq_acc[i]);
-          MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
-                                           out_multiplier, left_shift,
-                                           right_shift);
-          ADD_OUT_OFFSET_STORE_INT8(&p_out[(m_itr + i) * out_stride], dq_out,
-                                    out_zero_bias);
-        }
-      }
-    } else {
-      for (m_itr = 0; m_itr < (rows - 3); m_itr += 4) {
-        p_mat1_0 = &p_mat1[(m_itr + 0) * row_stride1];
-        p_mat1_1 = &p_mat1[(m_itr + 1) * row_stride1];
-        p_mat1_2 = &p_mat1[(m_itr + 2) * row_stride1];
-        p_mat1_3 = &p_mat1[(m_itr + 3) * row_stride1];
-        p_vec1_0 = p_vec1;
-
-        dq_acc[0] = dq_acc[1] = dq_acc[2] = dq_acc[3] = AE_ZEROQ56();
-
-        /* Matrix elements are kept in upper 8 bits of P registers, vector
-        elements are kept in lower 8 bits of P registers, typecasting to UWORD8
-        is to avoid extra extui instructions since signed 8-bit load in not
-        there in HiFiMini */
-        for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
-          dp_mat1_0 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_0[c_itr],
-                                        (UWORD8)p_mat1_0[c_itr + 1]);
-          dp_mat1_1 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_1[c_itr],
-                                        (UWORD8)p_mat1_1[c_itr + 1]);
-          dp_mat1_2 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_2[c_itr],
-                                        (UWORD8)p_mat1_2[c_itr + 1]);
-          dp_mat1_3 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_3[c_itr],
-                                        (UWORD8)p_mat1_3[c_itr + 1]);
-          dp_vec1_0 = AE_CVTP24A16X2_LL((UWORD8)p_vec1_0[c_itr],
-                                        (UWORD8)p_vec1_0[c_itr + 1]);
-          dp_mat1_0 = AE_SLLIP24(dp_mat1_0, 8);
-          dp_mat1_1 = AE_SLLIP24(dp_mat1_1, 8);
-          dp_mat1_2 = AE_SLLIP24(dp_mat1_2, 8);
-          dp_mat1_3 = AE_SLLIP24(dp_mat1_3, 8);
-          dp_vec1_0 = AE_SLLIP24(dp_vec1_0, 8);
-          dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
-          dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
-
-          dp_mat1_0 = AE_SRAIP24(dp_mat1_0, 16);
-          dp_mat1_0 = AE_ADDSP24S(dp_mat1_0, dp_mat1_zb);
-          dp_mat1_1 = AE_SRAIP24(dp_mat1_1, 16);
-          dp_mat1_1 = AE_ADDSP24S(dp_mat1_1, dp_mat1_zb);
-          dp_mat1_2 = AE_SRAIP24(dp_mat1_2, 16);
-          dp_mat1_2 = AE_ADDSP24S(dp_mat1_2, dp_mat1_zb);
-          dp_mat1_3 = AE_SRAIP24(dp_mat1_3, 16);
-          dp_mat1_3 = AE_ADDSP24S(dp_mat1_3, dp_mat1_zb);
-
-          AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
-          AE_MULAAP24S_HH_LL(dq_acc[1], dp_mat1_1, dp_vec1_0);
-          AE_MULAAP24S_HH_LL(dq_acc[2], dp_mat1_2, dp_vec1_0);
-          AE_MULAAP24S_HH_LL(dq_acc[3], dp_mat1_3, dp_vec1_0);
-        }
-        if (cols1 & 1) {
-          ae_p24x2s dp_mat1_01, dp_mat1_23;
-          dp_mat1_01 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_0[c_itr],
-                                         (UWORD8)p_mat1_1[c_itr]);
-          dp_mat1_23 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_2[c_itr],
-                                         (UWORD8)p_mat1_3[c_itr]);
-          dp_vec1_0 = AE_MOVPA24(p_vec1_0[c_itr]);
-          dp_mat1_01 = AE_SLLIP24(dp_mat1_01, 8);
-          dp_mat1_23 = AE_SLLIP24(dp_mat1_23, 8);
-          dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
-
-          dp_mat1_01 = AE_SRAIP24(dp_mat1_01, 16);
-          dp_mat1_01 = AE_ADDSP24S(dp_mat1_01, dp_mat1_zb);
-          dp_mat1_23 = AE_SRAIP24(dp_mat1_23, 16);
-          dp_mat1_23 = AE_ADDSP24S(dp_mat1_23, dp_mat1_zb);
-
-          AE_MULAP24S_HH(dq_acc[0], dp_mat1_01, dp_vec1_0);
-          AE_MULAP24S_LL(dq_acc[1], dp_mat1_01, dp_vec1_0);
-          AE_MULAP24S_HH(dq_acc[2], dp_mat1_23, dp_vec1_0);
-          AE_MULAP24S_LL(dq_acc[3], dp_mat1_23, dp_vec1_0);
-        }
-
-        dq_acc[0] = AE_SLLISQ56S(dq_acc[0], 16);
-        dq_acc[1] = AE_SLLISQ56S(dq_acc[1], 16);
-        dq_acc[2] = AE_SLLISQ56S(dq_acc[2], 16);
-        dq_acc[3] = AE_SLLISQ56S(dq_acc[3], 16);
-
-        if (p_bias != NULL) {
-          for (i = 0; i < 4; i++)
-            dq_acc[i] =
-                AE_ADDSQ56S(dq_acc[i], *(ae_q32s *)(&p_bias[m_itr + i]));
-        }
-
-        for (i = 0; i < 4; i++) {
-          dq_out32 = AE_SATQ48S(dq_acc[i]);
-          MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
-                                           out_multiplier, left_shift,
-                                           right_shift);
-          ADD_OUT_OFFSET_STORE_INT8(&p_out[(m_itr + i) * out_stride], dq_out,
-                                    out_zero_bias);
-        }
-      }
-    }
-    for (; m_itr < rows; m_itr++) {
-      p_mat1_0 = &p_mat1[m_itr * row_stride1];
-      p_vec1_0 = p_vec1;
-
-      dq_acc[0] = AE_ZEROQ56();
-
-      for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
-        dp_mat1_0 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_0[c_itr],
-                                      (UWORD8)p_mat1_0[c_itr + 1]);
-        dp_vec1_0 = AE_CVTP24A16X2_LL((UWORD8)p_vec1_0[c_itr],
-                                      (UWORD8)p_vec1_0[c_itr + 1]);
-        dp_mat1_0 = AE_SLLIP24(dp_mat1_0, 8);
-        dp_vec1_0 = AE_SLLIP24(dp_vec1_0, 8);
-        dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
-        dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
-
-        dp_mat1_0 = AE_SRAIP24(dp_mat1_0, 16);
-        dp_mat1_0 = AE_ADDSP24S(dp_mat1_0, dp_mat1_zb);
-
-        AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
-      }
-      if (cols1 & 1) {
-        dp_mat1_0 = AE_CVTP24A16(p_mat1_0[c_itr]);
-        dp_vec1_0 = AE_CVTP24A16(p_vec1_0[c_itr]);
-        dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, AE_CVTP24A16(vec1_zero_bias));
-
-        dp_mat1_0 = AE_SRAIP24(dp_mat1_0, 16);
-        dp_mat1_0 = AE_ADDSP24S(dp_mat1_0, dp_mat1_zb);
-
-        AE_MULAP24S_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
-      }
-
-      dq_acc[0] = AE_SLLISQ56S(dq_acc[0], 16);
-
-      if (p_bias != NULL)
-        dq_acc[0] = AE_ADDSQ56S(dq_acc[0], *(ae_q32s *)(&p_bias[m_itr]));
-
-      dq_out32 = AE_SATQ48S(dq_acc[0]);
-      MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
-                                       out_multiplier, left_shift, right_shift);
-      ADD_OUT_OFFSET_STORE_INT8(&p_out[m_itr * out_stride], dq_out,
-                                out_zero_bias);
-    }
-#else
-    return 1;
-#endif
-  }
-
-  return 0;
-}
-
-#define STORE_INT16(ptr, data)                                         \
-  {                                                                    \
-    int out_i32 = AE_TRUNCA32Q48(AE_SATQ48S(data));                    \
-    out_i32 = out_i32 < (int)0xffff8000L ? (int)0xffff8000L : out_i32; \
-    out_i32 = out_i32 > (int)0x7fff ? (int)0x7fff : out_i32;           \
-    *(ptr) = (WORD16)out_i32;                                          \
-  }
-
-WORD32 xa_nn_matXvec_out_stride_sym8sxasym8s_16(
-    WORD16 *__restrict__ p_out, const WORD8 *__restrict__ p_mat1,
-    const WORD8 *__restrict__ p_vec1, const WORD32 *__restrict__ p_bias,
-    WORD32 rows, WORD32 cols1, WORD32 row_stride1, WORD32 out_stride,
-    WORD32 vec1_zero_bias, WORD32 out_multiplier, WORD32 out_shift) {
-  /* NULL pointer checks */
-  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_mat1, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_vec1, -1);
-  /* Pointer alignment checks */
-  XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(WORD16), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_bias, sizeof(WORD32), -1);
-  /* Basic Parameter checks */
-  XA_NNLIB_ARG_CHK_COND((rows <= 0), -1);
-  XA_NNLIB_ARG_CHK_COND((cols1 <= 0), -1);
-  XA_NNLIB_ARG_CHK_COND((row_stride1 < cols1), -1);
-  XA_NNLIB_ARG_CHK_COND((vec1_zero_bias < -127 || vec1_zero_bias > 128), -1);
-  XA_NNLIB_ARG_CHK_COND((out_shift < -31 || out_shift > 31), -1);
-
-  /* Iterators used in for loops */
-  int m_itr, c_itr, i;
-  /* Assign initial value so this value will be used in trailing loop */
-  m_itr = 0;
-  /* Shifts to match with Tensorflow */
-  int left_shift, right_shift;
-
-  left_shift = out_shift < 0 ? 0 : out_shift;
-  right_shift = out_shift > 0 ? 0 : -out_shift;
-
-  const WORD8 *p_mat1_0, *p_mat1_1, *p_mat1_2, *p_mat1_3;
-  const WORD8 *p_vec1_0;
-  ae_p24x2s dp_mat1_0, dp_mat1_1, dp_mat1_2, dp_mat1_3, dp_vec1_0;
-  ae_p24x2s dp_vec1_zb;
-  ae_q56s dq_acc[4];
-  ae_q56s dq_out32, dq_out;
-
-  dp_vec1_zb = AE_MOVPA24(vec1_zero_bias);
-  if (((((unsigned)p_mat1) & 1) == 0) && ((((unsigned)p_vec1) & 1) == 0) &&
-      ((row_stride1 & 1) == 0)) {
-    for (m_itr = 0; m_itr < (rows - 3); m_itr += 4) {
-      p_mat1_0 = &p_mat1[(m_itr + 0) * row_stride1 - 2];
-      p_mat1_1 = &p_mat1[(m_itr + 1) * row_stride1 - 2];
-      p_mat1_2 = &p_mat1[(m_itr + 2) * row_stride1 - 2];
-      p_mat1_3 = &p_mat1[(m_itr + 3) * row_stride1 - 2];
-      p_vec1_0 = p_vec1 - 2;
-
-      dq_acc[0] = dq_acc[1] = dq_acc[2] = dq_acc[3] = AE_ZEROQ56();
-
-      /* AE_LP8X2F* instruction loads in upper 8 bits of P register, so shifting
-      vector right by 16 to get multiplication result in middle 32 bits of Q
-      register (lower 16 bits 0) */
-      for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
-        AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
-        AE_LP8X2F_IU(dp_mat1_1, p_mat1_1, 2);
-        AE_LP8X2F_IU(dp_mat1_2, p_mat1_2, 2);
-        AE_LP8X2F_IU(dp_mat1_3, p_mat1_3, 2);
-        AE_LP8X2F_IU(dp_vec1_0, p_vec1_0, 2);
-        dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
-        dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
-        AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
-        AE_MULAAP24S_HH_LL(dq_acc[1], dp_mat1_1, dp_vec1_0);
-        AE_MULAAP24S_HH_LL(dq_acc[2], dp_mat1_2, dp_vec1_0);
-        AE_MULAAP24S_HH_LL(dq_acc[3], dp_mat1_3, dp_vec1_0);
-      }
-      /* Pointers are aligned so can do 8X2 loads and ignore L parts of
-       * registers */
-      if (cols1 & 1) {
-        AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
-        AE_LP8X2F_IU(dp_mat1_1, p_mat1_1, 2);
-        AE_LP8X2F_IU(dp_mat1_2, p_mat1_2, 2);
-        AE_LP8X2F_IU(dp_mat1_3, p_mat1_3, 2);
-        AE_LP8X2F_IU(dp_vec1_0, p_vec1_0, 2);
-        dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
-        dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
-        AE_MULAP24S_HH(dq_acc[0], dp_mat1_0, dp_vec1_0);
-        AE_MULAP24S_HH(dq_acc[1], dp_mat1_1, dp_vec1_0);
-        AE_MULAP24S_HH(dq_acc[2], dp_mat1_2, dp_vec1_0);
-        AE_MULAP24S_HH(dq_acc[3], dp_mat1_3, dp_vec1_0);
-      }
-
-      if (p_bias != NULL) {
-        for (i = 0; i < 4; i++)
-          dq_acc[i] = AE_ADDSQ56S(dq_acc[i], *(ae_q32s *)(&p_bias[m_itr + i]));
-      }
-
-      for (i = 0; i < 4; i++) {
-        dq_out32 = AE_SATQ48S(dq_acc[i]);
-        MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
-                                         out_multiplier, left_shift,
-                                         right_shift);
-        STORE_INT16(&p_out[(m_itr + i) * out_stride], dq_out);
-      }
-    }
-    for (; m_itr < rows; m_itr++) {
-      p_mat1_0 = &p_mat1[m_itr * row_stride1 - 2];
-      p_vec1_0 = p_vec1 - 2;
-
-      dq_acc[0] = AE_ZEROQ56();
-
-      for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
-        AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
-        AE_LP8X2F_IU(dp_vec1_0, p_vec1_0, 2);
-        dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
-        dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
-        AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
-      }
-      /* Pointers are aligned so can do 8X2 loads and ignore L parts of
-       * registers */
-      if (cols1 & 1) {
-        AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
-        AE_LP8X2F_IU(dp_vec1_0, p_vec1_0, 2);
-        dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
-        dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
-        AE_MULAP24S_HH(dq_acc[0], dp_mat1_0, dp_vec1_0);
-      }
-
-      if (p_bias != NULL)
-        dq_acc[0] = AE_ADDSQ56S(dq_acc[0], *(ae_q32s *)(&p_bias[m_itr]));
-
-      dq_out32 = AE_SATQ48S(dq_acc[0]);
-      MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
-                                       out_multiplier, left_shift, right_shift);
-      STORE_INT16(&p_out[m_itr * out_stride], dq_out);
-    }
-  } else {
-#ifndef DISABLE_NNLIB_UNALIGNED_SUPPORT
-    if ((((unsigned)p_mat1) & 1) == 0) {
-      for (m_itr = 0; m_itr < (rows - 3); m_itr += 4) {
-        p_mat1_0 = &p_mat1[(m_itr + 0) * row_stride1 - 2];
-        p_mat1_1 = &p_mat1[(m_itr + 1) * row_stride1];
-        p_mat1_2 = &p_mat1[(m_itr + 2) * row_stride1 - 2];
-        p_mat1_3 = &p_mat1[(m_itr + 3) * row_stride1];
-        p_vec1_0 = p_vec1;
-
-        dq_acc[0] = dq_acc[1] = dq_acc[2] = dq_acc[3] = AE_ZEROQ56();
-
-        /* Matrix elements are kept in upper 8 bits of P registers, vector
-        elements are kept in lower 8 bits of P registers, typecasting to UWORD8
-        is to avoid extra extui instructions since signed 8-bit load in not
-        there in HiFiMini */
-        for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
-          AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
-          dp_mat1_1 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_1[c_itr],
-                                        (UWORD8)p_mat1_1[c_itr + 1]);
-          AE_LP8X2F_IU(dp_mat1_2, p_mat1_2, 2);
-          dp_mat1_3 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_3[c_itr],
-                                        (UWORD8)p_mat1_3[c_itr + 1]);
-          dp_vec1_0 = AE_CVTP24A16X2_LL((UWORD8)p_vec1_0[c_itr],
-                                        (UWORD8)p_vec1_0[c_itr + 1]);
-          dp_mat1_1 = AE_SLLIP24(dp_mat1_1, 8);
-          dp_mat1_3 = AE_SLLIP24(dp_mat1_3, 8);
-          dp_vec1_0 = AE_SLLIP24(dp_vec1_0, 8);
-          dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
-          dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
-          AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
-          AE_MULAAP24S_HH_LL(dq_acc[1], dp_mat1_1, dp_vec1_0);
-          AE_MULAAP24S_HH_LL(dq_acc[2], dp_mat1_2, dp_vec1_0);
-          AE_MULAAP24S_HH_LL(dq_acc[3], dp_mat1_3, dp_vec1_0);
-        }
-        if (cols1 & 1) {
-          ae_p24x2s dp_mat1_01, dp_mat1_23;
-          dp_mat1_01 =
-              AE_CVTP24A16X2_LL((UWORD8)p_mat1_0[2], (UWORD8)p_mat1_1[c_itr]);
-          dp_mat1_23 =
-              AE_CVTP24A16X2_LL((UWORD8)p_mat1_2[2], (UWORD8)p_mat1_3[c_itr]);
-          dp_vec1_0 = AE_MOVPA24(p_vec1_0[c_itr]);
-          dp_mat1_01 = AE_SLLIP24(dp_mat1_01, 8);
-          dp_mat1_23 = AE_SLLIP24(dp_mat1_23, 8);
-          dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
-          AE_MULAP24S_HH(dq_acc[0], dp_mat1_01, dp_vec1_0);
-          AE_MULAP24S_LL(dq_acc[1], dp_mat1_01, dp_vec1_0);
-          AE_MULAP24S_HH(dq_acc[2], dp_mat1_23, dp_vec1_0);
-          AE_MULAP24S_LL(dq_acc[3], dp_mat1_23, dp_vec1_0);
-        }
-
-        if (p_bias != NULL) {
-          for (i = 0; i < 4; i++)
-            dq_acc[i] =
-                AE_ADDSQ56S(dq_acc[i], *(ae_q32s *)(&p_bias[m_itr + i]));
-        }
-
-        for (i = 0; i < 4; i++) {
-          dq_out32 = AE_SATQ48S(dq_acc[i]);
-          MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
-                                           out_multiplier, left_shift,
-                                           right_shift);
-          STORE_INT16(&p_out[(m_itr + i) * out_stride], dq_out);
-        }
-      }
-    } else {
-      for (m_itr = 0; m_itr < (rows - 3); m_itr += 4) {
-        p_mat1_0 = &p_mat1[(m_itr + 0) * row_stride1];
-        p_mat1_1 = &p_mat1[(m_itr + 1) * row_stride1];
-        p_mat1_2 = &p_mat1[(m_itr + 2) * row_stride1];
-        p_mat1_3 = &p_mat1[(m_itr + 3) * row_stride1];
-        p_vec1_0 = p_vec1;
-
-        dq_acc[0] = dq_acc[1] = dq_acc[2] = dq_acc[3] = AE_ZEROQ56();
-
-        /* Matrix elements are kept in upper 8 bits of P registers, vector
-        elements are kept in lower 8 bits of P registers, typecasting to UWORD8
-        is to avoid extra extui instructions since signed 8-bit load in not
-        there in HiFiMini */
-        for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
-          dp_mat1_0 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_0[c_itr],
-                                        (UWORD8)p_mat1_0[c_itr + 1]);
-          dp_mat1_1 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_1[c_itr],
-                                        (UWORD8)p_mat1_1[c_itr + 1]);
-          dp_mat1_2 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_2[c_itr],
-                                        (UWORD8)p_mat1_2[c_itr + 1]);
-          dp_mat1_3 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_3[c_itr],
-                                        (UWORD8)p_mat1_3[c_itr + 1]);
-          dp_vec1_0 = AE_CVTP24A16X2_LL((UWORD8)p_vec1_0[c_itr],
-                                        (UWORD8)p_vec1_0[c_itr + 1]);
-          dp_mat1_0 = AE_SLLIP24(dp_mat1_0, 8);
-          dp_mat1_1 = AE_SLLIP24(dp_mat1_1, 8);
-          dp_mat1_2 = AE_SLLIP24(dp_mat1_2, 8);
-          dp_mat1_3 = AE_SLLIP24(dp_mat1_3, 8);
-          dp_vec1_0 = AE_SLLIP24(dp_vec1_0, 8);
-          dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
-          dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
-          AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
-          AE_MULAAP24S_HH_LL(dq_acc[1], dp_mat1_1, dp_vec1_0);
-          AE_MULAAP24S_HH_LL(dq_acc[2], dp_mat1_2, dp_vec1_0);
-          AE_MULAAP24S_HH_LL(dq_acc[3], dp_mat1_3, dp_vec1_0);
-        }
-        if (cols1 & 1) {
-          ae_p24x2s dp_mat1_01, dp_mat1_23;
-          dp_mat1_01 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_0[c_itr],
-                                         (UWORD8)p_mat1_1[c_itr]);
-          dp_mat1_23 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_2[c_itr],
-                                         (UWORD8)p_mat1_3[c_itr]);
-          dp_vec1_0 = AE_MOVPA24(p_vec1_0[c_itr]);
-          dp_mat1_01 = AE_SLLIP24(dp_mat1_01, 8);
-          dp_mat1_23 = AE_SLLIP24(dp_mat1_23, 8);
-          dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
-          AE_MULAP24S_HH(dq_acc[0], dp_mat1_01, dp_vec1_0);
-          AE_MULAP24S_LL(dq_acc[1], dp_mat1_01, dp_vec1_0);
-          AE_MULAP24S_HH(dq_acc[2], dp_mat1_23, dp_vec1_0);
-          AE_MULAP24S_LL(dq_acc[3], dp_mat1_23, dp_vec1_0);
-        }
-
-        if (p_bias != NULL) {
-          for (i = 0; i < 4; i++)
-            dq_acc[i] =
-                AE_ADDSQ56S(dq_acc[i], *(ae_q32s *)(&p_bias[m_itr + i]));
-        }
-
-        for (i = 0; i < 4; i++) {
-          dq_out32 = AE_SATQ48S(dq_acc[i]);
-          MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
-                                           out_multiplier, left_shift,
-                                           right_shift);
-          STORE_INT16(&p_out[(m_itr + i) * out_stride], dq_out);
-        }
-      }
-    }
-    for (; m_itr < rows; m_itr++) {
-      p_mat1_0 = &p_mat1[m_itr * row_stride1];
-      p_vec1_0 = p_vec1;
-
-      dq_acc[0] = AE_ZEROQ56();
-
-      for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
-        dp_mat1_0 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_0[c_itr],
-                                      (UWORD8)p_mat1_0[c_itr + 1]);
-        dp_vec1_0 = AE_CVTP24A16X2_LL((UWORD8)p_vec1_0[c_itr],
-                                      (UWORD8)p_vec1_0[c_itr + 1]);
-        dp_mat1_0 = AE_SLLIP24(dp_mat1_0, 8);
-        dp_vec1_0 = AE_SLLIP24(dp_vec1_0, 8);
-        dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
-        dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
-        AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
-      }
-      if (cols1 & 1) {
-        dp_mat1_0 = AE_CVTP24A16(p_mat1_0[c_itr]);
-        dp_vec1_0 = AE_CVTP24A16(p_vec1_0[c_itr]);
-        dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, AE_CVTP24A16(vec1_zero_bias));
-        AE_MULAP24S_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
-      }
-
-      if (p_bias != NULL)
-        dq_acc[0] = AE_ADDSQ56S(dq_acc[0], *(ae_q32s *)(&p_bias[m_itr]));
-
-      dq_out32 = AE_SATQ48S(dq_acc[0]);
-      MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
-                                       out_multiplier, left_shift, right_shift);
-      STORE_INT16(&p_out[m_itr * out_stride], dq_out);
-    }
-#else
-    return 1;
-#endif
-  }
-
-  return 0;
-}
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/nnlib/xa_nnlib_api.h b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/nnlib/xa_nnlib_api.h
deleted file mode 100644
index e499e1e..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/nnlib/xa_nnlib_api.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
-
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef __XA_NNLIB_API_H__
-#define __XA_NNLIB_API_H__
-
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/nnlib/xa_nnlib_kernels_api.h"
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/xa_type_def.h"
-
-#endif /* __XA_NNLIB_API_H__ */
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/nnlib/xa_nnlib_kernels_api.h b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/nnlib/xa_nnlib_kernels_api.h
deleted file mode 100644
index d3a5e29..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/nnlib/xa_nnlib_kernels_api.h
+++ /dev/null
@@ -1,300 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
-
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef __XA_NNLIB_KERNELS_API_H__
-#define __XA_NNLIB_KERNELS_API_H__
-
-/**
- * @file xa_nnlib_kernels_api.h
- * @brief This file gives the API definition for the HiFi NNLIB
- *
- * matXvec KERNELS API NAMING CONVENTION <br>
- * <br>
- * xa_nn_matXvec_<batch>_[m]x[n]_[p]_<activation>, where
- * - <batch>: Optional 'batch' tag to indicate time batching routine
- * - [m]: Matrix precision in bits
- * - [n]: Vector (and bias for non-activation routines) precision in bits
- * - [p]: Output precision in bits
- * - <activation>: optional activation tag 'sigmoid' / 'tanh'
- *
- * These set of kernels perform dual matXvec followed by optional
- * activation function. There are several variants based on the input,
- * output precision and use of activation functions.
- *
- * Restriction,
- * - All pointers (p_out, p_mat1, p_mat2, p_vec1, p_vec2, p_bias, p_scratch)
- * must be SIMD (64-bit) aligned and should not overlap.
- * - p_mat2, p_vec2 can be 'NULL', but other pointers cannot be 'NULL'
- * - Variables cols1, cols2, row_stride1, row_stride2 must be multiple of 4
- *
- * Usage of few critical variables,
- * - acc_shift:
- *   -# In case of valid activation tag i.e. <activation>: shift to be
- *   applied on accumulator to match accumulator's Q format with activation
- *   function's input's Q format
- *   -# In case of bypass i.e. no activation tag: shift to be applied on
- *   accumulator.
- *   -# Positive value denotes left shift, and negative value denotes right
- * shift.
- * - bias_shift: shift which is to be applied on bias to match bias's
- *   Q format with accumulator's Q format. Positive value denotes left shift,
- *   and negative value denotes right shift.
- * - bias_precision: This represents bias precision
- *   -# For 16x16, and 8x16 apis, valid values are '16' and '64'
- *   -# For 8x8 apis, valid values are '8' and '32'
- *
- * Output 8b, 16b, 32b of fixed point apis (only for bypass variants) is
- * extracted from 64b accumulator with symmetric rounding. Output 64b of fixed
- * point apis (only for bypass variants) is extracted from 64b accumulator.
- * Output 8b, 16b of fixed point apis (only for activation variants) is
- * symmetrically rounded.
- *
- * matXvec 16x16 Kernels,
- * - Bypass kernels with 16, 32, 64 bit output: 3
- * - Fused kernel with 2 activation variants:   2
- * - Time batching kernel:                      1 (Not implemented)
- * - Total:                                     6
- *
- * matXvec 8x16 Kernels,
- * - Bypass kernels with 16, 32, 64 bit output: 3
- * - Fused kernel with 2 activation variants:   2
- * - Time batching kernel:                      1 (Not implemented)
- * - Total:                                     6
- *
- * matXvec 8x8 Kernels,
- * - Bypass kernels with 8, 16, 32 bit output: 3
- * - Fused kernel with 2 activation variants:  2
- * - Time batching kernel:                     1 (Not implemented)
- * - Total:                                    6
- *
- * matXvec float32 x float32 Kernels,
- * - Bypass kernels 32 bit output:            1
- * - Fused kernel with 2 activation variants: 2
- * - Time batching kernel:                    1 (Not implemented)
- * - Total:                                   4
- *
- * ACTIVATION KERNELS API NAMING CONVENTION <br>
- * <br>
- * xa_nn_vec_[activation]_[n]_[p] for fixed point <br>
- * xa_nn_vec_[activation]_f32_f32 for floating point, where
- * - [activation]: One of activations - sigmoid/tanh/relu/relu1/relu6/softmax
- * - [n]:          Input precision in bits
- * - [p]:          Output precision in bits
- *
- * Possible values,
- * - 'n' takes value '32', and expects input in Q6.25 format.
- * - 'p' takes values '32' and '16', gives output in Q16.15 and Q0.15 formats
- * respectively.
- *
- * There is WORD32 datatype variable 'threshold' for 'relu' related apis, which
- * expects value in Q16.15 format.
- *
- * Restriction,
- * - All pointers (p_out, p_vec) must be 32-bit aligned and should not overlap.
- *
- * activation 32_32 kernels,
- * - Vector activation kernels: 6
- * - Total:                     6
- *
- * activation f32_f32 kernels,
- * - Vector activation kernels: 6
- * - Total:                     6
- *
- * activation 32_16 kernels,
- * - Vector activation kernels: 2
- * - Total:                     2
- */
-
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/xa_type_def.h"
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-WORD32 xa_nn_conv2d_depthwise_getsize(
-    WORD32 input_height, WORD32 input_width, WORD32 input_channels,
-    WORD32 kernel_height, WORD32 kernel_width, WORD32 channels_multiplier,
-    WORD32 x_stride, WORD32 y_stride, WORD32 x_padding, WORD32 y_padding,
-    WORD32 output_height, WORD32 output_width, WORD32 circ_buf_precision,
-    WORD32 inp_data_format);
-
-WORD32 xa_nn_vec_activation_min_max_asym8u_asym8u(
-    UWORD8 *__restrict__ p_out, const UWORD8 *__restrict__ p_vec,
-    int activation_min, int activation_max, WORD32 vec_length);
-
-WORD32 xa_nn_vec_activation_min_max_asym8s_asym8s(
-    WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_vec,
-    int activation_min, int activation_max, WORD32 vec_length);
-
-WORD32 xa_nn_conv2d_std_getsize(WORD32 input_height, WORD32 input_channels,
-                                WORD32 kernel_height, WORD32 kernel_width,
-                                WORD32 y_stride, WORD32 y_padding,
-                                WORD32 out_height, WORD32 input_precision);
-
-WORD32 xa_nn_conv2d_std_asym8uxasym8u(
-    UWORD8 *__restrict__ p_out, const UWORD8 *__restrict__ p_inp,
-    const UWORD8 *__restrict__ p_kernel, const WORD32 *__restrict__ p_bias,
-    WORD32 input_height, WORD32 input_width, WORD32 input_channels,
-    WORD32 kernel_height, WORD32 kernel_width, WORD32 out_channels,
-    WORD32 x_stride, WORD32 y_stride, WORD32 x_padding, WORD32 y_padding,
-    WORD32 out_height, WORD32 out_width, WORD32 input_zero_bias,
-    WORD32 kernel_zero_bias, WORD32 out_multiplier, WORD32 out_shift,
-    WORD32 out_zero_bias, WORD32 out_data_format, VOID *p_scratch);
-
-WORD32 xa_nn_conv2d_std_per_chan_sym8sxasym8s(
-    WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_inp,
-    const WORD8 *__restrict__ p_kernel, const WORD32 *__restrict__ p_bias,
-    WORD32 input_height, WORD32 input_width, WORD32 input_channels,
-    WORD32 kernel_height, WORD32 kernel_width, WORD32 out_channels,
-    WORD32 x_stride, WORD32 y_stride, WORD32 x_padding, WORD32 y_padding,
-    WORD32 out_height, WORD32 out_width, WORD32 input_zero_bias,
-    WORD32 *p_out_multiplier, WORD32 *p_out_shift, WORD32 out_zero_bias,
-    WORD32 out_data_format, VOID *p_scratch);
-
-WORD32 xa_nn_conv2d_depthwise_asym8uxasym8u(
-    pUWORD8 __restrict__ p_out, const UWORD8 *__restrict__ p_kernel,
-    const UWORD8 *__restrict__ p_inp, const WORD32 *__restrict__ p_bias,
-    WORD32 input_height, WORD32 input_width, WORD32 input_channels,
-    WORD32 kernel_height, WORD32 kernel_width, WORD32 channels_multiplier,
-    WORD32 x_stride, WORD32 y_stride, WORD32 x_padding, WORD32 y_padding,
-    WORD32 out_height, WORD32 out_width, WORD32 input_zero_bias,
-    WORD32 kernel_zero_bias, WORD32 out_multiplier, WORD32 out_shift,
-    WORD32 out_zero_bias, WORD32 inp_data_format, WORD32 out_data_format,
-    pVOID p_scratch);
-
-WORD32 xa_nn_conv2d_depthwise_per_chan_sym8sxasym8s(
-    WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_kernel,
-    const WORD8 *__restrict__ p_inp, const WORD32 *__restrict__ p_bias,
-    WORD32 input_height, WORD32 input_width, WORD32 input_channels,
-    WORD32 kernel_height, WORD32 kernel_width, WORD32 channels_multiplier,
-    WORD32 x_stride, WORD32 y_stride, WORD32 x_padding, WORD32 y_padding,
-    WORD32 out_height, WORD32 out_width, WORD32 input_zero_bias,
-    const WORD32 *p_out_multiplier, const WORD32 *p_out_shift,
-    WORD32 out_zero_bias, WORD32 inp_data_format, WORD32 out_data_format,
-    pVOID p_scratch);
-
-WORD32 xa_nn_fully_connected_asym8uxasym8u_asym8u(
-    pUWORD8 __restrict__ p_out, const UWORD8 *__restrict__ p_weight,
-    const UWORD8 *__restrict__ p_inp, const WORD32 *__restrict__ p_bias,
-    WORD32 weight_depth, WORD32 out_depth, WORD32 input_zero_bias,
-    WORD32 weight_zero_bias, WORD32 out_multiplier, WORD32 out_shift,
-    WORD32 out_zero_bias);
-
-WORD32 xa_nn_fully_connected_sym8sxasym8s_asym8s(
-    pWORD8 __restrict__ p_out, const WORD8 *__restrict__ p_weight,
-    const WORD8 *__restrict__ p_inp, const WORD32 *__restrict__ p_bias,
-    WORD32 weight_depth, WORD32 out_depth, WORD32 input_zero_bias,
-    WORD32 out_multiplier, WORD32 out_shift, WORD32 out_zero_bias);
-
-WORD32 xa_nn_fully_connected_asym8sxasym8s_asym8s(
-    WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_weight,
-    const WORD8 *__restrict__ p_inp, const WORD32 *__restrict__ p_bias,
-    WORD32 weight_depth, WORD32 out_depth, WORD32 weight_zero_bias,
-    WORD32 input_zero_bias, WORD32 out_multiplier, WORD32 out_shift,
-    WORD32 out_zero_bias);
-
-WORD32 xa_nn_vec_softmax_asym8u_8(UWORD8 *__restrict__ p_out,
-                                  const UWORD8 *__restrict__ p_vec,
-                                  WORD32 diffmin, WORD32 input_left_shift,
-                                  WORD32 input_multiplier, WORD32 vec_length,
-                                  pVOID p_scratch);
-
-WORD32 xa_nn_vec_softmax_asym8s_16(WORD16 *__restrict__ p_out,
-                                   const WORD8 *__restrict__ p_vec,
-                                   WORD32 diffmin, WORD32 input_left_shift,
-                                   WORD32 input_multiplier, WORD32 vec_length,
-                                   pVOID p_scratch);
-
-WORD32 xa_nn_vec_softmax_asym8s_8(WORD8 *__restrict__ p_out,
-                                  const WORD8 *__restrict__ p_vec,
-                                  WORD32 diffmin, WORD32 input_left_shift,
-                                  WORD32 input_multiplier, WORD32 vec_length,
-                                  pVOID p_scratch);
-
-int xa_nn_get_softmax_scratch_size(int inp_precision, int out_precision,
-                                   int length);
-
-WORD32 xa_nn_matXvec_out_stride_asym8uxasym8u_asym8u(
-    UWORD8 *__restrict__ p_out, const UWORD8 *__restrict__ p_mat1,
-    const UWORD8 *__restrict__ p_vec1, const WORD32 *__restrict__ p_bias,
-    WORD32 rows, WORD32 cols1, WORD32 row_stride1, WORD32 out_stride,
-    WORD32 mat1_zero_bias, WORD32 vec1_zero_bias, WORD32 out_multiplier,
-    WORD32 out_shift, WORD32 out_zero_bias);
-
-WORD32 xa_nn_matXvec_out_stride_sym8sxasym8s_asym8s(
-    WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_mat1,
-    const WORD8 *__restrict__ p_vec1, const WORD32 *__restrict__ p_bias,
-    WORD32 rows, WORD32 cols1, WORD32 row_stride1, WORD32 out_stride,
-    WORD32 vec1_zero_bias, WORD32 out_multiplier, WORD32 out_shift,
-    WORD32 out_zero_bias);
-
-WORD32 xa_nn_matXvec_out_stride_asym8sxasym8s_asym8s(
-    WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_mat1,
-    const WORD8 *__restrict__ p_vec1, const WORD32 *__restrict__ p_bias,
-    WORD32 rows, WORD32 cols1, WORD32 row_stride1, WORD32 out_stride,
-    WORD32 mat1_zero_bias, WORD32 vec1_zero_bias, WORD32 out_multiplier,
-    WORD32 out_shift, WORD32 out_zero_bias);
-
-WORD32 xa_nn_matXvec_out_stride_sym8sxasym8s_16(
-    WORD16 *__restrict__ p_out, const WORD8 *__restrict__ p_mat1,
-    const WORD8 *__restrict__ p_vec1, const WORD32 *__restrict__ p_bias,
-    WORD32 rows, WORD32 cols1, WORD32 row_stride1, WORD32 out_stride,
-    WORD32 vec1_zero_bias, WORD32 out_multiplier, WORD32 out_shift);
-
-WORD32 xa_nn_dot_prod_16x16_asym8s(
-    WORD8 *__restrict__ p_out,               /* pointer to output */
-    const WORD16 *__restrict__ p_inp1_start, /* pointer to input1 */
-    const WORD16 *__restrict__ p_inp2_start, /* pointer to input2 */
-    const WORD32 *bias_ptr, WORD32 vec_length, WORD32 out_multiplier,
-    WORD32 out_shift, WORD32 out_zero_bias, WORD32 vec_count);
-
-/* Mapping the functions names from previous naming convension for backward
- * compatibility */
-#define xa_nn_vec_activation_min_max_asym8_asym8 \
-  xa_nn_vec_activation_min_max_asym8u_asym8u
-#define xa_nn_conv2d_std_asym8xasym8 xa_nn_conv2d_std_asym8uxasym8u
-#define xa_nn_conv2d_depthwise_asym8xasym8 xa_nn_conv2d_depthwise_asym8uxasym8u
-#define xa_nn_fully_connected_asym8xasym8_asym8 \
-  xa_nn_fully_connected_asym8uxasym8u_asym8u
-#define xa_nn_vec_softmax_asym8_asym8 xa_nn_vec_softmax_asym8u_asym8u
-#define xa_nn_dot_prod_asym8xasym8_asym8 xa_nn_dot_prod_asym8uxasym8u_asym8u
-#define xa_nn_matXvec_out_stride_asym8xasym8_asym8 \
-  xa_nn_matXvec_out_stride_asym8uxasym8u_asym8u
-
-#if defined(__cplusplus)
-}
-#endif
-#endif /* __XA_NNLIB_KERNELS_API_H__ */
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/nnlib/xa_nnlib_standards.h b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/nnlib/xa_nnlib_standards.h
deleted file mode 100644
index 36ea75d..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/nnlib/xa_nnlib_standards.h
+++ /dev/null
@@ -1,170 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
-
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef __STANDARDS_H__
-#define __STANDARDS_H__
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-typedef double flt64;
-typedef char Int4;
-typedef char Int8;
-typedef int16_t Int16;
-typedef int Int32;
-typedef int Int24;
-typedef int64_t Int64;
-typedef int Bool;
-typedef float Flt32;
-
-#ifdef MODEL_FLT64
-typedef double vect_t;
-typedef double coeff_t;
-typedef double accu_t;
-
-#elif MODEL_INT16
-typedef int16_t vect_t;
-typedef int16_t coeff_t;
-typedef signed char coeff8_t;
-typedef int64_t accu_t;
-typedef float coefff32_t;
-#endif
-
-typedef struct xa_nnlib_opaque {
-  Int32 _;
-} * xa_nnlib_handle_t;
-
-typedef enum _xa_nnlib_prec_t {
-  PREC_8 = 8,
-  PREC_16 = 16,
-  PREC_32 = 32,
-  PREC_F32 = -1,
-  PREC_F16 = -2,
-  PREC_ASYM8U = -3,
-  PREC_ASYM8S = -4,
-  PREC_SYM8S = -5
-} xa_nnlib_prec_t;
-
-typedef enum _xa_nnlib_shape_type_t {
-  SHAPE_UNKNOWN_T = 0,
-  SHAPE_VECTOR_T = 1,
-  SHAPE_MATRIX_T = 2,
-  SHAPE_CUBE_DWH_T = 3,
-  SHAPE_CUBE_WHD_T = 4
-} xa_nnlib_shape_type_t;
-
-typedef struct _xa_nnlib_shape_t {
-  xa_nnlib_shape_type_t shape_type;
-  Int32 n_shapes;
-  Int32 shape_offset;  // Offest between current shape and next shape
-  union {
-    struct {
-      Int32 height;
-      Int32 height_offset;
-      Int32 width;
-      Int32 width_offset;
-      Int32 depth;
-      Int32 depth_offset;
-    } cube;
-
-    struct {
-      Int32 length;
-    } vector;
-    struct {
-      Int32 rows;
-      Int32 row_offset;  // Offset between current row and next row
-      Int32 cols;
-    } matrix;
-  } dim;
-} xa_nnlib_shape_t;
-
-/*****************************************************************************/
-/* Constant hash defines                                                     */
-/*****************************************************************************/
-#define XA_NNLIB_NO_ERROR 0
-/* error handling 'AND' definition */
-#define XA_FATAL_ERROR 0x80000000
-
-enum xa_error_severity {
-  xa_severity_nonfatal = 0,
-  xa_severity_fatal = (int)0xffffffff
-};
-
-enum xa_error_class {
-  xa_class_nnlib = 0,
-  xa_class_config = 1,
-  xa_class_execute = 2
-};
-
-#define XA_NNLIB_GENERIC 0
-
-#define XA_ERROR_CODE(severity, class, codec, index) \
-  ((severity << 31) | (class << 12) | (codec << 7) | index)
-#define XA_ERROR_SEVERITY(code) (((code)&XA_FATAL_ERROR) != 0)
-#define XA_ERROR_CLASS(code) (((code) >> 12) & 0x0f)
-#define XA_ERROR_CODEC(code) (((code) >> 7) & 0x1f)
-#define XA_ERROR_SUBCODE(code) (((code) >> 0) & 0x3f)
-
-/* Our convention is that only nnlib-class errors can be generic ones. */
-
-/*****************************************************************************/
-/* Class 0: NNLib Errors                                                     */
-/*****************************************************************************/
-/* Non Fatal Errors */
-/* (none) */
-/* Fatal Errors */
-enum xa_error_fatal_nnlib_generic {
-  XA_NNLIB_FATAL_MEM_ALLOC =
-      XA_ERROR_CODE(xa_severity_fatal, xa_class_nnlib, XA_NNLIB_GENERIC, 0),
-  XA_NNLIB_FATAL_MEM_ALIGN =
-      XA_ERROR_CODE(xa_severity_fatal, xa_class_nnlib, XA_NNLIB_GENERIC, 1),
-  XA_NNLIB_FATAL_INVALID_SHAPE =
-      XA_ERROR_CODE(xa_severity_fatal, xa_class_nnlib, XA_NNLIB_GENERIC, 3)
-};
-
-/*****************************************************************************/
-/* NNLib Startup Functions                                                   */
-/*****************************************************************************/
-const Int8* xa_nnlib_get_lib_name_string(void);
-const Int8* xa_nnlib_get_lib_version_string(void);
-const Int8* xa_nnlib_get_lib_api_version_string(void);
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif /* __STANDARDS_H__ */
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/xa_type_def.h b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/xa_type_def.h
deleted file mode 100644
index 13a7469..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/xa_type_def.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
-
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef __XA_TYPE_DEF_H__
-#define __XA_TYPE_DEF_H__
-
-#include <stdint.h>
-
-/****************************************************************************/
-/*     types               type define    prefix        examples      bytes */
-/************************  ***********    ******    ****************  ***** */
-typedef signed char WORD8;      /* b       WORD8    b_name     1   */
-typedef signed char* pWORD8;    /* pb      pWORD8   pb_nmae    1   */
-typedef unsigned char UWORD8;   /* ub      UWORD8   ub_count   1   */
-typedef unsigned char* pUWORD8; /* pub     pUWORD8  pub_count  1   */
-
-typedef int16_t WORD16;     /* s       WORD16   s_count    2   */
-typedef int16_t* pWORD16;   /* ps      pWORD16  ps_count   2   */
-typedef uint16_t UWORD16;   /* us      UWORD16  us_count   2   */
-typedef uint16_t* pUWORD16; /* pus     pUWORD16 pus_count  2   */
-
-typedef signed int WORD24;      /* k       WORD24   k_count    3   */
-typedef signed int* pWORD24;    /* pk      pWORD24  pk_count   3   */
-typedef unsigned int UWORD24;   /* uk      UWORD24  uk_count   3   */
-typedef unsigned int* pUWORD24; /* puk     pUWORD24 puk_count  3   */
-
-typedef signed int WORD32;      /* i       WORD32   i_count    4   */
-typedef signed int* pWORD32;    /* pi      pWORD32  pi_count   4   */
-typedef unsigned int UWORD32;   /* ui      UWORD32  ui_count   4   */
-typedef unsigned int* pUWORD32; /* pui     pUWORD32 pui_count  4   */
-
-typedef int64_t WORD40;     /* m       WORD40   m_count    5   */
-typedef int64_t* pWORD40;   /* pm      pWORD40  pm_count   5   */
-typedef uint64_t UWORD40;   /* um      UWORD40  um_count   5   */
-typedef uint64_t* pUWORD40; /* pum     pUWORD40 pum_count  5   */
-
-typedef int64_t WORD64;     /* h       WORD64   h_count    8   */
-typedef int64_t* pWORD64;   /* ph      pWORD64  ph_count   8   */
-typedef uint64_t UWORD64;   /* uh      UWORD64  uh_count   8   */
-typedef uint64_t* pUWORD64; /* puh     pUWORD64 puh_count  8   */
-
-typedef float FLOAT32;    /* f       FLOAT32  f_count    4   */
-typedef float* pFLOAT32;  /* pf      pFLOAT32 pf_count   4   */
-typedef double FLOAT64;   /* d       UFLOAT64 d_count    8   */
-typedef double* pFlOAT64; /* pd      pFLOAT64 pd_count   8   */
-
-typedef void VOID;   /* v       VOID     v_flag     4   */
-typedef void* pVOID; /* pv      pVOID    pv_flag    4   */
-
-/* variable size types: platform optimized implementation */
-typedef signed int BOOL;       /* bool    BOOL     bool_true      */
-typedef unsigned int UBOOL;    /* ubool   BOOL     ubool_true     */
-typedef signed int FLAG;       /* flag    FLAG     flag_false     */
-typedef unsigned int UFLAG;    /* uflag   FLAG     uflag_false    */
-typedef signed int LOOPIDX;    /* lp      LOOPIDX  lp_index       */
-typedef unsigned int ULOOPIDX; /* ulp     SLOOPIDX ulp_index      */
-typedef signed int WORD;       /* lp      LOOPIDX  lp_index       */
-typedef unsigned int UWORD;    /* ulp     SLOOPIDX ulp_index      */
-
-typedef LOOPIDX LOOPINDEX;   /* lp    LOOPIDX  lp_index       */
-typedef ULOOPIDX ULOOPINDEX; /* ulp   SLOOPIDX ulp_index      */
-
-#define PLATFORM_INLINE __inline
-
-typedef struct xa_codec_opaque {
-  WORD32 _;
-} * xa_codec_handle_t;
-
-typedef int XA_ERRORCODE;
-
-typedef XA_ERRORCODE xa_codec_func_t(xa_codec_handle_t p_xa_module_obj,
-                                     WORD32 i_cmd, WORD32 i_idx,
-                                     pVOID pv_value);
-
-#endif /* __XA_TYPE_DEF_H__ */
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xtensa_tf_micro_common.h b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xtensa_tf_micro_common.h
deleted file mode 100644
index 81847b6..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xtensa_tf_micro_common.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/******************************************************************************
- * Copyright (C) 2019 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
-
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef __XTENSA_TF_MICRO_COMMON__
-#define __XTENSA_TF_MICRO_COMMON__
-
-#if defined HIFI_NNLIB_OPT || defined HIFI_MINI_NNLIB_OPT
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/nnlib/xa_nnlib_api.h"
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/nnlib/xa_nnlib_standards.h"
-
-#define CHECK_ERR_HIFI_NNLIB_KER(ret, err_msg) \
-  if (ret != 0) {                              \
-    TF_LITE_KERNEL_LOG(context, err_msg);      \
-    return kTfLiteError;                       \
-  }
-
-#ifndef XTENSA_NNLIB_MAX_SCRATCH_SIZE
-#define XTENSA_NNLIB_MAX_SCRATCH_SIZE (70 * 1024)
-#endif
-
-#define ALLOCATE_XTENSA_NNLIB_SCRATCH_MEM \
-  uint8_t xtensa_nnlib_scratch_buf[XTENSA_NNLIB_MAX_SCRATCH_SIZE];
-
-#define MIN(a, b) (a) < (b) ? (a) : (b);
-#define MAX(a, b) (a) > (b) ? (a) : (b);
-
-#define ACTIVATION_MIN_MAX(data_type, out, inp, min, max) \
-  {                                                       \
-    data_type temp = MAX(inp, min);                       \
-    out = MIN(temp, max);                                 \
-  }
-
-#define ACTIVATION_MIN_MAX_F32(out, inp, min, max) \
-  {                                                \
-    float temp = MAX(inp, min);                    \
-    out = MIN(temp, max);                          \
-  }
-
-#define ACTIVATION_MIN_MAX_ASYM8(out, inp, min, max) \
-  {                                                  \
-    int32_t temp = MAX((int32_t)inp, min);           \
-    out = (uint8_t)MIN(temp, max);                   \
-  }
-
-#define ALIGNED_SIZE(x, bytes) (((x) + (bytes - 1)) & (~(bytes - 1)))
-#define ALIGN_PTR(x, bytes) ((((unsigned)(x)) + (bytes - 1)) & (~(bytes - 1)))
-
-#define PRINT_VAR(var)            \
-  printf("%s = %d\n", #var, var); \
-  fflush(stdout);                 \
-  fflush(stderr);
-
-#endif /* HIFI_NNLIB_OPT */
-
-#endif /* __XTENSA_TF_MICRO_COMMON__ */
diff --git a/tensorflow/lite/micro/tools/make/ext_libs/xtensa_hifimini_staging_nn_library.inc b/tensorflow/lite/micro/tools/make/ext_libs/xtensa_hifimini_staging_nn_library.inc
deleted file mode 100644
index df7d308..0000000
--- a/tensorflow/lite/micro/tools/make/ext_libs/xtensa_hifimini_staging_nn_library.inc
+++ /dev/null
@@ -1,30 +0,0 @@
-ifneq ($(filter xtensa_hifimini_staging, $(ALL_TAGS)),)
-
-    XTENSA_PATH = $(MAKEFILE_DIR)/../../kernels/xtensa_hifimini_staging
-
-    ifneq (,$(filter xtensa_hifimini%, $(ALL_TAGS)))
-
-        CCFLAGS += -DHIFI_MINI_NNLIB_OPT \
-                   -DDISABLE_NNLIB_UNALIGNED_SUPPORT \
-                   -DXTENSA_NNLIB_MAX_SCRATCH_SIZE=1024
-
-        CXXFLAGS += -DHIFI_MINI_NNLIB_OPT \
-                   -DDISABLE_NNLIB_UNALIGNED_SUPPORT \
-                   -DXTENSA_NNLIB_MAX_SCRATCH_SIZE=1024
-
-        MICROLITE_CC_SRCS += \
-                    $(XTENSA_PATH)/xa_nnlib/algo/kernels/activations/hifi_mini/xa_nn_activations_asym8s_asym8s.c \
-                    $(XTENSA_PATH)/xa_nnlib/algo/kernels/activations/hifi_mini/xa_nn_softmax_asym8_asym8.c \
-                    $(XTENSA_PATH)/xa_nnlib/algo/kernels/basic/hifi_mini/xa_nn_dot_prod_16x16.c \
-                    $(XTENSA_PATH)/xa_nnlib/algo/kernels/fc/hifi_mini/xa_nn_fully_connected.c \
-                    $(XTENSA_PATH)/xa_nnlib/algo/kernels/matXvec/hifi_mini/xa_nn_matXvec_sym8sxasym8s.c \
-
-
-        INCLUDES += -I$(XTENSA_PATH)/xa_nnlib/algo/kernels/ \
-                    -I$(XTENSA_PATH)/xa_nnlib/include/nnlib/ \
-                    -I$(XTENSA_PATH)/xa_nnlib/include/ \
-                    -I$(XTENSA_PATH)/xa_nnlib/algo/common/include/ \
-
-    endif
-
-endif
diff --git a/tensorflow/lite/micro/tools/make/targets/xtensa_hifimini_staging_makefile.inc b/tensorflow/lite/micro/tools/make/targets/xtensa_hifimini_staging_makefile.inc
deleted file mode 100644
index 557b8f6..0000000
--- a/tensorflow/lite/micro/tools/make/targets/xtensa_hifimini_staging_makefile.inc
+++ /dev/null
@@ -1,62 +0,0 @@
-# Settings for Xtensa toolchain for the hifimini kernels.
-# REQUIRED:
-#  Environment variables:
-#   - XTENSA_BASE  must be set to location of
-#     the Xtensa developer tools installation directory.
-#  Command line arguments:
-#   - XTENSA_TOOLS_VERSION: For example: RI-2019.2-linux
-#   - XTENSA_CORE: The name of the Xtensa core to use
-#      For example: hifimini
-
-ifeq ($(TARGET), xtensa_hifimini_staging)
-  TARGET_ARCH := xtensa_hifimini_staging
-
-  ifndef XTENSA_BASE
-    $(error XTENSA_BASE is undefined)
-  endif
-
-  ifndef XTENSA_TOOLS_VERSION
-    $(error XTENSA_TOOLS_VERSION is undefined)
-  endif
-
-  ifndef XTENSA_CORE
-    $(error XTENSA_CORE is undefined)
-  endif
-
-  PLATFORM_ARGS = \
-    -DTF_LITE_MCU_DEBUG_LOG \
-    --xtensa-core=$(XTENSA_CORE) \
-    -mcoproc \
-    -DXTENSA -DMAX_RFFT_PWR=9 -DMIN_RFFT_PWR=MAX_RFFT_PWR \
-    -fdata-sections \
-    -ffunction-sections \
-    -fno-exceptions \
-    -fno-unwind-tables \
-    -fno-use-cxa-atexit \
-    -fmessage-length=0 \
-    -fno-threadsafe-statics
-
-  export PATH := $(XTENSA_BASE)/tools/$(XTENSA_TOOLS_VERSION)/XtensaTools/bin:$(PATH)
-  TARGET_TOOLCHAIN_PREFIX := xt-
-  CXX_TOOL := clang++
-  CC_TOOL := clang
-
-  CXXFLAGS += $(PLATFORM_ARGS)
-  CCFLAGS += $(PLATFORM_ARGS)
-
-  LDFLAGS += -Wl,-gc-sections
-
-  TEST_SCRIPT := tensorflow/lite/micro/testing/test_xtensa_hifimini_staging_binary.sh
-
-  # TODO(b/156962140): This manually maintained list of excluded examples is
-  # quite error prone.
-  EXCLUDED_EXAMPLE_TESTS := \
-    tensorflow/lite/micro/examples/image_recognition_experimental/Makefile.inc \
-    tensorflow/lite/micro/examples/magic_wand/Makefile.inc \
-    tensorflow/lite/micro/examples/micro_speech/Makefile.inc \
-    tensorflow/lite/micro/examples/network_tester/Makefile.inc \
-    tensorflow/lite/micro/examples/person_detection/Makefile.inc \
-    tensorflow/lite/micro/examples/person_detection_experimental/Makefile.inc
-  MICRO_LITE_EXAMPLE_TESTS := $(filter-out $(EXCLUDED_EXAMPLE_TESTS), $(MICRO_LITE_EXAMPLE_TESTS))
-
-endif
diff --git a/tensorflow/lite/micro/xtensa_hifimini_staging/debug_log.cc b/tensorflow/lite/micro/xtensa_hifimini_staging/debug_log.cc
deleted file mode 100644
index 45d9317..0000000
--- a/tensorflow/lite/micro/xtensa_hifimini_staging/debug_log.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Reference implementation of the DebugLog() function that's required for a
-// platform to support the TensorFlow Lite for Microcontrollers library. This is
-// the only function that's absolutely required to be available on a target
-// device, since it's used for communicating test results back to the host so
-// that we can verify the implementation is working correctly.
-// It's designed to be as easy as possible to supply an implementation though.
-// On platforms that have a POSIX stack or C library, it can be written as a
-// single call to `fprintf(stderr, "%s", s)` to output a string to the error
-// stream of the console, but if there's no OS or C library available, there's
-// almost always an equivalent way to write out a string to some serial
-// interface that can be used instead. For example on Arm M-series MCUs, calling
-// the `bkpt #0xAB` assembler instruction will output the string in r1 to
-// whatever debug serial connection is available. If you're running mbed, you
-// can do the same by creating `Serial pc(USBTX, USBRX)` and then calling
-// `pc.printf("%s", s)`.
-// To add an equivalent function for your own platform, create your own
-// implementation file, and place it in a subfolder with named after the OS
-// you're targeting. For example, see the Cortex M bare metal version in
-// tensorflow/lite/micro/bluepill/debug_log.cc or the mbed one on
-// tensorflow/lite/micro/mbed/debug_log.cc.
-
-#include "tensorflow/lite/micro/debug_log.h"
-
-#ifndef TF_LITE_STRIP_ERROR_STRINGS
-#include <cstdio>
-#endif
-
-extern "C" void DebugLog(const char* s) {
-#ifndef TF_LITE_STRIP_ERROR_STRINGS
-  // Reusing TF_LITE_STRIP_ERROR_STRINGS to disable DebugLog completely to get
-  // maximum reduction in binary size. This is because we have DebugLog calls
-  // via TF_LITE_CHECK that are not stubbed out by TF_LITE_REPORT_ERROR.
-  fprintf(stderr, "%s", s);
-#endif
-}
diff --git a/tensorflow/lite/micro/xtensa_hifimini_staging/micro_time.cc b/tensorflow/lite/micro/xtensa_hifimini_staging/micro_time.cc
deleted file mode 100644
index 6f3844c..0000000
--- a/tensorflow/lite/micro/xtensa_hifimini_staging/micro_time.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Xtensa implementation of micro_timer.
-// To include this with make, add TAGS=xtensa-xpg.
-#include "tensorflow/lite/micro/micro_time.h"
-
-#include <time.h>
-
-namespace tflite {
-
-int32_t ticks_per_second() { return CLOCKS_PER_SEC; }
-
-int32_t GetCurrentTimeTicks() { return clock(); }
-
-}  // namespace tflite