Remove xtensa_hifimini_staging (in favor of consolidated xtensa directory).
See http://b/173043817 for more details.
PiperOrigin-RevId: 343329539
Change-Id: I0b38539f197b924f2f9f58f5f84ed5bc612ae7a2
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/fully_connected.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/fully_connected.cc
deleted file mode 100644
index f9b49a2..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/fully_connected.cc
+++ /dev/null
@@ -1,197 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2020 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
-
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
-
-#include <xtensa/tie/xt_hifi2.h>
-
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/kernels/internal/common.h"
-#include "tensorflow/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
-#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xtensa_tf_micro_common.h"
-namespace tflite {
-namespace ops {
-namespace micro {
-
-namespace fully_connected {
-namespace {
-
-struct OpData {
- // The scaling factor from input to output (aka the 'real multiplier') can
- // be represented as a fixed point multiplier plus a left shift.
- int32_t output_multiplier;
- int output_shift;
- // The range of the fused activation layer. For example for kNone and
- // uint8_t these would be 0 and 255.
- int32_t output_activation_min;
- int32_t output_activation_max;
- // The index of the temporary tensor where the quantized inputs are cached.
- int input_quantized_index;
-};
-
-constexpr int kInputTensor = 0;
-constexpr int kWeightsTensor = 1;
-constexpr int kBiasTensor = 2;
-constexpr int kOutputTensor = 0;
-
-TfLiteStatus CalculateOpData(TfLiteContext* context,
- TfLiteFusedActivation activation,
- TfLiteType data_type, const TfLiteTensor* input,
- const TfLiteTensor* filter,
- const TfLiteTensor* bias, TfLiteTensor* output,
- OpData* data) {
- if (data_type != kTfLiteInt8) {
- TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
- TfLiteTypeGetName(data_type), data_type);
- return kTfLiteError;
- }
-
- double real_multiplier = 0.0;
- TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
- context, input, filter, bias, output, &real_multiplier));
- xtensa::hifimini::QuantizeMultiplier(
- real_multiplier, &data->output_multiplier, &data->output_shift);
- return CalculateActivationRangeQuantized(context, activation, output,
- &data->output_activation_min,
- &data->output_activation_max);
-}
-
-} // namespace
-
-void* Init(TfLiteContext* context, const char* buffer, size_t length) {
- TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
- return context->AllocatePersistentBuffer(context, sizeof(OpData));
-}
-
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
- TFLITE_DCHECK(node->user_data != nullptr);
- TFLITE_DCHECK(node->builtin_data != nullptr);
-
- OpData* data = static_cast<OpData*>(node->user_data);
- const auto* params =
- reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
-
- const TfLiteTensor* input = GetInput(context, node, kInputTensor);
- const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
- const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
- TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
- return CalculateOpData(context, params->activation, input->type, input,
- filter, bias, output, data);
-}
-
-TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
- const OpData& data, const TfLiteTensor* input,
- const TfLiteTensor* filter,
- const TfLiteTensor* bias, TfLiteTensor* output) {
- // TODO(b/154032858): Investigate removing extra copies.
- FullyConnectedParams op_params;
- op_params.input_offset = -input->params.zero_point;
- op_params.weights_offset = -filter->params.zero_point;
- op_params.output_offset = output->params.zero_point;
- op_params.output_multiplier = data.output_multiplier;
- op_params.output_shift = data.output_shift;
- op_params.quantized_activation_min = data.output_activation_min;
- op_params.quantized_activation_max = data.output_activation_max;
-
- {
- int ret, b, weight_depth, out_depth, batches;
- int8_t* p_out = GetTensorData<int8_t>(output);
- weight_depth = GetTensorShape(filter).Dims(
- GetTensorShape(filter).DimensionsCount() - 1);
- out_depth = GetTensorShape(output).Dims(
- GetTensorShape(output).DimensionsCount() - 1);
- batches = FlatSizeSkipDim(GetTensorShape(output),
- GetTensorShape(output).DimensionsCount() - 1);
-
- // TODO: Use xa_nn_fully_connected_sym8xasym8s_asym8s? the kernel tests fail
- // with it.
- for (b = 0; b < batches; b++) {
- ret = xa_nn_fully_connected_asym8sxasym8s_asym8s(
- (GetTensorData<int8_t>(output) + b * out_depth),
- GetTensorData<int8_t>(filter),
- (GetTensorData<int8_t>(input) + b * weight_depth),
- GetTensorData<int32_t>(bias), weight_depth, out_depth,
- op_params.weights_offset, op_params.input_offset,
- (op_params.output_multiplier << 8), op_params.output_shift,
- op_params.output_offset);
- CHECK_ERR_HIFI_NNLIB_KER(
- ret, "xa_nn_fully_connected_sym8xasym8s_asym8s failed");
- }
- ret = xa_nn_vec_activation_min_max_asym8s_asym8s(
- p_out, p_out, data.output_activation_min, data.output_activation_max,
- batches * out_depth);
- CHECK_ERR_HIFI_NNLIB_KER(
- ret,
- "fully_connected: xa_nn_vec_activation_min_max_asym8s_asym8s failed");
- }
- return kTfLiteOk;
-}
-
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
- TFLITE_DCHECK(node->user_data != nullptr);
- const OpData& data = *(static_cast<const OpData*>(node->user_data));
-
- const TfLiteTensor* input = GetInput(context, node, kInputTensor);
- const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
- const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
- TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
- TFLITE_DCHECK(filter->type == kTfLiteInt8);
- return EvalQuantizedInt8(context, node, data, input, filter, bias, output);
-}
-
-} // namespace fully_connected
-
-TfLiteRegistration Register_FULLY_CONNECTED() {
- return {/*init=*/fully_connected::Init,
- /*free=*/nullptr,
- /*prepare=*/fully_connected::Prepare,
- /*invoke=*/fully_connected::Eval,
- /*profiling_string=*/nullptr,
- /*builtin_code=*/0,
- /*custom_name=*/nullptr,
- /*version=*/0};
-}
-
-} // namespace micro
-} // namespace ops
-} // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/quantize.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/quantize.cc
deleted file mode 100644
index 13c19cc..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/quantize.cc
+++ /dev/null
@@ -1,172 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/kernels/internal/reference/quantize.h"
-
-#include <xtensa/tie/xt_hifi2.h>
-
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
-
-namespace tflite {
-namespace ops {
-namespace micro {
-
-namespace xtensa {
-namespace hifimini {
-
-void AffineQuantize(int scale_multiplier,
- const tflite::QuantizationParams& op_params,
- const RuntimeShape& input_shape, const int16_t* input_data,
- const RuntimeShape& output_shape, int8_t* output_data) {
- const int32_t zero_point = op_params.zero_point;
- const int flat_size = MatchingFlatSize(input_shape, output_shape);
- ae_q56s min_val_56 = AE_CVTQ48A32S(INT16_MIN);
- ae_q56s max_val_56 = AE_CVTQ48A32S(INT16_MAX);
- ae_q56s zero_point_56 = AE_CVTQ48A32S(zero_point);
-
- const ae_p16x2s* input_data_ptr = (const ae_p16x2s*)(input_data - 2);
-
- ae_p24x2s scale_multiplier_24x2 = AE_MOVPA24(scale_multiplier);
-
- int iters = flat_size / 2;
- for (int i = 0; i < iters; i++) {
- // Load two 16bit pairs into the 2x24bit register PR:
- // Values need to be right shifted 8 bits to align from upper 16bits to a
- // 24bit value:
- ae_p24x2s inputs_24x2;
- AE_LP16X2F_IU(inputs_24x2, input_data_ptr, 4);
- inputs_24x2 = AE_P24X2S_SRAI(inputs_24x2, 8);
-
- // Q0.23 * Q16.0 == Q16.23
- {
- ae_q56s sum_56 = AE_MULP24S_HH(scale_multiplier_24x2, inputs_24x2);
-
- // Q16.23 -> Q16.0
- // Shift right only 7 bits (23 - 16). This truncated shift aligns the
- // 16bit value at the truncation line for 32bit in the QR register. The
- // lower 16 bits will be used for rounding in AE_ROUNDSQ32SYM.
- sum_56 = AE_Q56S_SRAI(sum_56, 7);
-
- // Round and truncate 32 bits
- sum_56 = AE_ROUNDSQ32SYM(sum_56);
-
- // Add offset (zero_point_56 is already aligned at 32bits.
- sum_56 = AE_ADDQ56(sum_56, zero_point_56);
-
- // Saturate:
- sum_56 = AE_MINQ56S(sum_56, max_val_56);
- sum_56 = AE_MAXQ56S(sum_56, min_val_56);
-
- output_data[i * 2] = static_cast<int16_t>(AE_TRUNCA32Q48(sum_56));
- }
- {
- ae_q56s sum_56 = AE_MULP24S_LL(scale_multiplier_24x2, inputs_24x2);
-
- // Q16.23 -> Q16.0
- // Shift right only 7 bits (23 - 16). This truncated shift aligns the
- // 16bit value at the truncation line for 32bit in the QR register. The
- // lower 16 bits will be used for rounding in AE_ROUNDSQ32SYM.
- sum_56 = AE_Q56S_SRAI(sum_56, 23 - 16);
-
- // Round and truncate 32 bits
- sum_56 = AE_ROUNDSQ32SYM(sum_56);
-
- // Add offset (zero_point_56 is already aligned at 32bits.
- sum_56 = AE_ADDQ56(sum_56, zero_point_56);
-
- // Saturate:
- sum_56 = AE_MINQ56S(sum_56, max_val_56);
- sum_56 = AE_MAXQ56S(sum_56, min_val_56);
-
- output_data[i * 2 + 1] = static_cast<int16_t>(AE_TRUNCA32Q48(sum_56));
- }
- }
-}
-
-} // namespace hifimini
-} // namespace xtensa
-
-namespace quantize {
-
-struct OpData {
- int scale_multiplier = 0;
-};
-
-void* Init(TfLiteContext* context, const char* buffer, size_t length) {
- TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
- return context->AllocatePersistentBuffer(context, sizeof(OpData));
-}
-
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
- TFLITE_DCHECK(node->user_data != nullptr);
- auto* op_data = static_cast<OpData*>(node->user_data);
-
- TfLiteTensor* output = GetOutput(context, node, 0);
- const TfLiteTensor* input = GetInput(context, node, 0);
-
- // TODO(b/155682734): Fix dangerous input/output scale ratio assumptions.
- op_data->scale_multiplier = xtensa::hifimini::CreateQConstantForInt24(
- 0, input->params.scale / output->params.scale);
-
- return kTfLiteOk;
-}
-
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
- TFLITE_DCHECK(node->user_data != nullptr);
- auto* op_data = static_cast<OpData*>(node->user_data);
-
- const TfLiteTensor* input = GetInput(context, node, 0);
- TfLiteTensor* output = GetOutput(context, node, 0);
-
- tflite::QuantizationParams op_params;
- op_params.zero_point = output->params.zero_point;
-
- if (input->type != kTfLiteInt16 && output->type != kTfLiteInt8) {
- TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
- TfLiteTypeGetName(input->type),
- TfLiteTypeGetName(output->type));
- return kTfLiteError;
- }
-
- xtensa::hifimini::AffineQuantize(
- op_data->scale_multiplier, op_params, GetTensorShape(input),
- GetTensorData<int16_t>(input), GetTensorShape(output),
- GetTensorData<int8_t>(output));
- return kTfLiteOk;
-}
-
-} // namespace quantize
-
-// This Op (QUANTIZE) quantizes the input and produces quantized output.
-// AffineQuantize takes scale and zero point and quantizes the float value to
-// quantized output, in int8_t or uint8_t format.
-TfLiteRegistration Register_QUANTIZE() {
- return {/*init=*/quantize::Init,
- /*free=*/nullptr,
- /*prepare=*/quantize::Prepare,
- /*invoke=*/quantize::Eval,
- /*profiling_string=*/nullptr,
- /*builtin_code=*/0,
- /*custom_name=*/nullptr,
- /*version=*/0};
-}
-
-} // namespace micro
-} // namespace ops
-} // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/softmax.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/softmax.cc
deleted file mode 100644
index 3e5ef19..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/softmax.cc
+++ /dev/null
@@ -1,189 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2020 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
-
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/kernels/internal/reference/softmax.h"
-
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/kernels/internal/common.h"
-#include "tensorflow/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xtensa_tf_micro_common.h"
-
-namespace tflite {
-namespace ops {
-namespace micro {
-namespace activations {
-namespace {
-
-struct OpData {
- int32_t input_multiplier;
- int32_t input_left_shift;
- int32_t diff_min;
- int scratch_tensor_index;
-};
-
-} // namespace
-
-TfLiteStatus CalculateSoftmaxOpData(TfLiteContext* context,
- const TfLiteTensor* input,
- TfLiteTensor* output,
- const TfLiteSoftmaxParams* params,
- OpData* op_data) {
- if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
- if (input->type == kTfLiteUInt8) {
- TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
- } else {
- if (output->type == kTfLiteInt16) {
- TF_LITE_ENSURE_EQ(context, output->params.zero_point,
- std::numeric_limits<int16_t>::min());
- // NOTE: Current int16_t softmax output does not require symmetric
- // scaling
- // - so no need to verify scale here.
- } else {
- TF_LITE_ENSURE_EQ(context, output->params.zero_point,
- std::numeric_limits<int8_t>::min());
- TF_LITE_ENSURE(context, output->params.scale == 1.f / 256);
- }
- }
-
- static const int kScaledDiffIntegerBits = 5;
-
- int input_left_shift;
- tflite::PreprocessSoftmaxScaling(
- static_cast<double>(params->beta),
- static_cast<double>(input->params.scale), kScaledDiffIntegerBits,
- &op_data->input_multiplier, &input_left_shift);
- op_data->input_left_shift = input_left_shift;
- op_data->diff_min =
- -1.0 * tflite::CalculateInputRadius(kScaledDiffIntegerBits,
- op_data->input_left_shift);
- }
- return kTfLiteOk;
-}
-
-void* SoftmaxInit(TfLiteContext* context, const char* buffer, size_t length) {
- TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
- return context->AllocatePersistentBuffer(context, sizeof(OpData));
-}
-
-TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
- auto* params = static_cast<TfLiteSoftmaxParams*>(node->builtin_data);
-
- TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
- TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
- const TfLiteTensor* input = GetInput(context, node, 0);
- TfLiteTensor* output = GetOutput(context, node, 0);
- TF_LITE_ENSURE(context, NumDimensions(input) >= 1);
-
- TFLITE_DCHECK(node->user_data != nullptr);
- OpData* op_data = static_cast<OpData*>(node->user_data);
-
- const RuntimeShape& input_shape = GetTensorShape(input);
- const RuntimeShape& output_shape = GetTensorShape(output);
- const int trailing_dim = input_shape.DimensionsCount() - 1;
- const int depth =
- MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
- int scratch_size =
- xa_nn_get_softmax_scratch_size(PREC_SYM8S, PREC_SYM8S, depth);
-
- const TfLiteStatus scratch_status = context->RequestScratchBufferInArena(
- context, scratch_size, &(op_data->scratch_tensor_index));
- TF_LITE_ENSURE_OK(context, scratch_status);
- // Allocate an array to precompute exponents over all int8_t inputs, applying
- // the scale and beta before calculating exp. It is mandatory to apply beta
- // and scale here, since each softmax op may have different beta and scale
- // values. Beta and scale will remain constant for a given softmax op.
-
- TF_LITE_ENSURE_STATUS(
- CalculateSoftmaxOpData(context, input, output, params, op_data));
-
- return kTfLiteOk;
-}
-
-TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
- auto* op_data = static_cast<OpData*>(node->user_data);
-
- const TfLiteTensor* input = GetInput(context, node, 0);
- TfLiteTensor* output = GetOutput(context, node, 0);
-
- if (input->type == kTfLiteInt8 && output->type == kTfLiteInt16) {
- const RuntimeShape& input_shape = GetTensorShape(input);
- const int8_t* input_data = GetTensorData<int8_t>(input);
- const RuntimeShape& output_shape = GetTensorShape(output);
- int16_t* output_data = GetTensorData<int16_t>(output);
- const int trailing_dim = input_shape.DimensionsCount() - 1;
- const int outer_size =
- MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
- const int depth =
- MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
-
- void* p_scratch = static_cast<void*>(
- context->GetScratchBuffer(context, op_data->scratch_tensor_index));
- TFLITE_DCHECK(p_scratch != nullptr);
-
- for (int i = 0; i < outer_size; ++i) {
- int err = xa_nn_vec_softmax_asym8s_16(
- &output_data[i * depth], &input_data[i * depth], op_data->diff_min,
- op_data->input_left_shift, op_data->input_multiplier, depth,
- p_scratch);
- CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_softmax_asym8s_16 failed");
- }
- return kTfLiteOk;
- } else {
- TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
- TfLiteTypeGetName(input->type), input->type);
- return kTfLiteError;
- }
-}
-} // namespace activations
-
-TfLiteRegistration Register_SOFTMAX() {
- return {/*init=*/activations::SoftmaxInit,
- /*free=*/nullptr,
- /*prepare=*/activations::SoftmaxPrepare,
- /*invoke=*/activations::SoftmaxEval,
- /*profiling_string=*/nullptr,
- /*builtin_code=*/0,
- /*custom_name=*/nullptr,
- /*version=*/0};
-}
-
-} // namespace micro
-} // namespace ops
-} // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/svdf.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/svdf.cc
deleted file mode 100644
index 05256f3..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/svdf.cc
+++ /dev/null
@@ -1,356 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2020 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
-
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <math.h>
-#include <xtensa/tie/xt_hifi2.h>
-
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/kernels/internal/common.h"
-#include "tensorflow/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/op_macros.h"
-#include "tensorflow/lite/micro/kernels/activation_utils.h"
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xtensa_tf_micro_common.h"
-
-namespace tflite {
-namespace ops {
-namespace micro {
-namespace svdf {
-namespace {
-
-struct OpData {
- int32_t effective_scale_1_a;
- int32_t effective_scale_2_a;
- // b versions of each scale are kept at int since the numbers are just the
- // shift value - typically between [-32, 32].
- int effective_scale_1_b;
- int effective_scale_2_b;
- int scratch_tensor_index;
- int scratch_output_tensor_index;
-};
-
-// Input tensors.
-constexpr int kInputTensor = 0;
-constexpr int kWeightsFeatureTensor = 1;
-constexpr int kWeightsTimeTensor = 2;
-constexpr int kBiasTensor = 3;
-// This is a variable tensor, and will be modified by this op.
-constexpr int kInputActivationStateTensor = 4;
-
-// Output tensor.
-constexpr int kOutputTensor = 0;
-
-/**
- * This version of SVDF is specific to TFLite Micro. It contains only a full
- * integer receipe with optimizations for the Xtensa HiFiMini platform.
- *
- * Note: passing OpData by value might seem like an oversight but it helps
- * reduce the latency. See b/155656675 for more details.
- */
-TfLiteStatus EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
- const TfLiteTensor* input_tensor,
- const TfLiteTensor* weights_feature_tensor,
- const TfLiteTensor* weights_time_tensor,
- const TfLiteTensor* bias_tensor,
- const TfLiteSVDFParams* params,
- TfLiteTensor* activation_state_tensor,
- TfLiteTensor* output_tensor, OpData data,
- int32_t input_zp, int32_t output_zp) {
- const int n_rank = params->rank;
- const int n_batch = input_tensor->dims->data[0];
- const int n_input = input_tensor->dims->data[1];
- const int n_filter = weights_feature_tensor->dims->data[0];
- const int n_unit = n_filter / n_rank;
- const int n_memory = weights_time_tensor->dims->data[1];
-
- TFLITE_DCHECK(context != nullptr);
- TFLITE_DCHECK(context->GetScratchBuffer != nullptr);
-
- int32_t* scratch_tensor = static_cast<int32_t*>(
- context->GetScratchBuffer(context, data.scratch_tensor_index));
- TFLITE_DCHECK(scratch_tensor != nullptr);
- int32_t* scratch_output_tensor = static_cast<int32_t*>(
- context->GetScratchBuffer(context, data.scratch_output_tensor_index));
- TFLITE_DCHECK(scratch_output_tensor != nullptr);
-
- // Shift states.
- int16_t* const state_ptr = GetTensorData<int16_t>(activation_state_tensor);
-
- // Left shift the activation_state.
-
- // 4-byte alignment check for state_ptr
- if (((reinterpret_cast<int>(state_ptr)) & 0x3) == 0) {
- // 4-bytes aligned processing
- ae_p16x2s* new_state_start = (ae_p16x2s*)(state_ptr - 2);
- const ae_p16x2s* old_state_start = (ae_p16x2s*)(state_ptr - 2);
- int loopcnt = (n_batch * n_filter * n_memory) - 1;
- ae_p24x2s dstate, dtmp, dout;
-
- AE_LP16X2F_IU(dtmp, old_state_start, 4);
- AE_LP16X2F_IU(dstate, old_state_start, 4);
- for (int i = 0; i < (loopcnt >> 1); i++) {
- dout = AE_SELP24_LH(dtmp, dstate);
- dtmp = dstate;
- AE_LP16X2F_IU(dstate, old_state_start, 4);
- AE_SP16X2F_IU(dout, new_state_start, 4);
- }
- if (loopcnt & 0x1) {
- AE_SP16F_L_I(dtmp, (ae_p16s*)new_state_start, 4);
- }
- } else {
- // 2-bytes aligned processing
- ae_p16s* new_state_start = (ae_p16s*)(state_ptr - 1);
- const ae_p16s* old_state_start = (ae_p16s*)(state_ptr);
- int loopcnt = (n_batch * n_filter * n_memory) - 1;
- ae_p24x2s dstate;
- for (int i = 0; i < loopcnt; i++) {
- AE_LP16F_IU(dstate, old_state_start, 2);
- AE_SP16F_L_IU(dstate, new_state_start, 2);
- }
- }
- // Note: no need to clear the latest activation, matmul is not accumulative.
-
- // Feature matmul.
- {
- int16_t* state = GetTensorData<int16_t>(activation_state_tensor);
- const int8_t* input = GetTensorData<int8_t>(input_tensor);
- const int8_t* weight_feature =
- GetTensorData<int8_t>(weights_feature_tensor);
- int16_t* result_in_batch = state + (n_memory - 1);
- int err = 0;
-
- for (int b = 0; b < n_batch; b++) {
- err = xa_nn_matXvec_out_stride_sym8sxasym8s_16(
- &result_in_batch[b * n_filter * n_memory], weight_feature,
- &input[b * n_input], NULL, n_filter, n_input, n_input, n_memory,
- -input_zp, (data.effective_scale_1_a << 8), data.effective_scale_1_b);
- CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_vec_matXvec_sym8sxasym8s_16 failed");
- }
- }
-
- // Time.
- {
- for (int b = 0; b < n_batch; ++b) {
- int8_t* output_ptr = GetTensorData<int8_t>(output_tensor) + b * n_unit;
-
- const int16_t* vector1_ptr = GetTensorData<int16_t>(weights_time_tensor);
- const int16_t* vector2_ptr =
- GetTensorData<int16_t>(activation_state_tensor) +
- b * n_memory * n_filter;
- int err = 0;
- const int32_t* bias_ptr = GetTensorData<int32_t>(bias_tensor);
- err = xa_nn_dot_prod_16x16_asym8s(
- output_ptr, vector1_ptr, vector2_ptr, bias_ptr, n_memory * n_rank,
- (data.effective_scale_2_a << 8), data.effective_scale_2_b, output_zp,
- n_unit);
- CHECK_ERR_HIFI_NNLIB_KER(err, "xa_nn_dot_prod_16x16_asym8s failed");
- }
- }
- return kTfLiteOk;
-}
-
-} // namespace
-
-void* Init(TfLiteContext* context, const char* buffer, size_t length) {
- TFLITE_DCHECK(context != nullptr);
- TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
- return context->AllocatePersistentBuffer(context, sizeof(OpData));
-}
-
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
- TFLITE_DCHECK(node->builtin_data != nullptr);
- const auto* params = static_cast<const TfLiteSVDFParams*>(node->builtin_data);
-
- // Validate Tensor Inputs (dtype depends on quantization):
- // [0] = Input, {2, batch_size, input_size}
- // [1] = Weights Feature, {2, num_filters, input_size}
- // [2] = Weights Time, {2, num_filters, memory_size}
- // [3] = Bias (optional), {1, num_units}
- // [4] = Activation State (variable),
- // {2, batch_size, memory_size * num_filters}
- const TfLiteTensor* input = GetInput(context, node, kInputTensor);
- const TfLiteTensor* weights_feature =
- GetInput(context, node, kWeightsFeatureTensor);
- const TfLiteTensor* weights_time =
- GetInput(context, node, kWeightsTimeTensor);
- const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
- const TfLiteTensor* activation_state =
- GetInput(context, node, kInputActivationStateTensor);
-
- // Define input constants based on input tensor definition above:
- const int rank = params->rank;
- const int input_size = input->dims->data[1];
- const int batch_size = input->dims->data[0];
- // Ensure the input size is a multiple of two. This is necessary since
- // optimized kernels access the memory in chunks of two, and all accesses
- // must be aligned to 16 bits.
- // TODO(b/153202598): Remove when padding is allowed in TFLite tensors.
- TF_LITE_ENSURE_EQ(context, input_size % 2, 0);
-
- const int num_filters = weights_feature->dims->data[0];
- TF_LITE_ENSURE_EQ(context, num_filters % rank, 0);
- const int num_units = num_filters / rank;
- const int memory_size = weights_time->dims->data[1];
-
- if (input->type != kTfLiteInt8) {
- TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
- TfLiteTypeGetName(input->type), input->type);
- return kTfLiteError;
- }
-
- // Validate Input Tensor:
- TF_LITE_ENSURE(context, input->type == kTfLiteInt8);
- TF_LITE_ENSURE_EQ(context, NumDimensions(input), 2);
-
- // Validate Tensor Output:
- // [0] = float/int8_t, {2, batch_size, num_units}
- TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
- TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
- TF_LITE_ENSURE_EQ(context, NumDimensions(output), 2);
- TF_LITE_ENSURE_EQ(context, output->dims->data[0], batch_size);
- TF_LITE_ENSURE_EQ(context, output->dims->data[1], num_units);
-
- // Validate Weights Feature Input Tensor:
- TF_LITE_ENSURE_EQ(context, NumDimensions(weights_feature), 2);
- TF_LITE_ENSURE_EQ(context, weights_feature->dims->data[1], input_size);
-
- // Validate Weights Time Input Tensor:
- TF_LITE_ENSURE_EQ(context, NumDimensions(weights_time), 2);
- TF_LITE_ENSURE_EQ(context, weights_time->dims->data[0], num_filters);
- TF_LITE_ENSURE_EQ(context, weights_time->dims->data[1], memory_size);
-
- // Validate Optional Bias Input Tensor:
- if (bias != nullptr) {
- TF_LITE_ENSURE_EQ(context, bias->dims->data[0], num_units);
- TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
- }
-
- // Validate Activation State Input Tensor:
- TF_LITE_ENSURE_EQ(context, NumDimensions(activation_state), 2);
- TF_LITE_ENSURE_EQ(context, activation_state->dims->data[0], batch_size);
- TF_LITE_ENSURE_EQ(context, activation_state->dims->data[1],
- memory_size * num_filters);
-
- TF_LITE_ENSURE_EQ(context, node->inputs->size, 5);
- TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteInt8);
- TF_LITE_ENSURE_EQ(context, weights_time->type, kTfLiteInt16);
- TF_LITE_ENSURE_EQ(context, activation_state->type, kTfLiteInt16);
-
- // Validate output tensor:
- TF_LITE_ENSURE_EQ(context, output->type, kTfLiteInt8);
-
- // Calculate effective scales.
- auto* input_params =
- static_cast<TfLiteAffineQuantization*>(input->quantization.params);
- auto* weights_feature_params = static_cast<TfLiteAffineQuantization*>(
- weights_feature->quantization.params);
- auto* state_params = static_cast<TfLiteAffineQuantization*>(
- activation_state->quantization.params);
- auto* weight_time_params =
- static_cast<TfLiteAffineQuantization*>(weights_time->quantization.params);
- auto* output_params =
- static_cast<TfLiteAffineQuantization*>(output->quantization.params);
- const float effective_scale_1 = input_params->scale->data[0] *
- weights_feature_params->scale->data[0] /
- state_params->scale->data[0];
- const float effective_scale_2 = state_params->scale->data[0] *
- weight_time_params->scale->data[0] /
- output_params->scale->data[0];
-
- TFLITE_DCHECK(node->user_data != nullptr);
- OpData* data = static_cast<OpData*>(node->user_data);
-
- xtensa::hifimini::QuantizeMultiplier(effective_scale_1,
- &data->effective_scale_1_a,
- &data->effective_scale_1_b);
- xtensa::hifimini::QuantizeMultiplier(effective_scale_2,
- &data->effective_scale_2_a,
- &data->effective_scale_2_b);
-
- const TfLiteStatus scratch_status = context->RequestScratchBufferInArena(
- context, batch_size * num_filters * sizeof(int32_t),
- &(data->scratch_tensor_index));
- TF_LITE_ENSURE_OK(context, scratch_status);
- const TfLiteStatus scratch_output_status =
- context->RequestScratchBufferInArena(
- context, batch_size * num_units * sizeof(int32_t),
- &(data->scratch_output_tensor_index));
- TF_LITE_ENSURE_OK(context, scratch_output_status);
-
- return kTfLiteOk;
-}
-
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
- auto* params = static_cast<TfLiteSVDFParams*>(node->builtin_data);
-
- const TfLiteTensor* input = GetInput(context, node, kInputTensor);
- const TfLiteTensor* weights_feature =
- GetInput(context, node, kWeightsFeatureTensor);
- const TfLiteTensor* weights_time =
- GetInput(context, node, kWeightsTimeTensor);
- const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
- TfLiteTensor* activation_state =
- GetVariableInput(context, node, kInputActivationStateTensor);
- TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
- TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActRelu);
-
- TFLITE_DCHECK(node->user_data != nullptr);
- const OpData& data = *(static_cast<const OpData*>(node->user_data));
-
- return EvalIntegerSVDF(context, node, input, weights_feature, weights_time,
- bias, params, activation_state, output, data,
- input->params.zero_point, output->params.zero_point);
-}
-
-} // namespace svdf
-
-TfLiteRegistration Register_SVDF() {
- return {/*init=*/svdf::Init,
- /*free=*/nullptr,
- /*prepare=*/svdf::Prepare,
- /*invoke=*/svdf::Eval,
- /*profiling_string=*/nullptr,
- /*builtin_code=*/0,
- /*custom_name=*/nullptr,
- /*version=*/0};
-}
-
-} // namespace micro
-} // namespace ops
-} // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/common/include/xa_api_defs.h b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/common/include/xa_api_defs.h
deleted file mode 100644
index a3eac67..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/common/include/xa_api_defs.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
-
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef __XA_API_DEFS_H__
-#define __XA_API_DEFS_H__
-
-/*****************************************************************************/
-/* Constant hash defines */
-/*****************************************************************************/
-/* A constant to let API copy small strings to buffers outside */
-#define XA_API_STR_LEN 30
-#define XA_APIVERSION_MAJOR 1
-#define XA_APIVERSION_MINOR 0
-
-/* last compatible version */
-/* sometimes a new API version is just for a bugfix, or a added feature in */
-/* this case it is better to use a newer version even though a library was */
-/* made for an older version, library API can then be upgraded to newer API */
-/* version after checking for compatibility or by adding features */
-#define XA_LASTCOMP_APIVERSION_MAJOR 1
-#define XA_LASTCOMP_APIVERSION_MINOR 0
-
-#define XA_STR(str) #str
-#define XA_MAKE_VERSION_STR(maj, min) XA_STR(maj) "." XA_STR(min)
-#define XA_APIVERSION \
- XA_MAKE_VERSION_STR(XA_APIVERSION_MAJOR, XA_APIVERSION_MINOR)
-
-#define XA_LAST_COMP_APIVERSION \
- XA_MAKE_VERSION_STR(XA_LASTCOMP_APIVERSION_MAJOR, \
- XA_LASTCOMP_APIVERSION_MINOR)
-
-#endif /* __XA_API_DEFS_H__ */
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/common/include/xa_nnlib_common.h b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/common/include/xa_nnlib_common.h
deleted file mode 100644
index 71e6682..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/common/include/xa_nnlib_common.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
-
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef __XA_NNLIB_COMMON_H__
-#define __XA_NNLIB_COMMON_H__
-
-#include <inttypes.h>
-#include <stddef.h>
-#include <xtensa/config/core-isa.h>
-#include <xtensa/tie/xt_core.h>
-#include <xtensa/tie/xt_hifi2.h>
-#include <xtensa/tie/xt_misc.h>
-#if XCHAL_HAVE_HIFI4_VFPU
-#include <xtensa/tie/xt_FP.h>
-#endif
-
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/common/include/xa_nnlib_err_chk.h"
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/nnlib/xa_nnlib_kernels_api.h"
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/nnlib/xa_nnlib_standards.h"
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/xa_type_def.h"
-
-#endif /* __XA_NNLIB_COMMON_H__ */
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/common/include/xa_nnlib_common_macros.h b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/common/include/xa_nnlib_common_macros.h
deleted file mode 100644
index d04752b..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/common/include/xa_nnlib_common_macros.h
+++ /dev/null
@@ -1,921 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
-
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef __XA_NNLIB_COMMON_MACROS_H__
-#define __XA_NNLIB_COMMON_MACROS_H__
-
-#ifndef NULL
-#define NULL (void *)0
-#endif /* NULL */
-
-#define ALIGNMENT 8
-
-/* Macro for zero value */
-#define ZERO64 AE_MOVINT64_FROMINT32X2(AE_MOVDA32(0))
-#define ZERO16X4 AE_MOVDA16(0)
-#define ZERO16 (0)
-#define ZERO32 (0)
-
-/* Macro for 1 */
-#define ONE16X4 AE_MOVDA16(1)
-
-/* Value of ROW_UNROLL currently supported are 1,2,4,8 only */
-#ifndef ROW_UNROLL
-#define ROW_UNROLL 8
-#endif
-#define VEC_UNROLL 2
-
-#define ACC_LSH_AFTER_FIRST_MATXVEC 0
-
-/* Increment in bytes required for particular load
- * instructions. */
-#define INCREMENT_IN_BYTES_FOR_WORD8 1
-#define INCREMENT_IN_BYTES_FOR_INT16 2
-#define INCREMENT_IN_BYTES_FOR_INT32 (INCREMENT_IN_BYTES_FOR_INT16 * 2)
-#define INCREMENT_IN_BYTES_FOR_WORD8X4 (INCREMENT_IN_BYTES_FOR_WORD8 * 4)
-#define INCREMENT_IN_BYTES_FOR_INT16X4 (INCREMENT_IN_BYTES_FOR_INT16 * 4)
-#define INCREMENT_IN_BYTES_FOR_INT64 INCREMENT_IN_BYTES_FOR_INT16X4
-#define INCREMENT_IN_BYTES_FOR_FLOAT32 4
-#define INCREMENT_IN_BYTES_FOR_FLOAT32x2 (INCREMENT_IN_BYTES_FOR_FLOAT32 * 2)
-
-#define HF2_AE_ADDCIRC16X4_XC(ptr, offset) \
- ptr = ptr + offset; \
- if (ptr >= p_end) ptr = ptr - size;
-
-#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(q_out, inp, out_multiplier, \
- left_shift, right_shift) \
- { \
- ae_q56s d1; \
- ae_p24x2s d_mul; \
- d_mul = AE_CVTP24A16X2_HL(out_multiplier, out_multiplier); \
- d1 = AE_CVTQ48A32S(inp); \
- d1 = AE_SLLAQ56(d1, left_shift); \
- q_out = AE_MULFQ32SP16U_L(d1, d_mul); \
- q_out = AE_SRAIQ56(q_out, 16); \
- AE_MULAFQ32SP16S_H(q_out, d1, d_mul); \
- q_out = AE_SRAAQ56(q_out, right_shift); \
- q_out = AE_ROUNDSQ32SYM(q_out); \
- }
-
-/* Limit effective bias_shift and acc_shift to [-63 ... 63] */
-#define LIMIT_VARIABLE(_var, _left_limit, _right_limit) \
- _var = _var > _right_limit ? _right_limit \
- : _var < _left_limit ? _left_limit : _var;
-
-#define LIMIT_ACC_LSH LIMIT_VARIABLE(acc_shift, -63, 63);
-
-#define LIMIT_BIAS_LSH LIMIT_VARIABLE(bias_shift, -63, 63);
-
-#define BW(_datatype) sizeof(_datatype)
-
-#define ADJUST_VAR_AxB(A, B) (((8 * (4 - (BW(A) + BW(B))))))
-
-#define ADJUST_VAR_C(C) (((64 - (8 * BW(C)))))
-
-#define ADJUST_ACC_LSH_AxB_C(A, B, C) \
- acc_shift = acc_shift + 32; \
- LIMIT_ACC_LSH;
-
-#define ADJUST_BIAS_LSH_AxB(A, B) LIMIT_BIAS_LSH;
-
-#define ADJUST_ACC_LSH_AND_BIAS_LSH_AxB_C(A, B, C) \
- ADJUST_ACC_LSH_AxB_C(A, B, C); \
- ADJUST_BIAS_LSH_AxB(A, B);
-
-/* ====================================================================================================
- */
-#define SETUP_BIAS_f32 \
- xtfloat _xtfloat_bias = (xtfloat)0.0f; \
- xtfloat *_xtfloat_p_bias = (xtfloat *)p_bias;
-
-#define SETUP_BIAS_ASYM8b \
- WORD32 _WORD32_bias; \
- ae_int64 _ae_int64_sat_bias = ZERO64; \
- WORD32 *_WORD32_p_bias = (WORD32 *)p_bias;
-
-#define SETUP_BIAS_8b \
- WORD8 _WORD8_bias; \
- UWORD32 _UWORD32_bias; \
- ae_int64 _ae_int64_bias = ZERO64; \
- ae_int64 _ae_int64_sat_bias = ZERO64; \
- WORD8 *_WORD8_p_bias = (WORD8 *)p_bias;
-
-#define SETUP_BIAS_8b_BATCH \
- WORD8 _WORD8_bias; \
- WORD16 _WORD16_bias; \
- ae_int16 _ae_int16_bias = ZERO16; \
- ae_int16 *_ae_int16_p_bias = &_ae_int16_bias; \
- ae_int64 _ae_int64_sat_bias = ZERO64; \
- WORD8 *_WORD8_p_bias = (WORD8 *)p_bias;
-
-#define SETUP_BIAS_32b \
- ae_int32 _ae_int32_bias = ZERO32; \
- ae_int64 _ae_int64_sat_bias = ZERO64; \
- ae_int32 *_ae_int32_p_bias = (ae_int32 *)p_bias;
-
-#define SETUP_BIAS_16b \
- ae_int16 _ae_int16_bias = ZERO16; \
- ae_int64 _ae_int64_sat_bias = ZERO64; \
- ae_int16 *_ae_int16_p_bias = (ae_int16 *)p_bias;
-
-#define SETUP_BIAS_64b \
- ae_int64 _ae_int64_bias = ZERO64; \
- ae_int64 _ae_int64_sat_bias = ZERO64; \
- ae_int64 *_ae_int64_p_bias = (ae_int64 *)p_bias;
-
-#define SETUP_ACC_FOR_8bx8b(idx) SETUP_ACC_64b(idx)
-#define SETUP_ACC_FOR_8bx16b(idx) SETUP_ACC_64b(idx)
-#define SETUP_ACC_FOR_16bx8b(idx) SETUP_ACC_64b(idx)
-#define SETUP_ACC_FOR_16bx16b(idx) SETUP_ACC_64b(idx)
-#define SETUP_ACC_FOR_ASYM8bxASYM8b(idx) SETUP_ACC_64b(idx)
-
-/*------------------ time batching macros ----------------- */
-
-#define SETUP_ACC_BATCH_ROW_FOR_16bx8b SETUP_ACC_BATCH_ROW_FOR_16bx16b
-#define SETUP_ACC_BATCH_ROW_FOR_8bx16b SETUP_ACC_BATCH_ROW_FOR_16bx16b
-#define SETUP_ACC_BATCH_ROW_FOR_8bx8b SETUP_ACC_BATCH_ROW_FOR_16bx16b
-#define SETUP_ACC_BATCH_ROW_FOR_ASYM8bxASYM8b SETUP_ACC_BATCH_ROW_FOR_16bx16b
-
-#define SETUP_ACC_BATCH_FOR_16bx8b SETUP_ACC_BATCH_FOR_16bx16b
-#define SETUP_ACC_BATCH_FOR_8bx16b SETUP_ACC_BATCH_FOR_16bx16b
-#define SETUP_ACC_BATCH_FOR_8bx8b SETUP_ACC_BATCH_FOR_16bx16b
-#define SETUP_ACC_BATCH_FOR_ASYM8bxASYM8b SETUP_ACC_BATCH_FOR_16bx16b
-
-#define SETUP_ACC_BATCH_ROW_FOR_16bx16b(idx_row) \
- SETUP_ACC_BATCH_VEC_UNROLL(idx_row);
-
-#define SETUP_ACC_BATCH_FOR_16bx16b(idx_row, idx_vec) \
- ae_int64 _ae_int64_acc_##idx_row##_##idx_vec = ZERO64;
-
-#define SETUP_ACC_BATCH_ROW_FOR_f32(idx_row) \
- SETUP_ACC_BATCH_VEC_UNROLL(idx_row);
-
-#define SETUP_ACC_BATCH_FOR_f32(idx_row, idx_vec) \
- xtfloatx2 _xtfloatx2_acc_##idx_row##_##idx_vec = (xtfloatx2)0.0f; \
- xtfloat _xtfloat_acc_##idx_row##_##idx_vec = (xtfloat)0.0f; \
- /*---------------------------------------------------------*/
-
-#define SETUP_ACC_64b(idx) ae_int64 _ae_int64_acc_##idx = ZERO64;
-
-#define SETUP_VEC1_8b \
- ae_int16x4 _ae_int16x4_vec1 = ZERO16X4; \
- WORD8 *_WORD8_p_vec1 = (WORD8 *)p_vec1;
-
-#define SETUP_VEC2_8b \
- ae_int16x4 _ae_int16x4_vec2 = ZERO16X4; \
- WORD8 *_WORD8_p_vec2 = (WORD8 *)p_vec2;
-
-#define SETUP_VEC1_16b \
- ae_int16x4 _ae_int16x4_vec1 = ZERO16X4; \
- ae_int16x4 *_ae_int16x4_p_vec1 = (ae_int16x4 *)p_vec1;
-
-#define SETUP_VEC2_16b \
- ae_int16x4 _ae_int16x4_vec2 = ZERO16X4; \
- ae_int16x4 *_ae_int16x4_p_vec2 = (ae_int16x4 *)p_vec2;
-
-#define SETUP_VEC1_ASYM8b SETUP_VEC1_8b
-#define SETUP_VEC2_ASYM8b SETUP_VEC2_8b
-/*------------------ time batching macros ----------------- */
-
-#define SETUP_VEC_BATCH_8b(idx_vec) \
- ae_int16x4 _ae_int16x4_vec_batch_##idx_vec = ZERO16X4; \
- WORD8 *_WORD8_p_vec_batch_##idx_vec = (WORD8 *)(p_vec1[vec_itr + idx_vec]);
-
-#define SETUP_VEC_BATCH_16b(idx_vec) \
- ae_int16x4 _ae_int16x4_vec_batch_##idx_vec = ZERO16X4; \
- ae_int16x4 *_ae_int16x4_p_vec_batch_##idx_vec = \
- (ae_int16x4 *)(p_vec1[vec_itr + idx_vec]);
-
-#define SETUP_VEC_OFFSET_BATCH_16b(idx_vec) \
- ae_int16x4 _ae_int16x4_vec_batch_##idx_vec = ZERO16X4; \
- ae_int16x4 *_ae_int16x4_p_vec_batch_##idx_vec = \
- (ae_int16x4 *)(p_vec1 + (vec_itr + idx_vec) * vec_offset);
-
-#define SETUP_VEC_BATCH_f32(idx_vec) \
- xtfloatx2 _xtfloatx2_vec_batch_##idx_vec = (xtfloatx2)0.0f; \
- xtfloatx2 *_xtfloatx2_p_vec_batch_##idx_vec = \
- (xtfloatx2 *)(p_vec1[vec_itr + idx_vec]);
-
-#define SETUP_VEC_BATCH_ASYM8b SETUP_VEC_BATCH_8b
-/*---------------------------------------------------------*/
-
-#define SETUP_MAT1_8b(idx) \
- ae_int16x4 _ae_int16x4_mat1_##idx = ZERO16X4; \
- WORD8 *_WORD8_p_mat1_##idx = (WORD8 *)&p_mat1[(m_itr + idx) * row_stride1];
-
-#define SETUP_MAT2_8b(idx) \
- ae_int16x4 _ae_int16x4_mat2_##idx = ZERO16X4; \
- WORD8 *_WORD8_p_mat2_##idx = (WORD8 *)&p_mat2[(m_itr + idx) * row_stride2];
-
-#define SETUP_MAT1_16b(idx) \
- ae_int16x4 _ae_int16x4_mat1_##idx = ZERO16X4; \
- ae_int16x4 *_ae_int16x4_p_mat1_##idx = \
- (ae_int16x4 *)&p_mat1[(m_itr + idx) * row_stride1];
-
-#define SETUP_MAT2_16b(idx) \
- ae_int16x4 _ae_int16x4_mat2_##idx = ZERO16X4; \
- ae_int16x4 *_ae_int16x4_p_mat2_##idx = \
- (ae_int16x4 *)&p_mat2[(m_itr + idx) * row_stride2];
-
-#define SETUP_MAT1_f32(idx) \
- xtfloatx2 _xtfloatx2_mat1_##idx = (xtfloatx2)0.0f; \
- xtfloatx2 *_xtfloatx2_p_mat1_##idx = \
- (xtfloatx2 *)&p_mat1[(m_itr + idx) * row_stride1];
-
-#define SETUP_MAT1_ASYM8b SETUP_MAT1_8b
-#define SETUP_MAT2_ASYM8b SETUP_MAT2_8b
-/* ====================================================================== */
-
-#define LOAD_VEC1_8b \
- AE_L8X4F_IP(_ae_int16x4_vec1, _WORD8_p_vec1, INCREMENT_IN_BYTES_FOR_WORD8X4);
-
-#define LOAD_VEC2_8b \
- AE_L8X4F_IP(_ae_int16x4_vec2, _WORD8_p_vec2, INCREMENT_IN_BYTES_FOR_WORD8X4);
-
-#define LOAD_VEC1_16b \
- AE_L16X4_IP(_ae_int16x4_vec1, _ae_int16x4_p_vec1, \
- INCREMENT_IN_BYTES_FOR_INT16X4);
-
-#define LOAD_VEC2_16b \
- AE_L16X4_IP(_ae_int16x4_vec2, _ae_int16x4_p_vec2, \
- INCREMENT_IN_BYTES_FOR_INT16X4);
-
-#define LOAD_VEC1_ASYM8b \
- AE_L8X4F_IP(_ae_int16x4_vec1, _WORD8_p_vec1, \
- INCREMENT_IN_BYTES_FOR_WORD8X4); \
- _ae_int16x4_vec1 = AE_MOVF16X4_FROMF64( \
- AE_SRLI64(AE_MOVF64_FROMF16X4(_ae_int16x4_vec1), 8)); \
- _ae_int16x4_vec1 = AE_ADD16(_ae_int16x4_vec1, AE_MOVDA16(vec1_zero_bias));
-
-#define LOAD_VEC2_ASYM8b \
- AE_L8X4F_IP(_ae_int16x4_vec2, _WORD8_p_vec2, \
- INCREMENT_IN_BYTES_FOR_WORD8X4); \
- _ae_int16x4_vec2 = AE_MOVF16X4_FROMF64( \
- AE_SRLI64(AE_MOVF64_FROMF16X4(_ae_int16x4_vec2), 8)); \
- _ae_int16x4_vec2 = AE_ADD16(_ae_int16x4_vec2, AE_MOVDA16(vec2_zero_bias)); \
-/*------------------ time batching macros ----------------- */
-#define LOAD_VEC_BATCH_f32(idx_vec) \
- XT_LSX2IP(_xtfloatx2_vec_batch_##idx_vec, _xtfloatx2_p_vec_batch_##idx_vec, \
- INCREMENT_IN_BYTES_FOR_FLOAT32x2);
-
-#define LOAD_VEC_BATCH_8b(idx_vec) \
- AE_L8X4F_IP(_ae_int16x4_vec_batch_##idx_vec, _WORD8_p_vec_batch_##idx_vec, \
- INCREMENT_IN_BYTES_FOR_WORD8X4);
-
-#define LOAD_VEC_BATCH_16b(idx_vec) \
- AE_L16X4_IP(_ae_int16x4_vec_batch_##idx_vec, \
- _ae_int16x4_p_vec_batch_##idx_vec, \
- INCREMENT_IN_BYTES_FOR_INT16X4);
-
-#define LOAD_VEC_BATCH_ASYM8b(idx_vec) \
- AE_L8X4F_IP(_ae_int16x4_vec_batch_##idx_vec, _WORD8_p_vec_batch_##idx_vec, \
- INCREMENT_IN_BYTES_FOR_WORD8X4); \
- _ae_int16x4_vec_batch_##idx_vec = AE_MOVF16X4_FROMF64( \
- AE_SRLI64(AE_MOVF64_FROMF16X4(_ae_int16x4_vec_batch_##idx_vec), 8)); \
- _ae_int16x4_vec_batch_##idx_vec = \
- AE_ADD16(_ae_int16x4_vec_batch_##idx_vec, AE_MOVDA16(vec1_zero_bias));
-
-#define LOAD_BIAS_8b_FOR_8bx8b \
- _WORD8_bias = *_WORD8_p_bias++; \
- _WORD16_bias = _WORD8_bias; \
- *((WORD16 *)_ae_int16_p_bias) = _WORD16_bias; \
- _ae_int64_sat_bias = AE_SLAA64S(((ae_int64)_ae_int16_bias), bias_shift);
-
-#define LOAD_BIAS_16b_FOR_8bx16b \
- ae_int16_loadip(_ae_int16_bias, _ae_int16_p_bias, \
- INCREMENT_IN_BYTES_FOR_INT16); \
- _ae_int64_sat_bias = AE_SLAA64S(((ae_int64)_ae_int16_bias), bias_shift);
-
-#define LOAD_BIAS_16b_FOR_16bx8b LOAD_BIAS_16b_FOR_8bx16b
-
-#define LOAD_BIAS_16b_FOR_16bx16b \
- ae_int16_loadip(_ae_int16_bias, _ae_int16_p_bias, \
- INCREMENT_IN_BYTES_FOR_INT16); \
- _ae_int64_sat_bias = AE_SLAA64S(((ae_int64)_ae_int16_bias), bias_shift);
-
-#define LOAD_BIAS_f32 \
- XT_LSIP(_xtfloat_bias, _xtfloat_p_bias, INCREMENT_IN_BYTES_FOR_FLOAT32);
-
-#define LOAD_BIAS_ASYM8b \
- _WORD32_bias = *_WORD32_p_bias++; \
- _ae_int64_sat_bias = \
- AE_SRAI64(AE_MOVINT64_FROMINT32X2(AE_MOVDA32(_WORD32_bias)), 32); \
-/*---------------------------------------------------------*/
-#define LOAD_ROW_MAT1_8b(idx) \
- AE_L8X4F_IP(_ae_int16x4_mat1_##idx, _WORD8_p_mat1_##idx, \
- INCREMENT_IN_BYTES_FOR_WORD8X4);
-
-#define LOAD_ROW_MAT2_8b(idx) \
- AE_L8X4F_IP(_ae_int16x4_mat2_##idx, _WORD8_p_mat2_##idx, \
- INCREMENT_IN_BYTES_FOR_WORD8X4);
-
-#define LOAD_ROW_MAT1_16b(idx) \
- AE_L16X4_IP(_ae_int16x4_mat1_##idx, _ae_int16x4_p_mat1_##idx, \
- INCREMENT_IN_BYTES_FOR_INT16X4);
-
-#define LOAD_ROW_MAT2_16b(idx) \
- AE_L16X4_IP(_ae_int16x4_mat2_##idx, _ae_int16x4_p_mat2_##idx, \
- INCREMENT_IN_BYTES_FOR_INT16X4);
-
-#define LOAD_ROW_MAT1_f32(idx) \
- XT_LSX2IP(_xtfloatx2_mat1_##idx, _xtfloatx2_p_mat1_##idx, \
- INCREMENT_IN_BYTES_FOR_FLOAT32x2);
-
-#define LOAD_ROW_MAT1_ASYM8b(idx) \
- AE_L8X4F_IP(_ae_int16x4_mat1_##idx, _WORD8_p_mat1_##idx, \
- INCREMENT_IN_BYTES_FOR_WORD8X4); \
- _ae_int16x4_mat1_##idx = AE_MOVF16X4_FROMF64( \
- AE_SRLI64(AE_MOVF64_FROMF16X4(_ae_int16x4_mat1_##idx), 8)); \
- _ae_int16x4_mat1_##idx = \
- AE_ADD16(_ae_int16x4_mat1_##idx, AE_MOVDA16(mat1_zero_bias));
-
-#define LOAD_ROW_MAT2_ASYM8b(idx) \
- AE_L8X4F_IP(_ae_int16x4_mat2_##idx, _WORD8_p_mat2_##idx, \
- INCREMENT_IN_BYTES_FOR_WORD8X4); \
- _ae_int16x4_mat2_##idx = AE_MOVF16X4_FROMF64( \
- AE_SRLI64(AE_MOVF64_FROMF16X4(_ae_int16x4_mat2_##idx), 8)); \
- _ae_int16x4_mat2_##idx = \
- AE_ADD16(_ae_int16x4_mat2_##idx, AE_MOVDA16(mat2_zero_bias));
-
-#define KERNEL_MAT1_VEC1_8b_8b(idx) \
- LOAD_ROW_MAT1_8b(idx); \
- AE_MULAAAAQ16(_ae_int64_acc_##idx, _ae_int16x4_vec1, _ae_int16x4_mat1_##idx);
-
-#define KERNEL_MAT2_VEC2_8b_8b(idx) \
- LOAD_ROW_MAT2_8b(idx); \
- AE_MULAAAAQ16(_ae_int64_acc_##idx, _ae_int16x4_vec2, _ae_int16x4_mat2_##idx);
-
-#define KERNEL_MAT1_VEC1_16b_8b(idx) \
- LOAD_ROW_MAT1_16b(idx); \
- AE_MULAAAAQ16(_ae_int64_acc_##idx, _ae_int16x4_vec1, _ae_int16x4_mat1_##idx);
-
-#define KERNEL_MAT2_VEC2_16b_8b(idx) \
- LOAD_ROW_MAT2_16b(idx); \
- AE_MULAAAAQ16(_ae_int64_acc_##idx, _ae_int16x4_vec2, _ae_int16x4_mat2_##idx);
-
-#define KERNEL_MAT1_VEC1_8b_16b(idx) \
- LOAD_ROW_MAT1_8b(idx); \
- AE_MULAAAAQ16(_ae_int64_acc_##idx, _ae_int16x4_vec1, _ae_int16x4_mat1_##idx);
-
-#define KERNEL_MAT2_VEC2_8b_16b(idx) \
- LOAD_ROW_MAT2_8b(idx); \
- AE_MULAAAAQ16(_ae_int64_acc_##idx, _ae_int16x4_vec2, _ae_int16x4_mat2_##idx);
-
-#define KERNEL_MAT1_VEC1_16b_16b(idx) \
- LOAD_ROW_MAT1_16b(idx); \
- AE_MULAAAAQ16(_ae_int64_acc_##idx, _ae_int16x4_vec1, _ae_int16x4_mat1_##idx);
-
-#define KERNEL_MAT2_VEC2_16b_16b(idx) \
- LOAD_ROW_MAT2_16b(idx); \
- AE_MULAAAAQ16(_ae_int64_acc_##idx, _ae_int16x4_vec2, _ae_int16x4_mat2_##idx);
-
-#define KERNEL_MAT1_VEC1_ASYM8b_ASYM8b(idx) \
- LOAD_ROW_MAT1_ASYM8b(idx); \
- AE_MULAAAAQ16(_ae_int64_acc_##idx, _ae_int16x4_vec1, _ae_int16x4_mat1_##idx);
-
-#define KERNEL_MAT2_VEC2_ASYM8b_ASYM8b(idx) \
- LOAD_ROW_MAT2_ASYM8b(idx); \
- AE_MULAAAAQ16(_ae_int64_acc_##idx, _ae_int16x4_vec2, _ae_int16x4_mat2_##idx);
-
-/*------------------ time batching macros ----------------- */
-
-#define KERNEL_MAT1_VEC_BATCH_ROW_8b_8b KERNEL_MAT1_VEC_BATCH_ROW_16b_16b
-#define KERNEL_MAT1_VEC_BATCH_ROW_16b_8b KERNEL_MAT1_VEC_BATCH_ROW_16b_16b
-#define KERNEL_MAT1_VEC_BATCH_ROW_8b_16b KERNEL_MAT1_VEC_BATCH_ROW_16b_16b
-#define KERNEL_MAT1_VEC_BATCH_ROW_ASYM8b_ASYM8b \
- KERNEL_MAT1_VEC_BATCH_ROW_16b_16b
-#define KERNEL_MAT1_VEC_BATCH_8b_8b KERNEL_MAT1_VEC_BATCH_16b_16b
-#define KERNEL_MAT1_VEC_BATCH_16b_8b KERNEL_MAT1_VEC_BATCH_16b_16b
-#define KERNEL_MAT1_VEC_BATCH_8b_16b KERNEL_MAT1_VEC_BATCH_16b_16b
-#define KERNEL_MAT1_VEC_BATCH_ASYM8b_ASYM8b KERNEL_MAT1_VEC_BATCH_16b_16b
-
-#define KERNEL_MAT1_VEC_BATCH_ROW_16b_16b(idx_row) \
- KERNEL_MAT1_VEC_BATCH_VEC_UNROLL(idx_row);
-
-#define KERNEL_MAT1_VEC_BATCH_16b_16b(idx_row, idx_vec) \
- AE_MULAAAAQ16(_ae_int64_acc_##idx_row##_##idx_vec, \
- _ae_int16x4_vec_batch_##idx_vec, _ae_int16x4_mat1_##idx_row);
-
-#define KERNEL_MAT1_VEC_BATCH_ROW_f32(idx_row) \
- KERNEL_MAT1_VEC_BATCH_VEC_UNROLL(idx_row);
-
-#define KERNEL_MAT1_VEC_BATCH_f32(idx_row, idx_vec) \
- XT_MADD_SX2(_xtfloatx2_acc_##idx_row##_##idx_vec, \
- _xtfloatx2_vec_batch_##idx_vec, _xtfloatx2_mat1_##idx_row);
-
-/*---------------------------------------------------------*/
-#define ADD_BIAS_8b_ACC_FOR_8bx8b(idx) \
- /* Load 8b bias */ \
- _WORD8_bias = *_WORD8_p_bias++; \
- /* Copy 8-bits to unsigned 32-bits */ \
- _UWORD32_bias = _WORD8_bias; \
- /*Move unsigned 32 bit value to DR register*/ \
- _ae_int64_bias = AE_MOVINT64_FROMINT32X2((AE_MOVDA32X2(_UWORD32_bias, 0))); \
- _ae_int64_bias = AE_SRAA64(_ae_int64_bias, 32); \
- _ae_int64_sat_bias = AE_SLAA64S(_ae_int64_bias, bias_shift); \
- _ae_int64_acc_##idx = AE_SRAA64(_ae_int64_acc_##idx, 16); \
- _ae_int64_acc_##idx = AE_ADD64S(_ae_int64_acc_##idx, _ae_int64_sat_bias);
-
-#define ADD_BIAS_32b_ACC_FOR_8bx8b(idx) \
- ae_int32_loadip(_ae_int32_bias, _ae_int32_p_bias, \
- INCREMENT_IN_BYTES_FOR_INT32); \
- _ae_int64_sat_bias = AE_SLAA64S(((ae_int64)_ae_int32_bias), bias_shift); \
- _ae_int64_acc_##idx = AE_SRAA64(_ae_int64_acc_##idx, 16); \
- _ae_int64_acc_##idx = AE_ADD64S(_ae_int64_acc_##idx, _ae_int64_sat_bias);
-
-#define ADD_BIAS_16b_ACC_FOR_8bx16b(idx) \
- ae_int16_loadip(_ae_int16_bias, _ae_int16_p_bias, \
- INCREMENT_IN_BYTES_FOR_INT16); \
- /* Saturate 16b bias after shift to 64b */ \
- _ae_int64_sat_bias = AE_SLAA64S(((ae_int64)_ae_int16_bias), bias_shift); \
- _ae_int64_acc_##idx = AE_SRAA64(_ae_int64_acc_##idx, 8); \
- _ae_int64_acc_##idx = AE_ADD64S(_ae_int64_acc_##idx, _ae_int64_sat_bias);
-
-#define ADD_BIAS_16b_ACC_FOR_16bx8b ADD_BIAS_16b_ACC_FOR_8bx16b
-
-#define ADD_BIAS_64b_ACC_FOR_8bx16b(idx) \
- ae_int64_loadip(_ae_int64_bias, _ae_int64_p_bias, \
- INCREMENT_IN_BYTES_FOR_INT64); \
- /* Saturate 64b bias after shift to 64b */ \
- _ae_int64_sat_bias = AE_SLAA64S(((ae_int64)_ae_int64_bias), bias_shift); \
- _ae_int64_acc_##idx = AE_SRAA64(_ae_int64_acc_##idx, 8); \
- _ae_int64_acc_##idx = AE_ADD64S(_ae_int64_acc_##idx, _ae_int64_sat_bias);
-
-#define ADD_BIAS_16b_ACC_FOR_16bx16b(idx) \
- ae_int16_loadip(_ae_int16_bias, _ae_int16_p_bias, \
- INCREMENT_IN_BYTES_FOR_INT16); \
- /* Saturate 16b bias after shift to 64b */ \
- _ae_int64_sat_bias = AE_SLAA64S(((ae_int64)_ae_int16_bias), bias_shift); \
- _ae_int64_acc_##idx = AE_ADD64S(_ae_int64_acc_##idx, _ae_int64_sat_bias);
-
-#define ADD_BIAS_64b_ACC_FOR_16bx16b(idx) \
- ae_int64_loadip(_ae_int64_bias, _ae_int64_p_bias, \
- INCREMENT_IN_BYTES_FOR_INT64); \
- /* Saturate 64b bias after shift to 64b */ \
- _ae_int64_sat_bias = AE_SLAA64S(((ae_int64)_ae_int64_bias), bias_shift); \
- _ae_int64_acc_##idx = AE_ADD64S(_ae_int64_acc_##idx, _ae_int64_sat_bias);
-
-#define ADD_BIAS_ASYM8b_ACC_FOR_ASYM8bxASYM8b(idx) \
- /* Load 32b bias */ \
- _WORD32_bias = *_WORD32_p_bias++; \
- _ae_int64_sat_bias = \
- AE_SRAI64(AE_MOVINT64_FROMINT32X2(AE_MOVDA32(_WORD32_bias)), 32); \
- _ae_int64_acc_##idx = AE_ADD64S(_ae_int64_acc_##idx, _ae_int64_sat_bias);
-
-/*------------------ time batching macros ----------------- */
-#define ADD_BIAS_BATCH_ROW_8b_ACC_FOR_8bx8b(idx_row) \
- LOAD_BIAS_8b_FOR_8bx8b; \
- ADD_BIAS_BATCH_ACC_VEC_UNROLL(idx_row);
-
-#define ADD_BIAS_BATCH_ROW_16b_ACC_FOR_8bx16b(idx_row) \
- LOAD_BIAS_16b_FOR_8bx16b; \
- ADD_BIAS_BATCH_ACC_VEC_UNROLL(idx_row);
-
-#define ADD_BIAS_BATCH_ROW_16b_ACC_FOR_16bx8b(idx_row) \
- LOAD_BIAS_16b_FOR_16bx8b; \
- ADD_BIAS_BATCH_ACC_VEC_UNROLL(idx_row);
-
-#define ADD_BIAS_BATCH_ROW_16b_ACC_FOR_16bx16b(idx_row) \
- LOAD_BIAS_16b_FOR_16bx16b; \
- ADD_BIAS_BATCH_ACC_VEC_UNROLL(idx_row);
-
-#define ADD_BIAS_BATCH_ROW_ASYM8b_ACC_FOR_ASYM8bxASYM8b(idx_row) \
- LOAD_BIAS_ASYM8b ADD_BIAS_BATCH_ACC_VEC_UNROLL(idx_row);
-
-#define ADD_BIAS_BATCH_8b_ACC_FOR_8bx8b(idx_row, idx_vec) \
- _ae_int64_acc_##idx_row##_##idx_vec = \
- AE_SRAA64(_ae_int64_acc_##idx_row##_##idx_vec, 16); \
- _ae_int64_acc_##idx_row##_##idx_vec = \
- AE_ADD64S(_ae_int64_acc_##idx_row##_##idx_vec, _ae_int64_sat_bias);
-
-#define ADD_BIAS_BATCH_16b_ACC_FOR_8bx16b(idx_row, idx_vec) \
- _ae_int64_acc_##idx_row##_##idx_vec = \
- AE_SRAA64(_ae_int64_acc_##idx_row##_##idx_vec, 8); \
- _ae_int64_acc_##idx_row##_##idx_vec = \
- AE_ADD64S(_ae_int64_acc_##idx_row##_##idx_vec, _ae_int64_sat_bias);
-
-#define ADD_BIAS_BATCH_16b_ACC_FOR_16bx16b(idx_row, idx_vec) \
- _ae_int64_acc_##idx_row##_##idx_vec = \
- AE_ADD64S(_ae_int64_acc_##idx_row##_##idx_vec, _ae_int64_sat_bias);
-
-#define ADD_BIAS_BATCH_16b_ACC_FOR_16bx8b ADD_BIAS_BATCH_16b_ACC_FOR_8bx16b
-#define ADD_BIAS_BATCH_ASYM8b_ACC_FOR_ASYM8bxASYM8b \
- ADD_BIAS_BATCH_16b_ACC_FOR_16bx16b
-
-#define ADD_BIAS_BATCH_ROW_ACC_FOR_f32(idx_row) \
- LOAD_BIAS_f32; \
- ADD_BIAS_BATCH_ACC_VEC_UNROLL(idx_row);
-
-#define ADD_BIAS_BATCH_ACC_FOR_f32(idx_row, idx_vec) \
- _xtfloat_acc_##idx_row##_##idx_vec = \
- XT_RADD_SX2(_xtfloatx2_acc_##idx_row##_##idx_vec); \
- _xtfloat_acc_##idx_row##_##idx_vec = \
- XT_ADD_S(_xtfloat_acc_##idx_row##_##idx_vec, _xtfloat_bias);
-
-#define STORE_ACC_8bx8b_AT_SCRATCH_32b(idx) \
- (*((ae_int32 *)p_scratch + m_itr + idx)) = \
- AE_ROUND32F64SSYM(AE_SLAA64S(_ae_int64_acc_##idx, acc_shift));
-
-#define STORE_ACC_8bx8b_AT_OUT_8b(idx) \
- ae_int32 _ae_int32_tmp_var_##idx; \
- ae_f32x2 _ae_f32x2_tmp_var_##idx = AE_SLAA32S( \
- AE_ROUND32F64SSYM(AE_SLAA64S(_ae_int64_acc_##idx, acc_shift)), 24); \
- _ae_int32_tmp_var_##idx = AE_SLAA32S(_ae_f32x2_tmp_var_##idx, -24); \
- (*((WORD8 *)p_out + m_itr + idx)) = (*((UWORD32 *)&_ae_int32_tmp_var_##idx));
-
-#define STORE_ACC_8bx8b_AT_OUT_16b(idx) \
- ae_int32 _ae_int32_tmp_var_##idx; \
- ae_f32x2 _ae_f32x2_tmp_var_##idx = AE_SLAA32S( \
- AE_ROUND32F64SSYM(AE_SLAA64S(_ae_int64_acc_##idx, acc_shift)), 16); \
- _ae_int32_tmp_var_##idx = AE_SLAA32S(_ae_f32x2_tmp_var_##idx, -16); \
- (*((WORD16 *)p_out + m_itr + idx)) = (*((UWORD32 *)&_ae_int32_tmp_var_##idx));
-
-#define STORE_ACC_8bx8b_AT_OUT_32b(idx) \
- (*((ae_int32 *)p_out + m_itr + idx)) = \
- AE_ROUND32F64SSYM(AE_SLAA64S(_ae_int64_acc_##idx, acc_shift));
-
-#define STORE_ACC_ASYM8bxASYM8b_AT_OUT_ASYM8b(idx) \
- _ae_int32x2_acc_##idx = AE_MIN32( \
- AE_MAX32(_ae_int32x2_acc_##idx, AE_MOVDA32(0)), AE_MOVDA32(255)); \
- (*((UWORD8 *)p_out + m_itr + idx)) = \
- (UWORD8)AE_MOVAD32_L(_ae_int32x2_acc_##idx);
-
-/* ====================================================================================================
- */
-#define STORE_ACC_8bx16b_AT_SCRATCH_32b(idx) \
- (*((ae_int32 *)p_scratch + m_itr + idx)) = \
- AE_ROUND32F64SSYM(AE_SLAA64S(_ae_int64_acc_##idx, acc_shift));
-
-#define STORE_ACC_8bx16b_AT_OUT_16b(idx) \
- ae_int32 _ae_int32_tmp_var_##idx; \
- ae_f32x2 _ae_f32x2_tmp_var_##idx = AE_SLAA32S( \
- AE_ROUND32F64SSYM(AE_SLAA64S(_ae_int64_acc_##idx, acc_shift)), 16); \
- _ae_int32_tmp_var_##idx = AE_SLAA32S(_ae_f32x2_tmp_var_##idx, -16); \
- (*((WORD16 *)p_out + m_itr + idx)) = (*((UWORD32 *)&_ae_int32_tmp_var_##idx));
-
-#define STORE_ACC_16bx8b_AT_OUT_16b STORE_ACC_8bx16b_AT_OUT_16b
-
-#define STORE_ACC_8bx16b_AT_OUT_32b(idx) \
- (*((ae_int32 *)p_out + m_itr + idx)) = \
- AE_ROUND32F64SSYM(AE_SLAA64S(_ae_int64_acc_##idx, acc_shift));
-
-#define STORE_ACC_8bx16b_AT_OUT_64b(idx) \
- (*((ae_int64 *)p_out + m_itr + idx)) = \
- AE_SLAA64S(_ae_int64_acc_##idx, acc_shift);
-
-/* ====================================================================================================
- */
-#define STORE_ACC_16bx16b_AT_SCRATCH_32b(idx) \
- (*((ae_int32 *)p_scratch + m_itr + idx)) = \
- AE_ROUND32F64SSYM(AE_SLAA64S(_ae_int64_acc_##idx, acc_shift));
-
-#define STORE_ACC_16bx16b_AT_OUT_16b(idx) \
- ae_int32 _ae_int32_tmp_var_##idx; \
- ae_f32x2 _ae_f32x2_tmp_var_##idx = AE_SLAA32S( \
- AE_ROUND32F64SSYM(AE_SLAA64S(_ae_int64_acc_##idx, acc_shift)), 16); \
- _ae_int32_tmp_var_##idx = AE_SLAA32S(_ae_f32x2_tmp_var_##idx, -16); \
- (*((WORD16 *)p_out + m_itr + idx)) = (*((UWORD32 *)&_ae_int32_tmp_var_##idx));
-
-#define STORE_ACC_16bx16b_AT_OUT_32b(idx) \
- (*((ae_int32 *)p_out + m_itr + idx)) = \
- AE_ROUND32F64SSYM(AE_SLAA64S(_ae_int64_acc_##idx, acc_shift));
-
-#define STORE_ACC_16bx16b_AT_OUT_64b(idx) \
- (*((ae_int64 *)p_out + m_itr + idx)) = \
- AE_SLAA64S(_ae_int64_acc_##idx, acc_shift);
-
-/*------------------ time batching macros ----------------- */
-#define STORE_ACC_BATCH_ROW_8bx8b_AT_OUT_32b(idx_row) \
- STORE_ACC_BATCH_VEC_UNROLL(idx_row);
-
-#define STORE_ACC_BATCH_ROW_8bx8b_AT_OUT_8b(idx_row) \
- STORE_ACC_BATCH_VEC_UNROLL(idx_row);
-
-#define STORE_ACC_BATCH_8bx8b_AT_OUT_32b(idx_row, idx_vec) \
- (*((ae_int32 *)p_out[vec_itr + idx_vec] + m_itr + idx_row)) = \
- AE_ROUND32F64SSYM( \
- AE_SLAA64S(_ae_int64_acc_##idx_row##_##idx_vec, acc_shift));
-
-#define STORE_ACC_BATCH_8bx8b_AT_OUT_8b(idx_row, idx_vec) \
- ae_int32 _ae_int32_tmp_var_##idx_row##_##idx_vec; \
- ae_f32x2 _ae_f32x2_tmp_var_##idx_row##_##idx_vec = \
- AE_SLAA32S(AE_ROUND32F64SSYM(AE_SLAA64S( \
- _ae_int64_acc_##idx_row##_##idx_vec, acc_shift)), \
- 24); \
- _ae_int32_tmp_var_##idx_row##_##idx_vec = \
- AE_SLAA32S(_ae_f32x2_tmp_var_##idx_row##_##idx_vec, -24); \
- (*((WORD8 *)p_out[vec_itr + idx_vec] + m_itr + idx_row)) = \
- (*((UWORD32 *)&_ae_int32_tmp_var_##idx_row##_##idx_vec));
-
-#define STORE_ACC_BATCH_ROW_8bx16b_AT_OUT_64b(idx_row) \
- STORE_ACC_BATCH_VEC_UNROLL(idx_row);
-
-#define STORE_ACC_BATCH_ROW_16bx8b_AT_OUT_16b \
- STORE_ACC_BATCH_ROW_8bx16b_AT_OUT_64b
-
-#define STORE_ACC_BATCH_ROW_8bx16b_AT_OUT_16b \
- STORE_ACC_BATCH_ROW_8bx16b_AT_OUT_64b
-
-#define STORE_ACC_BATCH_8bx16b_AT_OUT_64b(idx_row, idx_vec) \
- (*((ae_int64 *)p_out[vec_itr + idx_vec] + m_itr + idx_row)) = \
- AE_SLAA64S(_ae_int64_acc_##idx_row##_##idx_vec, acc_shift);
-
-#define STORE_ACC_BATCH_8bx16b_AT_OUT_16b(idx_row, idx_vec) \
- STORE_ACC_BATCH_16bx16b_AT_OUT_16b(idx_row, idx_vec);
-
-#define STORE_ACC_BATCH_ROW_16bx16b_AT_OUT_64b(idx_row) \
- STORE_ACC_BATCH_VEC_UNROLL(idx_row);
-
-#define STORE_ACC_BATCH_ROW_16bx16b_AT_OUT_16b \
- STORE_ACC_BATCH_ROW_16bx16b_AT_OUT_64b
-
-#define STORE_ACC_BATCH_16bx16b_AT_OUT_64b(idx_row, idx_vec) \
- (*((ae_int64 *)p_out[vec_itr + idx_vec] + m_itr + idx_row)) = \
- AE_SLAA64S(_ae_int64_acc_##idx_row##_##idx_vec, acc_shift);
-
-#define STORE_STRIDE_ACC_BATCH_16bx16b_AT_OUT_16b(idx_row, idx_vec) \
- ae_int32 _ae_int32_tmp_var_##idx_row##_##idx_vec; \
- ae_f32x2 _ae_f32x2_tmp_var_##idx_row##_##idx_vec = \
- AE_SLAA32S(AE_ROUND32F64SSYM(AE_SLAA64S( \
- _ae_int64_acc_##idx_row##_##idx_vec, acc_shift)), \
- 16); \
- _ae_int32_tmp_var_##idx_row##_##idx_vec = \
- AE_SLAA32S(_ae_f32x2_tmp_var_##idx_row##_##idx_vec, -16); \
- (*((WORD16 *)p_out + (vec_itr + idx_vec) * out_offset + \
- (m_itr + idx_row) * out_stride)) = \
- (*((UWORD32 *)&_ae_int32_tmp_var_##idx_row##_##idx_vec));
-
-#define STORE_ACC_BATCH_ROW_AT_OUT_f32(idx_row) \
- STORE_ACC_BATCH_VEC_UNROLL(idx_row);
-
-#define STORE_ACC_BATCH_AT_OUT_f32(idx_row, idx_vec) \
- /*p_out value stored in a tmp pointer to make it inout for ISA */ \
- p_out_tmp = (p_out[vec_itr + idx_vec] + m_itr + idx_row); \
- XT_SSIP(_xtfloat_acc_##idx_row##_##idx_vec, p_out_tmp, 0);
-
-#define STORE_ACC_BATCH_ROW_ASYM8bxASYM8b_AT_OUT_ASYM8b(idx_row) \
- STORE_ACC_BATCH_VEC_UNROLL(idx_row);
-
-#define STORE_ACC_BATCH_ASYM8bxASYM8b_AT_OUT_ASYM8b(idx_row, idx_vec) \
- _ae_int32x2_acc_##idx_row##_##idx_vec = \
- AE_MIN32(AE_MAX32(_ae_int32x2_acc_##idx_row##_##idx_vec, AE_MOVDA32(0)), \
- AE_MOVDA32(255)); \
- (*((UWORD8 *)(p_out[vec_itr + idx_vec] + m_itr + idx_row))) = \
- (UWORD8)AE_MOVAD32_L(_ae_int32x2_acc_##idx_row##_##idx_vec);
-
-/*---------------------------------------------------------*/
-/* Specific macros needed for extra calculations involved
- for ASYM8b */
-
-/* This is written to match with Tensorflow */
-#define ADJUST_ACC_ASYM8b(idx) \
- /* Multiply accumulator with 'out_multiplier', same as Tensorflow */ \
- ae_int32x2 _ae_int32x2_acc_##idx = \
- AE_SLAA32(AE_MOVINT32X2_FROMINT64(_ae_int64_acc_##idx), left_shift); \
- _ae_int32x2_acc_##idx = \
- AE_MULFP32X2RAS(_ae_int32x2_acc_##idx, AE_MOVDA32(out_multiplier)); \
- /* Shift by out_shift, same as Tensorflow */ \
- _ae_int64_acc_##idx = \
- AE_SLAI64(AE_MOVINT64_FROMINT32X2(_ae_int32x2_acc_##idx), 32); \
- _ae_int64_acc_##idx = AE_SRAA64(_ae_int64_acc_##idx, right_shift); \
- _ae_int32x2_acc_##idx = AE_ROUND32F64SSYM(_ae_int64_acc_##idx); \
- /* Add output zero point */ \
- (_ae_int32x2_acc_##idx) = \
- AE_ADD32S(_ae_int32x2_acc_##idx, AE_MOVDA32(out_zero_bias));
-
-/* For time batching */
-#define ADJUST_ACC_BATCH_ROW_ASYM8b(idx_row) \
- ADJUST_ACC_BATCH_VEC_UNROLL(idx_row);
-
-/* For time batching */
-#define ADJUST_ACC_BATCH_ASYM8b(idx_row, idx_vec) \
- /* Multiply accumulator with 'out_multiplier', same as Tensorflow */ \
- ae_int32x2 _ae_int32x2_acc_##idx_row##_##idx_vec = \
- AE_SLAA32(AE_MOVINT32X2_FROMINT64(_ae_int64_acc_##idx_row##_##idx_vec), \
- left_shift); \
- _ae_int32x2_acc_##idx_row##_##idx_vec = AE_MULFP32X2RAS( \
- _ae_int32x2_acc_##idx_row##_##idx_vec, AE_MOVDA32(out_multiplier)); \
- /* Shift by out_shift, same as Tensorflow */ \
- _ae_int64_acc_##idx_row##_##idx_vec = AE_SLAI64( \
- AE_MOVINT64_FROMINT32X2(_ae_int32x2_acc_##idx_row##_##idx_vec), 32); \
- _ae_int64_acc_##idx_row##_##idx_vec = \
- AE_SRAA64(_ae_int64_acc_##idx_row##_##idx_vec, right_shift); \
- _ae_int32x2_acc_##idx_row##_##idx_vec = \
- AE_ROUND32F64SSYM(_ae_int64_acc_##idx_row##_##idx_vec); \
- /* Add output zero point */ \
- (_ae_int32x2_acc_##idx_row##_##idx_vec) = AE_ADD32S( \
- _ae_int32x2_acc_##idx_row##_##idx_vec, AE_MOVDA32(out_zero_bias));
-
-/*---------------------------------------------------------*/
-/* ====================================================================================================
- */
-#if (ROW_UNROLL == 1)
-#define SETUP_ACC UNROLL_SETUP_ACC(0)
-#define SETUP_MAT1 UNROLL_SETUP_MAT1(0)
-#define SETUP_MAT2 UNROLL_SETUP_MAT2(0)
-#define KERNEL_MAT1_VEC1 UNROLL_KERNEL_MAT1_VEC1(0)
-#define KERNEL_MAT2_VEC2 UNROLL_KERNEL_MAT2_VEC2(0)
-#define ADD_BIAS_ACC UNROLL_ADD_BIAS_ACC(0)
-#define ADJUST_ACC UNROLL_ADJUST_ACC(0)
-#define STORE_ACC UNROLL_STORE_ACC(0)
-
-#elif (ROW_UNROLL == 2)
-#define SETUP_ACC UNROLL_SETUP_ACC(0) UNROLL_SETUP_ACC(1)
-#define SETUP_MAT1 UNROLL_SETUP_MAT1(0) UNROLL_SETUP_MAT1(1)
-#define SETUP_MAT2 UNROLL_SETUP_MAT2(0) UNROLL_SETUP_MAT2(1)
-#define KERNEL_MAT1_VEC1 UNROLL_KERNEL_MAT1_VEC1(0) UNROLL_KERNEL_MAT1_VEC1(1)
-#define KERNEL_MAT2_VEC2 UNROLL_KERNEL_MAT2_VEC2(0) UNROLL_KERNEL_MAT2_VEC2(1)
-#define ADD_BIAS_ACC UNROLL_ADD_BIAS_ACC(0) UNROLL_ADD_BIAS_ACC(1)
-#define ADJUST_ACC UNROLL_ADJUST_ACC(0) UNROLL_ADJUST_ACC(1)
-#define STORE_ACC UNROLL_STORE_ACC(0) UNROLL_STORE_ACC(1)
-
-#elif (ROW_UNROLL == 4)
-#define SETUP_ACC \
- UNROLL_SETUP_ACC(0) \
- UNROLL_SETUP_ACC(1) UNROLL_SETUP_ACC(2) UNROLL_SETUP_ACC(3)
-#define SETUP_MAT1 \
- UNROLL_SETUP_MAT1(0) \
- UNROLL_SETUP_MAT1(1) UNROLL_SETUP_MAT1(2) UNROLL_SETUP_MAT1(3)
-#define SETUP_MAT2 \
- UNROLL_SETUP_MAT2(0) \
- UNROLL_SETUP_MAT2(1) UNROLL_SETUP_MAT2(2) UNROLL_SETUP_MAT2(3)
-#define KERNEL_MAT1_VEC1 \
- UNROLL_KERNEL_MAT1_VEC1(0) \
- UNROLL_KERNEL_MAT1_VEC1(1) \
- UNROLL_KERNEL_MAT1_VEC1(2) UNROLL_KERNEL_MAT1_VEC1(3)
-#define KERNEL_MAT2_VEC2 \
- UNROLL_KERNEL_MAT2_VEC2(0) \
- UNROLL_KERNEL_MAT2_VEC2(1) \
- UNROLL_KERNEL_MAT2_VEC2(2) UNROLL_KERNEL_MAT2_VEC2(3)
-#define ADD_BIAS_ACC \
- UNROLL_ADD_BIAS_ACC(0) \
- UNROLL_ADD_BIAS_ACC(1) UNROLL_ADD_BIAS_ACC(2) UNROLL_ADD_BIAS_ACC(3)
-#define ADJUST_ACC \
- UNROLL_ADJUST_ACC(0) \
- UNROLL_ADJUST_ACC(1) UNROLL_ADJUST_ACC(2) UNROLL_ADJUST_ACC(3)
-#define STORE_ACC \
- UNROLL_STORE_ACC(0) \
- UNROLL_STORE_ACC(1) UNROLL_STORE_ACC(2) UNROLL_STORE_ACC(3)
-
-#elif (ROW_UNROLL == 8)
-#define SETUP_ACC \
- UNROLL_SETUP_ACC(0) \
- UNROLL_SETUP_ACC(1) \
- UNROLL_SETUP_ACC(2) \
- UNROLL_SETUP_ACC(3) \
- UNROLL_SETUP_ACC(4) \
- UNROLL_SETUP_ACC(5) UNROLL_SETUP_ACC(6) UNROLL_SETUP_ACC(7)
-#define SETUP_MAT1 \
- UNROLL_SETUP_MAT1(0) \
- UNROLL_SETUP_MAT1(1) \
- UNROLL_SETUP_MAT1(2) \
- UNROLL_SETUP_MAT1(3) \
- UNROLL_SETUP_MAT1(4) \
- UNROLL_SETUP_MAT1(5) UNROLL_SETUP_MAT1(6) UNROLL_SETUP_MAT1(7)
-#define SETUP_MAT2 \
- UNROLL_SETUP_MAT2(0) \
- UNROLL_SETUP_MAT2(1) \
- UNROLL_SETUP_MAT2(2) \
- UNROLL_SETUP_MAT2(3) \
- UNROLL_SETUP_MAT2(4) \
- UNROLL_SETUP_MAT2(5) UNROLL_SETUP_MAT2(6) UNROLL_SETUP_MAT2(7)
-#define KERNEL_MAT1_VEC1 \
- UNROLL_KERNEL_MAT1_VEC1(0) \
- UNROLL_KERNEL_MAT1_VEC1(1) \
- UNROLL_KERNEL_MAT1_VEC1(2) \
- UNROLL_KERNEL_MAT1_VEC1(3) \
- UNROLL_KERNEL_MAT1_VEC1(4) \
- UNROLL_KERNEL_MAT1_VEC1(5) \
- UNROLL_KERNEL_MAT1_VEC1(6) UNROLL_KERNEL_MAT1_VEC1(7)
-#define KERNEL_MAT2_VEC2 \
- UNROLL_KERNEL_MAT2_VEC2(0) \
- UNROLL_KERNEL_MAT2_VEC2(1) \
- UNROLL_KERNEL_MAT2_VEC2(2) \
- UNROLL_KERNEL_MAT2_VEC2(3) \
- UNROLL_KERNEL_MAT2_VEC2(4) \
- UNROLL_KERNEL_MAT2_VEC2(5) \
- UNROLL_KERNEL_MAT2_VEC2(6) UNROLL_KERNEL_MAT2_VEC2(7)
-#define ADD_BIAS_ACC \
- UNROLL_ADD_BIAS_ACC(0) \
- UNROLL_ADD_BIAS_ACC(1) \
- UNROLL_ADD_BIAS_ACC(2) \
- UNROLL_ADD_BIAS_ACC(3) \
- UNROLL_ADD_BIAS_ACC(4) \
- UNROLL_ADD_BIAS_ACC(5) UNROLL_ADD_BIAS_ACC(6) UNROLL_ADD_BIAS_ACC(7)
-#define ADJUST_ACC \
- UNROLL_ADJUST_ACC(0) \
- UNROLL_ADJUST_ACC(1) \
- UNROLL_ADJUST_ACC(2) \
- UNROLL_ADJUST_ACC(3) \
- UNROLL_ADJUST_ACC(4) \
- UNROLL_ADJUST_ACC(5) UNROLL_ADJUST_ACC(6) UNROLL_ADJUST_ACC(7)
-#define STORE_ACC \
- UNROLL_STORE_ACC(0) \
- UNROLL_STORE_ACC(1) \
- UNROLL_STORE_ACC(2) \
- UNROLL_STORE_ACC(3) \
- UNROLL_STORE_ACC(4) \
- UNROLL_STORE_ACC(5) UNROLL_STORE_ACC(6) UNROLL_STORE_ACC(7)
-
-#endif /* (ROW_UNROLL == 1) */
-
-#if (ROW_UNROLL == 4 && VEC_UNROLL == 2)
-
-#define SETUP_VEC_BATCH UNROLL_SETUP_VEC_BATCH(0) UNROLL_SETUP_VEC_BATCH(1)
-
-#define SETUP_ACC_BATCH \
- UNROLL_ROW_SETUP_ACC_BATCH(0) \
- UNROLL_ROW_SETUP_ACC_BATCH(1) \
- UNROLL_ROW_SETUP_ACC_BATCH(2) UNROLL_ROW_SETUP_ACC_BATCH(3)
-#define SETUP_ACC_BATCH_VEC_UNROLL(idx_row) \
- UNROLL_SETUP_ACC_BATCH(idx_row, 0) UNROLL_SETUP_ACC_BATCH(idx_row, 1)
-#define SETUP_ACC_BATCH_TAIL \
- UNROLL_SETUP_ACC_BATCH(0, 0) \
- UNROLL_SETUP_ACC_BATCH(1, 0) \
- UNROLL_SETUP_ACC_BATCH(2, 0) UNROLL_SETUP_ACC_BATCH(3, 0)
-
-#define LOAD_VEC_BATCH UNROLL_LOAD_VEC_BATCH(0) UNROLL_LOAD_VEC_BATCH(1)
-#define LOAD_MAT1 \
- UNROLL_LOAD_ROW_MAT1(0) \
- UNROLL_LOAD_ROW_MAT1(1) UNROLL_LOAD_ROW_MAT1(2) UNROLL_LOAD_ROW_MAT1(3)
-
-#define KERNEL_MAT1_VEC_BATCH \
- UNROLL_ROW_KERNEL_MAT1_VEC_BATCH(0) \
- UNROLL_ROW_KERNEL_MAT1_VEC_BATCH(1) \
- UNROLL_ROW_KERNEL_MAT1_VEC_BATCH(2) UNROLL_ROW_KERNEL_MAT1_VEC_BATCH(3)
-#define KERNEL_MAT1_VEC_BATCH_VEC_UNROLL(idx_row) \
- UNROLL_KERNEL_MAT1_VEC_BATCH(idx_row, 0) \
- UNROLL_KERNEL_MAT1_VEC_BATCH(idx_row, 1)
-#define KERNEL_MAT1_VEC_BATCH_TAIL \
- UNROLL_KERNEL_MAT1_VEC_BATCH(0, 0) \
- UNROLL_KERNEL_MAT1_VEC_BATCH(1, 0) \
- UNROLL_KERNEL_MAT1_VEC_BATCH(2, 0) UNROLL_KERNEL_MAT1_VEC_BATCH(3, 0)
-
-#define ADD_BIAS_ACC_BATCH \
- UNROLL_ROW_ADD_BIAS_ACC(0) \
- UNROLL_ROW_ADD_BIAS_ACC(1) \
- UNROLL_ROW_ADD_BIAS_ACC(2) UNROLL_ROW_ADD_BIAS_ACC(3)
-#define ADD_BIAS_BATCH_ACC_VEC_UNROLL(idx_row) \
- UNROLL_ADD_BIAS_ACC_BATCH(idx_row, 0) UNROLL_ADD_BIAS_ACC_BATCH(idx_row, 1)
-#define ADD_BIAS_ACC_BATCH_TAIL \
- LOAD_BIAS UNROLL_ADD_BIAS_ACC_BATCH(0, 0) \
- LOAD_BIAS UNROLL_ADD_BIAS_ACC_BATCH(1, 0) \
- LOAD_BIAS UNROLL_ADD_BIAS_ACC_BATCH(2, 0) \
- LOAD_BIAS UNROLL_ADD_BIAS_ACC_BATCH(3, 0)
-
-#define STORE_ACC_BATCH \
- UNROLL_ROW_STORE_ACC(0) \
- UNROLL_ROW_STORE_ACC(1) UNROLL_ROW_STORE_ACC(2) UNROLL_ROW_STORE_ACC(3)
-#define STORE_ACC_BATCH_VEC_UNROLL(idx_row) \
- UNROLL_STORE_ACC_BATCH(idx_row, 0) UNROLL_STORE_ACC_BATCH(idx_row, 1)
-#define STORE_ACC_BATCH_TAIL \
- UNROLL_STORE_ACC_BATCH(0, 0) \
- UNROLL_STORE_ACC_BATCH(1, 0) \
- UNROLL_STORE_ACC_BATCH(2, 0) UNROLL_STORE_ACC_BATCH(3, 0)
-
-#define ADJUST_ACC_BATCH_TAIL \
- UNROLL_ADJUST_ACC_BATCH(0, 0) \
- UNROLL_ADJUST_ACC_BATCH(1, 0) \
- UNROLL_ADJUST_ACC_BATCH(2, 0) UNROLL_ADJUST_ACC_BATCH(3, 0)
-#define ADJUST_ACC_BATCH \
- UNROLL_ROW_ADJUST_ACC(0) \
- UNROLL_ROW_ADJUST_ACC(1) UNROLL_ROW_ADJUST_ACC(2) UNROLL_ROW_ADJUST_ACC(3)
-#define ADJUST_ACC_BATCH_VEC_UNROLL(idx_row) \
- UNROLL_ADJUST_ACC_BATCH(idx_row, 0) UNROLL_ADJUST_ACC_BATCH(idx_row, 1)
-
-#endif /* (ROW_UNROLL == 4 && VEC_UNROLL == 2)*/
-
-#endif /* __XA_NNLIB_COMMON_MACROS_H__ */
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/common/include/xa_nnlib_definitions.h b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/common/include/xa_nnlib_definitions.h
deleted file mode 100644
index 7199887..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/common/include/xa_nnlib_definitions.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
-
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef __XA_OPUS_CODEC_DEFINITIONS_H__
-#define __XA_OPUS_CODEC_DEFINITIONS_H__
-
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/common/include/xa_api_defs.h"
-
-/* Identification Strings */
-#define LIBNAME "HiFi Mini Neural Network Library"
-#define LIBVERSION "0.6.0"
-
-#define LIB_APIVERSION_MAJOR 1
-#define LIB_APIVERSION_MINOR 0
-
-#if LIB_APIVERSION_MAJOR != XA_APIVERSION_MAJOR || \
- LIB_APIVERSION_MINOR != XA_APIVERSION_MINOR
-// #error "Version Mismatch"
-#endif
-
-#define LIB_APIVERSION \
- XA_MAKE_VERSION_STR(LIB_APIVERSION_MAJOR, LIB_APIVERSION_MINOR)
-
-#endif /* __XA_OPUS_CODEC_DEFINITIONS_H__ */
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/common/include/xa_nnlib_err_chk.h b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/common/include/xa_nnlib_err_chk.h
deleted file mode 100644
index 8508e54..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/common/include/xa_nnlib_err_chk.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
-
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef __XA_NNLIB_ERR_CHK_H__
-#define __XA_NNLIB_ERR_CHK_H__
-
-#ifndef NULL
-#define NULL (void *)0
-#endif /* NULL */
-
-#ifndef DISABLE_ARG_CHK
-
-#define XA_NNLIB_ARG_CHK_PTR(_ptr, _err) \
- do { \
- if ((_ptr) == NULL) return (_err); \
- } while (0)
-
-#define XA_NNLIB_ARG_CHK_ALIGN(_ptr, _align, _err) \
- do { \
- if (((unsigned int)(_ptr) & ((_align)-1)) != 0) return (_err); \
- } while (0)
-
-#define XA_NNLIB_ARG_CHK_COND(_cond, _err) \
- do { \
- if ((_cond)) return (_err); \
- } while (0)
-
-#else /* DISABLE_ARG_CHK */
-
-#define XA_NNLIB_ARG_CHK_PTR(_ptr, _err)
-#define XA_NNLIB_ARG_CHK_ALIGN(_ptr, _align, _err)
-#define XA_NNLIB_ARG_CHK_COND(_cond, _err)
-
-#endif /* DISABLE_ARG_CHK */
-
-#define XA_NNLIB_CHK_PTR(_ptr, _err) \
- do { \
- if ((_ptr) == NULL) return (_err); \
- } while (0)
-
-#define XA_NNLIB_CHK_ALIGN(_ptr, _align, _err) \
- do { \
- if (((unsigned int)(_ptr) & ((_align)-1)) != 0) return (_err); \
- } while (0)
-
-#define XA_NNLIB_CHK_COND(_cond, _err) \
- do { \
- if ((_cond)) return (_err); \
- } while (0)
-
-#endif /* __XA_NNLIB_ERR_CHK_H__ */
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/kernels/activations/hifi_mini/xa_nn_activations_asym8s_asym8s.c b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/kernels/activations/hifi_mini/xa_nn_activations_asym8s_asym8s.c
deleted file mode 100644
index 060b706..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/kernels/activations/hifi_mini/xa_nn_activations_asym8s_asym8s.c
+++ /dev/null
@@ -1,176 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
-
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xa_nnlib_common.h"
-
-#define ALIGNMENT 8 /* 8 bytes alignment */
-
-#define ALIGN_PTR(x, bytes) ((((unsigned)(x)) + (bytes - 1)) & (~(bytes - 1)))
-
-#define LIMIT(out, inp, min, max) \
- { \
- out = min; \
- out = AE_MAXP24S(inp, min); \
- out = AE_MINP24S(out, max); \
- }
-
-#define STORE_8X2_FROM_24X2(out_ptr, val) \
- { \
- int o1, o2; \
- o1 = AE_MOVAP24S_H(val); \
- o2 = AE_MOVAP24S_L(val); \
- *out_ptr++ = (WORD8)o1; \
- *out_ptr++ = (WORD8)o2; \
- }
-
-/*
- * inp: p_vec: 4 byte aligned input pointer
- * out: p_out: no alignment needed for output pointer*/
-WORD32 xa_nn_vec_activation_min_max_asym8s_asym8s(
- WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_vec,
- int activation_min, int activation_max, WORD32 vec_length) {
- int i;
- ae_p24x2s x, y, min, max;
-
- /* NULL pointer checks */
- XA_NNLIB_ARG_CHK_PTR(p_out, -1);
- XA_NNLIB_ARG_CHK_PTR(p_vec, -1);
-
- /* Basic Parameter checks */
- XA_NNLIB_ARG_CHK_COND((vec_length <= 0), -1);
-
- /* Basic Parameter checks */
- XA_NNLIB_ARG_CHK_COND((activation_max < activation_min), -1);
-
- WORD8 *p_o = p_out;
- WORD8 *p_v = (WORD8 *)p_vec;
-
- min = AE_SRAIP24(AE_CVTP24A16(activation_min), 8);
- max = AE_SRAIP24(AE_CVTP24A16(activation_max), 8);
-
- int pre_loop_count = 0;
- // pre loop, active when input ptr is not 4 byte aligned
- pre_loop_count = (int)((unsigned)ALIGN_PTR(p_v, 4) - (unsigned)p_v);
- pre_loop_count = (pre_loop_count < vec_length) ? pre_loop_count : vec_length;
-
- vec_length = vec_length - pre_loop_count;
- vec_length = (vec_length < 0) ? 0 : vec_length;
-
- for (i = 0; i < pre_loop_count; i++) {
- int i1;
- i1 = ((WORD8)*p_v++);
- x = AE_MOVPA24(i1);
- LIMIT(y, x, min, max)
- i1 = AE_MOVAP24S_H(y);
- *p_o++ = (WORD8)i1;
- }
-
- if ((activation_max >= (int)127) && (activation_min <= (int)-128)) {
- p_v = p_v - 2;
- for (i = 0; i < (vec_length >> 1); i++) {
- AE_LP8X2F_IU(x, (WORD8 *)p_v, 2 * sizeof(WORD8));
- y = AE_SRAIP24(x, 16);
-
- STORE_8X2_FROM_24X2(p_o, y)
- }
- if (vec_length & 1) {
- p_v = p_v + 2;
- int i1;
- i1 = (WORD8)p_v[0];
- *p_o++ = (WORD8)i1;
- }
- } else if ((activation_max < (int)127) && (activation_min <= (int)-128)) {
- p_v = p_v - 2;
- for (i = 0; i < (vec_length >> 1); i++) {
- AE_LP8X2F_IU(x, (WORD8 *)p_v, 2 * sizeof(WORD8));
- y = AE_SRAIP24(x, 16);
-
- y = AE_MINP24S(y, max);
-
- STORE_8X2_FROM_24X2(p_o, y)
- }
- if (vec_length & 1) {
- p_v = p_v + 2;
- int i1;
- i1 = (WORD8)p_v[0];
- y = AE_MOVPA24(i1);
-
- y = AE_MINP24S(y, max);
-
- i1 = AE_MOVAP24S_H(y);
- *p_o++ = (WORD8)i1;
- }
- } else if ((activation_max >= (int)127) && (activation_min > (int)-128)) {
- p_v = p_v - 2;
- for (i = 0; i < (vec_length >> 1); i++) {
- AE_LP8X2F_IU(x, (WORD8 *)p_v, 2 * sizeof(WORD8));
- y = AE_SRAIP24(x, 16);
-
- y = AE_MAXP24S(y, min);
-
- STORE_8X2_FROM_24X2(p_o, y)
- }
- if (vec_length & 1) {
- p_v = p_v + 2;
- int i1;
- i1 = (WORD8)p_v[0];
- y = AE_MOVPA24(i1);
-
- y = AE_MAXP24S(y, min);
-
- i1 = AE_MOVAP24S_H(y);
- *p_o++ = (WORD8)i1;
- }
- } else {
- p_v = p_v - 2;
- for (i = 0; i < (vec_length >> 1); i++) {
- AE_LP8X2F_IU(x, (WORD8 *)p_v, 2 * sizeof(WORD8));
- x = AE_SRAIP24(x, 16);
- LIMIT(y, x, min, max)
- STORE_8X2_FROM_24X2(p_o, y)
- }
- if (vec_length & 1) {
- p_v = p_v + 2;
- int i1;
- i1 = (WORD8)p_v[0];
- x = AE_MOVPA24(i1);
- LIMIT(y, x, min, max)
- i1 = AE_MOVAP24S_H(y);
- *p_o++ = (WORD8)i1;
- }
- }
- return 0;
-}
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/kernels/activations/hifi_mini/xa_nn_softmax_asym8_asym8.c b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/kernels/activations/hifi_mini/xa_nn_softmax_asym8_asym8.c
deleted file mode 100644
index 4f7dce8..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/kernels/activations/hifi_mini/xa_nn_softmax_asym8_asym8.c
+++ /dev/null
@@ -1,1005 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
-
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xa_nnlib_common.h"
-
-#define ALIGNMENT 8 /* 8 bytes alignment */
-#define ALIGNED_SIZE(x, bytes) (((x) + (bytes - 1)) & (~(bytes - 1)))
-#define ALIGN_PTR(x, bytes) ((((unsigned)(x)) + (bytes - 1)) & (~(bytes - 1)))
-
-#ifndef AE_LP8X2F_IU
-#define AE_LP8X2F_IU(p_x, p_in, x) \
- AE_LP16F_IU(p_x, (ae_p16s *)p_in, x); \
- ae_p24x2s p_tmp1 = AE_SLLIP24(p_x, 8); \
- ae_p24x2s p_tmp2 = AE_ANDP48(p_x, AE_MOVPA24(0xFFFF0000)); \
- p_x = AE_SELP24_LL(p_tmp2, p_tmp1);
-
-#endif
-
-#define NSA64_T(y, x) \
- { \
- ae_q56s q_tmp = *(ae_q56s *)&x; \
- y = AE_NSAQ56S(q_tmp) + 8; \
- }
-
-#define MULFP32X2RAS_T(result, a, b) \
- { \
- ae_q56s q_a = AE_CVTQ48A32S(a); \
- ae_p24x2s p_b = AE_CVTP24A16X2_HL(b, b); \
- ae_q56s q_out = AE_MULFQ32SP16U_L(q_a, p_b); \
- q_out = AE_SRAIQ56(q_out, 16); \
- AE_MULAFQ32SP16S_H(q_out, q_a, p_b); \
- q_out = AE_ROUNDSQ32ASYM(q_out); \
- *(ae_q32s *)&result = q_out; \
- }
-
-#define MULFP32X2RS_T(result, a, b) \
- { \
- ae_q56s q_a = AE_CVTQ48A32S(a); \
- ae_p24x2s p_b = AE_CVTP24A16X2_HL(b, b); \
- ae_q56s q_out = AE_MULFQ32SP16U_L(q_a, p_b); \
- q_out = AE_SRAIQ56(q_out, 16); \
- AE_MULAFQ32SP16S_H(q_out, q_a, p_b); \
- q_out = AE_ROUNDSQ32SYM(q_out); \
- *(ae_q32s *)&result = q_out; \
- }
-#define ADD32S_T(result, a, b) \
- { \
- ae_q56s q_a = AE_CVTQ48A32S(a); \
- ae_q56s q_b = AE_CVTQ48A32S(b); \
- ae_q56s q_out = AE_ADDSQ56S(q_a, q_b); \
- q_out = AE_SATQ48S(q_out); \
- *(ae_q32s *)&result = q_out; \
- }
-
-#define SUB32S_T(result, a, b) \
- { \
- ae_q56s q_a = AE_CVTQ48A32S(a); \
- ae_q56s q_b = AE_CVTQ48A32S(b); \
- ae_q56s q_out = AE_SUBSQ56S(q_a, q_b); \
- q_out = AE_SATQ48S(q_out); \
- *(ae_q32s *)&result = q_out; \
- }
-
-#define SLAI32S_T(result, a, b) \
- { \
- ae_q56s q_a = AE_CVTQ48A32S(a); \
- ae_q56s q_out = AE_SLLIQ56(q_a, b); \
- q_out = AE_SATQ48S(q_out); \
- *(ae_q32s *)&result = q_out; \
- }
-
-#define SRAA32RS_T(result, a, b) \
- { \
- ae_q56s q_a = AE_CVTQ48A32S(a); \
- ae_q56s q_out = AE_SLAASQ56S(q_a, (-b)); \
- q_out = AE_ROUNDSQ32ASYM(q_out); \
- *(ae_q32s *)&result = q_out; \
- }
-
-#define SRAI32R_T(result, a, b) \
- { \
- ae_q56s q_a = AE_CVTQ48A32S(a); \
- ae_q56s q_out = AE_SRAIQ56(q_a, b); \
- q_out = AE_ROUNDSQ32ASYM(q_out); \
- *(ae_q32s *)&result = q_out; \
- }
-
-static const int CONSTANT_TERM = (0x70f5a894);
-static const int CONSTANT_1_OVER_3 = (0x2aaaaaab);
-static const int CONSTANT_1_OVER_8 = (0x10000000);
-static const int ONE_QUATER_Q26 = (0x1000000); // Q6.26
-static const int MASK = (0xffffff);
-static const int Q31 = 0x7fffffff;
-static const int constant_48_over_17 = 1515870810;
-static const int constant_neg_32_over_17 = -1010580540; // Q29
-static const int F2_ONE = 0x20000000;
-
-static const int constant_neg_32_over_17_Q21 = -3947580; // Q21
-static const int constant_48_over_17_Q21 = 5921370; // Q21
-
-static ae_p24x2s GetReciprocal(ae_q56s q_x, int x_integerbits, int *lsh) {
- int headroom_plus_one;
- ae_p24x2s p_x;
- ae_q56s q_tmp;
- ae_p24x2s p_half_den;
- int i;
-
- headroom_plus_one = AE_NSAQ56S(q_x) + 8;
- headroom_plus_one = headroom_plus_one - 31;
- *lsh = x_integerbits - headroom_plus_one;
-
- q_x = (q_x << (headroom_plus_one + 15));
- p_half_den = AE_ROUNDSP24Q48SYM(q_x);
-
- q_tmp = AE_CVTQ48A32S(constant_48_over_17);
- AE_MULAFP24S_LL(q_tmp, p_half_den, AE_MOVPA24(constant_neg_32_over_17_Q21));
- p_x = AE_ROUNDSP24Q48SYM(q_tmp);
-
- for (i = 0; i < 3; i++) {
- q_tmp = AE_CVTQ48A32S(F2_ONE);
- AE_MULSFP24S_LL(q_tmp, p_x, p_half_den);
- ae_p24x2s p_one_minus_half_denominator_times_x = AE_ROUNDSP24Q48SYM(q_tmp);
-
- q_tmp = AE_MULFP24S_LL(p_x, p_one_minus_half_denominator_times_x);
- ae_p24x2s p_m = AE_ROUNDSP24Q48SYM(q_tmp);
- p_m = AE_SLLISP24S(p_m, 2);
- p_x = AE_ADDSP24S(p_x, p_m);
- }
-
- p_x = AE_SLLISP24S(p_x, 1);
-
- return p_x;
-}
-
-static const int MASK_16BITS = (0xffff);
-static const int ONE_QUATER_Q18 = (0x10000); // Q18
-static const int CONSTANT_1_OVER_8_Q23 = (0x100000); // Q23
-static const int CONSTANT_1_OVER_3_Q23 = (0x2aaaaa); // Q23
-static const int CONSTANT_TERM_Q23 = (0x70f5a8); // Q23
-static const int Q23 = 0x7fffff;
-
-#define GEMMLOWP_EXP_BARREL_SHIFTER_OPT_II(p_in_out, exponent, \
- FixedPointMultiplier, p_remainder) \
- { \
- ae_p24x2s p_out; \
- \
- ae_p24x2s p_zero = AE_ZEROP48(); \
- \
- ae_p24x2s p_scale = AE_MOVPA24(1 << (18 + exponent)); \
- ae_p24x2s p_mask = p_remainder & p_scale; \
- \
- ae_p24x2s p_FixedPointMultiplier = AE_MOVPA24(FixedPointMultiplier >> 8); \
- \
- ae_q56s q_tmp1 = AE_MULFP24S_HH(p_in_out, p_FixedPointMultiplier); \
- ae_q56s q_tmp2 = AE_MULFP24S_LL(p_in_out, p_FixedPointMultiplier); \
- ae_p24x2s p_t1 = AE_ROUNDSP24Q48SYM(q_tmp1); \
- ae_p24x2s p_t2 = AE_ROUNDSP24Q48SYM(q_tmp2); \
- p_out = AE_SELP24_LL(p_t1, p_t2); \
- \
- xtbool2 flag_le = AE_LTP24S(p_zero, p_mask); \
- AE_MOVTP24X2(p_in_out, p_out, flag_le); \
- }
-
-#define EXP_Q26_II(p_exp_y, p_inp_t) \
- { \
- ae_p24x2s p_x1_in, p_x2, p_x3, p_x4, p_x4_by_4, p_y1, p_y2, p_y3, p_y4, \
- p_y5, p_y6, p_y; \
- \
- p_x2 = p_inp_t & AE_MOVPA24(MASK_16BITS); \
- ae_p24x2s p_a_mod_quater_minus_q_1_by_4 = \
- p_x2 - AE_MOVPA24(ONE_QUATER_Q18); \
- ae_p24x2s p_x_in = p_a_mod_quater_minus_q_1_by_4 << 5; \
- ae_p24x2s p_remainder = p_a_mod_quater_minus_q_1_by_4 - p_inp_t; \
- \
- p_x1_in = AE_ADDSP24S(p_x_in, AE_MOVPA24(CONSTANT_1_OVER_8_Q23)); \
- \
- ae_q56s q_tmp1 = AE_MULFP24S_HH(p_x1_in, p_x1_in); \
- ae_q56s q_tmp2 = AE_MULFP24S_LL(p_x1_in, p_x1_in); \
- ae_p24x2s p_t1 = AE_ROUNDSP24Q48SYM(q_tmp1); \
- ae_p24x2s p_t2 = AE_ROUNDSP24Q48SYM(q_tmp2); \
- p_x2 = AE_SELP24_LL(p_t1, p_t2); \
- \
- q_tmp1 = AE_MULFP24S_HH(p_t1, p_x1_in); \
- q_tmp2 = AE_MULFP24S_LL(p_t2, p_x1_in); \
- p_t1 = AE_ROUNDSP24Q48SYM(q_tmp1); \
- p_t2 = AE_ROUNDSP24Q48SYM(q_tmp2); \
- p_x3 = AE_SELP24_LL(p_t1, p_t2); \
- \
- q_tmp1 = AE_MULFP24S_HH(p_x2, p_x2); \
- q_tmp2 = AE_MULFP24S_LL(p_x2, p_x2); \
- p_t1 = AE_ROUNDSP24Q48SYM(q_tmp1); \
- p_t2 = AE_ROUNDSP24Q48SYM(q_tmp2); \
- p_x4 = AE_SELP24_LL(p_t1, p_t2); \
- p_x4_by_4 = p_x4 >> 2; \
- \
- p_y1 = AE_ADDSP24S(p_x4_by_4, p_x3); \
- \
- ae_p24x2s p_const = AE_MOVPA24(CONSTANT_1_OVER_3_Q23); \
- q_tmp1 = AE_MULFP24S_HH(p_y1, p_const); \
- q_tmp2 = AE_MULFP24S_LL(p_y1, p_const); \
- p_t1 = AE_ROUNDSP24Q48SYM(q_tmp1); \
- p_t2 = AE_ROUNDSP24Q48SYM(q_tmp2); \
- p_y2 = AE_SELP24_LL(p_t1, p_t2); \
- \
- p_y3 = AE_ADDSP24S(p_y2, p_x2); \
- p_y4 = p_y3 >> 1; \
- \
- p_y5 = AE_ADDSP24S(p_x1_in, p_y4); /* ADD32S_T(y5, x1_in, y4); */ \
- \
- p_const = AE_MOVPA24(CONSTANT_TERM_Q23); \
- q_tmp1 = AE_MULFP24S_HH(p_y5, p_const); \
- q_tmp2 = AE_MULFP24S_LL(p_y5, p_const); \
- p_t1 = AE_ROUNDSP24Q48SYM(q_tmp1); \
- p_t2 = AE_ROUNDSP24Q48SYM(q_tmp2); \
- p_y6 = AE_SELP24_LL(p_t1, p_t2); \
- p_y = AE_ADDSP24S(p_y6, p_const); \
- \
- { \
- GEMMLOWP_EXP_BARREL_SHIFTER_OPT_II(p_y, -2, 1672461947, p_remainder); \
- GEMMLOWP_EXP_BARREL_SHIFTER_OPT_II(p_y, -1, 1302514674, p_remainder); \
- GEMMLOWP_EXP_BARREL_SHIFTER_OPT_II(p_y, 0, 790015084, p_remainder); \
- GEMMLOWP_EXP_BARREL_SHIFTER_OPT_II(p_y, 1, 290630308, p_remainder); \
- GEMMLOWP_EXP_BARREL_SHIFTER_OPT_II(p_y, 2, 39332535, p_remainder); \
- GEMMLOWP_EXP_BARREL_SHIFTER_OPT_II(p_y, 3, 720401, p_remainder); \
- GEMMLOWP_EXP_BARREL_SHIFTER_OPT_II(p_y, 4, 242, p_remainder); \
- } \
- p_exp_y = p_y; \
- p_const = AE_MOVPA24(Q23); \
- xtbool2 flag_eq = AE_EQP24(p_inp_t, AE_ZEROP48()); \
- AE_MOVTP24X2(p_exp_y, p_const, flag_eq); \
- }
-
-#define GEMMLOWP_EXP_BARREL_SHIFTER_OPT_I(p_in_out, exponent, \
- FixedPointMultiplier, p_remainder) \
- { \
- ae_p24x2s p_out; \
- \
- ae_p24x2s p_zero = AE_ZEROP48(); \
- \
- ae_p24x2s p_scale = AE_MOVPA24(1 << (18 + exponent)); \
- ae_p24x2s p_mask = p_remainder & p_scale; \
- \
- ae_p24x2s p_FixedPointMultiplier = AE_MOVPA24(FixedPointMultiplier >> 8); \
- \
- ae_q56s q_tmp1 = AE_MULFP24S_HH(p_in_out, p_FixedPointMultiplier); \
- p_out = AE_ROUNDSP24Q48SYM(q_tmp1); \
- \
- xtbool2 flag_le = AE_LTP24S(p_zero, p_mask); \
- AE_MOVTP24X2(p_in_out, p_out, flag_le); \
- }
-
-#define EXP_Q26_I(p_exp_y, p_inp_t) \
- { \
- ae_p24x2s p_x1_in, p_x2, p_x3, p_x4, p_x4_by_4, p_y1, p_y2, p_y3, p_y4, \
- p_y5, p_y6, p_y; \
- \
- p_x2 = p_inp_t & AE_MOVPA24(MASK_16BITS); \
- ae_p24x2s p_a_mod_quater_minus_q_1_by_4 = \
- p_x2 - AE_MOVPA24(ONE_QUATER_Q18); \
- ae_p24x2s p_x_in = p_a_mod_quater_minus_q_1_by_4 << 5; \
- ae_p24x2s p_remainder = p_a_mod_quater_minus_q_1_by_4 - p_inp_t; \
- \
- p_x1_in = AE_ADDSP24S(p_x_in, AE_MOVPA24(CONSTANT_1_OVER_8_Q23)); \
- \
- ae_q56s q_tmp1 = AE_MULFP24S_HH(p_x1_in, p_x1_in); \
- p_x2 = AE_ROUNDSP24Q48SYM(q_tmp1); \
- \
- q_tmp1 = AE_MULFP24S_HH(p_x2, p_x1_in); \
- p_x3 = AE_ROUNDSP24Q48SYM(q_tmp1); \
- \
- q_tmp1 = AE_MULFP24S_HH(p_x2, p_x2); \
- p_x4 = AE_ROUNDSP24Q48SYM(q_tmp1); \
- p_x4_by_4 = p_x4 >> 2; \
- \
- p_y1 = AE_ADDSP24S(p_x4_by_4, p_x3); \
- \
- ae_p24x2s p_const = AE_MOVPA24(CONSTANT_1_OVER_3_Q23); \
- q_tmp1 = AE_MULFP24S_HH(p_y1, p_const); \
- p_y2 = AE_ROUNDSP24Q48SYM(q_tmp1); \
- \
- p_y3 = AE_ADDSP24S(p_y2, p_x2); \
- p_y4 = p_y3 >> 1; \
- \
- p_y5 = AE_ADDSP24S(p_x1_in, p_y4); /* ADD32S_T(y5, x1_in, y4); */ \
- \
- p_const = AE_MOVPA24(CONSTANT_TERM_Q23); \
- q_tmp1 = AE_MULFP24S_HH(p_y5, p_const); \
- p_y6 = AE_ROUNDSP24Q48SYM(q_tmp1); \
- p_y = AE_ADDSP24S(p_y6, p_const); \
- \
- { \
- GEMMLOWP_EXP_BARREL_SHIFTER_OPT_I(p_y, -2, 1672461947, p_remainder); \
- GEMMLOWP_EXP_BARREL_SHIFTER_OPT_I(p_y, -1, 1302514674, p_remainder); \
- GEMMLOWP_EXP_BARREL_SHIFTER_OPT_I(p_y, 0, 790015084, p_remainder); \
- GEMMLOWP_EXP_BARREL_SHIFTER_OPT_I(p_y, 1, 290630308, p_remainder); \
- GEMMLOWP_EXP_BARREL_SHIFTER_OPT_I(p_y, 2, 39332535, p_remainder); \
- GEMMLOWP_EXP_BARREL_SHIFTER_OPT_I(p_y, 3, 720401, p_remainder); \
- GEMMLOWP_EXP_BARREL_SHIFTER_OPT_I(p_y, 4, 242, p_remainder); \
- } \
- p_exp_y = p_y; \
- p_const = AE_MOVPA24(Q23); \
- xtbool2 flag_eq = AE_EQP24(p_inp_t, AE_ZEROP48()); \
- AE_MOVTP24X2(p_exp_y, p_const, flag_eq); \
- }
-
-WORD32 xa_nn_vec_softmax_asym8u_8(UWORD8 *__restrict__ pOut,
- const UWORD8 *__restrict__ pVec,
- WORD32 diffmin, WORD32 input_beta_left_shift,
- WORD32 input_beta_multiplier,
- WORD32 vec_length, pVOID pScratch) {
- /* NULL pointer checks */
- XA_NNLIB_ARG_CHK_PTR(pOut, -1);
- XA_NNLIB_ARG_CHK_PTR(pVec, -1);
- XA_NNLIB_ARG_CHK_PTR(pScratch, -1);
- /* Pointer alignment checks */
- /* No alignment (1-byte) needed for any pointer */
- /* Basic Parameter checks */
- XA_NNLIB_ARG_CHK_COND((vec_length <= 0), -1);
- XA_NNLIB_ARG_CHK_COND(
- ((input_beta_left_shift < -31) || (input_beta_left_shift > 31)), -1);
- XA_NNLIB_ARG_CHK_COND((input_beta_multiplier < 0), -1);
-
- int i;
- int shift_bits_reciprocal;
- UWORD8 *p_in;
- WORD32 *__restrict pExp = (WORD32 *)ALIGN_PTR(pScratch, ALIGNMENT);
- ae_p24f *__restrict pTmpScratch = (ae_p24f *)pExp;
- int max;
- ae_p24x2s p_x;
- ae_p24x2s p_max = AE_MOVPA24(0xFF800000);
- ae_p24x2s p_recip_sum_exp;
- int pre_loop_count;
- int main_loop_count;
- int post_loop_count;
-
- if (vec_length > 1) {
- pre_loop_count = (int)pVec & 0x1;
- main_loop_count = vec_length - pre_loop_count;
- post_loop_count = (main_loop_count & 1);
- main_loop_count = main_loop_count >> 1;
- } else {
- pre_loop_count = 0;
- main_loop_count = 0;
- post_loop_count = vec_length;
- }
-
- /* Calculating Max */
- {
- p_in = (UWORD8 *)pVec;
-
- if (pre_loop_count) {
- p_x = AE_MOVPA24(*p_in++);
- p_max = AE_MAXP24S(p_max, p_x);
- }
-
- p_in -= 2;
- for (i = 0; i < main_loop_count; i++) {
- AE_LP8X2F_IU(p_x, p_in, 2 * sizeof(WORD8));
- p_x = AE_SRLIP24(p_x, 16);
- p_max = AE_MAXP24S(p_max, p_x);
- }
-
- if (post_loop_count) {
- p_in += 2;
- p_x = AE_MOVPA24(*p_in);
- p_max = AE_MAXP24S(p_max, p_x);
- }
- p_max = AE_MAXP24S(p_max, AE_SELP24_LH(p_max, p_max));
- max = AE_MOVAP24S_L(p_max);
- }
-
- /* Calculate exponents */
- {
- ae_q56s q_sum_exp = AE_ZEROQ56();
- ae_p24x2s p_rem_x, p_y, p_exp_y;
- ae_p24x2s p_zero = AE_ZEROP48();
- ae_p24x2s p_input_beta_multiplier =
- AE_MOVPA24((input_beta_multiplier >> 8));
- ae_p24x2s p_diffmin = AE_MOVPA24(diffmin);
- int input_beta_left_shift_for_24bit = input_beta_left_shift - 8;
-
- p_in = (UWORD8 *)pVec;
- WUR_AE_SAR(input_beta_left_shift_for_24bit);
-
- if (pre_loop_count) {
- p_x = AE_MOVPA24(*p_in++);
- p_rem_x = p_x - p_max;
- p_y = AE_SLLSSP24S(p_rem_x);
-
- ae_q56s q_dequantized_y1 = AE_MULFP24S_LL(p_y, p_input_beta_multiplier);
-
- ae_p24x2s p_dequantized_y1 = AE_ROUNDSP24Q48ASYM(q_dequantized_y1);
-
- EXP_Q26_I(p_exp_y, p_dequantized_y1)
-
- xtbool2 flag_cmp = AE_LTP24S(p_rem_x, p_diffmin);
- AE_MOVTP24X2(p_exp_y, p_zero, flag_cmp);
-
- *pTmpScratch++ = p_exp_y;
-
- p_exp_y = p_exp_y >> 4;
-
- AE_MULAP24S_LL(q_sum_exp, p_exp_y, AE_MOVPA24(1));
- }
-
- p_in -= 2;
- for (i = 0; i < main_loop_count; i++) {
- AE_LP8X2F_IU(p_x, p_in, 2 * sizeof(WORD8));
- p_x = AE_SRLIP24(p_x, 16);
- p_rem_x = p_x - p_max;
- p_y = AE_SLLSSP24S(p_rem_x);
-
- ae_q56s q_dequantized_y1 = AE_MULFP24S_HH(p_y, p_input_beta_multiplier);
- ae_q56s q_dequantized_y2 = AE_MULFP24S_LL(p_y, p_input_beta_multiplier);
-
- ae_p24x2s p_dequantized_y1 = AE_ROUNDSP24Q48ASYM(q_dequantized_y1);
- ae_p24x2s p_dequantized_y2 = AE_ROUNDSP24Q48ASYM(q_dequantized_y2);
-
- ae_p24x2s p_dequantized =
- AE_SELP24_LL(p_dequantized_y1, p_dequantized_y2);
-
- EXP_Q26_II(p_exp_y, p_dequantized)
-
- xtbool2 flag_cmp = AE_LTP24S(p_rem_x, p_diffmin);
- AE_MOVTP24X2(p_exp_y, p_zero, flag_cmp);
-
- *pTmpScratch++ = AE_SELP24_HH(p_exp_y, p_exp_y);
- *pTmpScratch++ = p_exp_y; /* store lower element */
-
- p_exp_y = p_exp_y >> 4;
-
- AE_MULAAP24S_HH_LL(q_sum_exp, p_exp_y, AE_MOVPA24(1));
- }
- if (post_loop_count) {
- p_in += 2;
-
- p_x = AE_MOVPA24(*p_in);
- p_rem_x = p_x - p_max;
- p_y = AE_SLLSSP24S(p_rem_x);
-
- ae_q56s q_dequantized_y1 = AE_MULFP24S_LL(p_y, p_input_beta_multiplier);
-
- ae_p24x2s p_dequantized_y1 = AE_ROUNDSP24Q48ASYM(q_dequantized_y1);
-
- EXP_Q26_I(p_exp_y, p_dequantized_y1)
-
- xtbool2 flag_cmp = AE_LTP24S(p_rem_x, p_diffmin);
- AE_MOVTP24X2(p_exp_y, p_zero, flag_cmp);
-
- *pTmpScratch = p_exp_y;
-
- p_exp_y = p_exp_y >> 4;
-
- AE_MULAP24S_LL(q_sum_exp, p_exp_y, AE_MOVPA24(1));
- }
- p_recip_sum_exp = GetReciprocal(q_sum_exp, 12, &shift_bits_reciprocal);
- }
-
- /* Calculate output */
- {
- ae_p24x2s p_exp;
-
- int shift_val = -(shift_bits_reciprocal + 31 - 8 - 8);
-
- ae_p24x2s p_min = AE_ZEROP48();
- ae_p24x2s p_max = AE_MOVPA24(255);
-
- for (i = 0; i<vec_length >> 1; i++) {
- int out;
-
- p_exp = *(ae_p24x2f *)&pExp[2 * i];
-
- ae_q56s q_tmp1 = AE_MULFP24S_HH(p_exp, p_recip_sum_exp);
- ae_q56s q_tmp2 = AE_MULFP24S_LL(p_exp, p_recip_sum_exp);
-
- q_tmp1 = AE_SLAASQ56S(q_tmp1, shift_val);
- q_tmp2 = AE_SLAASQ56S(q_tmp2, shift_val);
-
- ae_p24x2s p_out1 = AE_ROUNDSP24Q48ASYM(q_tmp1);
- ae_p24x2s p_out2 = AE_ROUNDSP24Q48ASYM(q_tmp2);
-
- ae_p24x2s p_out = AE_SELP24_LL(p_out1, p_out2);
-
- p_out = AE_MAXP24S(p_out, p_min);
- p_out = AE_MINP24S(p_out, p_max);
-
- out = AE_MOVAP24S_H(p_out);
- *pOut++ = (UWORD8)out;
-
- out = AE_MOVAP24S_L(p_out);
- *pOut++ = (UWORD8)out;
- }
-
- if (vec_length & 0x1) {
- int out;
-
- p_exp = *(ae_p24f *)&pExp[vec_length - 1];
-
- ae_q56s q_tmp1 = AE_MULFP24S_LL(p_exp, p_recip_sum_exp);
-
- q_tmp1 = AE_SLAASQ56S(q_tmp1, shift_val);
-
- ae_p24x2s p_out = AE_ROUNDSP24Q48ASYM(q_tmp1);
-
- p_out = AE_MAXP24S(p_out, p_min);
- p_out = AE_MINP24S(p_out, p_max);
-
- out = AE_MOVAP24S_L(p_out);
- *pOut++ = (UWORD8)out;
- }
- }
-
- return 0;
-}
-
-WORD32 xa_nn_vec_softmax_asym8s_8(WORD8 *__restrict__ pOut,
- const WORD8 *__restrict__ pVec,
- WORD32 diffmin, WORD32 input_beta_left_shift,
- WORD32 input_beta_multiplier,
- WORD32 vec_length, pVOID pScratch) {
- /* NULL pointer checks */
- XA_NNLIB_ARG_CHK_PTR(pOut, -1);
- XA_NNLIB_ARG_CHK_PTR(pVec, -1);
- XA_NNLIB_ARG_CHK_PTR(pScratch, -1);
- /* Pointer alignment checks */
- /* No alignment (1-byte) needed for any pointer */
- /* Basic Parameter checks */
- XA_NNLIB_ARG_CHK_COND((vec_length <= 0), -1);
- XA_NNLIB_ARG_CHK_COND(
- ((input_beta_left_shift < -31) || (input_beta_left_shift > 31)), -1);
- XA_NNLIB_ARG_CHK_COND((input_beta_multiplier < 0), -1);
-
- int i;
- int shift_bits_reciprocal;
- WORD8 *p_in;
- WORD32 *__restrict pExp = (WORD32 *)ALIGN_PTR(pScratch, ALIGNMENT);
- ae_p24x2s p_recip_sum_exp;
- ae_p24x2s p_x;
- ae_p24x2s p_max = AE_MOVPA24(0xFF800000);
-
- int pre_loop_count;
- int main_loop_count;
- int post_loop_count;
-
- if (vec_length > 1) {
- pre_loop_count = (int)pVec & 0x1;
- main_loop_count = vec_length - pre_loop_count;
- post_loop_count = (main_loop_count & 1);
- main_loop_count = main_loop_count >> 1;
- } else {
- pre_loop_count = 0;
- main_loop_count = 0;
- post_loop_count = vec_length;
- }
-
- /* Calculating Max */
- {
- p_in = (WORD8 *)pVec;
-
- if (pre_loop_count) {
- p_x = AE_MOVPA24(*p_in++);
- p_max = AE_MAXP24S(p_max, p_x);
- }
-
- p_in -= 2;
- for (i = 0; i < main_loop_count; i++) {
- AE_LP8X2F_IU(p_x, p_in, 2 * sizeof(WORD8));
- p_max = AE_MAXP24S(p_max, p_x);
- }
- p_max = AE_SRAIP24(p_max, 16);
-
- if (post_loop_count) {
- p_in += 2;
- p_x = AE_MOVPA24(*p_in);
- p_max = AE_MAXP24S(p_max, p_x);
- }
- p_max = AE_MAXP24S(p_max, AE_SELP24_LH(p_max, p_max));
- }
-
- /* Calculate exponents */
- {
- ae_q56s q_sum_exp = AE_ZEROQ56();
- ae_p24x2s p_rem_x, p_y, p_exp_y;
- ae_p24x2s p_zero = AE_ZEROP48();
- ae_p24x2s p_input_beta_multiplier =
- AE_MOVPA24((input_beta_multiplier >> 8));
- ae_p24x2s p_diffmin = AE_MOVPA24(diffmin);
- int input_beta_left_shift_for_24bit = input_beta_left_shift - 8;
-
- p_in = (WORD8 *)pVec;
- WUR_AE_SAR(input_beta_left_shift_for_24bit);
-
- if (pre_loop_count) {
- p_x = AE_MOVPA24(*p_in++);
- p_rem_x = p_x - p_max;
- p_y = AE_SLLSSP24S(p_rem_x);
-
- ae_q56s q_dequantized_y1 = AE_MULFP24S_LL(p_y, p_input_beta_multiplier);
-
- ae_p24x2s p_dequantized_y1 = AE_ROUNDSP24Q48ASYM(q_dequantized_y1);
-
- EXP_Q26_I(p_exp_y, p_dequantized_y1)
-
- xtbool2 flag_cmp = AE_LTP24S(p_rem_x, p_diffmin);
- AE_MOVTP24X2(p_exp_y, p_zero, flag_cmp);
-
- *(ae_p24f *)&pExp[0] = p_exp_y;
-
- p_exp_y = p_exp_y >> 4;
-
- AE_MULAP24S_LL(q_sum_exp, p_exp_y, AE_MOVPA24(1));
- }
-
- p_in -= 2;
- for (i = 0; i < main_loop_count; i++) {
- AE_LP8X2F_IU(p_x, p_in, 2 * sizeof(WORD8));
- p_x = AE_SRAIP24(p_x, 16);
- p_rem_x = p_x - p_max;
- p_y = AE_SLLSSP24S(p_rem_x);
-
- ae_q56s q_dequantized_y1 = AE_MULFP24S_HH(p_y, p_input_beta_multiplier);
- ae_q56s q_dequantized_y2 = AE_MULFP24S_LL(p_y, p_input_beta_multiplier);
-
- ae_p24x2s p_dequantized_y1 = AE_ROUNDSP24Q48ASYM(q_dequantized_y1);
- ae_p24x2s p_dequantized_y2 = AE_ROUNDSP24Q48ASYM(q_dequantized_y2);
-
- ae_p24x2s p_dequantized =
- AE_SELP24_LL(p_dequantized_y1, p_dequantized_y2);
-
- EXP_Q26_II(p_exp_y, p_dequantized)
-
- xtbool2 flag_cmp = AE_LTP24S(p_rem_x, p_diffmin);
- AE_MOVTP24X2(p_exp_y, p_zero, flag_cmp);
-
- //*(ae_p24x2f *)&pExp[pre_loop_count + 2*i] = p_exp_y;
- *(ae_p24f *)&pExp[pre_loop_count + 2 * i] =
- AE_SELP24_HH(p_exp_y, p_exp_y);
- *(ae_p24f *)&pExp[pre_loop_count + 2 * i + 1] =
- AE_SELP24_LL(p_exp_y, p_exp_y);
- //*(ae_p24f *)&pExp[0] = p_exp_y;
-
- p_exp_y = p_exp_y >> 4;
-
- AE_MULAAP24S_HH_LL(q_sum_exp, p_exp_y, AE_MOVPA24(1));
- }
-
- if (post_loop_count) {
- p_in += 2;
-
- p_x = AE_MOVPA24(*p_in);
- p_rem_x = p_x - p_max;
- p_y = AE_SLLSSP24S(p_rem_x);
-
- ae_q56s q_dequantized_y1 = AE_MULFP24S_LL(p_y, p_input_beta_multiplier);
-
- ae_p24x2s p_dequantized_y1 = AE_ROUNDSP24Q48ASYM(q_dequantized_y1);
-
- EXP_Q26_I(p_exp_y, p_dequantized_y1)
-
- xtbool2 flag_cmp = AE_LTP24S(p_rem_x, p_diffmin);
- AE_MOVTP24X2(p_exp_y, p_zero, flag_cmp);
-
- *(ae_p24f *)&pExp[vec_length - 1] = p_exp_y;
-
- p_exp_y = p_exp_y >> 4;
-
- AE_MULAP24S_LL(q_sum_exp, p_exp_y, AE_MOVPA24(1));
- }
-
- p_recip_sum_exp = GetReciprocal(q_sum_exp, 12, &shift_bits_reciprocal);
- }
-
- /* Calculate output */
- pExp = (WORD32 *)ALIGN_PTR(pScratch, ALIGNMENT);
- {
- ae_p24x2s p_exp;
-
- int shift_val = -(shift_bits_reciprocal + 31 - 8 - 8);
-
- ae_p24x2s p_min = AE_MOVPA24(-128);
- ae_p24x2s p_max = AE_MOVPA24(127);
-
- for (i = 0; i<vec_length >> 1; i++) {
- int out;
-
- p_exp = *(ae_p24x2f *)&pExp[2 * i];
-
- ae_q56s q_tmp1 = AE_MULFP24S_HH(p_exp, p_recip_sum_exp);
- ae_q56s q_tmp2 = AE_MULFP24S_LL(p_exp, p_recip_sum_exp);
-
- q_tmp1 = AE_SLAASQ56S(q_tmp1, shift_val);
- q_tmp2 = AE_SLAASQ56S(q_tmp2, shift_val);
-
- ae_p24x2s p_out1 = AE_ROUNDSP24Q48ASYM(q_tmp1);
- ae_p24x2s p_out2 = AE_ROUNDSP24Q48ASYM(q_tmp2);
-
- ae_p24x2s p_out = AE_SELP24_LL(p_out1, p_out2);
-
- p_out = AE_SUBSP24S(p_out, AE_MOVPA24(128));
- p_out = AE_MAXP24S(p_out, p_min);
- p_out = AE_MINP24S(p_out, p_max);
-
- out = AE_MOVAP24S_H(p_out);
- *pOut++ = (WORD8)out;
-
- out = AE_MOVAP24S_L(p_out);
- *pOut++ = (WORD8)out;
- }
-
- if (vec_length & 0x1) {
- int out;
-
- p_exp = *(ae_p24f *)&pExp[vec_length - 1];
-
- ae_q56s q_tmp1 = AE_MULFP24S_LL(p_exp, p_recip_sum_exp);
-
- q_tmp1 = AE_SLAASQ56S(q_tmp1, shift_val);
-
- ae_p24x2s p_out = AE_ROUNDSP24Q48ASYM(q_tmp1);
-
- p_out = AE_SUBSP24S(p_out, AE_MOVPA24(128));
- p_out = AE_MAXP24S(p_out, p_min);
- p_out = AE_MINP24S(p_out, p_max);
-
- out = AE_MOVAP24S_L(p_out);
- *pOut++ = (WORD8)out;
- }
- }
-
- return 0;
-}
-
-WORD32 xa_nn_vec_softmax_asym8s_16(WORD16 *__restrict__ pOut,
- const WORD8 *__restrict__ pVec,
- WORD32 diffmin, WORD32 input_beta_left_shift,
- WORD32 input_beta_multiplier,
- WORD32 vec_length, pVOID pScratch) {
- /* NULL pointer checks */
- XA_NNLIB_ARG_CHK_PTR(pOut, -1);
- XA_NNLIB_ARG_CHK_PTR(pVec, -1);
- XA_NNLIB_ARG_CHK_PTR(pScratch, -1);
- /* Pointer alignment checks */
- /* No alignment (1-byte) needed for any pointer */
- /* Basic Parameter checks */
- XA_NNLIB_ARG_CHK_COND((vec_length <= 0), -1);
- XA_NNLIB_ARG_CHK_COND(
- ((input_beta_left_shift < -31) || (input_beta_left_shift > 31)), -1);
- XA_NNLIB_ARG_CHK_COND((input_beta_multiplier < 0), -1);
-
- int i;
- int shift_bits_reciprocal;
- WORD8 *p_in;
- WORD32 *__restrict pExp = (WORD32 *)ALIGN_PTR(pScratch, ALIGNMENT);
- ae_p24x2s p_recip_sum_exp;
- ae_p24x2s p_x;
- ae_p24x2s p_max = AE_MOVPA24(0xFF800000);
-
- int pre_loop_count;
- int main_loop_count;
- int post_loop_count;
-
- if (vec_length > 1) {
- pre_loop_count = (int)pVec & 0x1;
- main_loop_count = vec_length - pre_loop_count;
- post_loop_count = (main_loop_count & 1);
- main_loop_count = main_loop_count >> 1;
- } else {
- pre_loop_count = 0;
- main_loop_count = 0;
- post_loop_count = vec_length;
- }
-
- /* Calculating Max */
- {
- p_in = (WORD8 *)pVec;
-
- if (pre_loop_count) {
- p_x = AE_MOVPA24(*p_in++);
- p_max = AE_MAXP24S(p_max, p_x);
- }
-
- p_in -= 2;
- for (i = 0; i < main_loop_count; i++) {
- AE_LP8X2F_IU(p_x, p_in, 2 * sizeof(WORD8));
- p_max = AE_MAXP24S(p_max, p_x);
- }
- p_max = AE_SRAIP24(p_max, 16);
-
- if (post_loop_count) {
- p_in += 2;
- p_x = AE_MOVPA24(*p_in);
- p_max = AE_MAXP24S(p_max, p_x);
- }
- p_max = AE_MAXP24S(p_max, AE_SELP24_LH(p_max, p_max));
- }
-
- /* Calculate exponents */
- {
- ae_q56s q_sum_exp = AE_ZEROQ56();
- ae_p24x2s p_rem_x, p_y, p_exp_y;
- ae_p24x2s p_zero = AE_ZEROP48();
- ae_p24x2s p_input_beta_multiplier =
- AE_MOVPA24((input_beta_multiplier >> 8));
- ae_p24x2s p_diffmin = AE_MOVPA24(diffmin);
- int input_beta_left_shift_for_24bit = input_beta_left_shift - 8;
-
- p_in = (WORD8 *)pVec;
- WUR_AE_SAR(input_beta_left_shift_for_24bit);
-
- if (pre_loop_count) {
- p_x = AE_MOVPA24(*p_in++);
- p_rem_x = p_x - p_max;
- p_y = AE_SLLSSP24S(p_rem_x);
-
- ae_q56s q_dequantized_y1 = AE_MULFP24S_LL(p_y, p_input_beta_multiplier);
-
- ae_p24x2s p_dequantized_y1 = AE_ROUNDSP24Q48ASYM(q_dequantized_y1);
-
- EXP_Q26_I(p_exp_y, p_dequantized_y1)
-
- xtbool2 flag_cmp = AE_LTP24S(p_rem_x, p_diffmin);
- AE_MOVTP24X2(p_exp_y, p_zero, flag_cmp);
-
- *(ae_p24f *)&pExp[0] = p_exp_y;
-
- p_exp_y = p_exp_y >> 4;
-
- AE_MULAP24S_LL(q_sum_exp, p_exp_y, AE_MOVPA24(1));
- }
-
- p_in -= 2;
- for (i = 0; i < main_loop_count; i++) {
- AE_LP8X2F_IU(p_x, p_in, 2 * sizeof(WORD8));
- p_x = AE_SRAIP24(p_x, 16);
- p_rem_x = p_x - p_max;
- p_y = AE_SLLSSP24S(p_rem_x);
-
- ae_q56s q_dequantized_y1 = AE_MULFP24S_HH(p_y, p_input_beta_multiplier);
- ae_q56s q_dequantized_y2 = AE_MULFP24S_LL(p_y, p_input_beta_multiplier);
-
- ae_p24x2s p_dequantized_y1 = AE_ROUNDSP24Q48ASYM(q_dequantized_y1);
- ae_p24x2s p_dequantized_y2 = AE_ROUNDSP24Q48ASYM(q_dequantized_y2);
-
- ae_p24x2s p_dequantized =
- AE_SELP24_LL(p_dequantized_y1, p_dequantized_y2);
-
- EXP_Q26_II(p_exp_y, p_dequantized)
-
- xtbool2 flag_cmp = AE_LTP24S(p_rem_x, p_diffmin);
- AE_MOVTP24X2(p_exp_y, p_zero, flag_cmp);
-
- *(ae_p24f *)&pExp[pre_loop_count + 2 * i] =
- AE_SELP24_HH(p_exp_y, p_exp_y);
- *(ae_p24f *)&pExp[pre_loop_count + 2 * i + 1] =
- AE_SELP24_LL(p_exp_y, p_exp_y);
-
- p_exp_y = p_exp_y >> 4;
-
- AE_MULAAP24S_HH_LL(q_sum_exp, p_exp_y, AE_MOVPA24(1));
- }
-
- if (post_loop_count) {
- p_in += 2;
-
- p_x = AE_MOVPA24(*p_in);
- p_rem_x = p_x - p_max;
- p_y = AE_SLLSSP24S(p_rem_x);
-
- ae_q56s q_dequantized_y1 = AE_MULFP24S_LL(p_y, p_input_beta_multiplier);
-
- ae_p24x2s p_dequantized_y1 = AE_ROUNDSP24Q48ASYM(q_dequantized_y1);
-
- EXP_Q26_I(p_exp_y, p_dequantized_y1)
-
- xtbool2 flag_cmp = AE_LTP24S(p_rem_x, p_diffmin);
- AE_MOVTP24X2(p_exp_y, p_zero, flag_cmp);
-
- *(ae_p24f *)&pExp[vec_length - 1] = p_exp_y;
-
- p_exp_y = p_exp_y >> 4;
-
- AE_MULAP24S_LL(q_sum_exp, p_exp_y, AE_MOVPA24(1));
- }
-
- p_recip_sum_exp = GetReciprocal(q_sum_exp, 12, &shift_bits_reciprocal);
- }
-
- /* Calculate output */
- pExp = (WORD32 *)ALIGN_PTR(pScratch, ALIGNMENT);
- {
- ae_p24x2s p_exp;
-
- int shift_val = -(shift_bits_reciprocal + 31 - 8 - 16);
-
- ae_p24x2s p_min = AE_MOVPA24(-32768);
- ae_p24x2s p_max = AE_MOVPA24(32767);
-
- for (i = 0; i<vec_length >> 1; i++) {
- int out;
-
- p_exp = *(ae_p24x2f *)&pExp[2 * i];
-
- ae_q56s q_tmp1 = AE_MULFP24S_HH(p_exp, p_recip_sum_exp);
- ae_q56s q_tmp2 = AE_MULFP24S_LL(p_exp, p_recip_sum_exp);
-
- q_tmp1 = AE_SLAASQ56S(q_tmp1, shift_val);
- q_tmp2 = AE_SLAASQ56S(q_tmp2, shift_val);
-
- ae_p24x2s p_out1 = AE_ROUNDSP24Q48ASYM(q_tmp1);
- ae_p24x2s p_out2 = AE_ROUNDSP24Q48ASYM(q_tmp2);
-
- ae_p24x2s p_out = AE_SELP24_LL(p_out1, p_out2);
-
- p_out = AE_SUBSP24S(p_out, AE_MOVPA24(32768));
- p_out = AE_MAXP24S(p_out, p_min);
- p_out = AE_MINP24S(p_out, p_max);
-
- out = AE_MOVAP24S_H(p_out);
- *pOut++ = (WORD16)out;
-
- out = AE_MOVAP24S_L(p_out);
- *pOut++ = (WORD16)out;
- }
-
- if (vec_length & 0x1) {
- int out;
-
- p_exp = *(ae_p24f *)&pExp[vec_length - 1];
-
- ae_q56s q_tmp1 = AE_MULFP24S_LL(p_exp, p_recip_sum_exp);
-
- q_tmp1 = AE_SLAASQ56S(q_tmp1, shift_val);
-
- ae_p24x2s p_out = AE_ROUNDSP24Q48ASYM(q_tmp1);
-
- p_out = AE_SUBSP24S(p_out, AE_MOVPA24(32768));
- p_out = AE_MAXP24S(p_out, p_min);
- p_out = AE_MINP24S(p_out, p_max);
-
- out = AE_MOVAP24S_L(p_out);
- *pOut++ = (WORD16)out;
- }
- }
-
- return 0;
-}
-
-int xa_nn_get_softmax_scratch_size(int inp_precision, int out_precision,
- int length) {
- int size_of_one_elm_in_bytes, total_bytes;
- (void)out_precision;
-
- /* This function returns scratch size required by softmax implementation in
- bytes scratch memory is needed to save exponents of inputs computed in the
- function, every exponent is computed as 32 bit (4 bytes) number currently*/
- switch (inp_precision) {
- case PREC_ASYM8U:
- size_of_one_elm_in_bytes = 4;
- break;
- case PREC_SYM8S:
- size_of_one_elm_in_bytes = 4;
- break;
- default:
- size_of_one_elm_in_bytes = 4;
- break;
- }
-
- total_bytes = size_of_one_elm_in_bytes * length;
- total_bytes = ALIGNED_SIZE(total_bytes, ALIGNMENT);
-
- return total_bytes;
-}
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/kernels/basic/hifi_mini/xa_nn_dot_prod_16x16.c b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/kernels/basic/hifi_mini/xa_nn_dot_prod_16x16.c
deleted file mode 100644
index 80697ca..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/kernels/basic/hifi_mini/xa_nn_dot_prod_16x16.c
+++ /dev/null
@@ -1,175 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
-
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xa_nnlib_common.h"
-#include "xa_nnlib_common_macros.h"
-
-/*----------------------------Main function---------------------------------*/
-WORD32 xa_nn_dot_prod_16x16_asym8s(
- WORD8 *__restrict__ p_out, /* pointer to output */
- const WORD16 *__restrict__ p_inp1_start, /* pointer to input1 */
- const WORD16 *__restrict__ p_inp2_start, /* pointer to input2 */
- const WORD32 *bias_ptr, WORD32 vec_length, WORD32 out_multiplier,
- WORD32 out_shift, WORD32 out_zero_bias, WORD32 vec_count) {
- /* NULL pointer checks */
- XA_NNLIB_ARG_CHK_PTR(p_out, -1);
- XA_NNLIB_ARG_CHK_PTR(p_inp1_start, -1);
- XA_NNLIB_ARG_CHK_PTR(p_inp2_start, -1);
- /* Pointer alignment checks */
- XA_NNLIB_ARG_CHK_ALIGN(p_inp1_start, sizeof(WORD16), -1);
- XA_NNLIB_ARG_CHK_ALIGN(p_inp2_start, sizeof(WORD16), -1);
- /* Basic Parameter checks */
- XA_NNLIB_ARG_CHK_COND((vec_length <= 0), -1);
- XA_NNLIB_ARG_CHK_COND((out_shift < -31 || out_shift > 31), -1);
- XA_NNLIB_ARG_CHK_COND((out_zero_bias < -128 || out_zero_bias > 127), -1);
- int left_shift, right_shift;
- int loopcnt;
- const WORD32 bias_buffer[2] = {0, 0};
- const WORD32 *p_bias_load;
- WORD32 bias_address_increment = sizeof(WORD32);
-
- if (bias_ptr == NULL) {
- p_bias_load = bias_buffer - 1;
- bias_address_increment = 0;
- } else {
- p_bias_load = bias_ptr - 1;
- }
-
- left_shift = out_shift < 0 ? 0 : out_shift;
- right_shift = out_shift > 0 ? 0 : -out_shift;
- /* inp1 4-bytes aligned, inp2 4-bytes aligned and vec_length is multple of 2
- */
- if (((((unsigned)p_inp1_start) & 0x3) == 0) &&
- ((((unsigned)p_inp2_start) & 0x3) == 0) && ((vec_length & 0x1) == 0)) {
- const ae_p16x2s *pt_inp1, *pt_inp2;
- pt_inp1 = (const ae_p16x2s *)&p_inp1_start[-2];
- pt_inp2 = (const ae_p16x2s *)&p_inp2_start[-2];
-
- ae_q56s output_int8_max_56 = AE_CVTQ48A32S(127);
- ae_q56s output_int8_min_56 = AE_CVTQ48A32S(-128);
- for (loopcnt = 0; loopcnt < vec_count; loopcnt++) {
- ae_p24x2s dp_inp1, dp_inp2;
- ae_q32s dq_out32;
- ae_q56s dq_out;
- int i;
-
- AE_LQ32F_XU(dq_out, (ae_q32s *)p_bias_load, bias_address_increment);
-
- for (i = 0; i < (vec_length >> 1); i++) {
- AE_LP16X2F_IU(dp_inp1, pt_inp1, 4);
- AE_LP16X2F_IU(dp_inp2, pt_inp2, 4);
- AE_MULAAP24S_HH_LL(dq_out, dp_inp1, dp_inp2);
- }
-
- dq_out32 = AE_SATQ48S(dq_out);
- MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
- out_multiplier, left_shift, right_shift);
- dq_out = AE_ADDSQ56S(dq_out, AE_CVTQ48A32S(out_zero_bias));
-
- dq_out = AE_MAXQ56S(dq_out, output_int8_min_56);
- dq_out = AE_MINQ56S(dq_out, output_int8_max_56);
- *p_out++ = (WORD8)AE_TRUNCA32Q48(dq_out);
- }
- } else {
-#ifndef DISABLE_NNLIB_UNALIGNED_SUPPORT
- for (loopcnt = 0; loopcnt < vec_count; loopcnt++) {
- ae_p24x2s dp_inp1, dp_inp2;
- ae_q32s dq_out32;
- ae_q56s dq_out;
- int i;
- const WORD16 *p_inp1 = (WORD16 *)&p_inp1_start[loopcnt * vec_length];
- const WORD16 *p_inp2 = (WORD16 *)&p_inp2_start[loopcnt * vec_length];
-
- AE_LQ32F_XU(dq_out, (ae_q32s *)p_bias_load, bias_address_increment);
-
- if (((((unsigned)p_inp1) & 3) != 0 && (((unsigned)p_inp2) & 3) != 0) ||
- ((((unsigned)p_inp1) & 3) == 0 && (((unsigned)p_inp2) & 3) == 0)) {
- int pre_loop_count = ((int)(((unsigned)p_inp1) & 3)) >> 1;
- if (pre_loop_count != 0) {
- dp_inp1 = AE_CVTP24A16X2_LL(*p_inp1++, *p_inp2++);
- AE_MULAP24S_HL(dq_out, dp_inp1, dp_inp1);
- }
- const ae_p16x2s *pt_inp1, *pt_inp2;
- pt_inp1 = (const ae_p16x2s *)(p_inp1 - 2);
- pt_inp2 = (const ae_p16x2s *)(p_inp2 - 2);
- for (i = 0; i < (vec_length - pre_loop_count - 1); i += 2) {
- AE_LP16X2F_IU(dp_inp1, pt_inp1, 4);
- AE_LP16X2F_IU(dp_inp2, pt_inp2, 4);
- AE_MULAAP24S_HH_LL(dq_out, dp_inp1, dp_inp2);
- }
- if ((vec_length - pre_loop_count) & 1) {
- dp_inp1 = AE_CVTP24A16X2_LL(p_inp1[i], p_inp2[i]);
- AE_MULAP24S_HL(dq_out, dp_inp1, dp_inp1);
- }
- } else {
- /* One of the pointers in not aligned to 4 bytes, if it is p_inp1, swap
- * them */
- if ((((unsigned)p_inp1) & 3) != 0) {
- const WORD16 *p_tmp;
- p_tmp = p_inp1;
- p_inp1 = p_inp2;
- p_inp2 = p_tmp;
- }
- const ae_p16x2s *pt_inp1 = (const ae_p16x2s *)(p_inp1 - 2);
- const ae_p16s *pt_inp2 = (const ae_p16s *)(p_inp2 - 1);
- for (i = 0; i < (vec_length - 1); i += 2) {
- ae_p24x2s dp_t0, dp_t1;
- AE_LP16X2F_IU(dp_inp1, pt_inp1, 4);
- AE_LP16F_IU(dp_t0, pt_inp2, 2);
- AE_LP16F_IU(dp_t1, pt_inp2, 2);
- dp_inp2 = AE_SELP24_LL(dp_t0, dp_t1);
- AE_MULAAP24S_HH_LL(dq_out, dp_inp1, dp_inp2);
- }
- if (vec_length & 1) {
- dp_inp1 = AE_CVTP24A16X2_LL(p_inp1[i], p_inp2[i]);
- AE_MULAP24S_HL(dq_out, dp_inp1, dp_inp1);
- }
- }
- dq_out32 = AE_SATQ48S(dq_out);
- MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
- out_multiplier, left_shift, right_shift);
- dq_out = AE_ADDSQ56S(dq_out, AE_CVTQ48A32S(out_zero_bias));
- WORD32 out_i32 = AE_TRUNCA32Q48(AE_SATQ48S(dq_out));
- out_i32 = out_i32 < -128 ? -128 : out_i32;
- out_i32 = out_i32 > 127 ? 127 : out_i32;
- *p_out++ = (WORD8)out_i32;
- }
-#else
- return 1;
-#endif
- }
- return 0;
-}
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/kernels/fc/hifi_mini/xa_nn_fully_connected.c b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/kernels/fc/hifi_mini/xa_nn_fully_connected.c
deleted file mode 100644
index 0a9325e..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/kernels/fc/hifi_mini/xa_nn_fully_connected.c
+++ /dev/null
@@ -1,142 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
-
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xa_nnlib_err_chk.h"
-#include "xa_nnlib_kernels_api.h"
-#include "xa_type_def.h"
-
-WORD32 xa_nn_fully_connected_asym8uxasym8u_asym8u(
- UWORD8 *__restrict__ p_out, const UWORD8 *__restrict__ p_weight,
- const UWORD8 *__restrict__ p_inp, const WORD32 *__restrict__ p_bias,
- WORD32 weight_depth, WORD32 out_depth, WORD32 input_zero_bias,
- WORD32 weight_zero_bias, WORD32 out_multiplier, WORD32 out_shift,
- WORD32 out_zero_bias) {
- /* NULL pointer checks */
- XA_NNLIB_ARG_CHK_PTR(p_out, -1);
- XA_NNLIB_ARG_CHK_PTR(p_weight, -1);
- XA_NNLIB_ARG_CHK_PTR(p_inp, -1);
- XA_NNLIB_ARG_CHK_PTR(p_bias, -1);
- /* Pointer alignment checks */
- XA_NNLIB_ARG_CHK_ALIGN(p_bias, sizeof(WORD32), -1);
- /* Basic Parameter checks */
- XA_NNLIB_ARG_CHK_COND((out_depth <= 0), -1);
- XA_NNLIB_ARG_CHK_COND((input_zero_bias < -255 || input_zero_bias > 0), -1);
- XA_NNLIB_ARG_CHK_COND((weight_zero_bias < -255 || weight_zero_bias > 0), -1);
- XA_NNLIB_ARG_CHK_COND((out_shift < -31 || out_shift > 31), -1);
- XA_NNLIB_ARG_CHK_COND((out_zero_bias < 0 || out_zero_bias > 255), -1);
-
- WORD32 ret = 0;
- ret = xa_nn_matXvec_out_stride_asym8uxasym8u_asym8u(
- p_out, p_weight, p_inp, p_bias, out_depth /* rows */
- ,
- weight_depth /* cols */
- ,
- weight_depth /* row_stride */
- ,
- 1 /* out_stride */
- ,
- weight_zero_bias, input_zero_bias, out_multiplier, out_shift,
- out_zero_bias);
- return ret;
-}
-
-WORD32 xa_nn_fully_connected_sym8sxasym8s_asym8s(
- WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_weight,
- const WORD8 *__restrict__ p_inp, const WORD32 *__restrict__ p_bias,
- WORD32 weight_depth, WORD32 out_depth, WORD32 input_zero_bias,
- WORD32 out_multiplier, WORD32 out_shift, WORD32 out_zero_bias) {
- /* NULL pointer checks */
- XA_NNLIB_ARG_CHK_PTR(p_out, -1);
- XA_NNLIB_ARG_CHK_PTR(p_weight, -1);
- XA_NNLIB_ARG_CHK_PTR(p_inp, -1);
- XA_NNLIB_ARG_CHK_PTR(p_bias, -1);
- /* Pointer alignment checks */
- XA_NNLIB_ARG_CHK_ALIGN(p_bias, sizeof(WORD32), -1);
- /* Basic Parameter checks */
- XA_NNLIB_ARG_CHK_COND((out_depth <= 0), -1);
- XA_NNLIB_ARG_CHK_COND((input_zero_bias < -127 || input_zero_bias > 128), -1);
- XA_NNLIB_ARG_CHK_COND((out_shift < -31 || out_shift > 31), -1);
- XA_NNLIB_ARG_CHK_COND((out_zero_bias < -128 || out_zero_bias > 127), -1);
-
- WORD32 ret = 0;
- ret = xa_nn_matXvec_out_stride_sym8sxasym8s_asym8s(
- p_out, p_weight, p_inp, p_bias, out_depth /* rows */
- ,
- weight_depth /* cols */
- ,
- weight_depth /* row_stride */
- ,
- 1 /* out_stride */
- ,
- input_zero_bias, out_multiplier, out_shift, out_zero_bias);
- return ret;
-}
-
-WORD32 xa_nn_fully_connected_asym8sxasym8s_asym8s(
- WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_weight,
- const WORD8 *__restrict__ p_inp, const WORD32 *__restrict__ p_bias,
- WORD32 weight_depth, WORD32 out_depth, WORD32 weight_zero_bias,
- WORD32 input_zero_bias, WORD32 out_multiplier, WORD32 out_shift,
- WORD32 out_zero_bias) {
- /* NULL pointer checks */
- XA_NNLIB_ARG_CHK_PTR(p_out, -1);
- XA_NNLIB_ARG_CHK_PTR(p_weight, -1);
- XA_NNLIB_ARG_CHK_PTR(p_inp, -1);
- XA_NNLIB_ARG_CHK_PTR(p_bias, -1);
- /* Pointer alignment checks */
- XA_NNLIB_ARG_CHK_ALIGN(p_bias, sizeof(WORD32), -1);
- /* Basic Parameter checks */
- XA_NNLIB_ARG_CHK_COND((out_depth <= 0), -1);
- XA_NNLIB_ARG_CHK_COND((weight_zero_bias < -127 || weight_zero_bias > 128),
- -1);
- XA_NNLIB_ARG_CHK_COND((input_zero_bias < -127 || input_zero_bias > 128), -1);
- XA_NNLIB_ARG_CHK_COND((out_shift < -31 || out_shift > 31), -1);
- XA_NNLIB_ARG_CHK_COND((out_zero_bias < -128 || out_zero_bias > 127), -1);
-
- WORD32 ret = 0;
- ret = xa_nn_matXvec_out_stride_asym8sxasym8s_asym8s(
- p_out, p_weight, p_inp, p_bias, out_depth /* rows */
- ,
- weight_depth /* cols */
- ,
- weight_depth /* row_stride */
- ,
- 1 /* out_stride */
- ,
- weight_zero_bias, input_zero_bias, out_multiplier, out_shift,
- out_zero_bias);
- return ret;
-}
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/kernels/matXvec/hifi_mini/xa_nn_matXvec_sym8sxasym8s.c b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/kernels/matXvec/hifi_mini/xa_nn_matXvec_sym8sxasym8s.c
deleted file mode 100644
index 71af822..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/algo/kernels/matXvec/hifi_mini/xa_nn_matXvec_sym8sxasym8s.c
+++ /dev/null
@@ -1,1053 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
-
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xa_nnlib_common.h"
-#include "xa_nnlib_common_macros.h"
-
-#define ADD_OUT_OFFSET_STORE_INT8(ptr, data, out_offset) \
- { \
- data = AE_ADDSQ56S(data, AE_CVTQ48A32S(out_offset)); \
- int out_i32 = AE_TRUNCA32Q48(AE_SATQ48S(data)); \
- out_i32 = out_i32 < -128 ? -128 : out_i32; \
- out_i32 = out_i32 > 127 ? 127 : out_i32; \
- *(ptr) = (WORD8)out_i32; \
- }
-
-WORD32 xa_nn_matXvec_out_stride_sym8sxasym8s_asym8s(
- WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_mat1,
- const WORD8 *__restrict__ p_vec1, const WORD32 *__restrict__ p_bias,
- WORD32 rows, WORD32 cols1, WORD32 row_stride1, WORD32 out_stride,
- WORD32 vec1_zero_bias, WORD32 out_multiplier, WORD32 out_shift,
- WORD32 out_zero_bias) {
- /* NULL pointer checks */
- XA_NNLIB_ARG_CHK_PTR(p_out, -1);
- XA_NNLIB_ARG_CHK_PTR(p_mat1, -1);
- XA_NNLIB_ARG_CHK_PTR(p_vec1, -1);
- /* Pointer alignment checks */
- XA_NNLIB_ARG_CHK_ALIGN(p_bias, sizeof(WORD32), -1);
- /* Basic Parameter checks */
- XA_NNLIB_ARG_CHK_COND((rows <= 0), -1);
- XA_NNLIB_ARG_CHK_COND((cols1 <= 0), -1);
- XA_NNLIB_ARG_CHK_COND((row_stride1 < cols1), -1);
- XA_NNLIB_ARG_CHK_COND((vec1_zero_bias < -127 || vec1_zero_bias > 128), -1);
- XA_NNLIB_ARG_CHK_COND((out_shift < -31 || out_shift > 31), -1);
- XA_NNLIB_ARG_CHK_COND((out_zero_bias < -128 || out_zero_bias > 127), -1);
-
- /* Iterators used in for loops */
- int m_itr, c_itr, i;
- /* Assign initial value so this value will be used in trailing loop */
- m_itr = 0;
- /* Shifts to match with Tensorflow */
- int left_shift, right_shift;
-
- left_shift = out_shift < 0 ? 0 : out_shift;
- right_shift = out_shift > 0 ? 0 : -out_shift;
-
- const WORD8 *p_mat1_0, *p_mat1_1, *p_mat1_2, *p_mat1_3;
- const WORD8 *p_vec1_0;
- ae_p24x2s dp_mat1_0, dp_mat1_1, dp_mat1_2, dp_mat1_3, dp_vec1_0;
- ae_p24x2s dp_vec1_zb;
- ae_q56s dq_acc[4];
- ae_q56s dq_out32, dq_out;
-
- dp_vec1_zb = AE_MOVPA24(vec1_zero_bias);
- if (((((unsigned)p_mat1) & 1) == 0) && ((((unsigned)p_vec1) & 1) == 0) &&
- ((row_stride1 & 1) == 0)) {
- for (m_itr = 0; m_itr < (rows - 3); m_itr += 4) {
- p_mat1_0 = &p_mat1[(m_itr + 0) * row_stride1 - 2];
- p_mat1_1 = &p_mat1[(m_itr + 1) * row_stride1 - 2];
- p_mat1_2 = &p_mat1[(m_itr + 2) * row_stride1 - 2];
- p_mat1_3 = &p_mat1[(m_itr + 3) * row_stride1 - 2];
- p_vec1_0 = p_vec1 - 2;
-
- dq_acc[0] = dq_acc[1] = dq_acc[2] = dq_acc[3] = AE_ZEROQ56();
-
- /* AE_LP8X2F* instruction loads in upper 8 bits of P register, so shifting
- vector right by 16 to get multiplication result in middle 32 bits of Q
- register (lower 16 bits 0) */
- for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
- AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
- AE_LP8X2F_IU(dp_mat1_1, p_mat1_1, 2);
- AE_LP8X2F_IU(dp_mat1_2, p_mat1_2, 2);
- AE_LP8X2F_IU(dp_mat1_3, p_mat1_3, 2);
- AE_LP8X2F_IU(dp_vec1_0, p_vec1_0, 2);
- dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
- dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
- AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
- AE_MULAAP24S_HH_LL(dq_acc[1], dp_mat1_1, dp_vec1_0);
- AE_MULAAP24S_HH_LL(dq_acc[2], dp_mat1_2, dp_vec1_0);
- AE_MULAAP24S_HH_LL(dq_acc[3], dp_mat1_3, dp_vec1_0);
- }
- /* Pointers are aligned so can do 8X2 loads and ignore L parts of
- * registers */
- if (cols1 & 1) {
- AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
- AE_LP8X2F_IU(dp_mat1_1, p_mat1_1, 2);
- AE_LP8X2F_IU(dp_mat1_2, p_mat1_2, 2);
- AE_LP8X2F_IU(dp_mat1_3, p_mat1_3, 2);
- AE_LP8X2F_IU(dp_vec1_0, p_vec1_0, 2);
- dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
- dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
- AE_MULAP24S_HH(dq_acc[0], dp_mat1_0, dp_vec1_0);
- AE_MULAP24S_HH(dq_acc[1], dp_mat1_1, dp_vec1_0);
- AE_MULAP24S_HH(dq_acc[2], dp_mat1_2, dp_vec1_0);
- AE_MULAP24S_HH(dq_acc[3], dp_mat1_3, dp_vec1_0);
- }
-
- if (p_bias != NULL) {
- for (i = 0; i < 4; i++)
- dq_acc[i] = AE_ADDSQ56S(dq_acc[i], *(ae_q32s *)(&p_bias[m_itr + i]));
- }
-
- for (i = 0; i < 4; i++) {
- dq_out32 = AE_SATQ48S(dq_acc[i]);
- MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
- out_multiplier, left_shift,
- right_shift);
- ADD_OUT_OFFSET_STORE_INT8(&p_out[(m_itr + i) * out_stride], dq_out,
- out_zero_bias);
- }
- }
- for (; m_itr < rows; m_itr++) {
- p_mat1_0 = &p_mat1[m_itr * row_stride1 - 2];
- p_vec1_0 = p_vec1 - 2;
-
- dq_acc[0] = AE_ZEROQ56();
-
- for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
- AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
- AE_LP8X2F_IU(dp_vec1_0, p_vec1_0, 2);
- dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
- dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
- AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
- }
- /* Pointers are aligned so can do 8X2 loads and ignore L parts of
- * registers */
- if (cols1 & 1) {
- AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
- AE_LP8X2F_IU(dp_vec1_0, p_vec1_0, 2);
- dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
- dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
- AE_MULAP24S_HH(dq_acc[0], dp_mat1_0, dp_vec1_0);
- }
-
- if (p_bias != NULL)
- dq_acc[0] = AE_ADDSQ56S(dq_acc[0], *(ae_q32s *)(&p_bias[m_itr]));
-
- dq_out32 = AE_SATQ48S(dq_acc[0]);
- MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
- out_multiplier, left_shift, right_shift);
- ADD_OUT_OFFSET_STORE_INT8(&p_out[m_itr * out_stride], dq_out,
- out_zero_bias);
- }
- } else {
- if ((((unsigned)p_mat1) & 1) == 0) {
- for (m_itr = 0; m_itr < (rows - 3); m_itr += 4) {
- p_mat1_0 = &p_mat1[(m_itr + 0) * row_stride1 - 2];
- p_mat1_1 = &p_mat1[(m_itr + 1) * row_stride1];
- p_mat1_2 = &p_mat1[(m_itr + 2) * row_stride1 - 2];
- p_mat1_3 = &p_mat1[(m_itr + 3) * row_stride1];
- p_vec1_0 = p_vec1;
-
- dq_acc[0] = dq_acc[1] = dq_acc[2] = dq_acc[3] = AE_ZEROQ56();
-
- /* Matrix elements are kept in upper 8 bits of P registers, vector
- elements are kept in lower 8 bits of P registers, typecasting to UWORD8
- is to avoid extra extui instructions since signed 8-bit load in not
- there in HiFiMini */
- for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
- AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
- dp_mat1_1 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_1[c_itr],
- (UWORD8)p_mat1_1[c_itr + 1]);
- AE_LP8X2F_IU(dp_mat1_2, p_mat1_2, 2);
- dp_mat1_3 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_3[c_itr],
- (UWORD8)p_mat1_3[c_itr + 1]);
- dp_vec1_0 = AE_CVTP24A16X2_LL((UWORD8)p_vec1_0[c_itr],
- (UWORD8)p_vec1_0[c_itr + 1]);
- dp_mat1_1 = AE_SLLIP24(dp_mat1_1, 8);
- dp_mat1_3 = AE_SLLIP24(dp_mat1_3, 8);
- dp_vec1_0 = AE_SLLIP24(dp_vec1_0, 8);
- dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
- dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
- AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
- AE_MULAAP24S_HH_LL(dq_acc[1], dp_mat1_1, dp_vec1_0);
- AE_MULAAP24S_HH_LL(dq_acc[2], dp_mat1_2, dp_vec1_0);
- AE_MULAAP24S_HH_LL(dq_acc[3], dp_mat1_3, dp_vec1_0);
- }
- if (cols1 & 1) {
- ae_p24x2s dp_mat1_01, dp_mat1_23;
- dp_mat1_01 =
- AE_CVTP24A16X2_LL((UWORD8)p_mat1_0[2], (UWORD8)p_mat1_1[c_itr]);
- dp_mat1_23 =
- AE_CVTP24A16X2_LL((UWORD8)p_mat1_2[2], (UWORD8)p_mat1_3[c_itr]);
- dp_vec1_0 = AE_MOVPA24(p_vec1_0[c_itr]);
- dp_mat1_01 = AE_SLLIP24(dp_mat1_01, 8);
- dp_mat1_23 = AE_SLLIP24(dp_mat1_23, 8);
- dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
- AE_MULAP24S_HH(dq_acc[0], dp_mat1_01, dp_vec1_0);
- AE_MULAP24S_LL(dq_acc[1], dp_mat1_01, dp_vec1_0);
- AE_MULAP24S_HH(dq_acc[2], dp_mat1_23, dp_vec1_0);
- AE_MULAP24S_LL(dq_acc[3], dp_mat1_23, dp_vec1_0);
- }
-
- if (p_bias != NULL) {
- for (i = 0; i < 4; i++)
- dq_acc[i] =
- AE_ADDSQ56S(dq_acc[i], *(ae_q32s *)(&p_bias[m_itr + i]));
- }
-
- for (i = 0; i < 4; i++) {
- dq_out32 = AE_SATQ48S(dq_acc[i]);
- MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
- out_multiplier, left_shift,
- right_shift);
- ADD_OUT_OFFSET_STORE_INT8(&p_out[(m_itr + i) * out_stride], dq_out,
- out_zero_bias);
- }
- }
- } else {
- for (m_itr = 0; m_itr < (rows - 3); m_itr += 4) {
- p_mat1_0 = &p_mat1[(m_itr + 0) * row_stride1];
- p_mat1_1 = &p_mat1[(m_itr + 1) * row_stride1];
- p_mat1_2 = &p_mat1[(m_itr + 2) * row_stride1];
- p_mat1_3 = &p_mat1[(m_itr + 3) * row_stride1];
- p_vec1_0 = p_vec1;
-
- dq_acc[0] = dq_acc[1] = dq_acc[2] = dq_acc[3] = AE_ZEROQ56();
-
- /* Matrix elements are kept in upper 8 bits of P registers, vector
- elements are kept in lower 8 bits of P registers, typecasting to UWORD8
- is to avoid extra extui instructions since signed 8-bit load in not
- there in HiFiMini */
- for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
- dp_mat1_0 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_0[c_itr],
- (UWORD8)p_mat1_0[c_itr + 1]);
- dp_mat1_1 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_1[c_itr],
- (UWORD8)p_mat1_1[c_itr + 1]);
- dp_mat1_2 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_2[c_itr],
- (UWORD8)p_mat1_2[c_itr + 1]);
- dp_mat1_3 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_3[c_itr],
- (UWORD8)p_mat1_3[c_itr + 1]);
- dp_vec1_0 = AE_CVTP24A16X2_LL((UWORD8)p_vec1_0[c_itr],
- (UWORD8)p_vec1_0[c_itr + 1]);
- dp_mat1_0 = AE_SLLIP24(dp_mat1_0, 8);
- dp_mat1_1 = AE_SLLIP24(dp_mat1_1, 8);
- dp_mat1_2 = AE_SLLIP24(dp_mat1_2, 8);
- dp_mat1_3 = AE_SLLIP24(dp_mat1_3, 8);
- dp_vec1_0 = AE_SLLIP24(dp_vec1_0, 8);
- dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
- dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
- AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
- AE_MULAAP24S_HH_LL(dq_acc[1], dp_mat1_1, dp_vec1_0);
- AE_MULAAP24S_HH_LL(dq_acc[2], dp_mat1_2, dp_vec1_0);
- AE_MULAAP24S_HH_LL(dq_acc[3], dp_mat1_3, dp_vec1_0);
- }
- if (cols1 & 1) {
- ae_p24x2s dp_mat1_01, dp_mat1_23;
- dp_mat1_01 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_0[c_itr],
- (UWORD8)p_mat1_1[c_itr]);
- dp_mat1_23 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_2[c_itr],
- (UWORD8)p_mat1_3[c_itr]);
- dp_vec1_0 = AE_MOVPA24(p_vec1_0[c_itr]);
- dp_mat1_01 = AE_SLLIP24(dp_mat1_01, 8);
- dp_mat1_23 = AE_SLLIP24(dp_mat1_23, 8);
- dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
- AE_MULAP24S_HH(dq_acc[0], dp_mat1_01, dp_vec1_0);
- AE_MULAP24S_LL(dq_acc[1], dp_mat1_01, dp_vec1_0);
- AE_MULAP24S_HH(dq_acc[2], dp_mat1_23, dp_vec1_0);
- AE_MULAP24S_LL(dq_acc[3], dp_mat1_23, dp_vec1_0);
- }
-
- if (p_bias != NULL) {
- for (i = 0; i < 4; i++)
- dq_acc[i] =
- AE_ADDSQ56S(dq_acc[i], *(ae_q32s *)(&p_bias[m_itr + i]));
- }
-
- for (i = 0; i < 4; i++) {
- dq_out32 = AE_SATQ48S(dq_acc[i]);
- MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
- out_multiplier, left_shift,
- right_shift);
- ADD_OUT_OFFSET_STORE_INT8(&p_out[(m_itr + i) * out_stride], dq_out,
- out_zero_bias);
- }
- }
- }
- for (; m_itr < rows; m_itr++) {
- p_mat1_0 = &p_mat1[m_itr * row_stride1];
- p_vec1_0 = p_vec1;
-
- dq_acc[0] = AE_ZEROQ56();
-
- for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
- dp_mat1_0 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_0[c_itr],
- (UWORD8)p_mat1_0[c_itr + 1]);
- dp_vec1_0 = AE_CVTP24A16X2_LL((UWORD8)p_vec1_0[c_itr],
- (UWORD8)p_vec1_0[c_itr + 1]);
- dp_mat1_0 = AE_SLLIP24(dp_mat1_0, 8);
- dp_vec1_0 = AE_SLLIP24(dp_vec1_0, 8);
- dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
- dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
- AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
- }
- if (cols1 & 1) {
- dp_mat1_0 = AE_CVTP24A16(p_mat1_0[c_itr]);
- dp_vec1_0 = AE_CVTP24A16(p_vec1_0[c_itr]);
- dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, AE_CVTP24A16(vec1_zero_bias));
- AE_MULAP24S_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
- }
-
- if (p_bias != NULL)
- dq_acc[0] = AE_ADDSQ56S(dq_acc[0], *(ae_q32s *)(&p_bias[m_itr]));
-
- dq_out32 = AE_SATQ48S(dq_acc[0]);
- MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
- out_multiplier, left_shift, right_shift);
- ADD_OUT_OFFSET_STORE_INT8(&p_out[m_itr * out_stride], dq_out,
- out_zero_bias);
- }
- }
-
- return 0;
-}
-
-WORD32 xa_nn_matXvec_out_stride_asym8sxasym8s_asym8s(
- WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_mat1,
- const WORD8 *__restrict__ p_vec1, const WORD32 *__restrict__ p_bias,
- WORD32 rows, WORD32 cols1, WORD32 row_stride1, WORD32 out_stride,
- WORD32 mat1_zero_bias, WORD32 vec1_zero_bias, WORD32 out_multiplier,
- WORD32 out_shift, WORD32 out_zero_bias) {
- /* NULL pointer checks */
- XA_NNLIB_ARG_CHK_PTR(p_out, -1);
- XA_NNLIB_ARG_CHK_PTR(p_mat1, -1);
- XA_NNLIB_ARG_CHK_PTR(p_vec1, -1);
- /* Pointer alignment checks */
- XA_NNLIB_ARG_CHK_ALIGN(p_bias, sizeof(WORD32), -1);
- /* Basic Parameter checks */
- XA_NNLIB_ARG_CHK_COND((rows <= 0), -1);
- XA_NNLIB_ARG_CHK_COND((cols1 <= 0), -1);
- XA_NNLIB_ARG_CHK_COND((row_stride1 < cols1), -1);
- XA_NNLIB_ARG_CHK_COND((mat1_zero_bias < -127 || mat1_zero_bias > 128), -1);
- XA_NNLIB_ARG_CHK_COND((vec1_zero_bias < -127 || vec1_zero_bias > 128), -1);
- XA_NNLIB_ARG_CHK_COND((out_shift < -31 || out_shift > 31), -1);
- XA_NNLIB_ARG_CHK_COND((out_zero_bias < -128 || out_zero_bias > 127), -1);
-
- /* Iterators used in for loops */
- int m_itr, c_itr, i;
- /* Assign initial value so this value will be used in trailing loop */
- m_itr = 0;
- /* Shifts to match with Tensorflow */
- int left_shift, right_shift;
-
- left_shift = out_shift < 0 ? 0 : out_shift;
- right_shift = out_shift > 0 ? 0 : -out_shift;
-
- const WORD8 *p_mat1_0, *p_mat1_1, *p_mat1_2, *p_mat1_3;
- const WORD8 *p_vec1_0;
- ae_p24x2s dp_mat1_0, dp_mat1_1, dp_mat1_2, dp_mat1_3, dp_vec1_0;
- ae_p24x2s dp_vec1_zb, dp_mat1_zb;
- ae_q56s dq_acc_0, dq_acc_1, dq_acc_2, dq_acc_3;
- ae_q56s dq_out32, dq_out;
-
- const WORD32 bias_buffer[1] = {0};
- const WORD32 *p_bias_load;
- WORD32 bias_address_increment = sizeof(WORD32);
-
- dp_mat1_zb = AE_MOVPA24(mat1_zero_bias);
- dp_vec1_zb = AE_MOVPA24(vec1_zero_bias);
-
- /* Check for alignment conditions */
- if (((((unsigned)p_mat1) & 1) == 0) && ((((unsigned)p_vec1) & 1) == 0) &&
- ((row_stride1 & 1) == 0) && ((cols1 & 1) == 0)) {
- /* Calculate partial zero offset adjustment outside the loop */
- WORD32 zero_offset_adjustment;
-
- // Constant part of total zero bias
- ae_q56s dq_zero_bias_sum =
- AE_CVTQ48A32S(vec1_zero_bias * cols1 * mat1_zero_bias);
-
- WORD8 *p_inp = (WORD8 *)p_vec1 - 2;
- for (i = 0; i < (cols1 >> 1); i++) {
- /* Input vector is in MSB 8 bits, matrix zero bias in LSB 8 bits */
- AE_LP8X2F_IU(dp_vec1_0, p_inp, 2);
- AE_MULAAP24S_HH_LL(dq_zero_bias_sum, dp_vec1_0, dp_mat1_zb);
- }
- /* Product is already aligned to bits 16 to 47 in QR register. */
- zero_offset_adjustment = AE_TRUNCA32Q48(dq_zero_bias_sum);
-
- /* If bias is not provided, use a dummy zero value from bias_buffer. */
- if (p_bias == NULL) {
- p_bias_load = bias_buffer - 1;
- bias_address_increment = 0;
- } else {
- p_bias_load = p_bias - 1;
- }
-
- for (m_itr = 0; m_itr < (rows - 3); m_itr += 4) {
- p_mat1_0 = &p_mat1[(m_itr + 0) * row_stride1 - 2];
- p_mat1_1 = &p_mat1[(m_itr + 1) * row_stride1 - 2];
- p_mat1_2 = &p_mat1[(m_itr + 2) * row_stride1 - 2];
- p_mat1_3 = &p_mat1[(m_itr + 3) * row_stride1 - 2];
- p_vec1_0 = p_vec1 - 2;
-
- AE_LQ32F_XU(dq_acc_0, (ae_q32s *)p_bias_load, bias_address_increment);
- AE_LQ32F_XU(dq_acc_1, (ae_q32s *)p_bias_load, bias_address_increment);
- AE_LQ32F_XU(dq_acc_2, (ae_q32s *)p_bias_load, bias_address_increment);
- AE_LQ32F_XU(dq_acc_3, (ae_q32s *)p_bias_load, bias_address_increment);
-
- dq_acc_0 = AE_ADDQ56(dq_acc_0, AE_CVTQ48A32S(zero_offset_adjustment));
- dq_acc_1 = AE_ADDQ56(dq_acc_1, AE_CVTQ48A32S(zero_offset_adjustment));
- dq_acc_2 = AE_ADDQ56(dq_acc_2, AE_CVTQ48A32S(zero_offset_adjustment));
- dq_acc_3 = AE_ADDQ56(dq_acc_3, AE_CVTQ48A32S(zero_offset_adjustment));
-
- /* AE_LP8X2F* instruction loads in upper 8 bits of P register, so shifting
- vector right by 16 to get multiplication result in middle 32 bits of Q
- register (lower 16 bits 0) */
- for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
- AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
- AE_LP8X2F_IU(dp_mat1_1, p_mat1_1, 2);
- AE_LP8X2F_IU(dp_mat1_2, p_mat1_2, 2);
- AE_LP8X2F_IU(dp_mat1_3, p_mat1_3, 2);
- AE_LP8X2F_IU(dp_vec1_0, p_vec1_0, 2);
-
- dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
- dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
-
- AE_MULAAP24S_HH_LL(dq_acc_0, dp_mat1_0, dp_vec1_0);
- AE_MULAAP24S_HH_LL(dq_acc_1, dp_mat1_1, dp_vec1_0);
- AE_MULAAP24S_HH_LL(dq_acc_2, dp_mat1_2, dp_vec1_0);
- AE_MULAAP24S_HH_LL(dq_acc_3, dp_mat1_3, dp_vec1_0);
- }
-
- /* Pointers are aligned so can do 8X2 loads and ignore L parts of
- * registers */
- if (cols1 & 1) {
- AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
- AE_LP8X2F_IU(dp_mat1_1, p_mat1_1, 2);
- AE_LP8X2F_IU(dp_mat1_2, p_mat1_2, 2);
- AE_LP8X2F_IU(dp_mat1_3, p_mat1_3, 2);
- AE_LP8X2F_IU(dp_vec1_0, p_vec1_0, 2);
-
- dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
- dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
-
- AE_MULAP24S_HH(dq_acc_0, dp_mat1_0, dp_vec1_0);
- AE_MULAP24S_HH(dq_acc_1, dp_mat1_1, dp_vec1_0);
- AE_MULAP24S_HH(dq_acc_2, dp_mat1_2, dp_vec1_0);
- AE_MULAP24S_HH(dq_acc_3, dp_mat1_3, dp_vec1_0);
- }
-
- dq_out32 = AE_SATQ48S(dq_acc_0);
- MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
- out_multiplier, left_shift, right_shift);
- ADD_OUT_OFFSET_STORE_INT8(&p_out[(m_itr + i) * out_stride], dq_out,
- out_zero_bias);
-
- dq_out32 = AE_SATQ48S(dq_acc_1);
- MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
- out_multiplier, left_shift, right_shift);
- ADD_OUT_OFFSET_STORE_INT8(&p_out[(m_itr + i) * out_stride], dq_out,
- out_zero_bias);
-
- dq_out32 = AE_SATQ48S(dq_acc_2);
- MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
- out_multiplier, left_shift, right_shift);
- ADD_OUT_OFFSET_STORE_INT8(&p_out[(m_itr + i) * out_stride], dq_out,
- out_zero_bias);
-
- dq_out32 = AE_SATQ48S(dq_acc_3);
- MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
- out_multiplier, left_shift, right_shift);
- ADD_OUT_OFFSET_STORE_INT8(&p_out[(m_itr + i) * out_stride], dq_out,
- out_zero_bias);
- }
- for (; m_itr < rows; m_itr++) {
- p_mat1_0 = &p_mat1[m_itr * row_stride1 - 2];
- p_vec1_0 = p_vec1 - 2;
-
- AE_LQ32F_XU(dq_acc_0, (ae_q32s *)p_bias_load, bias_address_increment);
- dq_acc_0 = AE_ADDQ56(dq_acc_0, AE_CVTQ48A32S(zero_offset_adjustment));
-
- for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
- AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
- AE_LP8X2F_IU(dp_vec1_0, p_vec1_0, 2);
- dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
- dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
-
- AE_MULAAP24S_HH_LL(dq_acc_0, dp_mat1_0, dp_vec1_0);
- }
-
- dq_out32 = AE_SATQ48S(dq_acc_0);
- MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
- out_multiplier, left_shift, right_shift);
- ADD_OUT_OFFSET_STORE_INT8(&p_out[m_itr * out_stride], dq_out,
- out_zero_bias);
- }
- } else {
-#ifndef DISABLE_NNLIB_UNALIGNED_SUPPORT
- ae_q56s dq_acc[4];
-
- if ((((unsigned)p_mat1) & 1) == 0) {
- for (m_itr = 0; m_itr < (rows - 3); m_itr += 4) {
- p_mat1_0 = &p_mat1[(m_itr + 0) * row_stride1 - 2];
- p_mat1_1 = &p_mat1[(m_itr + 1) * row_stride1];
- p_mat1_2 = &p_mat1[(m_itr + 2) * row_stride1 - 2];
- p_mat1_3 = &p_mat1[(m_itr + 3) * row_stride1];
- p_vec1_0 = p_vec1;
-
- dq_acc[0] = dq_acc[1] = dq_acc[2] = dq_acc[3] = AE_ZEROQ56();
-
- /* Matrix elements are kept in upper 8 bits of P registers, vector
- elements are kept in lower 8 bits of P registers, typecasting to UWORD8
- is to avoid extra extui instructions since signed 8-bit load in not
- there in HiFiMini */
- for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
- AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
- dp_mat1_1 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_1[c_itr],
- (UWORD8)p_mat1_1[c_itr + 1]);
- AE_LP8X2F_IU(dp_mat1_2, p_mat1_2, 2);
- dp_mat1_3 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_3[c_itr],
- (UWORD8)p_mat1_3[c_itr + 1]);
- dp_vec1_0 = AE_CVTP24A16X2_LL((UWORD8)p_vec1_0[c_itr],
- (UWORD8)p_vec1_0[c_itr + 1]);
- dp_mat1_1 = AE_SLLIP24(dp_mat1_1, 8);
- dp_mat1_3 = AE_SLLIP24(dp_mat1_3, 8);
- dp_vec1_0 = AE_SLLIP24(dp_vec1_0, 8);
- dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
- dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
-
- dp_mat1_0 = AE_SRAIP24(dp_mat1_0, 16);
- dp_mat1_0 = AE_ADDSP24S(dp_mat1_0, dp_mat1_zb);
- dp_mat1_1 = AE_SRAIP24(dp_mat1_1, 16);
- dp_mat1_1 = AE_ADDSP24S(dp_mat1_1, dp_mat1_zb);
- dp_mat1_2 = AE_SRAIP24(dp_mat1_2, 16);
- dp_mat1_2 = AE_ADDSP24S(dp_mat1_2, dp_mat1_zb);
- dp_mat1_3 = AE_SRAIP24(dp_mat1_3, 16);
- dp_mat1_3 = AE_ADDSP24S(dp_mat1_3, dp_mat1_zb);
-
- AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
- AE_MULAAP24S_HH_LL(dq_acc[1], dp_mat1_1, dp_vec1_0);
- AE_MULAAP24S_HH_LL(dq_acc[2], dp_mat1_2, dp_vec1_0);
- AE_MULAAP24S_HH_LL(dq_acc[3], dp_mat1_3, dp_vec1_0);
- }
- if (cols1 & 1) {
- ae_p24x2s dp_mat1_01, dp_mat1_23;
- dp_mat1_01 =
- AE_CVTP24A16X2_LL((UWORD8)p_mat1_0[2], (UWORD8)p_mat1_1[c_itr]);
- dp_mat1_23 =
- AE_CVTP24A16X2_LL((UWORD8)p_mat1_2[2], (UWORD8)p_mat1_3[c_itr]);
- dp_vec1_0 = AE_MOVPA24(p_vec1_0[c_itr]);
- dp_mat1_01 = AE_SLLIP24(dp_mat1_01, 8);
- dp_mat1_23 = AE_SLLIP24(dp_mat1_23, 8);
- dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
-
- dp_mat1_01 = AE_SRAIP24(dp_mat1_01, 16);
- dp_mat1_01 = AE_ADDSP24S(dp_mat1_01, dp_mat1_zb);
- dp_mat1_23 = AE_SRAIP24(dp_mat1_23, 16);
- dp_mat1_23 = AE_ADDSP24S(dp_mat1_23, dp_mat1_zb);
-
- AE_MULAP24S_HH(dq_acc[0], dp_mat1_01, dp_vec1_0);
- AE_MULAP24S_LL(dq_acc[1], dp_mat1_01, dp_vec1_0);
- AE_MULAP24S_HH(dq_acc[2], dp_mat1_23, dp_vec1_0);
- AE_MULAP24S_LL(dq_acc[3], dp_mat1_23, dp_vec1_0);
- }
-
- dq_acc[0] = AE_SLLISQ56S(dq_acc[0], 16);
- dq_acc[1] = AE_SLLISQ56S(dq_acc[1], 16);
- dq_acc[2] = AE_SLLISQ56S(dq_acc[2], 16);
- dq_acc[3] = AE_SLLISQ56S(dq_acc[3], 16);
-
- if (p_bias != NULL) {
- for (i = 0; i < 4; i++)
- dq_acc[i] =
- AE_ADDSQ56S(dq_acc[i], *(ae_q32s *)(&p_bias[m_itr + i]));
- }
-
- for (i = 0; i < 4; i++) {
- dq_out32 = AE_SATQ48S(dq_acc[i]);
- MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
- out_multiplier, left_shift,
- right_shift);
- ADD_OUT_OFFSET_STORE_INT8(&p_out[(m_itr + i) * out_stride], dq_out,
- out_zero_bias);
- }
- }
- } else {
- for (m_itr = 0; m_itr < (rows - 3); m_itr += 4) {
- p_mat1_0 = &p_mat1[(m_itr + 0) * row_stride1];
- p_mat1_1 = &p_mat1[(m_itr + 1) * row_stride1];
- p_mat1_2 = &p_mat1[(m_itr + 2) * row_stride1];
- p_mat1_3 = &p_mat1[(m_itr + 3) * row_stride1];
- p_vec1_0 = p_vec1;
-
- dq_acc[0] = dq_acc[1] = dq_acc[2] = dq_acc[3] = AE_ZEROQ56();
-
- /* Matrix elements are kept in upper 8 bits of P registers, vector
- elements are kept in lower 8 bits of P registers, typecasting to UWORD8
- is to avoid extra extui instructions since signed 8-bit load in not
- there in HiFiMini */
- for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
- dp_mat1_0 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_0[c_itr],
- (UWORD8)p_mat1_0[c_itr + 1]);
- dp_mat1_1 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_1[c_itr],
- (UWORD8)p_mat1_1[c_itr + 1]);
- dp_mat1_2 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_2[c_itr],
- (UWORD8)p_mat1_2[c_itr + 1]);
- dp_mat1_3 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_3[c_itr],
- (UWORD8)p_mat1_3[c_itr + 1]);
- dp_vec1_0 = AE_CVTP24A16X2_LL((UWORD8)p_vec1_0[c_itr],
- (UWORD8)p_vec1_0[c_itr + 1]);
- dp_mat1_0 = AE_SLLIP24(dp_mat1_0, 8);
- dp_mat1_1 = AE_SLLIP24(dp_mat1_1, 8);
- dp_mat1_2 = AE_SLLIP24(dp_mat1_2, 8);
- dp_mat1_3 = AE_SLLIP24(dp_mat1_3, 8);
- dp_vec1_0 = AE_SLLIP24(dp_vec1_0, 8);
- dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
- dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
-
- dp_mat1_0 = AE_SRAIP24(dp_mat1_0, 16);
- dp_mat1_0 = AE_ADDSP24S(dp_mat1_0, dp_mat1_zb);
- dp_mat1_1 = AE_SRAIP24(dp_mat1_1, 16);
- dp_mat1_1 = AE_ADDSP24S(dp_mat1_1, dp_mat1_zb);
- dp_mat1_2 = AE_SRAIP24(dp_mat1_2, 16);
- dp_mat1_2 = AE_ADDSP24S(dp_mat1_2, dp_mat1_zb);
- dp_mat1_3 = AE_SRAIP24(dp_mat1_3, 16);
- dp_mat1_3 = AE_ADDSP24S(dp_mat1_3, dp_mat1_zb);
-
- AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
- AE_MULAAP24S_HH_LL(dq_acc[1], dp_mat1_1, dp_vec1_0);
- AE_MULAAP24S_HH_LL(dq_acc[2], dp_mat1_2, dp_vec1_0);
- AE_MULAAP24S_HH_LL(dq_acc[3], dp_mat1_3, dp_vec1_0);
- }
- if (cols1 & 1) {
- ae_p24x2s dp_mat1_01, dp_mat1_23;
- dp_mat1_01 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_0[c_itr],
- (UWORD8)p_mat1_1[c_itr]);
- dp_mat1_23 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_2[c_itr],
- (UWORD8)p_mat1_3[c_itr]);
- dp_vec1_0 = AE_MOVPA24(p_vec1_0[c_itr]);
- dp_mat1_01 = AE_SLLIP24(dp_mat1_01, 8);
- dp_mat1_23 = AE_SLLIP24(dp_mat1_23, 8);
- dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
-
- dp_mat1_01 = AE_SRAIP24(dp_mat1_01, 16);
- dp_mat1_01 = AE_ADDSP24S(dp_mat1_01, dp_mat1_zb);
- dp_mat1_23 = AE_SRAIP24(dp_mat1_23, 16);
- dp_mat1_23 = AE_ADDSP24S(dp_mat1_23, dp_mat1_zb);
-
- AE_MULAP24S_HH(dq_acc[0], dp_mat1_01, dp_vec1_0);
- AE_MULAP24S_LL(dq_acc[1], dp_mat1_01, dp_vec1_0);
- AE_MULAP24S_HH(dq_acc[2], dp_mat1_23, dp_vec1_0);
- AE_MULAP24S_LL(dq_acc[3], dp_mat1_23, dp_vec1_0);
- }
-
- dq_acc[0] = AE_SLLISQ56S(dq_acc[0], 16);
- dq_acc[1] = AE_SLLISQ56S(dq_acc[1], 16);
- dq_acc[2] = AE_SLLISQ56S(dq_acc[2], 16);
- dq_acc[3] = AE_SLLISQ56S(dq_acc[3], 16);
-
- if (p_bias != NULL) {
- for (i = 0; i < 4; i++)
- dq_acc[i] =
- AE_ADDSQ56S(dq_acc[i], *(ae_q32s *)(&p_bias[m_itr + i]));
- }
-
- for (i = 0; i < 4; i++) {
- dq_out32 = AE_SATQ48S(dq_acc[i]);
- MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
- out_multiplier, left_shift,
- right_shift);
- ADD_OUT_OFFSET_STORE_INT8(&p_out[(m_itr + i) * out_stride], dq_out,
- out_zero_bias);
- }
- }
- }
- for (; m_itr < rows; m_itr++) {
- p_mat1_0 = &p_mat1[m_itr * row_stride1];
- p_vec1_0 = p_vec1;
-
- dq_acc[0] = AE_ZEROQ56();
-
- for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
- dp_mat1_0 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_0[c_itr],
- (UWORD8)p_mat1_0[c_itr + 1]);
- dp_vec1_0 = AE_CVTP24A16X2_LL((UWORD8)p_vec1_0[c_itr],
- (UWORD8)p_vec1_0[c_itr + 1]);
- dp_mat1_0 = AE_SLLIP24(dp_mat1_0, 8);
- dp_vec1_0 = AE_SLLIP24(dp_vec1_0, 8);
- dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
- dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
-
- dp_mat1_0 = AE_SRAIP24(dp_mat1_0, 16);
- dp_mat1_0 = AE_ADDSP24S(dp_mat1_0, dp_mat1_zb);
-
- AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
- }
- if (cols1 & 1) {
- dp_mat1_0 = AE_CVTP24A16(p_mat1_0[c_itr]);
- dp_vec1_0 = AE_CVTP24A16(p_vec1_0[c_itr]);
- dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, AE_CVTP24A16(vec1_zero_bias));
-
- dp_mat1_0 = AE_SRAIP24(dp_mat1_0, 16);
- dp_mat1_0 = AE_ADDSP24S(dp_mat1_0, dp_mat1_zb);
-
- AE_MULAP24S_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
- }
-
- dq_acc[0] = AE_SLLISQ56S(dq_acc[0], 16);
-
- if (p_bias != NULL)
- dq_acc[0] = AE_ADDSQ56S(dq_acc[0], *(ae_q32s *)(&p_bias[m_itr]));
-
- dq_out32 = AE_SATQ48S(dq_acc[0]);
- MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
- out_multiplier, left_shift, right_shift);
- ADD_OUT_OFFSET_STORE_INT8(&p_out[m_itr * out_stride], dq_out,
- out_zero_bias);
- }
-#else
- return 1;
-#endif
- }
-
- return 0;
-}
-
-#define STORE_INT16(ptr, data) \
- { \
- int out_i32 = AE_TRUNCA32Q48(AE_SATQ48S(data)); \
- out_i32 = out_i32 < (int)0xffff8000L ? (int)0xffff8000L : out_i32; \
- out_i32 = out_i32 > (int)0x7fff ? (int)0x7fff : out_i32; \
- *(ptr) = (WORD16)out_i32; \
- }
-
-WORD32 xa_nn_matXvec_out_stride_sym8sxasym8s_16(
- WORD16 *__restrict__ p_out, const WORD8 *__restrict__ p_mat1,
- const WORD8 *__restrict__ p_vec1, const WORD32 *__restrict__ p_bias,
- WORD32 rows, WORD32 cols1, WORD32 row_stride1, WORD32 out_stride,
- WORD32 vec1_zero_bias, WORD32 out_multiplier, WORD32 out_shift) {
- /* NULL pointer checks */
- XA_NNLIB_ARG_CHK_PTR(p_out, -1);
- XA_NNLIB_ARG_CHK_PTR(p_mat1, -1);
- XA_NNLIB_ARG_CHK_PTR(p_vec1, -1);
- /* Pointer alignment checks */
- XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(WORD16), -1);
- XA_NNLIB_ARG_CHK_ALIGN(p_bias, sizeof(WORD32), -1);
- /* Basic Parameter checks */
- XA_NNLIB_ARG_CHK_COND((rows <= 0), -1);
- XA_NNLIB_ARG_CHK_COND((cols1 <= 0), -1);
- XA_NNLIB_ARG_CHK_COND((row_stride1 < cols1), -1);
- XA_NNLIB_ARG_CHK_COND((vec1_zero_bias < -127 || vec1_zero_bias > 128), -1);
- XA_NNLIB_ARG_CHK_COND((out_shift < -31 || out_shift > 31), -1);
-
- /* Iterators used in for loops */
- int m_itr, c_itr, i;
- /* Assign initial value so this value will be used in trailing loop */
- m_itr = 0;
- /* Shifts to match with Tensorflow */
- int left_shift, right_shift;
-
- left_shift = out_shift < 0 ? 0 : out_shift;
- right_shift = out_shift > 0 ? 0 : -out_shift;
-
- const WORD8 *p_mat1_0, *p_mat1_1, *p_mat1_2, *p_mat1_3;
- const WORD8 *p_vec1_0;
- ae_p24x2s dp_mat1_0, dp_mat1_1, dp_mat1_2, dp_mat1_3, dp_vec1_0;
- ae_p24x2s dp_vec1_zb;
- ae_q56s dq_acc[4];
- ae_q56s dq_out32, dq_out;
-
- dp_vec1_zb = AE_MOVPA24(vec1_zero_bias);
- if (((((unsigned)p_mat1) & 1) == 0) && ((((unsigned)p_vec1) & 1) == 0) &&
- ((row_stride1 & 1) == 0)) {
- for (m_itr = 0; m_itr < (rows - 3); m_itr += 4) {
- p_mat1_0 = &p_mat1[(m_itr + 0) * row_stride1 - 2];
- p_mat1_1 = &p_mat1[(m_itr + 1) * row_stride1 - 2];
- p_mat1_2 = &p_mat1[(m_itr + 2) * row_stride1 - 2];
- p_mat1_3 = &p_mat1[(m_itr + 3) * row_stride1 - 2];
- p_vec1_0 = p_vec1 - 2;
-
- dq_acc[0] = dq_acc[1] = dq_acc[2] = dq_acc[3] = AE_ZEROQ56();
-
- /* AE_LP8X2F* instruction loads in upper 8 bits of P register, so shifting
- vector right by 16 to get multiplication result in middle 32 bits of Q
- register (lower 16 bits 0) */
- for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
- AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
- AE_LP8X2F_IU(dp_mat1_1, p_mat1_1, 2);
- AE_LP8X2F_IU(dp_mat1_2, p_mat1_2, 2);
- AE_LP8X2F_IU(dp_mat1_3, p_mat1_3, 2);
- AE_LP8X2F_IU(dp_vec1_0, p_vec1_0, 2);
- dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
- dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
- AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
- AE_MULAAP24S_HH_LL(dq_acc[1], dp_mat1_1, dp_vec1_0);
- AE_MULAAP24S_HH_LL(dq_acc[2], dp_mat1_2, dp_vec1_0);
- AE_MULAAP24S_HH_LL(dq_acc[3], dp_mat1_3, dp_vec1_0);
- }
- /* Pointers are aligned so can do 8X2 loads and ignore L parts of
- * registers */
- if (cols1 & 1) {
- AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
- AE_LP8X2F_IU(dp_mat1_1, p_mat1_1, 2);
- AE_LP8X2F_IU(dp_mat1_2, p_mat1_2, 2);
- AE_LP8X2F_IU(dp_mat1_3, p_mat1_3, 2);
- AE_LP8X2F_IU(dp_vec1_0, p_vec1_0, 2);
- dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
- dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
- AE_MULAP24S_HH(dq_acc[0], dp_mat1_0, dp_vec1_0);
- AE_MULAP24S_HH(dq_acc[1], dp_mat1_1, dp_vec1_0);
- AE_MULAP24S_HH(dq_acc[2], dp_mat1_2, dp_vec1_0);
- AE_MULAP24S_HH(dq_acc[3], dp_mat1_3, dp_vec1_0);
- }
-
- if (p_bias != NULL) {
- for (i = 0; i < 4; i++)
- dq_acc[i] = AE_ADDSQ56S(dq_acc[i], *(ae_q32s *)(&p_bias[m_itr + i]));
- }
-
- for (i = 0; i < 4; i++) {
- dq_out32 = AE_SATQ48S(dq_acc[i]);
- MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
- out_multiplier, left_shift,
- right_shift);
- STORE_INT16(&p_out[(m_itr + i) * out_stride], dq_out);
- }
- }
- for (; m_itr < rows; m_itr++) {
- p_mat1_0 = &p_mat1[m_itr * row_stride1 - 2];
- p_vec1_0 = p_vec1 - 2;
-
- dq_acc[0] = AE_ZEROQ56();
-
- for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
- AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
- AE_LP8X2F_IU(dp_vec1_0, p_vec1_0, 2);
- dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
- dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
- AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
- }
- /* Pointers are aligned so can do 8X2 loads and ignore L parts of
- * registers */
- if (cols1 & 1) {
- AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
- AE_LP8X2F_IU(dp_vec1_0, p_vec1_0, 2);
- dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
- dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
- AE_MULAP24S_HH(dq_acc[0], dp_mat1_0, dp_vec1_0);
- }
-
- if (p_bias != NULL)
- dq_acc[0] = AE_ADDSQ56S(dq_acc[0], *(ae_q32s *)(&p_bias[m_itr]));
-
- dq_out32 = AE_SATQ48S(dq_acc[0]);
- MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
- out_multiplier, left_shift, right_shift);
- STORE_INT16(&p_out[m_itr * out_stride], dq_out);
- }
- } else {
-#ifndef DISABLE_NNLIB_UNALIGNED_SUPPORT
- if ((((unsigned)p_mat1) & 1) == 0) {
- for (m_itr = 0; m_itr < (rows - 3); m_itr += 4) {
- p_mat1_0 = &p_mat1[(m_itr + 0) * row_stride1 - 2];
- p_mat1_1 = &p_mat1[(m_itr + 1) * row_stride1];
- p_mat1_2 = &p_mat1[(m_itr + 2) * row_stride1 - 2];
- p_mat1_3 = &p_mat1[(m_itr + 3) * row_stride1];
- p_vec1_0 = p_vec1;
-
- dq_acc[0] = dq_acc[1] = dq_acc[2] = dq_acc[3] = AE_ZEROQ56();
-
- /* Matrix elements are kept in upper 8 bits of P registers, vector
- elements are kept in lower 8 bits of P registers, typecasting to UWORD8
- is to avoid extra extui instructions since signed 8-bit load in not
- there in HiFiMini */
- for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
- AE_LP8X2F_IU(dp_mat1_0, p_mat1_0, 2);
- dp_mat1_1 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_1[c_itr],
- (UWORD8)p_mat1_1[c_itr + 1]);
- AE_LP8X2F_IU(dp_mat1_2, p_mat1_2, 2);
- dp_mat1_3 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_3[c_itr],
- (UWORD8)p_mat1_3[c_itr + 1]);
- dp_vec1_0 = AE_CVTP24A16X2_LL((UWORD8)p_vec1_0[c_itr],
- (UWORD8)p_vec1_0[c_itr + 1]);
- dp_mat1_1 = AE_SLLIP24(dp_mat1_1, 8);
- dp_mat1_3 = AE_SLLIP24(dp_mat1_3, 8);
- dp_vec1_0 = AE_SLLIP24(dp_vec1_0, 8);
- dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
- dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
- AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
- AE_MULAAP24S_HH_LL(dq_acc[1], dp_mat1_1, dp_vec1_0);
- AE_MULAAP24S_HH_LL(dq_acc[2], dp_mat1_2, dp_vec1_0);
- AE_MULAAP24S_HH_LL(dq_acc[3], dp_mat1_3, dp_vec1_0);
- }
- if (cols1 & 1) {
- ae_p24x2s dp_mat1_01, dp_mat1_23;
- dp_mat1_01 =
- AE_CVTP24A16X2_LL((UWORD8)p_mat1_0[2], (UWORD8)p_mat1_1[c_itr]);
- dp_mat1_23 =
- AE_CVTP24A16X2_LL((UWORD8)p_mat1_2[2], (UWORD8)p_mat1_3[c_itr]);
- dp_vec1_0 = AE_MOVPA24(p_vec1_0[c_itr]);
- dp_mat1_01 = AE_SLLIP24(dp_mat1_01, 8);
- dp_mat1_23 = AE_SLLIP24(dp_mat1_23, 8);
- dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
- AE_MULAP24S_HH(dq_acc[0], dp_mat1_01, dp_vec1_0);
- AE_MULAP24S_LL(dq_acc[1], dp_mat1_01, dp_vec1_0);
- AE_MULAP24S_HH(dq_acc[2], dp_mat1_23, dp_vec1_0);
- AE_MULAP24S_LL(dq_acc[3], dp_mat1_23, dp_vec1_0);
- }
-
- if (p_bias != NULL) {
- for (i = 0; i < 4; i++)
- dq_acc[i] =
- AE_ADDSQ56S(dq_acc[i], *(ae_q32s *)(&p_bias[m_itr + i]));
- }
-
- for (i = 0; i < 4; i++) {
- dq_out32 = AE_SATQ48S(dq_acc[i]);
- MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
- out_multiplier, left_shift,
- right_shift);
- STORE_INT16(&p_out[(m_itr + i) * out_stride], dq_out);
- }
- }
- } else {
- for (m_itr = 0; m_itr < (rows - 3); m_itr += 4) {
- p_mat1_0 = &p_mat1[(m_itr + 0) * row_stride1];
- p_mat1_1 = &p_mat1[(m_itr + 1) * row_stride1];
- p_mat1_2 = &p_mat1[(m_itr + 2) * row_stride1];
- p_mat1_3 = &p_mat1[(m_itr + 3) * row_stride1];
- p_vec1_0 = p_vec1;
-
- dq_acc[0] = dq_acc[1] = dq_acc[2] = dq_acc[3] = AE_ZEROQ56();
-
- /* Matrix elements are kept in upper 8 bits of P registers, vector
- elements are kept in lower 8 bits of P registers, typecasting to UWORD8
- is to avoid extra extui instructions since signed 8-bit load in not
- there in HiFiMini */
- for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
- dp_mat1_0 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_0[c_itr],
- (UWORD8)p_mat1_0[c_itr + 1]);
- dp_mat1_1 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_1[c_itr],
- (UWORD8)p_mat1_1[c_itr + 1]);
- dp_mat1_2 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_2[c_itr],
- (UWORD8)p_mat1_2[c_itr + 1]);
- dp_mat1_3 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_3[c_itr],
- (UWORD8)p_mat1_3[c_itr + 1]);
- dp_vec1_0 = AE_CVTP24A16X2_LL((UWORD8)p_vec1_0[c_itr],
- (UWORD8)p_vec1_0[c_itr + 1]);
- dp_mat1_0 = AE_SLLIP24(dp_mat1_0, 8);
- dp_mat1_1 = AE_SLLIP24(dp_mat1_1, 8);
- dp_mat1_2 = AE_SLLIP24(dp_mat1_2, 8);
- dp_mat1_3 = AE_SLLIP24(dp_mat1_3, 8);
- dp_vec1_0 = AE_SLLIP24(dp_vec1_0, 8);
- dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
- dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
- AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
- AE_MULAAP24S_HH_LL(dq_acc[1], dp_mat1_1, dp_vec1_0);
- AE_MULAAP24S_HH_LL(dq_acc[2], dp_mat1_2, dp_vec1_0);
- AE_MULAAP24S_HH_LL(dq_acc[3], dp_mat1_3, dp_vec1_0);
- }
- if (cols1 & 1) {
- ae_p24x2s dp_mat1_01, dp_mat1_23;
- dp_mat1_01 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_0[c_itr],
- (UWORD8)p_mat1_1[c_itr]);
- dp_mat1_23 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_2[c_itr],
- (UWORD8)p_mat1_3[c_itr]);
- dp_vec1_0 = AE_MOVPA24(p_vec1_0[c_itr]);
- dp_mat1_01 = AE_SLLIP24(dp_mat1_01, 8);
- dp_mat1_23 = AE_SLLIP24(dp_mat1_23, 8);
- dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
- AE_MULAP24S_HH(dq_acc[0], dp_mat1_01, dp_vec1_0);
- AE_MULAP24S_LL(dq_acc[1], dp_mat1_01, dp_vec1_0);
- AE_MULAP24S_HH(dq_acc[2], dp_mat1_23, dp_vec1_0);
- AE_MULAP24S_LL(dq_acc[3], dp_mat1_23, dp_vec1_0);
- }
-
- if (p_bias != NULL) {
- for (i = 0; i < 4; i++)
- dq_acc[i] =
- AE_ADDSQ56S(dq_acc[i], *(ae_q32s *)(&p_bias[m_itr + i]));
- }
-
- for (i = 0; i < 4; i++) {
- dq_out32 = AE_SATQ48S(dq_acc[i]);
- MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
- out_multiplier, left_shift,
- right_shift);
- STORE_INT16(&p_out[(m_itr + i) * out_stride], dq_out);
- }
- }
- }
- for (; m_itr < rows; m_itr++) {
- p_mat1_0 = &p_mat1[m_itr * row_stride1];
- p_vec1_0 = p_vec1;
-
- dq_acc[0] = AE_ZEROQ56();
-
- for (c_itr = 0; c_itr < (cols1 - 1); c_itr += 2) {
- dp_mat1_0 = AE_CVTP24A16X2_LL((UWORD8)p_mat1_0[c_itr],
- (UWORD8)p_mat1_0[c_itr + 1]);
- dp_vec1_0 = AE_CVTP24A16X2_LL((UWORD8)p_vec1_0[c_itr],
- (UWORD8)p_vec1_0[c_itr + 1]);
- dp_mat1_0 = AE_SLLIP24(dp_mat1_0, 8);
- dp_vec1_0 = AE_SLLIP24(dp_vec1_0, 8);
- dp_vec1_0 = AE_SRAIP24(dp_vec1_0, 16);
- dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, dp_vec1_zb);
- AE_MULAAP24S_HH_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
- }
- if (cols1 & 1) {
- dp_mat1_0 = AE_CVTP24A16(p_mat1_0[c_itr]);
- dp_vec1_0 = AE_CVTP24A16(p_vec1_0[c_itr]);
- dp_vec1_0 = AE_ADDSP24S(dp_vec1_0, AE_CVTP24A16(vec1_zero_bias));
- AE_MULAP24S_LL(dq_acc[0], dp_mat1_0, dp_vec1_0);
- }
-
- if (p_bias != NULL)
- dq_acc[0] = AE_ADDSQ56S(dq_acc[0], *(ae_q32s *)(&p_bias[m_itr]));
-
- dq_out32 = AE_SATQ48S(dq_acc[0]);
- MULTIPLY_BY_QUANTIZED_MULTIPLIER(dq_out, AE_TRUNCA32Q48(dq_out32),
- out_multiplier, left_shift, right_shift);
- STORE_INT16(&p_out[m_itr * out_stride], dq_out);
- }
-#else
- return 1;
-#endif
- }
-
- return 0;
-}
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/nnlib/xa_nnlib_api.h b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/nnlib/xa_nnlib_api.h
deleted file mode 100644
index e499e1e..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/nnlib/xa_nnlib_api.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
-
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef __XA_NNLIB_API_H__
-#define __XA_NNLIB_API_H__
-
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/nnlib/xa_nnlib_kernels_api.h"
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/xa_type_def.h"
-
-#endif /* __XA_NNLIB_API_H__ */
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/nnlib/xa_nnlib_kernels_api.h b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/nnlib/xa_nnlib_kernels_api.h
deleted file mode 100644
index d3a5e29..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/nnlib/xa_nnlib_kernels_api.h
+++ /dev/null
@@ -1,300 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
-
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef __XA_NNLIB_KERNELS_API_H__
-#define __XA_NNLIB_KERNELS_API_H__
-
-/**
- * @file xa_nnlib_kernels_api.h
- * @brief This file gives the API definition for the HiFi NNLIB
- *
- * matXvec KERNELS API NAMING CONVENTION <br>
- * <br>
- * xa_nn_matXvec_<batch>_[m]x[n]_[p]_<activation>, where
- * - <batch>: Optional 'batch' tag to indicate time batching routine
- * - [m]: Matrix precision in bits
- * - [n]: Vector (and bias for non-activation routines) precision in bits
- * - [p]: Output precision in bits
- * - <activation>: optional activation tag 'sigmoid' / 'tanh'
- *
- * These set of kernels perform dual matXvec followed by optional
- * activation function. There are several variants based on the input,
- * output precision and use of activation functions.
- *
- * Restriction,
- * - All pointers (p_out, p_mat1, p_mat2, p_vec1, p_vec2, p_bias, p_scratch)
- * must be SIMD (64-bit) aligned and should not overlap.
- * - p_mat2, p_vec2 can be 'NULL', but other pointers cannot be 'NULL'
- * - Variables cols1, cols2, row_stride1, row_stride2 must be multiple of 4
- *
- * Usage of few critical variables,
- * - acc_shift:
- * -# In case of valid activation tag i.e. <activation>: shift to be
- * applied on accumulator to match accumulator's Q format with activation
- * function's input's Q format
- * -# In case of bypass i.e. no activation tag: shift to be applied on
- * accumulator.
- * -# Positive value denotes left shift, and negative value denotes right
- * shift.
- * - bias_shift: shift which is to be applied on bias to match bias's
- * Q format with accumulator's Q format. Positive value denotes left shift,
- * and negative value denotes right shift.
- * - bias_precision: This represents bias precision
- * -# For 16x16, and 8x16 apis, valid values are '16' and '64'
- * -# For 8x8 apis, valid values are '8' and '32'
- *
- * Output 8b, 16b, 32b of fixed point apis (only for bypass variants) is
- * extracted from 64b accumulator with symmetric rounding. Output 64b of fixed
- * point apis (only for bypass variants) is extracted from 64b accumulator.
- * Output 8b, 16b of fixed point apis (only for activation variants) is
- * symmetrically rounded.
- *
- * matXvec 16x16 Kernels,
- * - Bypass kernels with 16, 32, 64 bit output: 3
- * - Fused kernel with 2 activation variants: 2
- * - Time batching kernel: 1 (Not implemented)
- * - Total: 6
- *
- * matXvec 8x16 Kernels,
- * - Bypass kernels with 16, 32, 64 bit output: 3
- * - Fused kernel with 2 activation variants: 2
- * - Time batching kernel: 1 (Not implemented)
- * - Total: 6
- *
- * matXvec 8x8 Kernels,
- * - Bypass kernels with 8, 16, 32 bit output: 3
- * - Fused kernel with 2 activation variants: 2
- * - Time batching kernel: 1 (Not implemented)
- * - Total: 6
- *
- * matXvec float32 x float32 Kernels,
- * - Bypass kernels 32 bit output: 1
- * - Fused kernel with 2 activation variants: 2
- * - Time batching kernel: 1 (Not implemented)
- * - Total: 4
- *
- * ACTIVATION KERNELS API NAMING CONVENTION <br>
- * <br>
- * xa_nn_vec_[activation]_[n]_[p] for fixed point <br>
- * xa_nn_vec_[activation]_f32_f32 for floating point, where
- * - [activation]: One of activations - sigmoid/tanh/relu/relu1/relu6/softmax
- * - [n]: Input precision in bits
- * - [p]: Output precision in bits
- *
- * Possible values,
- * - 'n' takes value '32', and expects input in Q6.25 format.
- * - 'p' takes values '32' and '16', gives output in Q16.15 and Q0.15 formats
- * respectively.
- *
- * There is WORD32 datatype variable 'threshold' for 'relu' related apis, which
- * expects value in Q16.15 format.
- *
- * Restriction,
- * - All pointers (p_out, p_vec) must be 32-bit aligned and should not overlap.
- *
- * activation 32_32 kernels,
- * - Vector activation kernels: 6
- * - Total: 6
- *
- * activation f32_f32 kernels,
- * - Vector activation kernels: 6
- * - Total: 6
- *
- * activation 32_16 kernels,
- * - Vector activation kernels: 2
- * - Total: 2
- */
-
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/xa_type_def.h"
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-WORD32 xa_nn_conv2d_depthwise_getsize(
- WORD32 input_height, WORD32 input_width, WORD32 input_channels,
- WORD32 kernel_height, WORD32 kernel_width, WORD32 channels_multiplier,
- WORD32 x_stride, WORD32 y_stride, WORD32 x_padding, WORD32 y_padding,
- WORD32 output_height, WORD32 output_width, WORD32 circ_buf_precision,
- WORD32 inp_data_format);
-
-WORD32 xa_nn_vec_activation_min_max_asym8u_asym8u(
- UWORD8 *__restrict__ p_out, const UWORD8 *__restrict__ p_vec,
- int activation_min, int activation_max, WORD32 vec_length);
-
-WORD32 xa_nn_vec_activation_min_max_asym8s_asym8s(
- WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_vec,
- int activation_min, int activation_max, WORD32 vec_length);
-
-WORD32 xa_nn_conv2d_std_getsize(WORD32 input_height, WORD32 input_channels,
- WORD32 kernel_height, WORD32 kernel_width,
- WORD32 y_stride, WORD32 y_padding,
- WORD32 out_height, WORD32 input_precision);
-
-WORD32 xa_nn_conv2d_std_asym8uxasym8u(
- UWORD8 *__restrict__ p_out, const UWORD8 *__restrict__ p_inp,
- const UWORD8 *__restrict__ p_kernel, const WORD32 *__restrict__ p_bias,
- WORD32 input_height, WORD32 input_width, WORD32 input_channels,
- WORD32 kernel_height, WORD32 kernel_width, WORD32 out_channels,
- WORD32 x_stride, WORD32 y_stride, WORD32 x_padding, WORD32 y_padding,
- WORD32 out_height, WORD32 out_width, WORD32 input_zero_bias,
- WORD32 kernel_zero_bias, WORD32 out_multiplier, WORD32 out_shift,
- WORD32 out_zero_bias, WORD32 out_data_format, VOID *p_scratch);
-
-WORD32 xa_nn_conv2d_std_per_chan_sym8sxasym8s(
- WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_inp,
- const WORD8 *__restrict__ p_kernel, const WORD32 *__restrict__ p_bias,
- WORD32 input_height, WORD32 input_width, WORD32 input_channels,
- WORD32 kernel_height, WORD32 kernel_width, WORD32 out_channels,
- WORD32 x_stride, WORD32 y_stride, WORD32 x_padding, WORD32 y_padding,
- WORD32 out_height, WORD32 out_width, WORD32 input_zero_bias,
- WORD32 *p_out_multiplier, WORD32 *p_out_shift, WORD32 out_zero_bias,
- WORD32 out_data_format, VOID *p_scratch);
-
-WORD32 xa_nn_conv2d_depthwise_asym8uxasym8u(
- pUWORD8 __restrict__ p_out, const UWORD8 *__restrict__ p_kernel,
- const UWORD8 *__restrict__ p_inp, const WORD32 *__restrict__ p_bias,
- WORD32 input_height, WORD32 input_width, WORD32 input_channels,
- WORD32 kernel_height, WORD32 kernel_width, WORD32 channels_multiplier,
- WORD32 x_stride, WORD32 y_stride, WORD32 x_padding, WORD32 y_padding,
- WORD32 out_height, WORD32 out_width, WORD32 input_zero_bias,
- WORD32 kernel_zero_bias, WORD32 out_multiplier, WORD32 out_shift,
- WORD32 out_zero_bias, WORD32 inp_data_format, WORD32 out_data_format,
- pVOID p_scratch);
-
-WORD32 xa_nn_conv2d_depthwise_per_chan_sym8sxasym8s(
- WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_kernel,
- const WORD8 *__restrict__ p_inp, const WORD32 *__restrict__ p_bias,
- WORD32 input_height, WORD32 input_width, WORD32 input_channels,
- WORD32 kernel_height, WORD32 kernel_width, WORD32 channels_multiplier,
- WORD32 x_stride, WORD32 y_stride, WORD32 x_padding, WORD32 y_padding,
- WORD32 out_height, WORD32 out_width, WORD32 input_zero_bias,
- const WORD32 *p_out_multiplier, const WORD32 *p_out_shift,
- WORD32 out_zero_bias, WORD32 inp_data_format, WORD32 out_data_format,
- pVOID p_scratch);
-
-WORD32 xa_nn_fully_connected_asym8uxasym8u_asym8u(
- pUWORD8 __restrict__ p_out, const UWORD8 *__restrict__ p_weight,
- const UWORD8 *__restrict__ p_inp, const WORD32 *__restrict__ p_bias,
- WORD32 weight_depth, WORD32 out_depth, WORD32 input_zero_bias,
- WORD32 weight_zero_bias, WORD32 out_multiplier, WORD32 out_shift,
- WORD32 out_zero_bias);
-
-WORD32 xa_nn_fully_connected_sym8sxasym8s_asym8s(
- pWORD8 __restrict__ p_out, const WORD8 *__restrict__ p_weight,
- const WORD8 *__restrict__ p_inp, const WORD32 *__restrict__ p_bias,
- WORD32 weight_depth, WORD32 out_depth, WORD32 input_zero_bias,
- WORD32 out_multiplier, WORD32 out_shift, WORD32 out_zero_bias);
-
-WORD32 xa_nn_fully_connected_asym8sxasym8s_asym8s(
- WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_weight,
- const WORD8 *__restrict__ p_inp, const WORD32 *__restrict__ p_bias,
- WORD32 weight_depth, WORD32 out_depth, WORD32 weight_zero_bias,
- WORD32 input_zero_bias, WORD32 out_multiplier, WORD32 out_shift,
- WORD32 out_zero_bias);
-
-WORD32 xa_nn_vec_softmax_asym8u_8(UWORD8 *__restrict__ p_out,
- const UWORD8 *__restrict__ p_vec,
- WORD32 diffmin, WORD32 input_left_shift,
- WORD32 input_multiplier, WORD32 vec_length,
- pVOID p_scratch);
-
-WORD32 xa_nn_vec_softmax_asym8s_16(WORD16 *__restrict__ p_out,
- const WORD8 *__restrict__ p_vec,
- WORD32 diffmin, WORD32 input_left_shift,
- WORD32 input_multiplier, WORD32 vec_length,
- pVOID p_scratch);
-
-WORD32 xa_nn_vec_softmax_asym8s_8(WORD8 *__restrict__ p_out,
- const WORD8 *__restrict__ p_vec,
- WORD32 diffmin, WORD32 input_left_shift,
- WORD32 input_multiplier, WORD32 vec_length,
- pVOID p_scratch);
-
-int xa_nn_get_softmax_scratch_size(int inp_precision, int out_precision,
- int length);
-
-WORD32 xa_nn_matXvec_out_stride_asym8uxasym8u_asym8u(
- UWORD8 *__restrict__ p_out, const UWORD8 *__restrict__ p_mat1,
- const UWORD8 *__restrict__ p_vec1, const WORD32 *__restrict__ p_bias,
- WORD32 rows, WORD32 cols1, WORD32 row_stride1, WORD32 out_stride,
- WORD32 mat1_zero_bias, WORD32 vec1_zero_bias, WORD32 out_multiplier,
- WORD32 out_shift, WORD32 out_zero_bias);
-
-WORD32 xa_nn_matXvec_out_stride_sym8sxasym8s_asym8s(
- WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_mat1,
- const WORD8 *__restrict__ p_vec1, const WORD32 *__restrict__ p_bias,
- WORD32 rows, WORD32 cols1, WORD32 row_stride1, WORD32 out_stride,
- WORD32 vec1_zero_bias, WORD32 out_multiplier, WORD32 out_shift,
- WORD32 out_zero_bias);
-
-WORD32 xa_nn_matXvec_out_stride_asym8sxasym8s_asym8s(
- WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_mat1,
- const WORD8 *__restrict__ p_vec1, const WORD32 *__restrict__ p_bias,
- WORD32 rows, WORD32 cols1, WORD32 row_stride1, WORD32 out_stride,
- WORD32 mat1_zero_bias, WORD32 vec1_zero_bias, WORD32 out_multiplier,
- WORD32 out_shift, WORD32 out_zero_bias);
-
-WORD32 xa_nn_matXvec_out_stride_sym8sxasym8s_16(
- WORD16 *__restrict__ p_out, const WORD8 *__restrict__ p_mat1,
- const WORD8 *__restrict__ p_vec1, const WORD32 *__restrict__ p_bias,
- WORD32 rows, WORD32 cols1, WORD32 row_stride1, WORD32 out_stride,
- WORD32 vec1_zero_bias, WORD32 out_multiplier, WORD32 out_shift);
-
-WORD32 xa_nn_dot_prod_16x16_asym8s(
- WORD8 *__restrict__ p_out, /* pointer to output */
- const WORD16 *__restrict__ p_inp1_start, /* pointer to input1 */
- const WORD16 *__restrict__ p_inp2_start, /* pointer to input2 */
- const WORD32 *bias_ptr, WORD32 vec_length, WORD32 out_multiplier,
- WORD32 out_shift, WORD32 out_zero_bias, WORD32 vec_count);
-
-/* Mapping the functions names from previous naming convension for backward
- * compatibility */
-#define xa_nn_vec_activation_min_max_asym8_asym8 \
- xa_nn_vec_activation_min_max_asym8u_asym8u
-#define xa_nn_conv2d_std_asym8xasym8 xa_nn_conv2d_std_asym8uxasym8u
-#define xa_nn_conv2d_depthwise_asym8xasym8 xa_nn_conv2d_depthwise_asym8uxasym8u
-#define xa_nn_fully_connected_asym8xasym8_asym8 \
- xa_nn_fully_connected_asym8uxasym8u_asym8u
-#define xa_nn_vec_softmax_asym8_asym8 xa_nn_vec_softmax_asym8u_asym8u
-#define xa_nn_dot_prod_asym8xasym8_asym8 xa_nn_dot_prod_asym8uxasym8u_asym8u
-#define xa_nn_matXvec_out_stride_asym8xasym8_asym8 \
- xa_nn_matXvec_out_stride_asym8uxasym8u_asym8u
-
-#if defined(__cplusplus)
-}
-#endif
-#endif /* __XA_NNLIB_KERNELS_API_H__ */
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/nnlib/xa_nnlib_standards.h b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/nnlib/xa_nnlib_standards.h
deleted file mode 100644
index 36ea75d..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/nnlib/xa_nnlib_standards.h
+++ /dev/null
@@ -1,170 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
-
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef __STANDARDS_H__
-#define __STANDARDS_H__
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-typedef double flt64;
-typedef char Int4;
-typedef char Int8;
-typedef int16_t Int16;
-typedef int Int32;
-typedef int Int24;
-typedef int64_t Int64;
-typedef int Bool;
-typedef float Flt32;
-
-#ifdef MODEL_FLT64
-typedef double vect_t;
-typedef double coeff_t;
-typedef double accu_t;
-
-#elif MODEL_INT16
-typedef int16_t vect_t;
-typedef int16_t coeff_t;
-typedef signed char coeff8_t;
-typedef int64_t accu_t;
-typedef float coefff32_t;
-#endif
-
-typedef struct xa_nnlib_opaque {
- Int32 _;
-} * xa_nnlib_handle_t;
-
-typedef enum _xa_nnlib_prec_t {
- PREC_8 = 8,
- PREC_16 = 16,
- PREC_32 = 32,
- PREC_F32 = -1,
- PREC_F16 = -2,
- PREC_ASYM8U = -3,
- PREC_ASYM8S = -4,
- PREC_SYM8S = -5
-} xa_nnlib_prec_t;
-
-typedef enum _xa_nnlib_shape_type_t {
- SHAPE_UNKNOWN_T = 0,
- SHAPE_VECTOR_T = 1,
- SHAPE_MATRIX_T = 2,
- SHAPE_CUBE_DWH_T = 3,
- SHAPE_CUBE_WHD_T = 4
-} xa_nnlib_shape_type_t;
-
-typedef struct _xa_nnlib_shape_t {
- xa_nnlib_shape_type_t shape_type;
- Int32 n_shapes;
- Int32 shape_offset; // Offest between current shape and next shape
- union {
- struct {
- Int32 height;
- Int32 height_offset;
- Int32 width;
- Int32 width_offset;
- Int32 depth;
- Int32 depth_offset;
- } cube;
-
- struct {
- Int32 length;
- } vector;
- struct {
- Int32 rows;
- Int32 row_offset; // Offset between current row and next row
- Int32 cols;
- } matrix;
- } dim;
-} xa_nnlib_shape_t;
-
-/*****************************************************************************/
-/* Constant hash defines */
-/*****************************************************************************/
-#define XA_NNLIB_NO_ERROR 0
-/* error handling 'AND' definition */
-#define XA_FATAL_ERROR 0x80000000
-
-enum xa_error_severity {
- xa_severity_nonfatal = 0,
- xa_severity_fatal = (int)0xffffffff
-};
-
-enum xa_error_class {
- xa_class_nnlib = 0,
- xa_class_config = 1,
- xa_class_execute = 2
-};
-
-#define XA_NNLIB_GENERIC 0
-
-#define XA_ERROR_CODE(severity, class, codec, index) \
- ((severity << 31) | (class << 12) | (codec << 7) | index)
-#define XA_ERROR_SEVERITY(code) (((code)&XA_FATAL_ERROR) != 0)
-#define XA_ERROR_CLASS(code) (((code) >> 12) & 0x0f)
-#define XA_ERROR_CODEC(code) (((code) >> 7) & 0x1f)
-#define XA_ERROR_SUBCODE(code) (((code) >> 0) & 0x3f)
-
-/* Our convention is that only nnlib-class errors can be generic ones. */
-
-/*****************************************************************************/
-/* Class 0: NNLib Errors */
-/*****************************************************************************/
-/* Non Fatal Errors */
-/* (none) */
-/* Fatal Errors */
-enum xa_error_fatal_nnlib_generic {
- XA_NNLIB_FATAL_MEM_ALLOC =
- XA_ERROR_CODE(xa_severity_fatal, xa_class_nnlib, XA_NNLIB_GENERIC, 0),
- XA_NNLIB_FATAL_MEM_ALIGN =
- XA_ERROR_CODE(xa_severity_fatal, xa_class_nnlib, XA_NNLIB_GENERIC, 1),
- XA_NNLIB_FATAL_INVALID_SHAPE =
- XA_ERROR_CODE(xa_severity_fatal, xa_class_nnlib, XA_NNLIB_GENERIC, 3)
-};
-
-/*****************************************************************************/
-/* NNLib Startup Functions */
-/*****************************************************************************/
-const Int8* xa_nnlib_get_lib_name_string(void);
-const Int8* xa_nnlib_get_lib_version_string(void);
-const Int8* xa_nnlib_get_lib_api_version_string(void);
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif /* __STANDARDS_H__ */
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/xa_type_def.h b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/xa_type_def.h
deleted file mode 100644
index 13a7469..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/xa_type_def.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2019-2020 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
-
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef __XA_TYPE_DEF_H__
-#define __XA_TYPE_DEF_H__
-
-#include <stdint.h>
-
-/****************************************************************************/
-/* types type define prefix examples bytes */
-/************************ *********** ****** **************** ***** */
-typedef signed char WORD8; /* b WORD8 b_name 1 */
-typedef signed char* pWORD8; /* pb pWORD8 pb_nmae 1 */
-typedef unsigned char UWORD8; /* ub UWORD8 ub_count 1 */
-typedef unsigned char* pUWORD8; /* pub pUWORD8 pub_count 1 */
-
-typedef int16_t WORD16; /* s WORD16 s_count 2 */
-typedef int16_t* pWORD16; /* ps pWORD16 ps_count 2 */
-typedef uint16_t UWORD16; /* us UWORD16 us_count 2 */
-typedef uint16_t* pUWORD16; /* pus pUWORD16 pus_count 2 */
-
-typedef signed int WORD24; /* k WORD24 k_count 3 */
-typedef signed int* pWORD24; /* pk pWORD24 pk_count 3 */
-typedef unsigned int UWORD24; /* uk UWORD24 uk_count 3 */
-typedef unsigned int* pUWORD24; /* puk pUWORD24 puk_count 3 */
-
-typedef signed int WORD32; /* i WORD32 i_count 4 */
-typedef signed int* pWORD32; /* pi pWORD32 pi_count 4 */
-typedef unsigned int UWORD32; /* ui UWORD32 ui_count 4 */
-typedef unsigned int* pUWORD32; /* pui pUWORD32 pui_count 4 */
-
-typedef int64_t WORD40; /* m WORD40 m_count 5 */
-typedef int64_t* pWORD40; /* pm pWORD40 pm_count 5 */
-typedef uint64_t UWORD40; /* um UWORD40 um_count 5 */
-typedef uint64_t* pUWORD40; /* pum pUWORD40 pum_count 5 */
-
-typedef int64_t WORD64; /* h WORD64 h_count 8 */
-typedef int64_t* pWORD64; /* ph pWORD64 ph_count 8 */
-typedef uint64_t UWORD64; /* uh UWORD64 uh_count 8 */
-typedef uint64_t* pUWORD64; /* puh pUWORD64 puh_count 8 */
-
-typedef float FLOAT32; /* f FLOAT32 f_count 4 */
-typedef float* pFLOAT32; /* pf pFLOAT32 pf_count 4 */
-typedef double FLOAT64; /* d UFLOAT64 d_count 8 */
-typedef double* pFlOAT64; /* pd pFLOAT64 pd_count 8 */
-
-typedef void VOID; /* v VOID v_flag 4 */
-typedef void* pVOID; /* pv pVOID pv_flag 4 */
-
-/* variable size types: platform optimized implementation */
-typedef signed int BOOL; /* bool BOOL bool_true */
-typedef unsigned int UBOOL; /* ubool BOOL ubool_true */
-typedef signed int FLAG; /* flag FLAG flag_false */
-typedef unsigned int UFLAG; /* uflag FLAG uflag_false */
-typedef signed int LOOPIDX; /* lp LOOPIDX lp_index */
-typedef unsigned int ULOOPIDX; /* ulp SLOOPIDX ulp_index */
-typedef signed int WORD; /* lp LOOPIDX lp_index */
-typedef unsigned int UWORD; /* ulp SLOOPIDX ulp_index */
-
-typedef LOOPIDX LOOPINDEX; /* lp LOOPIDX lp_index */
-typedef ULOOPIDX ULOOPINDEX; /* ulp SLOOPIDX ulp_index */
-
-#define PLATFORM_INLINE __inline
-
-typedef struct xa_codec_opaque {
- WORD32 _;
-} * xa_codec_handle_t;
-
-typedef int XA_ERRORCODE;
-
-typedef XA_ERRORCODE xa_codec_func_t(xa_codec_handle_t p_xa_module_obj,
- WORD32 i_cmd, WORD32 i_idx,
- pVOID pv_value);
-
-#endif /* __XA_TYPE_DEF_H__ */
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xtensa_tf_micro_common.h b/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xtensa_tf_micro_common.h
deleted file mode 100644
index 81847b6..0000000
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xtensa_tf_micro_common.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/******************************************************************************
- * Copyright (C) 2019 Cadence Design Systems, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to use this Software with Cadence processor cores only and
- * not with any other processors and platforms, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- ******************************************************************************/
-
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef __XTENSA_TF_MICRO_COMMON__
-#define __XTENSA_TF_MICRO_COMMON__
-
-#if defined HIFI_NNLIB_OPT || defined HIFI_MINI_NNLIB_OPT
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/nnlib/xa_nnlib_api.h"
-#include "tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/nnlib/xa_nnlib_standards.h"
-
-#define CHECK_ERR_HIFI_NNLIB_KER(ret, err_msg) \
- if (ret != 0) { \
- TF_LITE_KERNEL_LOG(context, err_msg); \
- return kTfLiteError; \
- }
-
-#ifndef XTENSA_NNLIB_MAX_SCRATCH_SIZE
-#define XTENSA_NNLIB_MAX_SCRATCH_SIZE (70 * 1024)
-#endif
-
-#define ALLOCATE_XTENSA_NNLIB_SCRATCH_MEM \
- uint8_t xtensa_nnlib_scratch_buf[XTENSA_NNLIB_MAX_SCRATCH_SIZE];
-
-#define MIN(a, b) (a) < (b) ? (a) : (b);
-#define MAX(a, b) (a) > (b) ? (a) : (b);
-
-#define ACTIVATION_MIN_MAX(data_type, out, inp, min, max) \
- { \
- data_type temp = MAX(inp, min); \
- out = MIN(temp, max); \
- }
-
-#define ACTIVATION_MIN_MAX_F32(out, inp, min, max) \
- { \
- float temp = MAX(inp, min); \
- out = MIN(temp, max); \
- }
-
-#define ACTIVATION_MIN_MAX_ASYM8(out, inp, min, max) \
- { \
- int32_t temp = MAX((int32_t)inp, min); \
- out = (uint8_t)MIN(temp, max); \
- }
-
-#define ALIGNED_SIZE(x, bytes) (((x) + (bytes - 1)) & (~(bytes - 1)))
-#define ALIGN_PTR(x, bytes) ((((unsigned)(x)) + (bytes - 1)) & (~(bytes - 1)))
-
-#define PRINT_VAR(var) \
- printf("%s = %d\n", #var, var); \
- fflush(stdout); \
- fflush(stderr);
-
-#endif /* HIFI_NNLIB_OPT */
-
-#endif /* __XTENSA_TF_MICRO_COMMON__ */
diff --git a/tensorflow/lite/micro/tools/make/ext_libs/xtensa_hifimini_staging_nn_library.inc b/tensorflow/lite/micro/tools/make/ext_libs/xtensa_hifimini_staging_nn_library.inc
deleted file mode 100644
index df7d308..0000000
--- a/tensorflow/lite/micro/tools/make/ext_libs/xtensa_hifimini_staging_nn_library.inc
+++ /dev/null
@@ -1,30 +0,0 @@
-ifneq ($(filter xtensa_hifimini_staging, $(ALL_TAGS)),)
-
- XTENSA_PATH = $(MAKEFILE_DIR)/../../kernels/xtensa_hifimini_staging
-
- ifneq (,$(filter xtensa_hifimini%, $(ALL_TAGS)))
-
- CCFLAGS += -DHIFI_MINI_NNLIB_OPT \
- -DDISABLE_NNLIB_UNALIGNED_SUPPORT \
- -DXTENSA_NNLIB_MAX_SCRATCH_SIZE=1024
-
- CXXFLAGS += -DHIFI_MINI_NNLIB_OPT \
- -DDISABLE_NNLIB_UNALIGNED_SUPPORT \
- -DXTENSA_NNLIB_MAX_SCRATCH_SIZE=1024
-
- MICROLITE_CC_SRCS += \
- $(XTENSA_PATH)/xa_nnlib/algo/kernels/activations/hifi_mini/xa_nn_activations_asym8s_asym8s.c \
- $(XTENSA_PATH)/xa_nnlib/algo/kernels/activations/hifi_mini/xa_nn_softmax_asym8_asym8.c \
- $(XTENSA_PATH)/xa_nnlib/algo/kernels/basic/hifi_mini/xa_nn_dot_prod_16x16.c \
- $(XTENSA_PATH)/xa_nnlib/algo/kernels/fc/hifi_mini/xa_nn_fully_connected.c \
- $(XTENSA_PATH)/xa_nnlib/algo/kernels/matXvec/hifi_mini/xa_nn_matXvec_sym8sxasym8s.c \
-
-
- INCLUDES += -I$(XTENSA_PATH)/xa_nnlib/algo/kernels/ \
- -I$(XTENSA_PATH)/xa_nnlib/include/nnlib/ \
- -I$(XTENSA_PATH)/xa_nnlib/include/ \
- -I$(XTENSA_PATH)/xa_nnlib/algo/common/include/ \
-
- endif
-
-endif
diff --git a/tensorflow/lite/micro/tools/make/targets/xtensa_hifimini_staging_makefile.inc b/tensorflow/lite/micro/tools/make/targets/xtensa_hifimini_staging_makefile.inc
deleted file mode 100644
index 557b8f6..0000000
--- a/tensorflow/lite/micro/tools/make/targets/xtensa_hifimini_staging_makefile.inc
+++ /dev/null
@@ -1,62 +0,0 @@
-# Settings for Xtensa toolchain for the hifimini kernels.
-# REQUIRED:
-# Environment variables:
-# - XTENSA_BASE must be set to location of
-# the Xtensa developer tools installation directory.
-# Command line arguments:
-# - XTENSA_TOOLS_VERSION: For example: RI-2019.2-linux
-# - XTENSA_CORE: The name of the Xtensa core to use
-# For example: hifimini
-
-ifeq ($(TARGET), xtensa_hifimini_staging)
- TARGET_ARCH := xtensa_hifimini_staging
-
- ifndef XTENSA_BASE
- $(error XTENSA_BASE is undefined)
- endif
-
- ifndef XTENSA_TOOLS_VERSION
- $(error XTENSA_TOOLS_VERSION is undefined)
- endif
-
- ifndef XTENSA_CORE
- $(error XTENSA_CORE is undefined)
- endif
-
- PLATFORM_ARGS = \
- -DTF_LITE_MCU_DEBUG_LOG \
- --xtensa-core=$(XTENSA_CORE) \
- -mcoproc \
- -DXTENSA -DMAX_RFFT_PWR=9 -DMIN_RFFT_PWR=MAX_RFFT_PWR \
- -fdata-sections \
- -ffunction-sections \
- -fno-exceptions \
- -fno-unwind-tables \
- -fno-use-cxa-atexit \
- -fmessage-length=0 \
- -fno-threadsafe-statics
-
- export PATH := $(XTENSA_BASE)/tools/$(XTENSA_TOOLS_VERSION)/XtensaTools/bin:$(PATH)
- TARGET_TOOLCHAIN_PREFIX := xt-
- CXX_TOOL := clang++
- CC_TOOL := clang
-
- CXXFLAGS += $(PLATFORM_ARGS)
- CCFLAGS += $(PLATFORM_ARGS)
-
- LDFLAGS += -Wl,-gc-sections
-
- TEST_SCRIPT := tensorflow/lite/micro/testing/test_xtensa_hifimini_staging_binary.sh
-
- # TODO(b/156962140): This manually maintained list of excluded examples is
- # quite error prone.
- EXCLUDED_EXAMPLE_TESTS := \
- tensorflow/lite/micro/examples/image_recognition_experimental/Makefile.inc \
- tensorflow/lite/micro/examples/magic_wand/Makefile.inc \
- tensorflow/lite/micro/examples/micro_speech/Makefile.inc \
- tensorflow/lite/micro/examples/network_tester/Makefile.inc \
- tensorflow/lite/micro/examples/person_detection/Makefile.inc \
- tensorflow/lite/micro/examples/person_detection_experimental/Makefile.inc
- MICRO_LITE_EXAMPLE_TESTS := $(filter-out $(EXCLUDED_EXAMPLE_TESTS), $(MICRO_LITE_EXAMPLE_TESTS))
-
-endif
diff --git a/tensorflow/lite/micro/xtensa_hifimini_staging/debug_log.cc b/tensorflow/lite/micro/xtensa_hifimini_staging/debug_log.cc
deleted file mode 100644
index 45d9317..0000000
--- a/tensorflow/lite/micro/xtensa_hifimini_staging/debug_log.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Reference implementation of the DebugLog() function that's required for a
-// platform to support the TensorFlow Lite for Microcontrollers library. This is
-// the only function that's absolutely required to be available on a target
-// device, since it's used for communicating test results back to the host so
-// that we can verify the implementation is working correctly.
-// It's designed to be as easy as possible to supply an implementation though.
-// On platforms that have a POSIX stack or C library, it can be written as a
-// single call to `fprintf(stderr, "%s", s)` to output a string to the error
-// stream of the console, but if there's no OS or C library available, there's
-// almost always an equivalent way to write out a string to some serial
-// interface that can be used instead. For example on Arm M-series MCUs, calling
-// the `bkpt #0xAB` assembler instruction will output the string in r1 to
-// whatever debug serial connection is available. If you're running mbed, you
-// can do the same by creating `Serial pc(USBTX, USBRX)` and then calling
-// `pc.printf("%s", s)`.
-// To add an equivalent function for your own platform, create your own
-// implementation file, and place it in a subfolder with named after the OS
-// you're targeting. For example, see the Cortex M bare metal version in
-// tensorflow/lite/micro/bluepill/debug_log.cc or the mbed one on
-// tensorflow/lite/micro/mbed/debug_log.cc.
-
-#include "tensorflow/lite/micro/debug_log.h"
-
-#ifndef TF_LITE_STRIP_ERROR_STRINGS
-#include <cstdio>
-#endif
-
-extern "C" void DebugLog(const char* s) {
-#ifndef TF_LITE_STRIP_ERROR_STRINGS
- // Reusing TF_LITE_STRIP_ERROR_STRINGS to disable DebugLog completely to get
- // maximum reduction in binary size. This is because we have DebugLog calls
- // via TF_LITE_CHECK that are not stubbed out by TF_LITE_REPORT_ERROR.
- fprintf(stderr, "%s", s);
-#endif
-}
diff --git a/tensorflow/lite/micro/xtensa_hifimini_staging/micro_time.cc b/tensorflow/lite/micro/xtensa_hifimini_staging/micro_time.cc
deleted file mode 100644
index 6f3844c..0000000
--- a/tensorflow/lite/micro/xtensa_hifimini_staging/micro_time.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Xtensa implementation of micro_timer.
-// To include this with make, add TAGS=xtensa-xpg.
-#include "tensorflow/lite/micro/micro_time.h"
-
-#include <time.h>
-
-namespace tflite {
-
-int32_t ticks_per_second() { return CLOCKS_PER_SEC; }
-
-int32_t GetCurrentTimeTicks() { return clock(); }
-
-} // namespace tflite