blob: d3a5e2990c0c9a2bf05c1e11c427ab8876600a24 [file] [log] [blame]
/*******************************************************************************
* Copyright (c) 2019-2020 Cadence Design Systems, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to use this Software with Cadence processor cores only and
* not with any other processors and platforms, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
******************************************************************************/
/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef __XA_NNLIB_KERNELS_API_H__
#define __XA_NNLIB_KERNELS_API_H__
/**
* @file xa_nnlib_kernels_api.h
* @brief This file gives the API definition for the HiFi NNLIB
*
* matXvec KERNELS API NAMING CONVENTION <br>
* <br>
* xa_nn_matXvec_<batch>_[m]x[n]_[p]_<activation>, where
* - <batch>: Optional 'batch' tag to indicate time batching routine
* - [m]: Matrix precision in bits
* - [n]: Vector (and bias for non-activation routines) precision in bits
* - [p]: Output precision in bits
* - <activation>: optional activation tag 'sigmoid' / 'tanh'
*
* These set of kernels perform dual matXvec followed by optional
* activation function. There are several variants based on the input,
* output precision and use of activation functions.
*
* Restriction,
* - All pointers (p_out, p_mat1, p_mat2, p_vec1, p_vec2, p_bias, p_scratch)
* must be SIMD (64-bit) aligned and should not overlap.
* - p_mat2, p_vec2 can be 'NULL', but other pointers cannot be 'NULL'
* - Variables cols1, cols2, row_stride1, row_stride2 must be multiple of 4
*
* Usage of few critical variables,
* - acc_shift:
* -# In case of valid activation tag i.e. <activation>: shift to be
* applied on accumulator to match accumulator's Q format with activation
* function's input's Q format
* -# In case of bypass i.e. no activation tag: shift to be applied on
* accumulator.
* -# Positive value denotes left shift, and negative value denotes right
* shift.
* - bias_shift: shift which is to be applied on bias to match bias's
* Q format with accumulator's Q format. Positive value denotes left shift,
* and negative value denotes right shift.
* - bias_precision: This represents bias precision
* -# For 16x16, and 8x16 apis, valid values are '16' and '64'
* -# For 8x8 apis, valid values are '8' and '32'
*
* Output 8b, 16b, 32b of fixed point apis (only for bypass variants) is
* extracted from 64b accumulator with symmetric rounding. Output 64b of fixed
* point apis (only for bypass variants) is extracted from 64b accumulator.
* Output 8b, 16b of fixed point apis (only for activation variants) is
* symmetrically rounded.
*
* matXvec 16x16 Kernels,
* - Bypass kernels with 16, 32, 64 bit output: 3
* - Fused kernel with 2 activation variants: 2
* - Time batching kernel: 1 (Not implemented)
* - Total: 6
*
* matXvec 8x16 Kernels,
* - Bypass kernels with 16, 32, 64 bit output: 3
* - Fused kernel with 2 activation variants: 2
* - Time batching kernel: 1 (Not implemented)
* - Total: 6
*
* matXvec 8x8 Kernels,
* - Bypass kernels with 8, 16, 32 bit output: 3
* - Fused kernel with 2 activation variants: 2
* - Time batching kernel: 1 (Not implemented)
* - Total: 6
*
* matXvec float32 x float32 Kernels,
* - Bypass kernels 32 bit output: 1
* - Fused kernel with 2 activation variants: 2
* - Time batching kernel: 1 (Not implemented)
* - Total: 4
*
* ACTIVATION KERNELS API NAMING CONVENTION <br>
* <br>
* xa_nn_vec_[activation]_[n]_[p] for fixed point <br>
* xa_nn_vec_[activation]_f32_f32 for floating point, where
* - [activation]: One of activations - sigmoid/tanh/relu/relu1/relu6/softmax
* - [n]: Input precision in bits
* - [p]: Output precision in bits
*
* Possible values,
* - 'n' takes value '32', and expects input in Q6.25 format.
* - 'p' takes values '32' and '16', gives output in Q16.15 and Q0.15 formats
* respectively.
*
* There is WORD32 datatype variable 'threshold' for 'relu' related apis, which
* expects value in Q16.15 format.
*
* Restriction,
* - All pointers (p_out, p_vec) must be 32-bit aligned and should not overlap.
*
* activation 32_32 kernels,
* - Vector activation kernels: 6
* - Total: 6
*
* activation f32_f32 kernels,
* - Vector activation kernels: 6
* - Total: 6
*
* activation 32_16 kernels,
* - Vector activation kernels: 2
* - Total: 2
*/
#include "tensorflow/lite/micro/kernels/xtensa_hifimini_staging/xa_nnlib/include/xa_type_def.h"
#if defined(__cplusplus)
extern "C" {
#endif
WORD32 xa_nn_conv2d_depthwise_getsize(
WORD32 input_height, WORD32 input_width, WORD32 input_channels,
WORD32 kernel_height, WORD32 kernel_width, WORD32 channels_multiplier,
WORD32 x_stride, WORD32 y_stride, WORD32 x_padding, WORD32 y_padding,
WORD32 output_height, WORD32 output_width, WORD32 circ_buf_precision,
WORD32 inp_data_format);
WORD32 xa_nn_vec_activation_min_max_asym8u_asym8u(
UWORD8 *__restrict__ p_out, const UWORD8 *__restrict__ p_vec,
int activation_min, int activation_max, WORD32 vec_length);
WORD32 xa_nn_vec_activation_min_max_asym8s_asym8s(
WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_vec,
int activation_min, int activation_max, WORD32 vec_length);
WORD32 xa_nn_conv2d_std_getsize(WORD32 input_height, WORD32 input_channels,
WORD32 kernel_height, WORD32 kernel_width,
WORD32 y_stride, WORD32 y_padding,
WORD32 out_height, WORD32 input_precision);
WORD32 xa_nn_conv2d_std_asym8uxasym8u(
UWORD8 *__restrict__ p_out, const UWORD8 *__restrict__ p_inp,
const UWORD8 *__restrict__ p_kernel, const WORD32 *__restrict__ p_bias,
WORD32 input_height, WORD32 input_width, WORD32 input_channels,
WORD32 kernel_height, WORD32 kernel_width, WORD32 out_channels,
WORD32 x_stride, WORD32 y_stride, WORD32 x_padding, WORD32 y_padding,
WORD32 out_height, WORD32 out_width, WORD32 input_zero_bias,
WORD32 kernel_zero_bias, WORD32 out_multiplier, WORD32 out_shift,
WORD32 out_zero_bias, WORD32 out_data_format, VOID *p_scratch);
WORD32 xa_nn_conv2d_std_per_chan_sym8sxasym8s(
WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_inp,
const WORD8 *__restrict__ p_kernel, const WORD32 *__restrict__ p_bias,
WORD32 input_height, WORD32 input_width, WORD32 input_channels,
WORD32 kernel_height, WORD32 kernel_width, WORD32 out_channels,
WORD32 x_stride, WORD32 y_stride, WORD32 x_padding, WORD32 y_padding,
WORD32 out_height, WORD32 out_width, WORD32 input_zero_bias,
WORD32 *p_out_multiplier, WORD32 *p_out_shift, WORD32 out_zero_bias,
WORD32 out_data_format, VOID *p_scratch);
WORD32 xa_nn_conv2d_depthwise_asym8uxasym8u(
pUWORD8 __restrict__ p_out, const UWORD8 *__restrict__ p_kernel,
const UWORD8 *__restrict__ p_inp, const WORD32 *__restrict__ p_bias,
WORD32 input_height, WORD32 input_width, WORD32 input_channels,
WORD32 kernel_height, WORD32 kernel_width, WORD32 channels_multiplier,
WORD32 x_stride, WORD32 y_stride, WORD32 x_padding, WORD32 y_padding,
WORD32 out_height, WORD32 out_width, WORD32 input_zero_bias,
WORD32 kernel_zero_bias, WORD32 out_multiplier, WORD32 out_shift,
WORD32 out_zero_bias, WORD32 inp_data_format, WORD32 out_data_format,
pVOID p_scratch);
WORD32 xa_nn_conv2d_depthwise_per_chan_sym8sxasym8s(
WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_kernel,
const WORD8 *__restrict__ p_inp, const WORD32 *__restrict__ p_bias,
WORD32 input_height, WORD32 input_width, WORD32 input_channels,
WORD32 kernel_height, WORD32 kernel_width, WORD32 channels_multiplier,
WORD32 x_stride, WORD32 y_stride, WORD32 x_padding, WORD32 y_padding,
WORD32 out_height, WORD32 out_width, WORD32 input_zero_bias,
const WORD32 *p_out_multiplier, const WORD32 *p_out_shift,
WORD32 out_zero_bias, WORD32 inp_data_format, WORD32 out_data_format,
pVOID p_scratch);
WORD32 xa_nn_fully_connected_asym8uxasym8u_asym8u(
pUWORD8 __restrict__ p_out, const UWORD8 *__restrict__ p_weight,
const UWORD8 *__restrict__ p_inp, const WORD32 *__restrict__ p_bias,
WORD32 weight_depth, WORD32 out_depth, WORD32 input_zero_bias,
WORD32 weight_zero_bias, WORD32 out_multiplier, WORD32 out_shift,
WORD32 out_zero_bias);
WORD32 xa_nn_fully_connected_sym8sxasym8s_asym8s(
pWORD8 __restrict__ p_out, const WORD8 *__restrict__ p_weight,
const WORD8 *__restrict__ p_inp, const WORD32 *__restrict__ p_bias,
WORD32 weight_depth, WORD32 out_depth, WORD32 input_zero_bias,
WORD32 out_multiplier, WORD32 out_shift, WORD32 out_zero_bias);
WORD32 xa_nn_fully_connected_asym8sxasym8s_asym8s(
WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_weight,
const WORD8 *__restrict__ p_inp, const WORD32 *__restrict__ p_bias,
WORD32 weight_depth, WORD32 out_depth, WORD32 weight_zero_bias,
WORD32 input_zero_bias, WORD32 out_multiplier, WORD32 out_shift,
WORD32 out_zero_bias);
WORD32 xa_nn_vec_softmax_asym8u_8(UWORD8 *__restrict__ p_out,
const UWORD8 *__restrict__ p_vec,
WORD32 diffmin, WORD32 input_left_shift,
WORD32 input_multiplier, WORD32 vec_length,
pVOID p_scratch);
WORD32 xa_nn_vec_softmax_asym8s_16(WORD16 *__restrict__ p_out,
const WORD8 *__restrict__ p_vec,
WORD32 diffmin, WORD32 input_left_shift,
WORD32 input_multiplier, WORD32 vec_length,
pVOID p_scratch);
WORD32 xa_nn_vec_softmax_asym8s_8(WORD8 *__restrict__ p_out,
const WORD8 *__restrict__ p_vec,
WORD32 diffmin, WORD32 input_left_shift,
WORD32 input_multiplier, WORD32 vec_length,
pVOID p_scratch);
int xa_nn_get_softmax_scratch_size(int inp_precision, int out_precision,
int length);
WORD32 xa_nn_matXvec_out_stride_asym8uxasym8u_asym8u(
UWORD8 *__restrict__ p_out, const UWORD8 *__restrict__ p_mat1,
const UWORD8 *__restrict__ p_vec1, const WORD32 *__restrict__ p_bias,
WORD32 rows, WORD32 cols1, WORD32 row_stride1, WORD32 out_stride,
WORD32 mat1_zero_bias, WORD32 vec1_zero_bias, WORD32 out_multiplier,
WORD32 out_shift, WORD32 out_zero_bias);
WORD32 xa_nn_matXvec_out_stride_sym8sxasym8s_asym8s(
WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_mat1,
const WORD8 *__restrict__ p_vec1, const WORD32 *__restrict__ p_bias,
WORD32 rows, WORD32 cols1, WORD32 row_stride1, WORD32 out_stride,
WORD32 vec1_zero_bias, WORD32 out_multiplier, WORD32 out_shift,
WORD32 out_zero_bias);
WORD32 xa_nn_matXvec_out_stride_asym8sxasym8s_asym8s(
WORD8 *__restrict__ p_out, const WORD8 *__restrict__ p_mat1,
const WORD8 *__restrict__ p_vec1, const WORD32 *__restrict__ p_bias,
WORD32 rows, WORD32 cols1, WORD32 row_stride1, WORD32 out_stride,
WORD32 mat1_zero_bias, WORD32 vec1_zero_bias, WORD32 out_multiplier,
WORD32 out_shift, WORD32 out_zero_bias);
WORD32 xa_nn_matXvec_out_stride_sym8sxasym8s_16(
WORD16 *__restrict__ p_out, const WORD8 *__restrict__ p_mat1,
const WORD8 *__restrict__ p_vec1, const WORD32 *__restrict__ p_bias,
WORD32 rows, WORD32 cols1, WORD32 row_stride1, WORD32 out_stride,
WORD32 vec1_zero_bias, WORD32 out_multiplier, WORD32 out_shift);
WORD32 xa_nn_dot_prod_16x16_asym8s(
WORD8 *__restrict__ p_out, /* pointer to output */
const WORD16 *__restrict__ p_inp1_start, /* pointer to input1 */
const WORD16 *__restrict__ p_inp2_start, /* pointer to input2 */
const WORD32 *bias_ptr, WORD32 vec_length, WORD32 out_multiplier,
WORD32 out_shift, WORD32 out_zero_bias, WORD32 vec_count);
/* Mapping the functions names from previous naming convension for backward
* compatibility */
#define xa_nn_vec_activation_min_max_asym8_asym8 \
xa_nn_vec_activation_min_max_asym8u_asym8u
#define xa_nn_conv2d_std_asym8xasym8 xa_nn_conv2d_std_asym8uxasym8u
#define xa_nn_conv2d_depthwise_asym8xasym8 xa_nn_conv2d_depthwise_asym8uxasym8u
#define xa_nn_fully_connected_asym8xasym8_asym8 \
xa_nn_fully_connected_asym8uxasym8u_asym8u
#define xa_nn_vec_softmax_asym8_asym8 xa_nn_vec_softmax_asym8u_asym8u
#define xa_nn_dot_prod_asym8xasym8_asym8 xa_nn_dot_prod_asym8uxasym8u_asym8u
#define xa_nn_matXvec_out_stride_asym8xasym8_asym8 \
xa_nn_matXvec_out_stride_asym8uxasym8u_asym8u
#if defined(__cplusplus)
}
#endif
#endif /* __XA_NNLIB_KERNELS_API_H__ */