blob: 2eb079f0b0c295055b9cc29d9c3cd4bb27de2438 [file]
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
#pragma once
#include <unistd.h>
#include <cmath>
#include <limits>
#include <vector>
#include <executorch/runtime/core/error.h>
#include <executorch/runtime/core/exec_aten/exec_aten.h>
#ifdef __aarch64__
#include <arm_neon.h>
#endif
namespace executorch {
namespace backends {
namespace xnnpack {
namespace utils {
struct QuantizationParams {
double scale;
int32_t zero_point;
};
executorch::runtime::Error ChooseQuantizationParams(
float min,
float max,
int32_t qmin,
int32_t qmax,
QuantizationParams& result,
bool preserve_sparsity,
bool force_scale_power_of_two,
bool reduce_range);
#if defined(__ANDROID__) && !defined(__NDK_MAJOR__)
template <class T>
inline float Round(const float x) {
return ::nearbyintf(x);
}
inline double Round(const double x) {
return ::nearbyint(x);
}
#else
template <class T>
inline T Round(const T x) {
return std::nearbyint(x);
}
#endif
template <typename T>
T quantize_val(double scale, int64_t zero_point, float value) {
// std::nearbyint results in nearest integer value according to the current
// rounding mode and the default rounding mode is rounds to even in half-way
// cases in most popular processor architectures like x86 and ARM. This is
// typically faster than an alternatives like std::round that rounds half-way
// cases away from zero, and can be consistent with SIMD implementations for
// example in x86 using _mm512_cvtps_epi32 or mm512_round_ps with
// _MM_FROUND_CUR_DIRECTION option that also follow the current rounding mode.
int64_t qvalue;
constexpr int64_t qmin = std::numeric_limits<T>::min();
constexpr int64_t qmax = std::numeric_limits<T>::max();
float inv_scale = 1.0f / static_cast<float>(scale);
qvalue = static_cast<int64_t>(zero_point + Round(value * inv_scale));
qvalue = std::max<int64_t>(qvalue, qmin);
qvalue = std::min<int64_t>(qvalue, qmax);
return static_cast<T>(qvalue);
}
#ifdef __aarch64__
template <typename Tx8>
Tx8 vqmov(int16x8_t vraw);
template <typename T, typename Tx8>
void vst1(T* out, Tx8 vout);
template <typename underlying_t, typename underlying_x8_t>
void quantize_tensor_arm64_q8(
const float* __restrict__ in,
underlying_t* __restrict__ out,
const int64_t N,
const float scale,
const int32_t zero_point) {
const float inv_scale = 1.0f / scale;
uint32_t i = 0;
underlying_t* out_underlying = reinterpret_cast<underlying_t*>(out);
const float32x4_t vinv_scale = vdupq_n_f32(inv_scale);
const int16x8_t vzero_point = vdupq_n_s16((int16_t)(uint16_t)zero_point);
for (i = 0; i + 8 <= N; i += 8) {
const float32x4_t vin0123 = vld1q_f32(in);
in += 4;
const float32x4_t vin4567 = vld1q_f32(in);
in += 4;
const int32x4_t v0123_rounded =
vcvtnq_s32_f32(vmulq_f32(vin0123, vinv_scale));
const int32x4_t v4567_rounded =
vcvtnq_s32_f32(vmulq_f32(vin4567, vinv_scale));
const int16x8_t v01234567_packed = vqaddq_s16(
vqmovn_high_s32(vqmovn_s32(v0123_rounded), v4567_rounded), vzero_point);
const underlying_x8_t vout01234567 =
vqmov<underlying_x8_t>(v01234567_packed);
vst1<underlying_t, underlying_x8_t>(out_underlying, vout01234567);
out_underlying += 8;
}
for (; i < N; ++i) {
(*out_underlying++) =
quantize_val<underlying_t>(scale, zero_point, (*in++));
}
}
template <typename T>
void quantize_tensor_arm64_q8_wrapper(
const float* __restrict__ in,
T* __restrict__ out,
const int64_t N,
const float scale,
const int32_t zero_point);
#endif /* __aarch64__ */
template <typename T = uint8_t>
executorch::runtime::Error QuantizePerTensor(
const executorch::aten::Tensor& rtensor,
executorch::aten::Tensor& qtensor,
double scale,
int zero_point) {
const float* rdata = rtensor.const_data_ptr<float>();
int numel = rtensor.numel();
ET_CHECK_OR_RETURN_ERROR(
(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value),
Internal,
"Expecting quantized output tensor of dtype uint8_t or int8_t");
ET_CHECK_OR_RETURN_ERROR(
rtensor.numel() <= qtensor.numel(),
Internal,
"Expecting quantized output tensor of same or smaller size as input, %zd vs. %zd",
qtensor.numel(),
rtensor.numel());
T* qdata = qtensor.mutable_data_ptr<T>();
#if defined(__aarch64__)
quantize_tensor_arm64_q8_wrapper<T>(rdata, qdata, numel, scale, zero_point);
#else
for (int i = 0; i < numel; ++i) {
qdata[i] = quantize_val<T>(scale, zero_point, rdata[i]);
}
#endif /* __aarch64__ */
return executorch::runtime::Error::Ok;
}
executorch::runtime::Error GenerateRequantizationScale(
const executorch::aten::Tensor& weight_scales,
float input_scale,
float output_scale,
std::vector<float>& requant_scales);
std::pair<float, float> GetMinMax(const executorch::aten::Tensor& ft);
} // namespace utils
} // namespace xnnpack
} // namespace backends
} // namespace executorch