| #include <ATen/native/quantized/AffineQuantizerBase.h> |
| #include <c10/util/irange.h> |
| #include <cfenv> |
| #include <climits> |
| |
| #ifdef USE_FBGEMM |
| #include <fbgemm/QuantUtils.h> |
| #endif |
| #ifdef __ARM_NEON__ |
| #include <arm_neon.h> |
| #endif |
| |
| namespace at { |
| namespace native { |
| |
| namespace { |
| |
| template <typename T> |
| void checkZeroPoint(const std::string& fn_name, int64_t zero_point) { |
| TORCH_CHECK( |
| zero_point <= std::numeric_limits<T>::max(), |
| fn_name, |
| " zero_point ", |
| zero_point, |
| " is out of range."); |
| TORCH_CHECK( |
| zero_point >= std::numeric_limits<T>::min(), |
| fn_name, |
| " zero_point ", |
| zero_point, |
| " is out of range."); |
| } |
| |
| } // anonymous namespace |
| |
| #ifdef USE_FBGEMM |
| // Note: quantize_val is only explicitly used in test outside of this file |
| template <typename T> |
| T quantize_val(double scale, int64_t zero_point, float value) { |
| // Internally, fbgemm::Quantize uses std::nearbyint. |
| // std::nearbyint results in nearest integer value according to the current |
| // rounding mode and the default rounding mode is rounds to even in half-way |
| // cases in most popular processor architectures like x86 and ARM. This is |
| // typically faster than an alternatives like std::round that rounds half-way |
| // cases away from zero, and can be consistent with SIMD implementations for |
| // example in x86 using _mm512_cvtps_epi32 or mm512_round_ps with |
| // _MM_FROUND_CUR_DIRECTION option that also follow the current rounding mode. |
| // NOLINTNEXTLINE(cppcoreguidelines-init-variables) |
| int32_t qvalue; |
| // NOLINTNEXTLINE(bugprone-signed-char-misuse) |
| qvalue = fbgemm::Quantize<typename T::underlying, false /*LEGACY*/>( |
| value, |
| static_cast<int32_t>(zero_point), |
| static_cast<float>(scale), |
| /*result_precision=*/CHAR_BIT * sizeof(typename T::underlying)); |
| return static_cast<T>(qvalue); |
| } |
| |
| template <typename T, int precision> |
| void quantize_vec( |
| double scale, |
| int64_t zero_point, |
| const float* src, |
| T* dst, |
| size_t count) { |
| fbgemm::Quantize<typename T::underlying, false /*LEGACY*/>( |
| src, |
| (typename T::underlying*)dst, |
| count, |
| fbgemm::TensorQuantizationParams{ |
| (float)scale, (int32_t)zero_point, precision}); |
| } |
| |
| #if defined(__ARM_NEON__) || defined(__aarch64__) |
| // For use when compiling FBGEMM on aarch64 but still supporting x86 |
| // intrinsics via simde |
| template <typename T> |
| T quantize_val_arm( |
| const float scale, |
| const int32_t zero_point, |
| const float value) { |
| constexpr int32_t qmin = std::numeric_limits<T>::min(); |
| constexpr int32_t qmax = std::numeric_limits<T>::max(); |
| float inv_scale = 1.0f / scale; |
| auto r = zero_point + static_cast<int32_t>(std::nearbyint(value * inv_scale)); |
| r = std::max(r, qmin); |
| r = std::min(r, qmax); |
| return static_cast<T>(r); |
| } |
| |
| template uint8_t quantize_val_arm<uint8_t>( |
| const float scale, |
| const int32_t zero_point, |
| const float value); |
| template int8_t quantize_val_arm<int8_t>( |
| const float scale, |
| const int32_t zero_point, |
| const float value); |
| #endif |
| |
| template <typename T> |
| inline float dequantize_val(double scale, int64_t zero_point, T value) { |
| // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) |
| fbgemm::TensorQuantizationParams qparams; |
| qparams.scale = static_cast<float>(scale); |
| qparams.zero_point = static_cast<int32_t>(zero_point); |
| return fbgemm::Dequantize<typename T::underlying>(value.val_, qparams); |
| } |
| #else // USE_FBGEMM |
| |
| #if defined(__ANDROID__) && !defined(__NDK_MAJOR__) |
| template <class T> |
| inline float Round(const float x) { |
| return ::nearbyintf(x); |
| } |
| inline double Round(const double x) { |
| return ::nearbyint(x); |
| } |
| #else |
| template <class T> |
| inline T Round(const T x) { |
| return std::nearbyint(x); |
| } |
| #endif |
| |
| template <typename T> |
| T quantize_val(double scale, int64_t zero_point, float value) { |
| // std::nearbyint results in nearest integer value according to the current |
| // rounding mode and the default rounding mode is rounds to even in half-way |
| // cases in most popular processor architectures like x86 and ARM. This is |
| // typically faster than an alternatives like std::round that rounds half-way |
| // cases away from zero, and can be consistent with SIMD implementations for |
| // example in x86 using _mm512_cvtps_epi32 or mm512_round_ps with |
| // _MM_FROUND_CUR_DIRECTION option that also follow the current rounding mode. |
| int64_t qvalue; |
| constexpr int64_t qmin = std::numeric_limits<typename T::underlying>::min(); |
| constexpr int64_t qmax = std::numeric_limits<typename T::underlying>::max(); |
| float inv_scale = 1.0f / static_cast<float>(scale); |
| qvalue = static_cast<int64_t>(zero_point + Round(value * inv_scale)); |
| qvalue = std::max<int64_t>(qvalue, qmin); |
| qvalue = std::min<int64_t>(qvalue, qmax); |
| return static_cast<T>(qvalue); |
| } |
| |
| template <typename T> |
| T quantize_val_arm( |
| const float scale, |
| const int32_t zero_point, |
| const float value) { |
| constexpr int32_t qmin = std::numeric_limits<T>::min(); |
| constexpr int32_t qmax = std::numeric_limits<T>::max(); |
| float inv_scale = 1.0f / scale; |
| #ifndef _MSC_VER |
| auto r = static_cast<int32_t>(Round(value * inv_scale)); |
| // builtin_add_overflow() returns true in case of overflow |
| if (__builtin_add_overflow(zero_point, r, &r)) { |
| // zero_point must be a non-negative value between qmin and qmax, |
| // i.e. only overflow can happen. |
| r = qmax; |
| } |
| #else |
| auto r = zero_point + static_cast<int32_t>(Round(value * inv_scale)); |
| #endif |
| r = std::max(r, qmin); |
| r = std::min(r, qmax); |
| return static_cast<T>(r); |
| } |
| |
| template <typename T, int precision> |
| void quantize_vec( |
| double scale, |
| int64_t zero_point, |
| const float* src, |
| T* dst, |
| size_t count) { |
| checkZeroPoint<typename T::underlying>("quantize_vec", zero_point); |
| for (const auto i : c10::irange(count)) { |
| dst[i] = quantize_val<T>(scale, zero_point, src[i]); |
| } |
| } |
| |
| template uint8_t quantize_val_arm<uint8_t>( |
| const float scale, |
| const int32_t zero_point, |
| const float value); |
| template int8_t quantize_val_arm<int8_t>( |
| const float scale, |
| const int32_t zero_point, |
| const float value); |
| template <typename T> |
| TORCH_API float dequantize_val(double scale, int64_t zero_point, T value) { |
| return static_cast<float>(scale) * (value.val_ - static_cast<int32_t>(zero_point)); |
| } |
| #endif // USE_FBGEMM |
| |
| /* |
| * Quantize value based on the following equation |
| * Xq = Round(Xf * inv_scale + zero_point) |
| * where zero_point is in float. |
| * |
| * Note: For the case of embedding quantization we will set zero_point |
| * to (-Xmin/scale), where Xmin is the min value in input tensor row. |
| */ |
| int quantize_val_float_qparams(float scale, float zero_point, float value, int qmin, int qmax) { |
| // NOLINTNEXTLINE(cppcoreguidelines-init-variables) |
| int qvalue; |
| |
| float inv_scale = scale == 0 ? 1.0f : 1.0f / scale; |
| // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions) |
| qvalue = lrintf(value * inv_scale + zero_point); |
| qvalue = std::max(qmin, std::min(qvalue, qmax)); |
| return qvalue; |
| } |
| |
| template <typename SRC_T, typename DST_T> |
| DST_T requantize_val( |
| double src_scale, |
| int64_t src_zero_point, |
| double dst_scale, |
| int64_t dst_zero_point, |
| SRC_T src) { |
| const auto dq = dequantize_val<SRC_T>(src_scale, src_zero_point, src); |
| return quantize_val<DST_T>(dst_scale, dst_zero_point, dq); |
| } |
| |
| template <typename DST_T> |
| DST_T requantize_from_int(double multiplier, int64_t zero_point, int64_t src) { |
| int64_t quantize_down = |
| zero_point + lrintf(src * static_cast<float>(multiplier)); |
| // NOLINTNEXTLINE(bugprone-signed-char-misuse) |
| int32_t min = std::numeric_limits<typename DST_T::underlying>::min(); |
| int32_t max = std::numeric_limits<typename DST_T::underlying>::max(); |
| return static_cast<DST_T>( |
| std::min<int64_t>(std::max<int64_t>(quantize_down, min), max)); |
| } |
| |
| template TORCH_API qint8 |
| quantize_val<qint8>(double scale, int64_t zero_point, float value); |
| template TORCH_API quint8 |
| quantize_val<quint8>(double scale, int64_t zero_point, float value); |
| template TORCH_API qint32 |
| quantize_val<qint32>(double scale, int64_t zero_point, float value); |
| template TORCH_API void quantize_vec<c10::qint8>( |
| double scale, |
| int64_t zero_point, |
| const float* src, |
| c10::qint8* dst, |
| size_t count); |
| template TORCH_API void quantize_vec<c10::quint8>( |
| double scale, |
| int64_t zero_point, |
| const float* src, |
| c10::quint8* dst, |
| size_t count); |
| template TORCH_API void quantize_vec<c10::qint32, 32>( |
| double scale, |
| int64_t zero_point, |
| const float* src, |
| c10::qint32* dst, |
| size_t count); |
| |
| template TORCH_API float dequantize_val<qint8>( |
| double scale, |
| int64_t zero_point, |
| qint8 value); |
| template TORCH_API float dequantize_val<quint8>( |
| double scale, |
| int64_t zero_point, |
| quint8 value); |
| template TORCH_API float dequantize_val<qint32>( |
| double scale, |
| int64_t zero_point, |
| qint32 value); |
| |
| template TORCH_API qint8 |
| requantize_val<qint8, qint8>(double, int64_t, double, int64_t, qint8); |
| template TORCH_API quint8 |
| requantize_val<qint8, quint8>(double, int64_t, double, int64_t, qint8); |
| template TORCH_API qint32 |
| requantize_val<qint8, qint32>(double, int64_t, double, int64_t, qint8); |
| template TORCH_API qint8 |
| requantize_val<quint8, qint8>(double, int64_t, double, int64_t, quint8); |
| template TORCH_API quint8 |
| requantize_val<quint8, quint8>(double, int64_t, double, int64_t, quint8); |
| template TORCH_API qint32 |
| requantize_val<quint8, qint32>(double, int64_t, double, int64_t, quint8); |
| template TORCH_API qint8 |
| requantize_val<qint32, qint8>(double, int64_t, double, int64_t, qint32); |
| template TORCH_API quint8 |
| requantize_val<qint32, quint8>(double, int64_t, double, int64_t, qint32); |
| template TORCH_API qint32 |
| requantize_val<qint32, qint32>(double, int64_t, double, int64_t, qint32); |
| |
| template TORCH_API qint8 requantize_from_int<qint8>(double, int64_t, int64_t); |
| template TORCH_API quint8 |
| requantize_from_int<quint8>(double, int64_t, int64_t); |
| template TORCH_API qint32 |
| requantize_from_int<qint32>(double, int64_t, int64_t); |
| |
| } // namespace native |
| } // namespace at |