Add optimized quantize function for ARM (#26867)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/26867
Use caffe2::Int8Quantize for pytorch mobile. Currently this is only implemented for uint8 tensors and runs using NEON intrinsics.
For all other cases it falls back to naive pytorch quantize_val implementation.
Previously, naive implementation of quantize_val is slow on mobile, taking up more than 50% of the execution time.
Results
Before
aten::quantize_per_tensor 42.893 ms
Total model runtime 70.5ms
After
aten::quantize_per_tensor 0.340 ms
Total model runtime 27.5ms
Test Plan:
Tested current python tests work python test/test_quantized.py TestQNNPackOps
Also tested using quantized mobilenetV2 on mobile and compared output
Imported from OSS
Differential Revision: D17638732
fbshipit-source-id: 76445d1e415e6e502d05ba5b900e5e1d875fc1b0
diff --git a/aten/src/ATen/quantized/Quantizer.cpp b/aten/src/ATen/quantized/Quantizer.cpp
index d548061..69956aa 100644
--- a/aten/src/ATen/quantized/Quantizer.cpp
+++ b/aten/src/ATen/quantized/Quantizer.cpp
@@ -6,12 +6,14 @@
#include <ATen/native/TensorFactories.h>
#include <ATen/quantized/QTensorImpl.h>
#include <ATen/core/Tensor.h>
-
#include <typeinfo>
#ifdef USE_FBGEMM
#include <fbgemm/QuantUtils.h>
#endif
+#ifdef __ARM_NEON__
+#include <arm_neon.h>
+#endif
namespace at {
@@ -159,6 +161,21 @@
}
#else // USE_FBGEMM
+#if defined(__ANDROID__) && !defined(__NDK_MAJOR__)
+template <class T>
+inline float Round(const float x) {
+ return ::nearbyintf(x);
+}
+inline double Round(const double x) {
+ return ::nearbyint(x);
+}
+#else
+template <class T>
+inline T Round(const T x) {
+ return std::nearbyint(x);
+}
+#endif
+
template <typename T>
T quantize_val(double scale, int64_t zero_point, float value) {
// std::nearbyint results in nearest integer value according to the current
@@ -171,8 +188,7 @@
int64_t qvalue;
constexpr int64_t qmin = std::numeric_limits<typename T::underlying>::min();
constexpr int64_t qmax = std::numeric_limits<typename T::underlying>::max();
- checkZeroPoint<typename T::underlying>("quantize_val", zero_point);
- qvalue = static_cast<int64_t>(std::nearbyint(value / scale + zero_point));
+ qvalue = static_cast<int64_t>(Round(value / scale + zero_point));
qvalue = std::max<int64_t>(qvalue, qmin);
qvalue = std::min<int64_t>(qvalue, qmax);
return static_cast<T>(qvalue);
@@ -180,18 +196,106 @@
template <typename T, int precision>
void quantize_vec(double scale, int64_t zero_point, const float *src, T *dst, size_t count) {
+ checkZeroPoint<typename T::underlying>("quantize_val", zero_point);
for (int64_t i = 0; i < count; ++i) {
dst[i] = quantize_val<T>(scale, zero_point, src[i]);
}
}
+// TODO combine this with quantize_val once the numerics for ARM are aligned with it
+inline uint8_t quantize_val_arm(const float scale, const int32_t zero_point, const float value) {
+ const int32_t qmin = std::numeric_limits<uint8_t>::min();
+ const int32_t qmax = std::numeric_limits<uint8_t>::max();
+ auto r = zero_point + static_cast<int32_t>(Round(value / scale));
+ r = std::max(r, qmin);
+ r = std::min(r, qmax);
+ return static_cast<uint8_t>(r);
+}
+
+#ifdef __ARM_NEON__
+// Generic template defaults to naive quantize implementation
+template <typename T>
+void quantize_tensor_arm(
+ const float* in,
+ Tensor qtensor,
+ const int64_t N,
+ const float scale,
+ const int32_t zero_point) {
+ auto out = qtensor.data_ptr<T>();
+ for (int i = 0; i < N; ++i) {
+ out[i] = quantize_val<T>(scale, zero_point, in[i]);
+ }
+}
+
+// Specialized implementation from caffe2::Int8Quantize.
+// There may be slight accuracy difference between this and implementation of quantize_val
+// TODO Update quantize_tensor_arm implementation to follow quantize_val,
+// i.e. f = Round(value/scale + zero_point)
+// TODO Make quantize_tensor_arm work for other datatypes too (int8, int32).
+template <>
+void quantize_tensor_arm<c10::quint8>(
+ const float* in,
+ Tensor qtensor,
+ const int64_t N,
+ const float scale,
+ const int32_t zero_point) {
+ const float inv_scale = 1.0f / scale;
+ uint32_t i = 0;
+ auto out = (uint8_t*)qtensor.data_ptr<c10::quint8>();
+ const float32x4_t vinv_scale = vdupq_n_f32(inv_scale);
+ // magic float and magic int to take care of rounding
+ // int magic_round(float f): interpret_int32(f + 12582912.0f) - 0x4B400000
+ // Some detail:
+ // 12582912.0f is 2**23 + 2**22. The trick is based on the fact that when you
+ // add a small number to a large number, the result rounds to the precision of
+ // the least significant bit of the large number. For IEEE-754
+ // single-precision number mantissa has 23 bits, and adding 2**23 would cause
+ // rounding to the nearest even integer. The we cast to int and subtract the
+ // same number (0x4B400000 is the integer representation of 12582912.0f) to
+ // get only the mantissa. This works if -2**22 < x < 2**22, but preserves the
+ // sign for negative numbers.
+ const int32x4_t voffset = vdupq_n_s32(zero_point - 0x4B400000);
+ const float32x4_t vmagic_float = vdupq_n_f32(12582912.0f);
+ for (i = 0; i + 8 < N; i += 8) {
+ const float32x4_t vin0123 = vld1q_f32(in);
+ in += 4;
+ const float32x4_t vin4567 = vld1q_f32(in);
+ in += 4;
+ const int32x4_t vraw0123 = vaddq_s32(
+ voffset,
+ vreinterpretq_s32_f32(
+ vaddq_f32(vmagic_float, vmulq_f32(vin0123, vinv_scale))));
+ const int32x4_t vraw4567 = vaddq_s32(
+ voffset,
+ vreinterpretq_s32_f32(
+ vaddq_f32(vmagic_float, vmulq_f32(vin4567, vinv_scale))));
+ const int16x8_t vraw01234567 =
+ vcombine_s16(vqmovn_s32(vraw0123), vqmovn_s32(vraw4567));
+ const uint8x8_t vout01234567 = vqmovun_s16(vraw01234567);
+ vst1_u8(out, vout01234567);
+ out += 8;
+ }
+ for (; i < N; ++i) {
+ (*out++) = quantize_val_arm(scale, zero_point, (*in++));
+ }
+}
+#endif // __ARM_NEON__
+
template <typename T>
Tensor quantize_tensor(Tensor rtensor, Tensor qtensor, double scale, int64_t zero_point) {
auto fn_name = "quantize_tensor";
checkFloatCPUTensor(fn_name, rtensor);
checkQuantizedCPUTensor<T>(fn_name, qtensor);
checkZeroPoint<typename T::underlying>(fn_name, zero_point);
- const float* rdata = rtensor.data_ptr<float>();
+ TORCH_CHECK(rtensor.is_contiguous(), "Float tensor should be contiguous");
+ const float* const rdata = rtensor.data_ptr<float>();
+ // If QEngine is set to QNNPACK, use caffe2 specialized Int8Quantize implementation on ARM
+#if defined(__ARM_NEON__)
+ if (at::globalContext().qEngine() == at::QEngine::QNNPACK) {
+ quantize_tensor_arm<T>(rdata, qtensor, rtensor.numel(), scale, zero_point);
+ return qtensor;
+ }
+#endif
auto qdata = qtensor.data_ptr<T>();
for (int i = 0; i < rtensor.numel(); ++i) {
qdata[i] = quantize_val<T>(scale, zero_point, rdata[i]);