Add optimized quantize function for ARM (#26867)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/26867

Use caffe2::Int8Quantize for pytorch mobile. Currently this is only implemented for uint8 tensors and runs using NEON intrinsics.
For all other cases it falls back to naive pytorch quantize_val implementation.

Previously, naive implementation of quantize_val is slow on mobile, taking up more than 50% of the execution time.

Results
Before
aten::quantize_per_tensor 42.893 ms
Total model runtime 70.5ms

After
aten::quantize_per_tensor 0.340 ms
Total model runtime 27.5ms

Test Plan:
Tested current python tests work python test/test_quantized.py TestQNNPackOps
Also tested using quantized mobilenetV2 on mobile and compared output

Imported from OSS

Differential Revision: D17638732

fbshipit-source-id: 76445d1e415e6e502d05ba5b900e5e1d875fc1b0
diff --git a/aten/src/ATen/quantized/Quantizer.cpp b/aten/src/ATen/quantized/Quantizer.cpp
index d548061..69956aa 100644
--- a/aten/src/ATen/quantized/Quantizer.cpp
+++ b/aten/src/ATen/quantized/Quantizer.cpp
@@ -6,12 +6,14 @@
 #include <ATen/native/TensorFactories.h>
 #include <ATen/quantized/QTensorImpl.h>
 #include <ATen/core/Tensor.h>
-
 #include <typeinfo>
 
 #ifdef USE_FBGEMM
 #include <fbgemm/QuantUtils.h>
 #endif
+#ifdef __ARM_NEON__
+#include <arm_neon.h>
+#endif
 
 namespace at {
 
@@ -159,6 +161,21 @@
 }
 #else  // USE_FBGEMM
 
+#if defined(__ANDROID__) && !defined(__NDK_MAJOR__)
+template <class T>
+inline float Round(const float x) {
+  return ::nearbyintf(x);
+}
+inline double Round(const double x) {
+  return ::nearbyint(x);
+}
+#else
+template <class T>
+inline T Round(const T x) {
+  return std::nearbyint(x);
+}
+#endif
+
 template <typename T>
 T quantize_val(double scale, int64_t zero_point, float value) {
   // std::nearbyint results in nearest integer value according to the current
@@ -171,8 +188,7 @@
   int64_t qvalue;
   constexpr int64_t qmin = std::numeric_limits<typename T::underlying>::min();
   constexpr int64_t qmax = std::numeric_limits<typename T::underlying>::max();
-  checkZeroPoint<typename T::underlying>("quantize_val", zero_point);
-  qvalue = static_cast<int64_t>(std::nearbyint(value / scale + zero_point));
+  qvalue = static_cast<int64_t>(Round(value / scale + zero_point));
   qvalue = std::max<int64_t>(qvalue, qmin);
   qvalue = std::min<int64_t>(qvalue, qmax);
   return static_cast<T>(qvalue);
@@ -180,18 +196,106 @@
 
 template <typename T, int precision>
 void quantize_vec(double scale, int64_t zero_point, const float *src, T *dst, size_t count) {
+  checkZeroPoint<typename T::underlying>("quantize_val", zero_point);
   for (int64_t i = 0; i < count; ++i) {
     dst[i] = quantize_val<T>(scale, zero_point, src[i]);
   }
 }
 
+// TODO combine this with quantize_val once the numerics for ARM are aligned with it
+inline uint8_t quantize_val_arm(const float scale, const int32_t zero_point, const float value) {
+  const int32_t qmin = std::numeric_limits<uint8_t>::min();
+  const int32_t qmax = std::numeric_limits<uint8_t>::max();
+  auto r = zero_point + static_cast<int32_t>(Round(value / scale));
+  r = std::max(r, qmin);
+  r = std::min(r, qmax);
+  return static_cast<uint8_t>(r);
+}
+
+#ifdef __ARM_NEON__
+// Generic template defaults to naive quantize implementation
+template <typename T>
+void quantize_tensor_arm(
+    const float* in,
+    Tensor qtensor,
+    const int64_t N,
+    const float scale,
+    const int32_t zero_point) {
+  auto out = qtensor.data_ptr<T>();
+  for (int i = 0; i < N; ++i) {
+    out[i] = quantize_val<T>(scale, zero_point, in[i]);
+  }
+}
+
+// Specialized implementation from caffe2::Int8Quantize.
+// There may be slight accuracy difference between this and implementation of quantize_val
+// TODO Update quantize_tensor_arm implementation to follow quantize_val,
+// i.e. f = Round(value/scale + zero_point)
+// TODO Make quantize_tensor_arm work for other datatypes too (int8, int32).
+template <>
+void quantize_tensor_arm<c10::quint8>(
+    const float* in,
+    Tensor qtensor,
+    const int64_t N,
+    const float scale,
+    const int32_t zero_point) {
+  const float inv_scale = 1.0f / scale;
+  uint32_t i = 0;
+  auto out = (uint8_t*)qtensor.data_ptr<c10::quint8>();
+  const float32x4_t vinv_scale = vdupq_n_f32(inv_scale);
+  // magic float and magic int to take care of rounding
+  // int magic_round(float f): interpret_int32(f + 12582912.0f) - 0x4B400000
+  // Some detail:
+  // 12582912.0f is 2**23 + 2**22. The trick is based on the fact that when you
+  // add a small number to a large number, the result rounds to the precision of
+  // the least significant bit of the large number. For IEEE-754
+  // single-precision number mantissa has 23 bits, and adding 2**23 would cause
+  // rounding to the nearest even integer. The we cast to int and subtract the
+  // same number (0x4B400000 is the integer representation of 12582912.0f) to
+  // get only the mantissa. This works if -2**22 < x < 2**22, but preserves the
+  // sign for negative numbers.
+  const int32x4_t voffset = vdupq_n_s32(zero_point - 0x4B400000);
+  const float32x4_t vmagic_float = vdupq_n_f32(12582912.0f);
+  for (i = 0; i + 8 < N; i += 8) {
+    const float32x4_t vin0123 = vld1q_f32(in);
+    in += 4;
+    const float32x4_t vin4567 = vld1q_f32(in);
+    in += 4;
+    const int32x4_t vraw0123 = vaddq_s32(
+        voffset,
+        vreinterpretq_s32_f32(
+            vaddq_f32(vmagic_float, vmulq_f32(vin0123, vinv_scale))));
+    const int32x4_t vraw4567 = vaddq_s32(
+        voffset,
+        vreinterpretq_s32_f32(
+            vaddq_f32(vmagic_float, vmulq_f32(vin4567, vinv_scale))));
+    const int16x8_t vraw01234567 =
+        vcombine_s16(vqmovn_s32(vraw0123), vqmovn_s32(vraw4567));
+    const uint8x8_t vout01234567 = vqmovun_s16(vraw01234567);
+    vst1_u8(out, vout01234567);
+    out += 8;
+  }
+  for (; i < N; ++i) {
+    (*out++) = quantize_val_arm(scale, zero_point, (*in++));
+  }
+}
+#endif // __ARM_NEON__
+
 template <typename T>
 Tensor quantize_tensor(Tensor rtensor, Tensor qtensor, double scale, int64_t zero_point) {
   auto fn_name = "quantize_tensor";
   checkFloatCPUTensor(fn_name, rtensor);
   checkQuantizedCPUTensor<T>(fn_name, qtensor);
   checkZeroPoint<typename T::underlying>(fn_name, zero_point);
-  const float* rdata = rtensor.data_ptr<float>();
+  TORCH_CHECK(rtensor.is_contiguous(), "Float tensor should be contiguous");
+  const float* const rdata = rtensor.data_ptr<float>();
+  // If QEngine is set to QNNPACK, use caffe2 specialized Int8Quantize implementation on ARM
+#if defined(__ARM_NEON__)
+  if (at::globalContext().qEngine() == at::QEngine::QNNPACK) {
+    quantize_tensor_arm<T>(rdata, qtensor, rtensor.numel(), scale, zero_point);
+    return qtensor;
+  }
+#endif
   auto qdata = qtensor.data_ptr<T>();
   for (int i = 0; i < rtensor.numel(); ++i) {
     qdata[i] = quantize_val<T>(scale, zero_point, rdata[i]);