Add 8-bit NEON impl of ComputeSuperRes.

The upscaling filters are converted to uint8_t type to support vmull_u8.
The range of tap-multiplied Pixels interferes with the sign bit for
int16.

Average 2.5x speedup

PiperOrigin-RevId: 288029963
PiperOrigin-RevId: 288929346
Change-Id: Iad054dbb10251212bef34de31b5a826caba3ac2b
diff --git a/src/dsp/arm/super_res_neon.cc b/src/dsp/arm/super_res_neon.cc
new file mode 100644
index 0000000..d69dfb9
--- /dev/null
+++ b/src/dsp/arm/super_res_neon.cc
@@ -0,0 +1,91 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/super_res.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include "src/dsp/dsp.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+
+namespace low_bitdepth {
+namespace {
+
+void ComputeSuperRes_NEON(const void* source, const int upscaled_width,
+                          const int initial_subpixel_x, const int step,
+                          void* const dest) {
+  const auto* src = reinterpret_cast<const uint8_t*>(source);
+  auto* dst = reinterpret_cast<uint8_t*>(dest);
+  src -= kSuperResFilterTaps >> 1;
+
+  int p = initial_subpixel_x;
+  uint16x8_t weighted_src[8];
+  for (int x = 0; x < upscaled_width; x += 8) {
+    for (int i = 0; i < kSuperResFilterTaps; ++i, p += step) {
+      const uint8x8_t src_x = vld1_u8(&src[p >> kSuperResScaleBits]);
+      const int remainder = p & kSuperResScaleMask;
+      const uint8x8_t filter =
+          vld1_u8(kUpscaleFilterUnsigned[remainder >> kSuperResExtraBits]);
+      weighted_src[i] = vmull_u8(src_x, filter);
+    }
+    Transpose8x8(weighted_src);
+
+    // Maximum sum of positive taps: 171 = 7 + 86 + 71 + 7
+    // Maximum sum: 255*171 == 0xAA55
+    // The sum is clipped to [0, 255], so adding all positive and then
+    // subtracting all negative with saturation is sufficient.
+    //           0 1 2 3 4 5 6 7
+    // tap sign: - + - + + - + -
+    uint16x8_t res = weighted_src[1];
+    res = vaddq_u16(res, weighted_src[3]);
+    res = vaddq_u16(res, weighted_src[4]);
+    res = vaddq_u16(res, weighted_src[6]);
+    res = vqsubq_u16(res, weighted_src[0]);
+    res = vqsubq_u16(res, weighted_src[2]);
+    res = vqsubq_u16(res, weighted_src[5]);
+    res = vqsubq_u16(res, weighted_src[7]);
+    vst1_u8(&dst[x], vqrshrn_n_u16(res, kFilterBits));
+  }
+}
+
+void Init8bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(8);
+  dsp->super_res_row = ComputeSuperRes_NEON;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void SuperResInit_NEON() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_NEON
+
+namespace libgav1 {
+namespace dsp {
+
+void SuperResInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/arm/super_res_neon.h b/src/dsp/arm/super_res_neon.h
new file mode 100644
index 0000000..f51785d
--- /dev/null
+++ b/src/dsp/arm/super_res_neon.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_SUPER_RES_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_SUPER_RES_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::super_res. This function is not thread-safe.
+void SuperResInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_SuperRes LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_SuperResClip LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_SUPER_RES_NEON_H_
diff --git a/src/dsp/dsp.cc b/src/dsp/dsp.cc
index 8f97238..e655f3f 100644
--- a/src/dsp/dsp.cc
+++ b/src/dsp/dsp.cc
@@ -108,6 +108,7 @@
     LoopRestorationInit_NEON();
     MaskBlendInit_NEON();
     ObmcInit_NEON();
+    SuperResInit_NEON();
     WarpInit_NEON();
     WeightMaskInit_NEON();
 #endif  // LIBGAV1_ENABLE_NEON
diff --git a/src/dsp/libgav1_dsp.cmake b/src/dsp/libgav1_dsp.cmake
index bba425f..2489e35 100644
--- a/src/dsp/libgav1_dsp.cmake
+++ b/src/dsp/libgav1_dsp.cmake
@@ -88,6 +88,8 @@
             "${libgav1_source}/dsp/arm/mask_blend_neon.h"
             "${libgav1_source}/dsp/arm/obmc_neon.cc"
             "${libgav1_source}/dsp/arm/obmc_neon.h"
+            "${libgav1_source}/dsp/arm/super_res_neon.cc"
+            "${libgav1_source}/dsp/arm/super_res_neon.h"
             "${libgav1_source}/dsp/arm/warp_neon.cc"
             "${libgav1_source}/dsp/arm/warp_neon.h"
             "${libgav1_source}/dsp/arm/weight_mask_neon.cc"
diff --git a/src/dsp/super_res.cc b/src/dsp/super_res.cc
index a2a1a66..bdbe320 100644
--- a/src/dsp/super_res.cc
+++ b/src/dsp/super_res.cc
@@ -37,9 +37,15 @@
     const Pixel* const src_x = &src[subpixel_x >> kSuperResScaleBits];
     const int src_x_subpixel =
         (subpixel_x & kSuperResScaleMask) >> kSuperResExtraBits;
-    for (int i = 0; i < kSuperResFilterTaps; ++i) {
-      sum += src_x[i] * kUpscaleFilter[src_x_subpixel][i];
-    }
+    // The sign of each tap is: - + - + + - + -
+    sum -= src_x[0] * kUpscaleFilterUnsigned[src_x_subpixel][0];
+    sum += src_x[1] * kUpscaleFilterUnsigned[src_x_subpixel][1];
+    sum -= src_x[2] * kUpscaleFilterUnsigned[src_x_subpixel][2];
+    sum += src_x[3] * kUpscaleFilterUnsigned[src_x_subpixel][3];
+    sum += src_x[4] * kUpscaleFilterUnsigned[src_x_subpixel][4];
+    sum -= src_x[5] * kUpscaleFilterUnsigned[src_x_subpixel][5];
+    sum += src_x[6] * kUpscaleFilterUnsigned[src_x_subpixel][6];
+    sum -= src_x[7] * kUpscaleFilterUnsigned[src_x_subpixel][7];
     dst[x] =
         Clip3(RightShiftWithRounding(sum, kFilterBits), 0, (1 << bitdepth) - 1);
     subpixel_x += step;
diff --git a/src/dsp/super_res.h b/src/dsp/super_res.h
index eb97ea0..33f6142 100644
--- a/src/dsp/super_res.h
+++ b/src/dsp/super_res.h
@@ -17,6 +17,16 @@
 #ifndef LIBGAV1_SRC_DSP_SUPER_RES_H_
 #define LIBGAV1_SRC_DSP_SUPER_RES_H_
 
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/super_res_neon.h"
+
+// IWYU pragma: end_exports
+
 namespace libgav1 {
 namespace dsp {
 
diff --git a/src/utils/constants.cc b/src/utils/constants.cc
index 736a0a8..e6e6b20 100644
--- a/src/utils/constants.cc
+++ b/src/utils/constants.cc
@@ -189,40 +189,43 @@
 
 const int8_t kWienerTapsMax[3] = {10, 8, 46};
 
-alignas(16) const int16_t
-    kUpscaleFilter[kSuperResFilterShifts][kSuperResFilterTaps] = {
-        {0, 0, 0, 128, 0, 0, 0, 0},        {0, 0, -1, 128, 2, -1, 0, 0},
-        {0, 1, -3, 127, 4, -2, 1, 0},      {0, 1, -4, 127, 6, -3, 1, 0},
-        {0, 2, -6, 126, 8, -3, 1, 0},      {0, 2, -7, 125, 11, -4, 1, 0},
-        {-1, 2, -8, 125, 13, -5, 2, 0},    {-1, 3, -9, 124, 15, -6, 2, 0},
-        {-1, 3, -10, 123, 18, -6, 2, -1},  {-1, 3, -11, 122, 20, -7, 3, -1},
-        {-1, 4, -12, 121, 22, -8, 3, -1},  {-1, 4, -13, 120, 25, -9, 3, -1},
-        {-1, 4, -14, 118, 28, -9, 3, -1},  {-1, 4, -15, 117, 30, -10, 4, -1},
-        {-1, 5, -16, 116, 32, -11, 4, -1}, {-1, 5, -16, 114, 35, -12, 4, -1},
-        {-1, 5, -17, 112, 38, -12, 4, -1}, {-1, 5, -18, 111, 40, -13, 5, -1},
-        {-1, 5, -18, 109, 43, -14, 5, -1}, {-1, 6, -19, 107, 45, -14, 5, -1},
-        {-1, 6, -19, 105, 48, -15, 5, -1}, {-1, 6, -19, 103, 51, -16, 5, -1},
-        {-1, 6, -20, 101, 53, -16, 6, -1}, {-1, 6, -20, 99, 56, -17, 6, -1},
-        {-1, 6, -20, 97, 58, -17, 6, -1},  {-1, 6, -20, 95, 61, -18, 6, -1},
-        {-2, 7, -20, 93, 64, -18, 6, -2},  {-2, 7, -20, 91, 66, -19, 6, -1},
-        {-2, 7, -20, 88, 69, -19, 6, -1},  {-2, 7, -20, 86, 71, -19, 6, -1},
-        {-2, 7, -20, 84, 74, -20, 7, -2},  {-2, 7, -20, 81, 76, -20, 7, -1},
-        {-2, 7, -20, 79, 79, -20, 7, -2},  {-1, 7, -20, 76, 81, -20, 7, -2},
-        {-2, 7, -20, 74, 84, -20, 7, -2},  {-1, 6, -19, 71, 86, -20, 7, -2},
-        {-1, 6, -19, 69, 88, -20, 7, -2},  {-1, 6, -19, 66, 91, -20, 7, -2},
-        {-2, 6, -18, 64, 93, -20, 7, -2},  {-1, 6, -18, 61, 95, -20, 6, -1},
-        {-1, 6, -17, 58, 97, -20, 6, -1},  {-1, 6, -17, 56, 99, -20, 6, -1},
-        {-1, 6, -16, 53, 101, -20, 6, -1}, {-1, 5, -16, 51, 103, -19, 6, -1},
-        {-1, 5, -15, 48, 105, -19, 6, -1}, {-1, 5, -14, 45, 107, -19, 6, -1},
-        {-1, 5, -14, 43, 109, -18, 5, -1}, {-1, 5, -13, 40, 111, -18, 5, -1},
-        {-1, 4, -12, 38, 112, -17, 5, -1}, {-1, 4, -12, 35, 114, -16, 5, -1},
-        {-1, 4, -11, 32, 116, -16, 5, -1}, {-1, 4, -10, 30, 117, -15, 4, -1},
-        {-1, 3, -9, 28, 118, -14, 4, -1},  {-1, 3, -9, 25, 120, -13, 4, -1},
-        {-1, 3, -8, 22, 121, -12, 4, -1},  {-1, 3, -7, 20, 122, -11, 3, -1},
-        {-1, 2, -6, 18, 123, -10, 3, -1},  {0, 2, -6, 15, 124, -9, 3, -1},
-        {0, 2, -5, 13, 125, -8, 2, -1},    {0, 1, -4, 11, 125, -7, 2, 0},
-        {0, 1, -3, 8, 126, -6, 2, 0},      {0, 1, -3, 6, 127, -4, 1, 0},
-        {0, 1, -2, 4, 127, -3, 1, 0},      {0, 0, -1, 2, 128, -1, 0, 0},
+// This was modified from Upscale_Filter as defined in AV1 Section 7.16, in
+// order to support 16-bit packed SIMD operations.
+// The sign of each tap is: - + - + + - + -
+alignas(16) const uint8_t
+    kUpscaleFilterUnsigned[kSuperResFilterShifts][kSuperResFilterTaps] = {
+        {0, 0, 0, 128, 0, 0, 0, 0},    {0, 0, 1, 128, 2, 1, 0, 0},
+        {0, 1, 3, 127, 4, 2, 1, 0},    {0, 1, 4, 127, 6, 3, 1, 0},
+        {0, 2, 6, 126, 8, 3, 1, 0},    {0, 2, 7, 125, 11, 4, 1, 0},
+        {1, 2, 8, 125, 13, 5, 2, 0},   {1, 3, 9, 124, 15, 6, 2, 0},
+        {1, 3, 10, 123, 18, 6, 2, 1},  {1, 3, 11, 122, 20, 7, 3, 1},
+        {1, 4, 12, 121, 22, 8, 3, 1},  {1, 4, 13, 120, 25, 9, 3, 1},
+        {1, 4, 14, 118, 28, 9, 3, 1},  {1, 4, 15, 117, 30, 10, 4, 1},
+        {1, 5, 16, 116, 32, 11, 4, 1}, {1, 5, 16, 114, 35, 12, 4, 1},
+        {1, 5, 17, 112, 38, 12, 4, 1}, {1, 5, 18, 111, 40, 13, 5, 1},
+        {1, 5, 18, 109, 43, 14, 5, 1}, {1, 6, 19, 107, 45, 14, 5, 1},
+        {1, 6, 19, 105, 48, 15, 5, 1}, {1, 6, 19, 103, 51, 16, 5, 1},
+        {1, 6, 20, 101, 53, 16, 6, 1}, {1, 6, 20, 99, 56, 17, 6, 1},
+        {1, 6, 20, 97, 58, 17, 6, 1},  {1, 6, 20, 95, 61, 18, 6, 1},
+        {2, 7, 20, 93, 64, 18, 6, 2},  {2, 7, 20, 91, 66, 19, 6, 1},
+        {2, 7, 20, 88, 69, 19, 6, 1},  {2, 7, 20, 86, 71, 19, 6, 1},
+        {2, 7, 20, 84, 74, 20, 7, 2},  {2, 7, 20, 81, 76, 20, 7, 1},
+        {2, 7, 20, 79, 79, 20, 7, 2},  {1, 7, 20, 76, 81, 20, 7, 2},
+        {2, 7, 20, 74, 84, 20, 7, 2},  {1, 6, 19, 71, 86, 20, 7, 2},
+        {1, 6, 19, 69, 88, 20, 7, 2},  {1, 6, 19, 66, 91, 20, 7, 2},
+        {2, 6, 18, 64, 93, 20, 7, 2},  {1, 6, 18, 61, 95, 20, 6, 1},
+        {1, 6, 17, 58, 97, 20, 6, 1},  {1, 6, 17, 56, 99, 20, 6, 1},
+        {1, 6, 16, 53, 101, 20, 6, 1}, {1, 5, 16, 51, 103, 19, 6, 1},
+        {1, 5, 15, 48, 105, 19, 6, 1}, {1, 5, 14, 45, 107, 19, 6, 1},
+        {1, 5, 14, 43, 109, 18, 5, 1}, {1, 5, 13, 40, 111, 18, 5, 1},
+        {1, 4, 12, 38, 112, 17, 5, 1}, {1, 4, 12, 35, 114, 16, 5, 1},
+        {1, 4, 11, 32, 116, 16, 5, 1}, {1, 4, 10, 30, 117, 15, 4, 1},
+        {1, 3, 9, 28, 118, 14, 4, 1},  {1, 3, 9, 25, 120, 13, 4, 1},
+        {1, 3, 8, 22, 121, 12, 4, 1},  {1, 3, 7, 20, 122, 11, 3, 1},
+        {1, 2, 6, 18, 123, 10, 3, 1},  {0, 2, 6, 15, 124, 9, 3, 1},
+        {0, 2, 5, 13, 125, 8, 2, 1},   {0, 1, 4, 11, 125, 7, 2, 0},
+        {0, 1, 3, 8, 126, 6, 2, 0},    {0, 1, 3, 6, 127, 4, 1, 0},
+        {0, 1, 2, 4, 127, 3, 1, 0},    {0, 0, 1, 2, 128, 1, 0, 0},
 };
 
 alignas(8) const int8_t
diff --git a/src/utils/constants.h b/src/utils/constants.h
index 31db3f2..88b164b 100644
--- a/src/utils/constants.h
+++ b/src/utils/constants.h
@@ -700,7 +700,8 @@
 
 extern const int8_t kWienerTapsMax[3];
 
-extern const int16_t kUpscaleFilter[kSuperResFilterShifts][kSuperResFilterTaps];
+extern const uint8_t kUpscaleFilterUnsigned[kSuperResFilterShifts]
+                                           [kSuperResFilterTaps];
 
 // An int8_t version of the kWarpedFilters array.
 // Note: The array could be removed with a performance penalty.