arm64: filmgrain16: Guard against out of range pixels in the gather function In 16 bpc, the pixels are 16 bit integers, but valid pixels only are up to 12 bits, and the scaling buffer only contains 4096 elements. The src pixels are, normally, supposed to be valid pixels, but when processing blocks of 32 pixels at a time, it can operate on uninitialized pixels past the right edge. Before: Cortex A53 A72 A73 Apple M1 fgy_32x32xn_16bpc_neon: 10372.5 8194.4 8612.1 24.2 After: fgy_32x32xn_16bpc_neon: 10837.9 8469.5 8885.1 24.6

commit: 3aac025204602810c5bf33cbad6ac1bf157487cc [log] [tgz]
author: Martin Storsjö <martin@martin.st> Thu May 13 09:33:21 2021 +0300
committer: Jean-Baptiste Kempf <jb@videolan.org> Thu May 13 08:49:02 2021 +0000
tree: 54bcb2f2c39f7e4d0377f51c5bdbc952ed89a214
parent: 1cf1b309bb2c40e25582f2ace6917d84e45f354b [diff]
diff --git a/src/arm/64/film_grain16.S b/src/arm/64/film_grain16.S
index be40388..a72164c 100644
--- a/src/arm/64/film_grain16.S
+++ b/src/arm/64/film_grain16.S

@@ -188,6 +188,7 @@
 .macro fgy ox, oy
 L(loop_\ox\oy):
 1:
+        mov             w16, #0xfff
         ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x1],  x2 // src
 .if \ox
         ld1             {v20.4h},                         [x4],  x9 // grain_lut old
@@ -198,8 +199,15 @@
 .if \ox && \oy
         ld1             {v14.4h},                         [x8],  x9 // grain_lut top old
 .endif
+        dup             v4.8h,   w16
         ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x5],  x9 // grain_lut
 
+        // Make sure that uninitialized pixels out of range past the right
+        // edge are in range; their actual values shouldn't matter.
+        and             v0.16b,  v0.16b,  v4.16b
+        and             v1.16b,  v1.16b,  v4.16b
+        and             v2.16b,  v2.16b,  v4.16b
+        and             v3.16b,  v3.16b,  v4.16b
         bl              gather_neon
 
 .if \ox

diff --git a/tests/checkasm/filmgrain.c b/tests/checkasm/filmgrain.c
index eadf2ad..3db2f61 100644
--- a/tests/checkasm/filmgrain.c
+++ b/tests/checkasm/filmgrain.c

@@ -183,10 +183,6 @@
         generate_scaling(bitdepth_from_max(bitdepth_max), fg_data[0].y_points,
                          fg_data[0].num_y_points, scaling);
 
-        for (int y = 0; y < 32; y++)
-            for (int x = 0; x < 128; x++)
-                src[y * PXSTRIDE(stride) + x] = rnd() & bitdepth_max;
-
         fg_data[0].clip_to_restricted_range = rnd() & 1;
         fg_data[0].scaling_shift = (rnd() & 3) + 8;
         for (fg_data[0].overlap_flag = 0; fg_data[0].overlap_flag <= 1;
@@ -204,6 +200,14 @@
                     row_num = rnd() & 0x7ff;
                 }
 
+                for (int y = 0; y < 32; y++) {
+                    // Src pixels past the right edge can be uninitialized
+                    for (int x = 0; x < 128; x++)
+                        src[y * PXSTRIDE(stride) + x] = rnd();
+                    for (int x = 0; x < w; x++)
+                        src[y * PXSTRIDE(stride) + x] &= bitdepth_max;
+                }
+
                 CLEAR_PIXEL_RECT(c_dst);
                 CLEAR_PIXEL_RECT(a_dst);
                 call_ref(c_dst, src, stride, fg_data, w, scaling, grain_lut, h,
@@ -275,12 +279,6 @@
                 dsp->generate_grain_uv[layout_idx](grain_lut[1], grain_lut[0],
                                                    fg_data, uv_pl HIGHBD_TAIL_SUFFIX);
 
-                for (int y = 0; y < 32; y++)
-                    for (int x = 0; x < 128; x++)
-                        src[y * PXSTRIDE(stride) + x] = rnd() & bitdepth_max;
-                for (int y = 0; y < 32; y++)
-                    for (int x = 0; x < 128; x++)
-                        luma_src[y * PXSTRIDE(lstride) + x] = rnd() & bitdepth_max;
                 if (csfl) {
                     fg_data[0].num_y_points = 2 + (rnd() % 13);
                     const int pad = 0xff / fg_data[0].num_y_points;
@@ -325,6 +323,18 @@
                             row_num = rnd() & 0x7ff;
                         }
 
+                        for (int y = 0; y < 32; y++) {
+                            // Src pixels past the right edge can be uninitialized
+                            for (int x = 0; x < 128; x++) {
+                                src[y * PXSTRIDE(stride) + x] = rnd();
+                                luma_src[y * PXSTRIDE(lstride) + x] = rnd();
+                            }
+                            for (int x = 0; x < w; x++)
+                                src[y * PXSTRIDE(stride) + x] &= bitdepth_max;
+                            for (int x = 0; x < (w << ss_x); x++)
+                                luma_src[y * PXSTRIDE(lstride) + x] &= bitdepth_max;
+                        }
+
                         CLEAR_PIXEL_RECT(c_dst);
                         CLEAR_PIXEL_RECT(a_dst);
                         call_ref(c_dst, src, stride, fg_data, w, scaling, grain_lut[1], h,
commit	3aac025204602810c5bf33cbad6ac1bf157487cc	[log] [tgz]
author	Martin Storsjö <martin@martin.st>	Thu May 13 09:33:21 2021 +0300
committer	Jean-Baptiste Kempf <jb@videolan.org>	Thu May 13 08:49:02 2021 +0000
tree	54bcb2f2c39f7e4d0377f51c5bdbc952ed89a214
parent	1cf1b309bb2c40e25582f2ace6917d84e45f354b [diff]