arm64: filmgrain16: Guard against out of range pixels in the gather function
In 16 bpc, the pixels are 16 bit integers, but valid pixels only
are up to 12 bits, and the scaling buffer only contains 4096
elements.
The src pixels are, normally, supposed to be valid pixels, but when
processing blocks of 32 pixels at a time, it can operate on
uninitialized pixels past the right edge.
Before: Cortex A53 A72 A73 Apple M1
fgy_32x32xn_16bpc_neon: 10372.5 8194.4 8612.1 24.2
After:
fgy_32x32xn_16bpc_neon: 10837.9 8469.5 8885.1 24.6
diff --git a/src/arm/64/film_grain16.S b/src/arm/64/film_grain16.S
index be40388..a72164c 100644
--- a/src/arm/64/film_grain16.S
+++ b/src/arm/64/film_grain16.S
@@ -188,6 +188,7 @@
.macro fgy ox, oy
L(loop_\ox\oy):
1:
+ mov w16, #0xfff
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 // src
.if \ox
ld1 {v20.4h}, [x4], x9 // grain_lut old
@@ -198,8 +199,15 @@
.if \ox && \oy
ld1 {v14.4h}, [x8], x9 // grain_lut top old
.endif
+ dup v4.8h, w16
ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x9 // grain_lut
+ // Make sure that uninitialized pixels out of range past the right
+ // edge are in range; their actual values shouldn't matter.
+ and v0.16b, v0.16b, v4.16b
+ and v1.16b, v1.16b, v4.16b
+ and v2.16b, v2.16b, v4.16b
+ and v3.16b, v3.16b, v4.16b
bl gather_neon
.if \ox
diff --git a/tests/checkasm/filmgrain.c b/tests/checkasm/filmgrain.c
index eadf2ad..3db2f61 100644
--- a/tests/checkasm/filmgrain.c
+++ b/tests/checkasm/filmgrain.c
@@ -183,10 +183,6 @@
generate_scaling(bitdepth_from_max(bitdepth_max), fg_data[0].y_points,
fg_data[0].num_y_points, scaling);
- for (int y = 0; y < 32; y++)
- for (int x = 0; x < 128; x++)
- src[y * PXSTRIDE(stride) + x] = rnd() & bitdepth_max;
-
fg_data[0].clip_to_restricted_range = rnd() & 1;
fg_data[0].scaling_shift = (rnd() & 3) + 8;
for (fg_data[0].overlap_flag = 0; fg_data[0].overlap_flag <= 1;
@@ -204,6 +200,14 @@
row_num = rnd() & 0x7ff;
}
+ for (int y = 0; y < 32; y++) {
+ // Src pixels past the right edge can be uninitialized
+ for (int x = 0; x < 128; x++)
+ src[y * PXSTRIDE(stride) + x] = rnd();
+ for (int x = 0; x < w; x++)
+ src[y * PXSTRIDE(stride) + x] &= bitdepth_max;
+ }
+
CLEAR_PIXEL_RECT(c_dst);
CLEAR_PIXEL_RECT(a_dst);
call_ref(c_dst, src, stride, fg_data, w, scaling, grain_lut, h,
@@ -275,12 +279,6 @@
dsp->generate_grain_uv[layout_idx](grain_lut[1], grain_lut[0],
fg_data, uv_pl HIGHBD_TAIL_SUFFIX);
- for (int y = 0; y < 32; y++)
- for (int x = 0; x < 128; x++)
- src[y * PXSTRIDE(stride) + x] = rnd() & bitdepth_max;
- for (int y = 0; y < 32; y++)
- for (int x = 0; x < 128; x++)
- luma_src[y * PXSTRIDE(lstride) + x] = rnd() & bitdepth_max;
if (csfl) {
fg_data[0].num_y_points = 2 + (rnd() % 13);
const int pad = 0xff / fg_data[0].num_y_points;
@@ -325,6 +323,18 @@
row_num = rnd() & 0x7ff;
}
+ for (int y = 0; y < 32; y++) {
+ // Src pixels past the right edge can be uninitialized
+ for (int x = 0; x < 128; x++) {
+ src[y * PXSTRIDE(stride) + x] = rnd();
+ luma_src[y * PXSTRIDE(lstride) + x] = rnd();
+ }
+ for (int x = 0; x < w; x++)
+ src[y * PXSTRIDE(stride) + x] &= bitdepth_max;
+ for (int x = 0; x < (w << ss_x); x++)
+ luma_src[y * PXSTRIDE(lstride) + x] &= bitdepth_max;
+ }
+
CLEAR_PIXEL_RECT(c_dst);
CLEAR_PIXEL_RECT(a_dst);
call_ref(c_dst, src, stride, fg_data, w, scaling, grain_lut[1], h,