arm: ipred: Make a SIMD pixel_set function for padding
For 8 bpc, there's probably not much difference to a decent memset,
but for 16 bpc, there might be a bigger difference.
diff --git a/src/arm/64/ipred.S b/src/arm/64/ipred.S
index 86b4106..dab6757 100644
--- a/src/arm/64/ipred.S
+++ b/src/arm/64/ipred.S
@@ -1597,6 +1597,17 @@
ret
endfunc
+// void ipred_pixel_set_8bpc_neon(pixel *out, const pixel px,
+// const int n);
+function ipred_pixel_set_8bpc_neon, export=1
+ dup v0.16b, w1
+1:
+ subs w2, w2, #16
+ st1 {v0.16b}, [x0], #16
+ b.gt 1b
+ ret
+endfunc
+
// void ipred_z1_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const top,
// const int width, const int height,
diff --git a/src/arm/ipred.h b/src/arm/ipred.h
index fa22f3d..793ce51 100644
--- a/src/arm/ipred.h
+++ b/src/arm/ipred.h
@@ -57,6 +57,8 @@
void BF(dav1d_ipred_z1_filter_edge, neon)(pixel *out, const int sz,
const pixel *const in,
const int end, const int strength);
+void BF(dav1d_ipred_pixel_set, neon)(pixel *out, const pixel px,
+ const int n);
void BF(dav1d_ipred_z1_fill1, neon)(pixel *dst, ptrdiff_t stride,
const pixel *const top, const int width,
const int height, const int dx,
@@ -76,7 +78,7 @@
const int enable_intra_edge_filter = angle >> 10;
angle &= 511;
int dx = dav1d_dr_intra_derivative[angle >> 1];
- pixel top_out[64 + 64 + (64+15)*2];
+ pixel top_out[64 + 64 + (64+15)*2 + 16];
int max_base_x;
const int upsample_above = enable_intra_edge_filter ?
get_upsample(width + height, 90 - angle, is_sm) : 0;
@@ -102,7 +104,8 @@
}
const int base_inc = 1 + upsample_above;
int pad_pixels = width + 15; // max(dx >> 6) == 15
- pixel_set(&top_out[max_base_x + 1], top_out[max_base_x], pad_pixels * base_inc);
+ BF(dav1d_ipred_pixel_set, neon)(&top_out[max_base_x + 1],
+ top_out[max_base_x], pad_pixels * base_inc);
if (upsample_above)
BF(dav1d_ipred_z1_fill2, neon)(dst, stride, top_out, width, height,
dx, max_base_x);
@@ -172,7 +175,8 @@
// the other implementation can read height + max(dy >> 6) past the end.
int pad_pixels = imax(64 - max_base_y - 1, height + 15);
- pixel_set(&left_out[max_base_y + 1], left_out[max_base_y], pad_pixels * base_inc);
+ BF(dav1d_ipred_pixel_set, neon)(&left_out[max_base_y + 1],
+ left_out[max_base_y], pad_pixels * base_inc);
if (upsample_left)
BF(dav1d_ipred_z3_fill2, neon)(dst, stride, left_out, width, height,
dy, max_base_y);