arm: ipred: Make a SIMD pixel_set function for padding

For 8 bpc, there's probably not much difference to a decent memset,
but for 16 bpc, there might be a bigger difference.
diff --git a/src/arm/64/ipred.S b/src/arm/64/ipred.S
index 86b4106..dab6757 100644
--- a/src/arm/64/ipred.S
+++ b/src/arm/64/ipred.S
@@ -1597,6 +1597,17 @@
         ret
 endfunc
 
+// void ipred_pixel_set_8bpc_neon(pixel *out, const pixel px,
+//                                const int n);
+function ipred_pixel_set_8bpc_neon, export=1
+        dup             v0.16b,  w1
+1:
+        subs            w2,  w2,  #16
+        st1             {v0.16b}, [x0], #16
+        b.gt            1b
+        ret
+endfunc
+
 // void ipred_z1_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride,
 //                               const pixel *const top,
 //                               const int width, const int height,
diff --git a/src/arm/ipred.h b/src/arm/ipred.h
index fa22f3d..793ce51 100644
--- a/src/arm/ipred.h
+++ b/src/arm/ipred.h
@@ -57,6 +57,8 @@
 void BF(dav1d_ipred_z1_filter_edge, neon)(pixel *out, const int sz,
                                           const pixel *const in,
                                           const int end, const int strength);
+void BF(dav1d_ipred_pixel_set, neon)(pixel *out, const pixel px,
+                                     const int n);
 void BF(dav1d_ipred_z1_fill1, neon)(pixel *dst, ptrdiff_t stride,
                                     const pixel *const top, const int width,
                                     const int height, const int dx,
@@ -76,7 +78,7 @@
     const int enable_intra_edge_filter = angle >> 10;
     angle &= 511;
     int dx = dav1d_dr_intra_derivative[angle >> 1];
-    pixel top_out[64 + 64 + (64+15)*2];
+    pixel top_out[64 + 64 + (64+15)*2 + 16];
     int max_base_x;
     const int upsample_above = enable_intra_edge_filter ?
         get_upsample(width + height, 90 - angle, is_sm) : 0;
@@ -102,7 +104,8 @@
     }
     const int base_inc = 1 + upsample_above;
     int pad_pixels = width + 15; // max(dx >> 6) == 15
-    pixel_set(&top_out[max_base_x + 1], top_out[max_base_x], pad_pixels * base_inc);
+    BF(dav1d_ipred_pixel_set, neon)(&top_out[max_base_x + 1],
+                                    top_out[max_base_x], pad_pixels * base_inc);
     if (upsample_above)
         BF(dav1d_ipred_z1_fill2, neon)(dst, stride, top_out, width, height,
                                        dx, max_base_x);
@@ -172,7 +175,8 @@
     // the other implementation can read height + max(dy >> 6) past the end.
     int pad_pixels = imax(64 - max_base_y - 1, height + 15);
 
-    pixel_set(&left_out[max_base_y + 1], left_out[max_base_y], pad_pixels * base_inc);
+    BF(dav1d_ipred_pixel_set, neon)(&left_out[max_base_y + 1],
+                                    left_out[max_base_y], pad_pixels * base_inc);
     if (upsample_left)
         BF(dav1d_ipred_z3_fill2, neon)(dst, stride, left_out, width, height,
                                        dy, max_base_y);