src/arm/64/looprestoration.S - platform/external/libdav1d - Git at Google

 /*
  * Copyright © 2018, VideoLAN and dav1d authors
  * Copyright © 2018, Martin Storsjo
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * 1. Redistributions of source code must retain the above copyright notice, this
  *    list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright notice,
  *    this list of conditions and the following disclaimer in the documentation
  *    and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */

 #include "src/arm/asm.S"

 // void dav1d_wiener_filter_h_neon(int16_t *dst, const pixel (*left)[4],
 //                                 const pixel *src, ptrdiff_t stride,
 //                                 const int16_t fh[7], const intptr_t w,
 //                                 int h, enum LrEdgeFlags edges);
 function wiener_filter_h_neon, export=1
         mov             w8,  w5
         ld1             {v0.8h},  [x4]
         mov             w9,  #(1 << 14) - (1 << 2)
         dup             v30.8h,  w9
         movi            v31.8h,  #8, lsl #8
         // Calculate mid_stride
         add             w10, w5,  #7
         bic             w10, w10, #7
         lsl             w10, w10, #1

         // Clear the last unused element of v0, to allow filtering a single
         // pixel with one plain mul+addv.
         ins             v0.h[7], wzr

         // Set up pointers for reading/writing alternate rows
         add             x12, x0,  x10
         lsl             w10, w10, #1
         add             x13, x2,  x3
         lsl             x3,  x3,  #1

         // Subtract the width from mid_stride
         sub             x10, x10, w5, uxtw #1

         // For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels.
         cmp             w5,  #8
         add             w11, w5,  #13
         bic             w11, w11, #7
         b.ge            1f
         mov             w11, #16
 1:
         sub             x3,  x3,  w11, uxtw

         // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
         tst             w7,  #1 // LR_HAVE_LEFT
         b.eq            2f
         // LR_HAVE_LEFT
         cbnz            x1,  0f
         // left == NULL
         sub             x2,  x2,  #3
         sub             x13, x13, #3
         b               1f
 0:      // LR_HAVE_LEFT, left != NULL
 2:      // !LR_HAVE_LEFT, increase the stride.
         // For this case we don't read the left 3 pixels from the src pointer,
         // but shift it as if we had done that.
         add             x3,  x3,  #3


 1:      // Loop vertically
         ld1             {v3.16b},  [x2],  #16
         ld1             {v5.16b},  [x13], #16

         tst             w7,  #1 // LR_HAVE_LEFT
         b.eq            0f
         cbz             x1,  2f
         // LR_HAVE_LEFT, left != NULL
         ld1             {v2.s}[3],  [x1], #4
         // Move x2/x13 back to account for the last 3 bytes we loaded earlier,
         // which we'll shift out.
         sub             x2,  x2,  #3
         sub             x13, x13, #3
         ld1             {v4.s}[3],  [x1], #4
         ext             v3.16b, v2.16b, v3.16b, #13
         ext             v5.16b, v4.16b, v5.16b, #13
         b               2f
 0:
         // !LR_HAVE_LEFT, fill v2 with the leftmost byte
         // and shift v3 to have 3x the first byte at the front.
         dup             v2.16b, v3.b[0]
         dup             v4.16b, v5.b[0]
         // Move x2 back to account for the last 3 bytes we loaded before,
         // which we shifted out.
         sub             x2,  x2,  #3
         sub             x13, x13, #3
         ext             v3.16b, v2.16b, v3.16b, #13
         ext             v5.16b, v4.16b, v5.16b, #13

 2:
         uxtl            v2.8h,  v3.8b
         uxtl2           v3.8h,  v3.16b
         uxtl            v4.8h,  v5.8b
         uxtl2           v5.8h,  v5.16b

         tst             w7,  #2 // LR_HAVE_RIGHT
         b.ne            4f
         // If we'll need to pad the right edge, load that byte to pad with
         // here since we can find it pretty easily from here.
         sub             w9,  w5, #14
         ldr             b28, [x2,  w9, sxtw]
         ldr             b29, [x13, w9, sxtw]
         // Fill v28/v29 with the right padding pixel
         dup             v28.8b,  v28.b[0]
         dup             v29.8b,  v29.b[0]
         uxtl            v28.8h,  v28.8b
         uxtl            v29.8h,  v29.8b
 3:      // !LR_HAVE_RIGHT
         // If we'll have to pad the right edge we need to quit early here.
         cmp             w5,  #11
         b.ge            4f   // If w >= 11, all used input pixels are valid
         cmp             w5,  #7
         b.ge            5f   // If w >= 7, we can filter 4 pixels
         b               6f

 4:      // Loop horizontally
 .macro filter wd
         // Interleaving the mul/mla chains actually hurts performance
         // significantly on Cortex A53, thus keeping mul/mla tightly
         // chained like this.
         ext             v16.16b, v2.16b,  v3.16b, #2
         ext             v17.16b, v2.16b,  v3.16b, #4
         ext             v18.16b, v2.16b,  v3.16b, #6
         ext             v19.16b, v2.16b,  v3.16b, #8
         ext             v20.16b, v2.16b,  v3.16b, #10
         ext             v21.16b, v2.16b,  v3.16b, #12
         mul             v6\wd,   v2\wd,   v0.h[0]
         mla             v6\wd,   v16\wd,  v0.h[1]
         mla             v6\wd,   v17\wd,  v0.h[2]
         mla             v6\wd,   v18\wd,  v0.h[3]
         mla             v6\wd,   v19\wd,  v0.h[4]
         mla             v6\wd,   v20\wd,  v0.h[5]
         mla             v6\wd,   v21\wd,  v0.h[6]
         ext             v22.16b, v4.16b,  v5.16b, #2
         ext             v23.16b, v4.16b,  v5.16b, #4
         ext             v24.16b, v4.16b,  v5.16b, #6
         ext             v25.16b, v4.16b,  v5.16b, #8
         ext             v26.16b, v4.16b,  v5.16b, #10
         ext             v27.16b, v4.16b,  v5.16b, #12
         mul             v7\wd,   v4\wd,   v0.h[0]
         mla             v7\wd,   v22\wd,  v0.h[1]
         mla             v7\wd,   v23\wd,  v0.h[2]
         mla             v7\wd,   v24\wd,  v0.h[3]
         mla             v7\wd,   v25\wd,  v0.h[4]
         mla             v7\wd,   v26\wd,  v0.h[5]
         mla             v7\wd,   v27\wd,  v0.h[6]

         shl             v18\wd,  v18\wd,  #7
         shl             v24\wd,  v24\wd,  #7
         sub             v18\wd,  v18\wd,  v30\wd
         sub             v24\wd,  v24\wd,  v30\wd
         sqadd           v6\wd,   v6\wd,   v18\wd
         sqadd           v7\wd,   v7\wd,   v24\wd
         sshr            v6\wd,   v6\wd,   #3
         sshr            v7\wd,   v7\wd,   #3
         add             v6\wd,   v6\wd,   v31\wd
         add             v7\wd,   v7\wd,   v31\wd
 .endm
         filter          .8h
         st1             {v6.8h},  [x0],  #16
         st1             {v7.8h},  [x12], #16

         subs            w5,  w5,  #8
         b.le            9f
         tst             w7,  #2 // LR_HAVE_RIGHT
         mov             v2.16b,  v3.16b
         mov             v4.16b,  v5.16b
         ld1             {v3.8b},  [x2],  #8
         ld1             {v5.8b},  [x13], #8
         uxtl            v3.8h,   v3.8b
         uxtl            v5.8h,   v5.8b
         b.ne            4b // If we don't need to pad, just keep filtering.
         b               3b // If we need to pad, check how many pixels we have left.

 5:      // Filter 4 pixels, 7 <= w < 11
         filter          .4h
         st1             {v6.4h},  [x0],  #8
         st1             {v7.4h},  [x12], #8

         subs            w5,  w5,  #4 // 3 <= w < 7
         ext             v2.16b,  v2.16b,  v3.16b, #8
         ext             v3.16b,  v3.16b,  v3.16b, #8
         ext             v4.16b,  v4.16b,  v5.16b, #8
         ext             v5.16b,  v5.16b,  v5.16b, #8

 6:      // Pad the right edge and filter the last few pixels.
         // w < 7, w+3 pixels valid in v2-v3
         cmp             w5,  #5
         b.lt            7f
         b.gt            8f
         // w == 5, 8 pixels valid in v2, v3 invalid
         mov             v3.16b,  v28.16b
         mov             v5.16b,  v29.16b
         b               88f

 7:      // 1 <= w < 5, 4-7 pixels valid in v2
         sub             w9,  w5,  #1
         // w9 = (pixels valid - 4)
         adr             x11, L(variable_shift_tbl)
         ldrh            w9,  [x11, w9, uxtw #1]
         sub             x11, x11, w9, uxth
         mov             v3.16b,  v28.16b
         mov             v5.16b,  v29.16b
         br              x11
         // Shift v2 right, shifting out invalid pixels,
         // shift v2 left to the original offset, shifting in padding pixels.
 44:     // 4 pixels valid
         ext             v2.16b,  v2.16b,  v2.16b,  #8
         ext             v2.16b,  v2.16b,  v3.16b,  #8
         ext             v4.16b,  v4.16b,  v4.16b,  #8
         ext             v4.16b,  v4.16b,  v5.16b,  #8
         b               88f
 55:     // 5 pixels valid
         ext             v2.16b,  v2.16b,  v2.16b,  #10
         ext             v2.16b,  v2.16b,  v3.16b,  #6
         ext             v4.16b,  v4.16b,  v4.16b,  #10
         ext             v4.16b,  v4.16b,  v5.16b,  #6
         b               88f
 66:     // 6 pixels valid
         ext             v2.16b,  v2.16b,  v2.16b,  #12
         ext             v2.16b,  v2.16b,  v3.16b,  #4
         ext             v4.16b,  v4.16b,  v4.16b,  #12
         ext             v4.16b,  v4.16b,  v5.16b,  #4
         b               88f
 77:     // 7 pixels valid
         ext             v2.16b,  v2.16b,  v2.16b,  #14
         ext             v2.16b,  v2.16b,  v3.16b,  #2
         ext             v4.16b,  v4.16b,  v4.16b,  #14
         ext             v4.16b,  v4.16b,  v5.16b,  #2
         b               88f

 L(variable_shift_tbl):
         .hword L(variable_shift_tbl) - 44b
         .hword L(variable_shift_tbl) - 55b
         .hword L(variable_shift_tbl) - 66b
         .hword L(variable_shift_tbl) - 77b

 8:      // w > 5, w == 6, 9 pixels valid in v2-v3, 1 pixel valid in v3
         ins             v28.h[0],  v3.h[0]
         ins             v29.h[0],  v5.h[0]
         mov             v3.16b,  v28.16b
         mov             v5.16b,  v29.16b

 88:
         // w < 7, v2-v3 padded properly
         cmp             w5,  #4
         b.lt            888f

         // w >= 4, filter 4 pixels
         filter          .4h
         st1             {v6.4h},  [x0],  #8
         st1             {v7.4h},  [x12], #8
         subs            w5,  w5,  #4 // 0 <= w < 4
         ext             v2.16b,  v2.16b,  v3.16b, #8
         ext             v4.16b,  v4.16b,  v5.16b, #8
         b.eq            9f
 888:    // 1 <= w < 4, filter 1 pixel at a time
         mul             v6.8h,   v2.8h,   v0.8h
         mul             v7.8h,   v4.8h,   v0.8h
         addv            h6,      v6.8h
         addv            h7,      v7.8h
         dup             v16.4h,  v2.h[3]
         dup             v17.4h,  v4.h[3]
         shl             v16.4h,  v16.4h,  #7
         shl             v17.4h,  v17.4h,  #7
         sub             v16.4h,  v16.4h,  v30.4h
         sub             v17.4h,  v17.4h,  v30.4h
         sqadd           v6.4h,   v6.4h,   v16.4h
         sqadd           v7.4h,   v7.4h,   v17.4h
         sshr            v6.4h,   v6.4h,   #3
         sshr            v7.4h,   v7.4h,   #3
         add             v6.4h,   v6.4h,   v31.4h
         add             v7.4h,   v7.4h,   v31.4h
         st1             {v6.h}[0], [x0],  #2
         st1             {v7.h}[0], [x12], #2
         subs            w5,  w5,  #1
         ext             v2.16b,  v2.16b,  v3.16b,  #2
         ext             v4.16b,  v4.16b,  v5.16b,  #2
         b.gt            888b

 9:
         subs            w6,  w6,  #2
         b.le            0f
         // Jump to the next row and loop horizontally
         add             x0,  x0,  x10
         add             x12, x12, x10
         add             x2,  x2,  x3
         add             x13, x13, x3
         mov             w5,  w8
         b               1b
 0:
         ret
 .purgem filter
 endfunc

 // void dav1d_wiener_filter_v_neon(pixel *dst, ptrdiff_t stride,
 //                                 const int16_t *mid, int w, int h,
 //                                 const int16_t fv[7], enum LrEdgeFlags edges,
 //                                 ptrdiff_t mid_stride);
 function wiener_filter_v_neon, export=1
         mov             w8,  w4
         ld1             {v0.8h},  [x5]
         movi            v1.8h, #128
         add             v1.8h,  v1.8h,  v0.8h

         // Calculate the number of rows to move back when looping vertically
         mov             w11, w4
         tst             w6,  #4 // LR_HAVE_TOP
         b.eq            0f
         sub             x2,  x2,  x7,  lsl #1
         add             w11, w11, #2
 0:
         tst             w6,  #8 // LR_HAVE_BOTTOM
         b.eq            1f
         add             w11, w11, #2

 1:      // Start of horizontal loop; start one vertical filter slice.
         // Load rows into v16-v19 and pad properly.
         tst             w6,  #4 // LR_HAVE_TOP
         ld1             {v16.8h}, [x2], x7
         b.eq            2f
         // LR_HAVE_TOP
         ld1             {v18.8h}, [x2], x7
         mov             v17.16b, v16.16b
         ld1             {v19.8h}, [x2], x7
         b               3f
 2:      // !LR_HAVE_TOP
         mov             v17.16b, v16.16b
         mov             v18.16b, v16.16b
         mov             v19.16b, v16.16b

 3:
         cmp             w4,  #4
         b.lt            5f
         // Start filtering normally; fill in v20-v22 with unique rows.
         ld1             {v20.8h}, [x2], x7
         ld1             {v21.8h}, [x2], x7
         ld1             {v22.8h}, [x2], x7

 4:
 .macro filter compare
         subs            w4,  w4,  #1
         // Interleaving the mul/mla chains actually hurts performance
         // significantly on Cortex A53, thus keeping mul/mla tightly
         // chained like this.
         smull           v2.4s,  v16.4h,  v0.h[0]
         smlal           v2.4s,  v17.4h,  v0.h[1]
         smlal           v2.4s,  v18.4h,  v0.h[2]
         smlal           v2.4s,  v19.4h,  v1.h[3]
         smlal           v2.4s,  v20.4h,  v0.h[4]
         smlal           v2.4s,  v21.4h,  v0.h[5]
         smlal           v2.4s,  v22.4h,  v0.h[6]
         smull2          v3.4s,  v16.8h,  v0.h[0]
         smlal2          v3.4s,  v17.8h,  v0.h[1]
         smlal2          v3.4s,  v18.8h,  v0.h[2]
         smlal2          v3.4s,  v19.8h,  v1.h[3]
         smlal2          v3.4s,  v20.8h,  v0.h[4]
         smlal2          v3.4s,  v21.8h,  v0.h[5]
         smlal2          v3.4s,  v22.8h,  v0.h[6]
         sqrshrun        v2.4h,  v2.4s,   #11
         sqrshrun2       v2.8h,  v3.4s,   #11
         sqxtun          v2.8b,  v2.8h
         st1             {v2.8b}, [x0], x1
 .if \compare
         cmp             w4,  #4
 .else
         b.le            9f
 .endif
         mov             v16.16b,  v17.16b
         mov             v17.16b,  v18.16b
         mov             v18.16b,  v19.16b
         mov             v19.16b,  v20.16b
         mov             v20.16b,  v21.16b
         mov             v21.16b,  v22.16b
 .endm
         filter          1
         b.lt            7f
         ld1             {v22.8h}, [x2], x7
         b               4b

 5:      // Less than 4 rows in total; not all of v20-v21 are filled yet.
         tst             w6,  #8 // LR_HAVE_BOTTOM
         b.eq            6f
         // LR_HAVE_BOTTOM
         cmp             w4,  #2
         // We load at least 2 rows in all cases.
         ld1             {v20.8h}, [x2], x7
         ld1             {v21.8h}, [x2], x7
         b.gt            53f // 3 rows in total
         b.eq            52f // 2 rows in total
 51:     // 1 row in total, v19 already loaded, load edge into v20-v22.
         mov             v22.16b,  v21.16b
         b               8f
 52:     // 2 rows in total, v19 already loaded, load v20 with content data
         // and 2 rows of edge.
         ld1             {v22.8h}, [x2], x7
         mov             v23.16b,  v22.16b
         b               8f
 53:
         // 3 rows in total, v19 already loaded, load v20 and v21 with content
         // and 2 rows of edge.
         ld1             {v22.8h}, [x2], x7
         ld1             {v23.8h}, [x2], x7
         mov             v24.16b,  v23.16b
         b               8f

 6:
         // !LR_HAVE_BOTTOM
         cmp             w4,  #2
         b.gt            63f // 3 rows in total
         b.eq            62f // 2 rows in total
 61:     // 1 row in total, v19 already loaded, pad that into v20-v22.
         mov             v20.16b,  v19.16b
         mov             v21.16b,  v19.16b
         mov             v22.16b,  v19.16b
         b               8f
 62:     // 2 rows in total, v19 already loaded, load v20 and pad that into v21-v23.
         ld1             {v20.8h}, [x2], x7
         mov             v21.16b,  v20.16b
         mov             v22.16b,  v20.16b
         mov             v23.16b,  v20.16b
         b               8f
 63:
         // 3 rows in total, v19 already loaded, load v20 and v21 and pad v21 into v22-v24.
         ld1             {v20.8h}, [x2], x7
         ld1             {v21.8h}, [x2], x7
         mov             v22.16b,  v21.16b
         mov             v23.16b,  v21.16b
         mov             v24.16b,  v21.16b
         b               8f

 7:
         // All registers up to v21 are filled already, 3 valid rows left.
         // < 4 valid rows left; fill in padding and filter the last
         // few rows.
         tst             w6,  #8 // LR_HAVE_BOTTOM
         b.eq            71f
         // LR_HAVE_BOTTOM; load 2 rows of edge.
         ld1             {v22.8h}, [x2], x7
         ld1             {v23.8h}, [x2], x7
         mov             v24.16b,  v23.16b
         b               8f
 71:
         // !LR_HAVE_BOTTOM, pad 3 rows
         mov             v22.16b,  v21.16b
         mov             v23.16b,  v21.16b
         mov             v24.16b,  v21.16b

 8:      // At this point, all registers up to v22-v24 are loaded with
         // edge/padding (depending on how many rows are left).
         filter          0 // This branches to 9f when done
         mov             v22.16b,  v23.16b
         mov             v23.16b,  v24.16b
         b               8b

 9:      // End of one vertical slice.
         subs            w3,  w3,  #8
         b.le            0f
         // Move pointers back up to the top and loop horizontally.
         msub            x0,  x1,  x8,  x0
         msub            x2,  x7,  x11, x2
         add             x0,  x0,  #8
         add             x2,  x2,  #16
         mov             w4,  w8
         b               1b

 0:
         ret
 .purgem filter
 endfunc

 // void dav1d_copy_narrow_neon(pixel *dst, ptrdiff_t stride,
 //                             const pixel *src, int w, int h);
 function copy_narrow_neon, export=1
         adr             x5,  L(copy_narrow_tbl)
         ldrh            w6,  [x5, w3, uxtw #1]
         sub             x5,  x5,  w6, uxth
         br              x5
 10:
         add             x7,  x0,  x1
         lsl             x1,  x1,  #1
 18:
         cmp             w4,  #8
         b.lt            110f
         subs            w4,  w4,  #8
         ld1             {v0.8b}, [x2], #8
         st1             {v0.b}[0], [x0], x1
         st1             {v0.b}[1], [x7], x1
         st1             {v0.b}[2], [x0], x1
         st1             {v0.b}[3], [x7], x1
         st1             {v0.b}[4], [x0], x1
         st1             {v0.b}[5], [x7], x1
         st1             {v0.b}[6], [x0], x1
         st1             {v0.b}[7], [x7], x1
         b.le            0f
         b               18b
 110:
         asr             x1,  x1,  #1
 11:
         subs            w4,  w4,  #1
         ld1             {v0.b}[0], [x2], #1
         st1             {v0.b}[0], [x0], x1
         b.gt            11b
 0:
         ret

 20:
         add             x7,  x0,  x1
         lsl             x1,  x1,  #1
 24:
         cmp             w4,  #4
         b.lt            210f
         subs            w4,  w4,  #4
         ld1             {v0.4h}, [x2], #8
         st1             {v0.h}[0], [x0], x1
         st1             {v0.h}[1], [x7], x1
         st1             {v0.h}[2], [x0], x1
         st1             {v0.h}[3], [x7], x1
         b.le            0f
         b               24b
 210:
         asr             x1,  x1,  #1
 22:
         subs            w4,  w4,  #1
         ld1             {v0.h}[0], [x2], #2
         st1             {v0.h}[0], [x0], x1
         b.gt            22b
 0:
         ret

 30:
         ldrh            w5,  [x2]
         ldrb            w6,  [x2, #2]
         add             x2,  x2,  #3
         subs            w4,  w4,  #1
         strh            w5,  [x0]
         strb            w6,  [x0, #2]
         add             x0,  x0,  x1
         b.gt            30b
         ret

 40:
         add             x7,  x0,  x1
         lsl             x1,  x1,  #1
 42:
         cmp             w4,  #2
         b.lt            41f
         subs            w4,  w4,  #2
         ld1             {v0.2s}, [x2], #8
         st1             {v0.s}[0], [x0], x1
         st1             {v0.s}[1], [x7], x1
         b.le            0f
         b               42b
 41:
         ld1             {v0.s}[0], [x2]
         st1             {v0.s}[0], [x0]
 0:
         ret

 50:
         ldr             w5,  [x2]
         ldrb            w6,  [x2, #4]
         add             x2,  x2,  #5
         subs            w4,  w4,  #1
         str             w5,  [x0]
         strb            w6,  [x0, #4]
         add             x0,  x0,  x1
         b.gt            50b
         ret

 60:
         ldr             w5,  [x2]
         ldrh            w6,  [x2, #4]
         add             x2,  x2,  #6
         subs            w4,  w4,  #1
         str             w5,  [x0]
         strh            w6,  [x0, #4]
         add             x0,  x0,  x1
         b.gt            60b
         ret

 70:
         ldr             w5,  [x2]
         ldrh            w6,  [x2, #4]
         ldrb            w7,  [x2, #6]
         add             x2,  x2,  #7
         subs            w4,  w4,  #1
         str             w5,  [x0]
         strh            w6,  [x0, #4]
         strb            w7,  [x0, #6]
         add             x0,  x0,  x1
         b.gt            70b
         ret

 L(copy_narrow_tbl):
         .hword 0
         .hword L(copy_narrow_tbl) - 10b
         .hword L(copy_narrow_tbl) - 20b
         .hword L(copy_narrow_tbl) - 30b
         .hword L(copy_narrow_tbl) - 40b
         .hword L(copy_narrow_tbl) - 50b
         .hword L(copy_narrow_tbl) - 60b
         .hword L(copy_narrow_tbl) - 70b
 endfunc
	/*
	* Copyright © 2018, VideoLAN and dav1d authors
	* Copyright © 2018, Martin Storsjo
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	*
	* 1. Redistributions of source code must retain the above copyright notice, this
	* list of conditions and the following disclaimer.
	*
	* 2. Redistributions in binary form must reproduce the above copyright notice,
	* this list of conditions and the following disclaimer in the documentation
	* and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
	* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include "src/arm/asm.S"

	// void dav1d_wiener_filter_h_neon(int16_t dst, const pixel (left)[4],
	// const pixel *src, ptrdiff_t stride,
	// const int16_t fh[7], const intptr_t w,
	// int h, enum LrEdgeFlags edges);
	function wiener_filter_h_neon, export=1
	mov w8, w5
	ld1 {v0.8h}, [x4]
	mov w9, #(1 << 14) - (1 << 2)
	dup v30.8h, w9
	movi v31.8h, #8, lsl #8
	// Calculate mid_stride
	add w10, w5, #7
	bic w10, w10, #7
	lsl w10, w10, #1

	// Clear the last unused element of v0, to allow filtering a single
	// pixel with one plain mul+addv.
	ins v0.h[7], wzr

	// Set up pointers for reading/writing alternate rows
	add x12, x0, x10
	lsl w10, w10, #1
	add x13, x2, x3
	lsl x3, x3, #1

	// Subtract the width from mid_stride
	sub x10, x10, w5, uxtw #1

	// For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels.
	cmp w5, #8
	add w11, w5, #13
	bic w11, w11, #7
	b.ge 1f
	mov w11, #16
	1:
	sub x3, x3, w11, uxtw

	// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
	tst w7, #1 // LR_HAVE_LEFT
	b.eq 2f
	// LR_HAVE_LEFT
	cbnz x1, 0f
	// left == NULL
	sub x2, x2, #3
	sub x13, x13, #3
	b 1f
	0: // LR_HAVE_LEFT, left != NULL
	2: // !LR_HAVE_LEFT, increase the stride.
	// For this case we don't read the left 3 pixels from the src pointer,
	// but shift it as if we had done that.
	add x3, x3, #3


	1: // Loop vertically
	ld1 {v3.16b}, [x2], #16
	ld1 {v5.16b}, [x13], #16

	tst w7, #1 // LR_HAVE_LEFT
	b.eq 0f
	cbz x1, 2f
	// LR_HAVE_LEFT, left != NULL
	ld1 {v2.s}[3], [x1], #4
	// Move x2/x13 back to account for the last 3 bytes we loaded earlier,
	// which we'll shift out.
	sub x2, x2, #3
	sub x13, x13, #3
	ld1 {v4.s}[3], [x1], #4
	ext v3.16b, v2.16b, v3.16b, #13
	ext v5.16b, v4.16b, v5.16b, #13
	b 2f
	0:
	// !LR_HAVE_LEFT, fill v2 with the leftmost byte
	// and shift v3 to have 3x the first byte at the front.
	dup v2.16b, v3.b[0]
	dup v4.16b, v5.b[0]
	// Move x2 back to account for the last 3 bytes we loaded before,
	// which we shifted out.
	sub x2, x2, #3
	sub x13, x13, #3
	ext v3.16b, v2.16b, v3.16b, #13
	ext v5.16b, v4.16b, v5.16b, #13

	2:
	uxtl v2.8h, v3.8b
	uxtl2 v3.8h, v3.16b
	uxtl v4.8h, v5.8b
	uxtl2 v5.8h, v5.16b

	tst w7, #2 // LR_HAVE_RIGHT
	b.ne 4f
	// If we'll need to pad the right edge, load that byte to pad with
	// here since we can find it pretty easily from here.
	sub w9, w5, #14
	ldr b28, [x2, w9, sxtw]
	ldr b29, [x13, w9, sxtw]
	// Fill v28/v29 with the right padding pixel
	dup v28.8b, v28.b[0]
	dup v29.8b, v29.b[0]
	uxtl v28.8h, v28.8b
	uxtl v29.8h, v29.8b
	3: // !LR_HAVE_RIGHT
	// If we'll have to pad the right edge we need to quit early here.
	cmp w5, #11
	b.ge 4f // If w >= 11, all used input pixels are valid
	cmp w5, #7
	b.ge 5f // If w >= 7, we can filter 4 pixels
	b 6f

	4: // Loop horizontally
	.macro filter wd
	// Interleaving the mul/mla chains actually hurts performance
	// significantly on Cortex A53, thus keeping mul/mla tightly
	// chained like this.
	ext v16.16b, v2.16b, v3.16b, #2
	ext v17.16b, v2.16b, v3.16b, #4
	ext v18.16b, v2.16b, v3.16b, #6
	ext v19.16b, v2.16b, v3.16b, #8
	ext v20.16b, v2.16b, v3.16b, #10
	ext v21.16b, v2.16b, v3.16b, #12
	mul v6\wd, v2\wd, v0.h[0]
	mla v6\wd, v16\wd, v0.h[1]
	mla v6\wd, v17\wd, v0.h[2]
	mla v6\wd, v18\wd, v0.h[3]
	mla v6\wd, v19\wd, v0.h[4]
	mla v6\wd, v20\wd, v0.h[5]
	mla v6\wd, v21\wd, v0.h[6]
	ext v22.16b, v4.16b, v5.16b, #2
	ext v23.16b, v4.16b, v5.16b, #4
	ext v24.16b, v4.16b, v5.16b, #6
	ext v25.16b, v4.16b, v5.16b, #8
	ext v26.16b, v4.16b, v5.16b, #10
	ext v27.16b, v4.16b, v5.16b, #12
	mul v7\wd, v4\wd, v0.h[0]
	mla v7\wd, v22\wd, v0.h[1]
	mla v7\wd, v23\wd, v0.h[2]
	mla v7\wd, v24\wd, v0.h[3]
	mla v7\wd, v25\wd, v0.h[4]
	mla v7\wd, v26\wd, v0.h[5]
	mla v7\wd, v27\wd, v0.h[6]

	shl v18\wd, v18\wd, #7
	shl v24\wd, v24\wd, #7
	sub v18\wd, v18\wd, v30\wd
	sub v24\wd, v24\wd, v30\wd
	sqadd v6\wd, v6\wd, v18\wd
	sqadd v7\wd, v7\wd, v24\wd
	sshr v6\wd, v6\wd, #3
	sshr v7\wd, v7\wd, #3
	add v6\wd, v6\wd, v31\wd
	add v7\wd, v7\wd, v31\wd
	.endm
	filter .8h
	st1 {v6.8h}, [x0], #16
	st1 {v7.8h}, [x12], #16

	subs w5, w5, #8
	b.le 9f
	tst w7, #2 // LR_HAVE_RIGHT
	mov v2.16b, v3.16b
	mov v4.16b, v5.16b
	ld1 {v3.8b}, [x2], #8
	ld1 {v5.8b}, [x13], #8
	uxtl v3.8h, v3.8b
	uxtl v5.8h, v5.8b
	b.ne 4b // If we don't need to pad, just keep filtering.
	b 3b // If we need to pad, check how many pixels we have left.

	5: // Filter 4 pixels, 7 <= w < 11
	filter .4h
	st1 {v6.4h}, [x0], #8
	st1 {v7.4h}, [x12], #8

	subs w5, w5, #4 // 3 <= w < 7
	ext v2.16b, v2.16b, v3.16b, #8
	ext v3.16b, v3.16b, v3.16b, #8
	ext v4.16b, v4.16b, v5.16b, #8
	ext v5.16b, v5.16b, v5.16b, #8

	6: // Pad the right edge and filter the last few pixels.
	// w < 7, w+3 pixels valid in v2-v3
	cmp w5, #5
	b.lt 7f
	b.gt 8f
	// w == 5, 8 pixels valid in v2, v3 invalid
	mov v3.16b, v28.16b
	mov v5.16b, v29.16b
	b 88f

	7: // 1 <= w < 5, 4-7 pixels valid in v2
	sub w9, w5, #1
	// w9 = (pixels valid - 4)
	adr x11, L(variable_shift_tbl)
	ldrh w9, [x11, w9, uxtw #1]
	sub x11, x11, w9, uxth
	mov v3.16b, v28.16b
	mov v5.16b, v29.16b
	br x11
	// Shift v2 right, shifting out invalid pixels,
	// shift v2 left to the original offset, shifting in padding pixels.
	44: // 4 pixels valid
	ext v2.16b, v2.16b, v2.16b, #8
	ext v2.16b, v2.16b, v3.16b, #8
	ext v4.16b, v4.16b, v4.16b, #8
	ext v4.16b, v4.16b, v5.16b, #8
	b 88f
	55: // 5 pixels valid
	ext v2.16b, v2.16b, v2.16b, #10
	ext v2.16b, v2.16b, v3.16b, #6
	ext v4.16b, v4.16b, v4.16b, #10
	ext v4.16b, v4.16b, v5.16b, #6
	b 88f
	66: // 6 pixels valid
	ext v2.16b, v2.16b, v2.16b, #12
	ext v2.16b, v2.16b, v3.16b, #4
	ext v4.16b, v4.16b, v4.16b, #12
	ext v4.16b, v4.16b, v5.16b, #4
	b 88f
	77: // 7 pixels valid
	ext v2.16b, v2.16b, v2.16b, #14
	ext v2.16b, v2.16b, v3.16b, #2
	ext v4.16b, v4.16b, v4.16b, #14
	ext v4.16b, v4.16b, v5.16b, #2
	b 88f

	L(variable_shift_tbl):
	.hword L(variable_shift_tbl) - 44b
	.hword L(variable_shift_tbl) - 55b
	.hword L(variable_shift_tbl) - 66b
	.hword L(variable_shift_tbl) - 77b

	8: // w > 5, w == 6, 9 pixels valid in v2-v3, 1 pixel valid in v3
	ins v28.h[0], v3.h[0]
	ins v29.h[0], v5.h[0]
	mov v3.16b, v28.16b
	mov v5.16b, v29.16b

	88:
	// w < 7, v2-v3 padded properly
	cmp w5, #4
	b.lt 888f

	// w >= 4, filter 4 pixels
	filter .4h
	st1 {v6.4h}, [x0], #8
	st1 {v7.4h}, [x12], #8
	subs w5, w5, #4 // 0 <= w < 4
	ext v2.16b, v2.16b, v3.16b, #8
	ext v4.16b, v4.16b, v5.16b, #8
	b.eq 9f
	888: // 1 <= w < 4, filter 1 pixel at a time
	mul v6.8h, v2.8h, v0.8h
	mul v7.8h, v4.8h, v0.8h
	addv h6, v6.8h
	addv h7, v7.8h
	dup v16.4h, v2.h[3]
	dup v17.4h, v4.h[3]
	shl v16.4h, v16.4h, #7
	shl v17.4h, v17.4h, #7
	sub v16.4h, v16.4h, v30.4h
	sub v17.4h, v17.4h, v30.4h
	sqadd v6.4h, v6.4h, v16.4h
	sqadd v7.4h, v7.4h, v17.4h
	sshr v6.4h, v6.4h, #3
	sshr v7.4h, v7.4h, #3
	add v6.4h, v6.4h, v31.4h
	add v7.4h, v7.4h, v31.4h
	st1 {v6.h}[0], [x0], #2
	st1 {v7.h}[0], [x12], #2
	subs w5, w5, #1
	ext v2.16b, v2.16b, v3.16b, #2
	ext v4.16b, v4.16b, v5.16b, #2
	b.gt 888b

	9:
	subs w6, w6, #2
	b.le 0f
	// Jump to the next row and loop horizontally
	add x0, x0, x10
	add x12, x12, x10
	add x2, x2, x3
	add x13, x13, x3
	mov w5, w8
	b 1b
	0:
	ret
	.purgem filter
	endfunc

	// void dav1d_wiener_filter_v_neon(pixel *dst, ptrdiff_t stride,
	// const int16_t *mid, int w, int h,
	// const int16_t fv[7], enum LrEdgeFlags edges,
	// ptrdiff_t mid_stride);
	function wiener_filter_v_neon, export=1
	mov w8, w4
	ld1 {v0.8h}, [x5]
	movi v1.8h, #128
	add v1.8h, v1.8h, v0.8h

	// Calculate the number of rows to move back when looping vertically
	mov w11, w4
	tst w6, #4 // LR_HAVE_TOP
	b.eq 0f
	sub x2, x2, x7, lsl #1
	add w11, w11, #2
	0:
	tst w6, #8 // LR_HAVE_BOTTOM
	b.eq 1f
	add w11, w11, #2

	1: // Start of horizontal loop; start one vertical filter slice.
	// Load rows into v16-v19 and pad properly.
	tst w6, #4 // LR_HAVE_TOP
	ld1 {v16.8h}, [x2], x7
	b.eq 2f
	// LR_HAVE_TOP
	ld1 {v18.8h}, [x2], x7
	mov v17.16b, v16.16b
	ld1 {v19.8h}, [x2], x7
	b 3f
	2: // !LR_HAVE_TOP
	mov v17.16b, v16.16b
	mov v18.16b, v16.16b
	mov v19.16b, v16.16b

	3:
	cmp w4, #4
	b.lt 5f
	// Start filtering normally; fill in v20-v22 with unique rows.
	ld1 {v20.8h}, [x2], x7
	ld1 {v21.8h}, [x2], x7
	ld1 {v22.8h}, [x2], x7

	4:
	.macro filter compare
	subs w4, w4, #1
	// Interleaving the mul/mla chains actually hurts performance
	// significantly on Cortex A53, thus keeping mul/mla tightly
	// chained like this.
	smull v2.4s, v16.4h, v0.h[0]
	smlal v2.4s, v17.4h, v0.h[1]
	smlal v2.4s, v18.4h, v0.h[2]
	smlal v2.4s, v19.4h, v1.h[3]
	smlal v2.4s, v20.4h, v0.h[4]
	smlal v2.4s, v21.4h, v0.h[5]
	smlal v2.4s, v22.4h, v0.h[6]
	smull2 v3.4s, v16.8h, v0.h[0]
	smlal2 v3.4s, v17.8h, v0.h[1]
	smlal2 v3.4s, v18.8h, v0.h[2]
	smlal2 v3.4s, v19.8h, v1.h[3]
	smlal2 v3.4s, v20.8h, v0.h[4]
	smlal2 v3.4s, v21.8h, v0.h[5]
	smlal2 v3.4s, v22.8h, v0.h[6]
	sqrshrun v2.4h, v2.4s, #11
	sqrshrun2 v2.8h, v3.4s, #11
	sqxtun v2.8b, v2.8h
	st1 {v2.8b}, [x0], x1
	.if \compare
	cmp w4, #4
	.else
	b.le 9f
	.endif
	mov v16.16b, v17.16b
	mov v17.16b, v18.16b
	mov v18.16b, v19.16b
	mov v19.16b, v20.16b
	mov v20.16b, v21.16b
	mov v21.16b, v22.16b
	.endm
	filter 1
	b.lt 7f
	ld1 {v22.8h}, [x2], x7
	b 4b

	5: // Less than 4 rows in total; not all of v20-v21 are filled yet.
	tst w6, #8 // LR_HAVE_BOTTOM
	b.eq 6f
	// LR_HAVE_BOTTOM
	cmp w4, #2
	// We load at least 2 rows in all cases.
	ld1 {v20.8h}, [x2], x7
	ld1 {v21.8h}, [x2], x7
	b.gt 53f // 3 rows in total
	b.eq 52f // 2 rows in total
	51: // 1 row in total, v19 already loaded, load edge into v20-v22.
	mov v22.16b, v21.16b
	b 8f
	52: // 2 rows in total, v19 already loaded, load v20 with content data
	// and 2 rows of edge.
	ld1 {v22.8h}, [x2], x7
	mov v23.16b, v22.16b
	b 8f
	53:
	// 3 rows in total, v19 already loaded, load v20 and v21 with content
	// and 2 rows of edge.
	ld1 {v22.8h}, [x2], x7
	ld1 {v23.8h}, [x2], x7
	mov v24.16b, v23.16b
	b 8f

	6:
	// !LR_HAVE_BOTTOM
	cmp w4, #2
	b.gt 63f // 3 rows in total
	b.eq 62f // 2 rows in total
	61: // 1 row in total, v19 already loaded, pad that into v20-v22.
	mov v20.16b, v19.16b
	mov v21.16b, v19.16b
	mov v22.16b, v19.16b
	b 8f
	62: // 2 rows in total, v19 already loaded, load v20 and pad that into v21-v23.
	ld1 {v20.8h}, [x2], x7
	mov v21.16b, v20.16b
	mov v22.16b, v20.16b
	mov v23.16b, v20.16b
	b 8f
	63:
	// 3 rows in total, v19 already loaded, load v20 and v21 and pad v21 into v22-v24.
	ld1 {v20.8h}, [x2], x7
	ld1 {v21.8h}, [x2], x7
	mov v22.16b, v21.16b
	mov v23.16b, v21.16b
	mov v24.16b, v21.16b
	b 8f

	7:
	// All registers up to v21 are filled already, 3 valid rows left.
	// < 4 valid rows left; fill in padding and filter the last
	// few rows.
	tst w6, #8 // LR_HAVE_BOTTOM
	b.eq 71f
	// LR_HAVE_BOTTOM; load 2 rows of edge.
	ld1 {v22.8h}, [x2], x7
	ld1 {v23.8h}, [x2], x7
	mov v24.16b, v23.16b
	b 8f
	71:
	// !LR_HAVE_BOTTOM, pad 3 rows
	mov v22.16b, v21.16b
	mov v23.16b, v21.16b
	mov v24.16b, v21.16b

	8: // At this point, all registers up to v22-v24 are loaded with
	// edge/padding (depending on how many rows are left).
	filter 0 // This branches to 9f when done
	mov v22.16b, v23.16b
	mov v23.16b, v24.16b
	b 8b

	9: // End of one vertical slice.
	subs w3, w3, #8
	b.le 0f
	// Move pointers back up to the top and loop horizontally.
	msub x0, x1, x8, x0
	msub x2, x7, x11, x2
	add x0, x0, #8
	add x2, x2, #16
	mov w4, w8
	b 1b

	0:
	ret
	.purgem filter
	endfunc

	// void dav1d_copy_narrow_neon(pixel *dst, ptrdiff_t stride,
	// const pixel *src, int w, int h);
	function copy_narrow_neon, export=1
	adr x5, L(copy_narrow_tbl)
	ldrh w6, [x5, w3, uxtw #1]
	sub x5, x5, w6, uxth
	br x5
	10:
	add x7, x0, x1
	lsl x1, x1, #1
	18:
	cmp w4, #8
	b.lt 110f
	subs w4, w4, #8
	ld1 {v0.8b}, [x2], #8
	st1 {v0.b}[0], [x0], x1
	st1 {v0.b}[1], [x7], x1
	st1 {v0.b}[2], [x0], x1
	st1 {v0.b}[3], [x7], x1
	st1 {v0.b}[4], [x0], x1
	st1 {v0.b}[5], [x7], x1
	st1 {v0.b}[6], [x0], x1
	st1 {v0.b}[7], [x7], x1
	b.le 0f
	b 18b
	110:
	asr x1, x1, #1
	11:
	subs w4, w4, #1
	ld1 {v0.b}[0], [x2], #1
	st1 {v0.b}[0], [x0], x1
	b.gt 11b
	0:
	ret

	20:
	add x7, x0, x1
	lsl x1, x1, #1
	24:
	cmp w4, #4
	b.lt 210f
	subs w4, w4, #4
	ld1 {v0.4h}, [x2], #8
	st1 {v0.h}[0], [x0], x1
	st1 {v0.h}[1], [x7], x1
	st1 {v0.h}[2], [x0], x1
	st1 {v0.h}[3], [x7], x1
	b.le 0f
	b 24b
	210:
	asr x1, x1, #1
	22:
	subs w4, w4, #1
	ld1 {v0.h}[0], [x2], #2
	st1 {v0.h}[0], [x0], x1
	b.gt 22b
	0:
	ret

	30:
	ldrh w5, [x2]
	ldrb w6, [x2, #2]
	add x2, x2, #3
	subs w4, w4, #1
	strh w5, [x0]
	strb w6, [x0, #2]
	add x0, x0, x1
	b.gt 30b
	ret

	40:
	add x7, x0, x1
	lsl x1, x1, #1
	42:
	cmp w4, #2
	b.lt 41f
	subs w4, w4, #2
	ld1 {v0.2s}, [x2], #8
	st1 {v0.s}[0], [x0], x1
	st1 {v0.s}[1], [x7], x1
	b.le 0f
	b 42b
	41:
	ld1 {v0.s}[0], [x2]
	st1 {v0.s}[0], [x0]
	0:
	ret

	50:
	ldr w5, [x2]
	ldrb w6, [x2, #4]
	add x2, x2, #5
	subs w4, w4, #1
	str w5, [x0]
	strb w6, [x0, #4]
	add x0, x0, x1
	b.gt 50b
	ret

	60:
	ldr w5, [x2]
	ldrh w6, [x2, #4]
	add x2, x2, #6
	subs w4, w4, #1
	str w5, [x0]
	strh w6, [x0, #4]
	add x0, x0, x1
	b.gt 60b
	ret

	70:
	ldr w5, [x2]
	ldrh w6, [x2, #4]
	ldrb w7, [x2, #6]
	add x2, x2, #7
	subs w4, w4, #1
	str w5, [x0]
	strh w6, [x0, #4]
	strb w7, [x0, #6]
	add x0, x0, x1
	b.gt 70b
	ret

	L(copy_narrow_tbl):
	.hword 0
	.hword L(copy_narrow_tbl) - 10b
	.hword L(copy_narrow_tbl) - 20b
	.hword L(copy_narrow_tbl) - 30b
	.hword L(copy_narrow_tbl) - 40b
	.hword L(copy_narrow_tbl) - 50b
	.hword L(copy_narrow_tbl) - 60b
	.hword L(copy_narrow_tbl) - 70b
	endfunc