| /* |
| * Copyright © 2018, VideoLAN and dav1d authors |
| * Copyright © 2019, Martin Storsjo |
| * All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are met: |
| * |
| * 1. Redistributions of source code must retain the above copyright notice, this |
| * list of conditions and the following disclaimer. |
| * |
| * 2. Redistributions in binary form must reproduce the above copyright notice, |
| * this list of conditions and the following disclaimer in the documentation |
| * and/or other materials provided with the distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
| * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR |
| * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #include "src/arm/asm.S" |
| #include "util.S" |
| |
| // void ipred_dc_128_8bpc_neon(pixel *dst, const ptrdiff_t stride, |
| // const pixel *const topleft, |
| // const int width, const int height, const int a, |
| // const int max_width, const int max_height); |
| function ipred_dc_128_8bpc_neon, export=1 |
| clz w3, w3 |
| adr x5, L(ipred_dc_128_tbl) |
| sub w3, w3, #25 |
| ldrh w3, [x5, w3, uxtw #1] |
| movi v0.16b, #128 |
| sub x5, x5, w3, uxtw |
| add x6, x0, x1 |
| lsl x1, x1, #1 |
| br x5 |
| 4: |
| AARCH64_VALID_JUMP_TARGET |
| st1 {v0.s}[0], [x0], x1 |
| st1 {v0.s}[0], [x6], x1 |
| subs w4, w4, #4 |
| st1 {v0.s}[0], [x0], x1 |
| st1 {v0.s}[0], [x6], x1 |
| b.gt 4b |
| ret |
| 8: |
| AARCH64_VALID_JUMP_TARGET |
| st1 {v0.8b}, [x0], x1 |
| st1 {v0.8b}, [x6], x1 |
| subs w4, w4, #4 |
| st1 {v0.8b}, [x0], x1 |
| st1 {v0.8b}, [x6], x1 |
| b.gt 8b |
| ret |
| 16: |
| AARCH64_VALID_JUMP_TARGET |
| st1 {v0.16b}, [x0], x1 |
| st1 {v0.16b}, [x6], x1 |
| subs w4, w4, #4 |
| st1 {v0.16b}, [x0], x1 |
| st1 {v0.16b}, [x6], x1 |
| b.gt 16b |
| ret |
| 320: |
| AARCH64_VALID_JUMP_TARGET |
| movi v1.16b, #128 |
| 32: |
| st1 {v0.16b, v1.16b}, [x0], x1 |
| st1 {v0.16b, v1.16b}, [x6], x1 |
| subs w4, w4, #4 |
| st1 {v0.16b, v1.16b}, [x0], x1 |
| st1 {v0.16b, v1.16b}, [x6], x1 |
| b.gt 32b |
| ret |
| 640: |
| AARCH64_VALID_JUMP_TARGET |
| movi v1.16b, #128 |
| movi v2.16b, #128 |
| movi v3.16b, #128 |
| 64: |
| st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 |
| st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 |
| subs w4, w4, #4 |
| st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 |
| st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 |
| b.gt 64b |
| ret |
| |
| L(ipred_dc_128_tbl): |
| .hword L(ipred_dc_128_tbl) - 640b |
| .hword L(ipred_dc_128_tbl) - 320b |
| .hword L(ipred_dc_128_tbl) - 16b |
| .hword L(ipred_dc_128_tbl) - 8b |
| .hword L(ipred_dc_128_tbl) - 4b |
| endfunc |
| |
| // void ipred_v_8bpc_neon(pixel *dst, const ptrdiff_t stride, |
| // const pixel *const topleft, |
| // const int width, const int height, const int a, |
| // const int max_width, const int max_height); |
| function ipred_v_8bpc_neon, export=1 |
| clz w3, w3 |
| adr x5, L(ipred_v_tbl) |
| sub w3, w3, #25 |
| ldrh w3, [x5, w3, uxtw #1] |
| add x2, x2, #1 |
| sub x5, x5, w3, uxtw |
| add x6, x0, x1 |
| lsl x1, x1, #1 |
| br x5 |
| 40: |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v0.s}[0], [x2] |
| 4: |
| st1 {v0.s}[0], [x0], x1 |
| st1 {v0.s}[0], [x6], x1 |
| subs w4, w4, #4 |
| st1 {v0.s}[0], [x0], x1 |
| st1 {v0.s}[0], [x6], x1 |
| b.gt 4b |
| ret |
| 80: |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v0.8b}, [x2] |
| 8: |
| st1 {v0.8b}, [x0], x1 |
| st1 {v0.8b}, [x6], x1 |
| subs w4, w4, #4 |
| st1 {v0.8b}, [x0], x1 |
| st1 {v0.8b}, [x6], x1 |
| b.gt 8b |
| ret |
| 160: |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v0.16b}, [x2] |
| 16: |
| st1 {v0.16b}, [x0], x1 |
| st1 {v0.16b}, [x6], x1 |
| subs w4, w4, #4 |
| st1 {v0.16b}, [x0], x1 |
| st1 {v0.16b}, [x6], x1 |
| b.gt 16b |
| ret |
| 320: |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v0.16b, v1.16b}, [x2] |
| 32: |
| st1 {v0.16b, v1.16b}, [x0], x1 |
| st1 {v0.16b, v1.16b}, [x6], x1 |
| subs w4, w4, #4 |
| st1 {v0.16b, v1.16b}, [x0], x1 |
| st1 {v0.16b, v1.16b}, [x6], x1 |
| b.gt 32b |
| ret |
| 640: |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] |
| 64: |
| st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 |
| st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 |
| subs w4, w4, #4 |
| st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 |
| st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 |
| b.gt 64b |
| ret |
| |
| L(ipred_v_tbl): |
| .hword L(ipred_v_tbl) - 640b |
| .hword L(ipred_v_tbl) - 320b |
| .hword L(ipred_v_tbl) - 160b |
| .hword L(ipred_v_tbl) - 80b |
| .hword L(ipred_v_tbl) - 40b |
| endfunc |
| |
| // void ipred_h_8bpc_neon(pixel *dst, const ptrdiff_t stride, |
| // const pixel *const topleft, |
| // const int width, const int height, const int a, |
| // const int max_width, const int max_height); |
| function ipred_h_8bpc_neon, export=1 |
| clz w3, w3 |
| adr x5, L(ipred_h_tbl) |
| sub w3, w3, #25 |
| ldrh w3, [x5, w3, uxtw #1] |
| sub x2, x2, #4 |
| sub x5, x5, w3, uxtw |
| mov x7, #-4 |
| add x6, x0, x1 |
| lsl x1, x1, #1 |
| br x5 |
| 4: |
| AARCH64_VALID_JUMP_TARGET |
| ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 |
| st1 {v3.s}[0], [x0], x1 |
| st1 {v2.s}[0], [x6], x1 |
| subs w4, w4, #4 |
| st1 {v1.s}[0], [x0], x1 |
| st1 {v0.s}[0], [x6], x1 |
| b.gt 4b |
| ret |
| 8: |
| AARCH64_VALID_JUMP_TARGET |
| ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 |
| st1 {v3.8b}, [x0], x1 |
| st1 {v2.8b}, [x6], x1 |
| subs w4, w4, #4 |
| st1 {v1.8b}, [x0], x1 |
| st1 {v0.8b}, [x6], x1 |
| b.gt 8b |
| ret |
| 16: |
| AARCH64_VALID_JUMP_TARGET |
| ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 |
| st1 {v3.16b}, [x0], x1 |
| st1 {v2.16b}, [x6], x1 |
| subs w4, w4, #4 |
| st1 {v1.16b}, [x0], x1 |
| st1 {v0.16b}, [x6], x1 |
| b.gt 16b |
| ret |
| 32: |
| AARCH64_VALID_JUMP_TARGET |
| ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 |
| str q3, [x0, #16] |
| str q2, [x6, #16] |
| st1 {v3.16b}, [x0], x1 |
| st1 {v2.16b}, [x6], x1 |
| subs w4, w4, #4 |
| str q1, [x0, #16] |
| str q0, [x6, #16] |
| st1 {v1.16b}, [x0], x1 |
| st1 {v0.16b}, [x6], x1 |
| b.gt 32b |
| ret |
| 64: |
| AARCH64_VALID_JUMP_TARGET |
| ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 |
| str q3, [x0, #16] |
| str q2, [x6, #16] |
| stp q3, q3, [x0, #32] |
| stp q2, q2, [x6, #32] |
| st1 {v3.16b}, [x0], x1 |
| st1 {v2.16b}, [x6], x1 |
| subs w4, w4, #4 |
| str q1, [x0, #16] |
| str q0, [x6, #16] |
| stp q1, q1, [x0, #32] |
| stp q0, q0, [x6, #32] |
| st1 {v1.16b}, [x0], x1 |
| st1 {v0.16b}, [x6], x1 |
| b.gt 64b |
| ret |
| |
| L(ipred_h_tbl): |
| .hword L(ipred_h_tbl) - 64b |
| .hword L(ipred_h_tbl) - 32b |
| .hword L(ipred_h_tbl) - 16b |
| .hword L(ipred_h_tbl) - 8b |
| .hword L(ipred_h_tbl) - 4b |
| endfunc |
| |
| // void ipred_dc_top_8bpc_neon(pixel *dst, const ptrdiff_t stride, |
| // const pixel *const topleft, |
| // const int width, const int height, const int a, |
| // const int max_width, const int max_height); |
| function ipred_dc_top_8bpc_neon, export=1 |
| clz w3, w3 |
| adr x5, L(ipred_dc_top_tbl) |
| sub w3, w3, #25 |
| ldrh w3, [x5, w3, uxtw #1] |
| add x2, x2, #1 |
| sub x5, x5, w3, uxtw |
| add x6, x0, x1 |
| lsl x1, x1, #1 |
| br x5 |
| 40: |
| AARCH64_VALID_JUMP_TARGET |
| ld1r {v0.2s}, [x2] |
| uaddlv h0, v0.8b |
| rshrn v0.8b, v0.8h, #3 |
| dup v0.8b, v0.b[0] |
| 4: |
| st1 {v0.s}[0], [x0], x1 |
| st1 {v0.s}[0], [x6], x1 |
| subs w4, w4, #4 |
| st1 {v0.s}[0], [x0], x1 |
| st1 {v0.s}[0], [x6], x1 |
| b.gt 4b |
| ret |
| 80: |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v0.8b}, [x2] |
| uaddlv h0, v0.8b |
| rshrn v0.8b, v0.8h, #3 |
| dup v0.8b, v0.b[0] |
| 8: |
| st1 {v0.8b}, [x0], x1 |
| st1 {v0.8b}, [x6], x1 |
| subs w4, w4, #4 |
| st1 {v0.8b}, [x0], x1 |
| st1 {v0.8b}, [x6], x1 |
| b.gt 8b |
| ret |
| 160: |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v0.16b}, [x2] |
| uaddlv h0, v0.16b |
| rshrn v0.8b, v0.8h, #4 |
| dup v0.16b, v0.b[0] |
| 16: |
| st1 {v0.16b}, [x0], x1 |
| st1 {v0.16b}, [x6], x1 |
| subs w4, w4, #4 |
| st1 {v0.16b}, [x0], x1 |
| st1 {v0.16b}, [x6], x1 |
| b.gt 16b |
| ret |
| 320: |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v0.16b, v1.16b}, [x2] |
| uaddlv h0, v0.16b |
| uaddlv h1, v1.16b |
| add v2.4h, v0.4h, v1.4h |
| rshrn v2.8b, v2.8h, #5 |
| dup v0.16b, v2.b[0] |
| dup v1.16b, v2.b[0] |
| 32: |
| st1 {v0.16b, v1.16b}, [x0], x1 |
| st1 {v0.16b, v1.16b}, [x6], x1 |
| subs w4, w4, #4 |
| st1 {v0.16b, v1.16b}, [x0], x1 |
| st1 {v0.16b, v1.16b}, [x6], x1 |
| b.gt 32b |
| ret |
| 640: |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] |
| uaddlv h0, v0.16b |
| uaddlv h1, v1.16b |
| uaddlv h2, v2.16b |
| uaddlv h3, v3.16b |
| add v4.4h, v0.4h, v1.4h |
| add v5.4h, v2.4h, v3.4h |
| add v4.4h, v4.4h, v5.4h |
| rshrn v4.8b, v4.8h, #6 |
| dup v0.16b, v4.b[0] |
| dup v1.16b, v4.b[0] |
| dup v2.16b, v4.b[0] |
| dup v3.16b, v4.b[0] |
| 64: |
| st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 |
| st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 |
| subs w4, w4, #4 |
| st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 |
| st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 |
| b.gt 64b |
| ret |
| |
| L(ipred_dc_top_tbl): |
| .hword L(ipred_dc_top_tbl) - 640b |
| .hword L(ipred_dc_top_tbl) - 320b |
| .hword L(ipred_dc_top_tbl) - 160b |
| .hword L(ipred_dc_top_tbl) - 80b |
| .hword L(ipred_dc_top_tbl) - 40b |
| endfunc |
| |
| // void ipred_dc_left_8bpc_neon(pixel *dst, const ptrdiff_t stride, |
| // const pixel *const topleft, |
| // const int width, const int height, const int a, |
| // const int max_width, const int max_height); |
| function ipred_dc_left_8bpc_neon, export=1 |
| sub x2, x2, w4, uxtw |
| clz w3, w3 |
| clz w7, w4 |
| adr x5, L(ipred_dc_left_tbl) |
| sub w3, w3, #20 // 25 leading bits, minus table offset 5 |
| sub w7, w7, #25 |
| ldrh w3, [x5, w3, uxtw #1] |
| ldrh w7, [x5, w7, uxtw #1] |
| sub x3, x5, w3, uxtw |
| sub x5, x5, w7, uxtw |
| add x6, x0, x1 |
| lsl x1, x1, #1 |
| br x5 |
| |
| L(ipred_dc_left_h4): |
| AARCH64_VALID_JUMP_TARGET |
| ld1r {v0.2s}, [x2] |
| uaddlv h0, v0.8b |
| rshrn v0.8b, v0.8h, #3 |
| dup v0.16b, v0.b[0] |
| br x3 |
| L(ipred_dc_left_w4): |
| AARCH64_VALID_JUMP_TARGET |
| st1 {v0.s}[0], [x0], x1 |
| st1 {v0.s}[0], [x6], x1 |
| subs w4, w4, #4 |
| st1 {v0.s}[0], [x0], x1 |
| st1 {v0.s}[0], [x6], x1 |
| b.gt L(ipred_dc_left_w4) |
| ret |
| |
| L(ipred_dc_left_h8): |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v0.8b}, [x2] |
| uaddlv h0, v0.8b |
| rshrn v0.8b, v0.8h, #3 |
| dup v0.16b, v0.b[0] |
| br x3 |
| L(ipred_dc_left_w8): |
| AARCH64_VALID_JUMP_TARGET |
| st1 {v0.8b}, [x0], x1 |
| st1 {v0.8b}, [x6], x1 |
| subs w4, w4, #4 |
| st1 {v0.8b}, [x0], x1 |
| st1 {v0.8b}, [x6], x1 |
| b.gt L(ipred_dc_left_w8) |
| ret |
| |
| L(ipred_dc_left_h16): |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v0.16b}, [x2] |
| uaddlv h0, v0.16b |
| rshrn v0.8b, v0.8h, #4 |
| dup v0.16b, v0.b[0] |
| br x3 |
| L(ipred_dc_left_w16): |
| AARCH64_VALID_JUMP_TARGET |
| st1 {v0.16b}, [x0], x1 |
| st1 {v0.16b}, [x6], x1 |
| subs w4, w4, #4 |
| st1 {v0.16b}, [x0], x1 |
| st1 {v0.16b}, [x6], x1 |
| b.gt L(ipred_dc_left_w16) |
| ret |
| |
| L(ipred_dc_left_h32): |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v0.16b, v1.16b}, [x2] |
| uaddlv h0, v0.16b |
| uaddlv h1, v1.16b |
| add v0.4h, v0.4h, v1.4h |
| rshrn v0.8b, v0.8h, #5 |
| dup v0.16b, v0.b[0] |
| br x3 |
| L(ipred_dc_left_w32): |
| AARCH64_VALID_JUMP_TARGET |
| mov v1.16b, v0.16b |
| 1: |
| st1 {v0.16b, v1.16b}, [x0], x1 |
| st1 {v0.16b, v1.16b}, [x6], x1 |
| subs w4, w4, #4 |
| st1 {v0.16b, v1.16b}, [x0], x1 |
| st1 {v0.16b, v1.16b}, [x6], x1 |
| b.gt 1b |
| ret |
| |
| L(ipred_dc_left_h64): |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] |
| uaddlv h0, v0.16b |
| uaddlv h1, v1.16b |
| uaddlv h2, v2.16b |
| uaddlv h3, v3.16b |
| add v0.4h, v0.4h, v1.4h |
| add v2.4h, v2.4h, v3.4h |
| add v0.4h, v0.4h, v2.4h |
| rshrn v0.8b, v0.8h, #6 |
| dup v0.16b, v0.b[0] |
| br x3 |
| L(ipred_dc_left_w64): |
| AARCH64_VALID_JUMP_TARGET |
| mov v1.16b, v0.16b |
| mov v2.16b, v0.16b |
| mov v3.16b, v0.16b |
| 1: |
| st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 |
| st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 |
| subs w4, w4, #4 |
| st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 |
| st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 |
| b.gt 1b |
| ret |
| |
| L(ipred_dc_left_tbl): |
| .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64) |
| .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32) |
| .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16) |
| .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8) |
| .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4) |
| .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64) |
| .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32) |
| .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16) |
| .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8) |
| .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4) |
| endfunc |
| |
| // void ipred_dc_8bpc_neon(pixel *dst, const ptrdiff_t stride, |
| // const pixel *const topleft, |
| // const int width, const int height, const int a, |
| // const int max_width, const int max_height); |
| function ipred_dc_8bpc_neon, export=1 |
| sub x2, x2, w4, uxtw |
| add w7, w3, w4 // width + height |
| clz w3, w3 |
| clz w6, w4 |
| dup v16.8h, w7 // width + height |
| adr x5, L(ipred_dc_tbl) |
| rbit w7, w7 // rbit(width + height) |
| sub w3, w3, #20 // 25 leading bits, minus table offset 5 |
| sub w6, w6, #25 |
| clz w7, w7 // ctz(width + height) |
| ldrh w3, [x5, w3, uxtw #1] |
| ldrh w6, [x5, w6, uxtw #1] |
| neg w7, w7 // -ctz(width + height) |
| sub x3, x5, w3, uxtw |
| sub x5, x5, w6, uxtw |
| ushr v16.8h, v16.8h, #1 // (width + height) >> 1 |
| dup v17.8h, w7 // -ctz(width + height) |
| add x6, x0, x1 |
| lsl x1, x1, #1 |
| br x5 |
| |
| L(ipred_dc_h4): |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v0.s}[0], [x2], #4 |
| ins v0.s[1], wzr |
| uaddlv h0, v0.8b |
| add x2, x2, #1 |
| br x3 |
| L(ipred_dc_w4): |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v1.s}[0], [x2] |
| ins v1.s[1], wzr |
| add v0.4h, v0.4h, v16.4h |
| uaddlv h1, v1.8b |
| cmp w4, #4 |
| add v0.4h, v0.4h, v1.4h |
| ushl v0.4h, v0.4h, v17.4h |
| b.eq 1f |
| // h = 8/16 |
| mov w16, #(0x3334/2) |
| movk w16, #(0x5556/2), lsl #16 |
| add w17, w4, w4 // w17 = 2*h = 16 or 32 |
| lsr w16, w16, w17 |
| dup v16.4h, w16 |
| sqdmulh v0.4h, v0.4h, v16.4h |
| 1: |
| dup v0.8b, v0.b[0] |
| 2: |
| st1 {v0.s}[0], [x0], x1 |
| st1 {v0.s}[0], [x6], x1 |
| subs w4, w4, #4 |
| st1 {v0.s}[0], [x0], x1 |
| st1 {v0.s}[0], [x6], x1 |
| b.gt 2b |
| ret |
| |
| L(ipred_dc_h8): |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v0.8b}, [x2], #8 |
| uaddlv h0, v0.8b |
| add x2, x2, #1 |
| br x3 |
| L(ipred_dc_w8): |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v1.8b}, [x2] |
| add v0.4h, v0.4h, v16.4h |
| uaddlv h1, v1.8b |
| cmp w4, #8 |
| add v0.4h, v0.4h, v1.4h |
| ushl v0.4h, v0.4h, v17.4h |
| b.eq 1f |
| // h = 4/16/32 |
| cmp w4, #32 |
| mov w16, #(0x3334/2) |
| mov w17, #(0x5556/2) |
| csel w16, w16, w17, eq |
| dup v16.4h, w16 |
| sqdmulh v0.4h, v0.4h, v16.4h |
| 1: |
| dup v0.8b, v0.b[0] |
| 2: |
| st1 {v0.8b}, [x0], x1 |
| st1 {v0.8b}, [x6], x1 |
| subs w4, w4, #4 |
| st1 {v0.8b}, [x0], x1 |
| st1 {v0.8b}, [x6], x1 |
| b.gt 2b |
| ret |
| |
| L(ipred_dc_h16): |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v0.16b}, [x2], #16 |
| uaddlv h0, v0.16b |
| add x2, x2, #1 |
| br x3 |
| L(ipred_dc_w16): |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v1.16b}, [x2] |
| add v0.4h, v0.4h, v16.4h |
| uaddlv h1, v1.16b |
| cmp w4, #16 |
| add v0.4h, v0.4h, v1.4h |
| ushl v0.4h, v0.4h, v17.4h |
| b.eq 1f |
| // h = 4/8/32/64 |
| tst w4, #(32+16+8) // 16 added to make a consecutive bitmask |
| mov w16, #(0x3334/2) |
| mov w17, #(0x5556/2) |
| csel w16, w16, w17, eq |
| dup v16.4h, w16 |
| sqdmulh v0.4h, v0.4h, v16.4h |
| 1: |
| dup v0.16b, v0.b[0] |
| 2: |
| st1 {v0.16b}, [x0], x1 |
| st1 {v0.16b}, [x6], x1 |
| subs w4, w4, #4 |
| st1 {v0.16b}, [x0], x1 |
| st1 {v0.16b}, [x6], x1 |
| b.gt 2b |
| ret |
| |
| L(ipred_dc_h32): |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v0.16b, v1.16b}, [x2], #32 |
| uaddlv h0, v0.16b |
| uaddlv h1, v1.16b |
| add x2, x2, #1 |
| add v0.4h, v0.4h, v1.4h |
| br x3 |
| L(ipred_dc_w32): |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v1.16b, v2.16b}, [x2] |
| add v0.4h, v0.4h, v16.4h |
| uaddlv h1, v1.16b |
| uaddlv h2, v2.16b |
| cmp w4, #32 |
| add v0.4h, v0.4h, v1.4h |
| add v0.4h, v0.4h, v2.4h |
| ushl v4.4h, v0.4h, v17.4h |
| b.eq 1f |
| // h = 8/16/64 |
| cmp w4, #8 |
| mov w16, #(0x3334/2) |
| mov w17, #(0x5556/2) |
| csel w16, w16, w17, eq |
| dup v16.4h, w16 |
| sqdmulh v4.4h, v4.4h, v16.4h |
| 1: |
| dup v0.16b, v4.b[0] |
| dup v1.16b, v4.b[0] |
| 2: |
| st1 {v0.16b, v1.16b}, [x0], x1 |
| st1 {v0.16b, v1.16b}, [x6], x1 |
| subs w4, w4, #4 |
| st1 {v0.16b, v1.16b}, [x0], x1 |
| st1 {v0.16b, v1.16b}, [x6], x1 |
| b.gt 2b |
| ret |
| |
| L(ipred_dc_h64): |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64 |
| uaddlv h0, v0.16b |
| uaddlv h1, v1.16b |
| uaddlv h2, v2.16b |
| uaddlv h3, v3.16b |
| add v0.4h, v0.4h, v1.4h |
| add v2.4h, v2.4h, v3.4h |
| add x2, x2, #1 |
| add v0.4h, v0.4h, v2.4h |
| br x3 |
| L(ipred_dc_w64): |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v1.16b, v2.16b, v3.16b, v4.16b}, [x2] |
| add v0.4h, v0.4h, v16.4h |
| uaddlv h1, v1.16b |
| uaddlv h2, v2.16b |
| uaddlv h3, v3.16b |
| uaddlv h4, v4.16b |
| add v1.4h, v1.4h, v2.4h |
| add v3.4h, v3.4h, v4.4h |
| cmp w4, #64 |
| add v0.4h, v0.4h, v1.4h |
| add v0.4h, v0.4h, v3.4h |
| ushl v4.4h, v0.4h, v17.4h |
| b.eq 1f |
| // h = 16/32 |
| mov w16, #(0x5556/2) |
| movk w16, #(0x3334/2), lsl #16 |
| lsr w16, w16, w4 |
| dup v16.4h, w16 |
| sqdmulh v4.4h, v4.4h, v16.4h |
| 1: |
| dup v0.16b, v4.b[0] |
| dup v1.16b, v4.b[0] |
| dup v2.16b, v4.b[0] |
| dup v3.16b, v4.b[0] |
| 2: |
| st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 |
| st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 |
| subs w4, w4, #4 |
| st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 |
| st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 |
| b.gt 2b |
| ret |
| |
| L(ipred_dc_tbl): |
| .hword L(ipred_dc_tbl) - L(ipred_dc_h64) |
| .hword L(ipred_dc_tbl) - L(ipred_dc_h32) |
| .hword L(ipred_dc_tbl) - L(ipred_dc_h16) |
| .hword L(ipred_dc_tbl) - L(ipred_dc_h8) |
| .hword L(ipred_dc_tbl) - L(ipred_dc_h4) |
| .hword L(ipred_dc_tbl) - L(ipred_dc_w64) |
| .hword L(ipred_dc_tbl) - L(ipred_dc_w32) |
| .hword L(ipred_dc_tbl) - L(ipred_dc_w16) |
| .hword L(ipred_dc_tbl) - L(ipred_dc_w8) |
| .hword L(ipred_dc_tbl) - L(ipred_dc_w4) |
| endfunc |
| |
| // void ipred_paeth_8bpc_neon(pixel *dst, const ptrdiff_t stride, |
| // const pixel *const topleft, |
| // const int width, const int height, const int a, |
| // const int max_width, const int max_height); |
| function ipred_paeth_8bpc_neon, export=1 |
| clz w9, w3 |
| adr x5, L(ipred_paeth_tbl) |
| sub w9, w9, #25 |
| ldrh w9, [x5, w9, uxtw #1] |
| ld1r {v4.16b}, [x2] |
| add x8, x2, #1 |
| sub x2, x2, #4 |
| sub x5, x5, w9, uxtw |
| mov x7, #-4 |
| add x6, x0, x1 |
| lsl x1, x1, #1 |
| br x5 |
| 40: |
| AARCH64_VALID_JUMP_TARGET |
| ld1r {v5.4s}, [x8] |
| usubl v6.8h, v5.8b, v4.8b // top - topleft |
| 4: |
| ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 |
| zip1 v0.2s, v0.2s, v1.2s |
| zip1 v2.2s, v2.2s, v3.2s |
| uaddw v16.8h, v6.8h, v0.8b |
| uaddw v17.8h, v6.8h, v2.8b |
| sqxtun v16.8b, v16.8h // base |
| sqxtun2 v16.16b, v17.8h |
| zip1 v0.2d, v0.2d, v2.2d |
| uabd v20.16b, v5.16b, v16.16b // tdiff |
| uabd v22.16b, v4.16b, v16.16b // tldiff |
| uabd v16.16b, v0.16b, v16.16b // ldiff |
| umin v18.16b, v20.16b, v22.16b // min(tdiff, tldiff) |
| cmhs v20.16b, v22.16b, v20.16b // tldiff >= tdiff |
| cmhs v16.16b, v18.16b, v16.16b // min(tdiff, tldiff) >= ldiff |
| bsl v20.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft |
| bit v20.16b, v0.16b, v16.16b // ldiff <= min ? left : ... |
| st1 {v20.s}[3], [x0], x1 |
| st1 {v20.s}[2], [x6], x1 |
| subs w4, w4, #4 |
| st1 {v20.s}[1], [x0], x1 |
| st1 {v20.s}[0], [x6], x1 |
| b.gt 4b |
| ret |
| 80: |
| AARCH64_VALID_JUMP_TARGET |
| ld1r {v5.2d}, [x8] |
| usubl v6.8h, v5.8b, v4.8b // top - topleft |
| 8: |
| ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 |
| uaddw v16.8h, v6.8h, v0.8b |
| uaddw v17.8h, v6.8h, v1.8b |
| uaddw v18.8h, v6.8h, v2.8b |
| uaddw v19.8h, v6.8h, v3.8b |
| sqxtun v16.8b, v16.8h // base |
| sqxtun2 v16.16b, v17.8h |
| sqxtun v18.8b, v18.8h |
| sqxtun2 v18.16b, v19.8h |
| zip1 v2.2d, v2.2d, v3.2d |
| zip1 v0.2d, v0.2d, v1.2d |
| uabd v21.16b, v5.16b, v18.16b // tdiff |
| uabd v20.16b, v5.16b, v16.16b |
| uabd v23.16b, v4.16b, v18.16b // tldiff |
| uabd v22.16b, v4.16b, v16.16b |
| uabd v17.16b, v2.16b, v18.16b // ldiff |
| uabd v16.16b, v0.16b, v16.16b |
| umin v19.16b, v21.16b, v23.16b // min(tdiff, tldiff) |
| umin v18.16b, v20.16b, v22.16b |
| cmhs v21.16b, v23.16b, v21.16b // tldiff >= tdiff |
| cmhs v20.16b, v22.16b, v20.16b |
| cmhs v17.16b, v19.16b, v17.16b // min(tdiff, tldiff) >= ldiff |
| cmhs v16.16b, v18.16b, v16.16b |
| bsl v21.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft |
| bsl v20.16b, v5.16b, v4.16b |
| bit v21.16b, v2.16b, v17.16b // ldiff <= min ? left : ... |
| bit v20.16b, v0.16b, v16.16b |
| st1 {v21.d}[1], [x0], x1 |
| st1 {v21.d}[0], [x6], x1 |
| subs w4, w4, #4 |
| st1 {v20.d}[1], [x0], x1 |
| st1 {v20.d}[0], [x6], x1 |
| b.gt 8b |
| ret |
| 160: |
| 320: |
| 640: |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v5.16b}, [x8], #16 |
| mov w9, w3 |
| // Set up pointers for four rows in parallel; x0, x6, x5, x10 |
| add x5, x0, x1 |
| add x10, x6, x1 |
| lsl x1, x1, #1 |
| sub x1, x1, w3, uxtw |
| 1: |
| ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 |
| 2: |
| usubl v6.8h, v5.8b, v4.8b // top - topleft |
| usubl2 v7.8h, v5.16b, v4.16b |
| uaddw v24.8h, v6.8h, v0.8b |
| uaddw v25.8h, v7.8h, v0.8b |
| uaddw v26.8h, v6.8h, v1.8b |
| uaddw v27.8h, v7.8h, v1.8b |
| uaddw v28.8h, v6.8h, v2.8b |
| uaddw v29.8h, v7.8h, v2.8b |
| uaddw v30.8h, v6.8h, v3.8b |
| uaddw v31.8h, v7.8h, v3.8b |
| sqxtun v17.8b, v26.8h // base |
| sqxtun2 v17.16b, v27.8h |
| sqxtun v16.8b, v24.8h |
| sqxtun2 v16.16b, v25.8h |
| sqxtun v19.8b, v30.8h |
| sqxtun2 v19.16b, v31.8h |
| sqxtun v18.8b, v28.8h |
| sqxtun2 v18.16b, v29.8h |
| uabd v23.16b, v5.16b, v19.16b // tdiff |
| uabd v22.16b, v5.16b, v18.16b |
| uabd v21.16b, v5.16b, v17.16b |
| uabd v20.16b, v5.16b, v16.16b |
| uabd v27.16b, v4.16b, v19.16b // tldiff |
| uabd v26.16b, v4.16b, v18.16b |
| uabd v25.16b, v4.16b, v17.16b |
| uabd v24.16b, v4.16b, v16.16b |
| uabd v19.16b, v3.16b, v19.16b // ldiff |
| uabd v18.16b, v2.16b, v18.16b |
| uabd v17.16b, v1.16b, v17.16b |
| uabd v16.16b, v0.16b, v16.16b |
| umin v31.16b, v23.16b, v27.16b // min(tdiff, tldiff) |
| umin v30.16b, v22.16b, v26.16b |
| umin v29.16b, v21.16b, v25.16b |
| umin v28.16b, v20.16b, v24.16b |
| cmhs v23.16b, v27.16b, v23.16b // tldiff >= tdiff |
| cmhs v22.16b, v26.16b, v22.16b |
| cmhs v21.16b, v25.16b, v21.16b |
| cmhs v20.16b, v24.16b, v20.16b |
| cmhs v19.16b, v31.16b, v19.16b // min(tdiff, tldiff) >= ldiff |
| cmhs v18.16b, v30.16b, v18.16b |
| cmhs v17.16b, v29.16b, v17.16b |
| cmhs v16.16b, v28.16b, v16.16b |
| bsl v23.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft |
| bsl v22.16b, v5.16b, v4.16b |
| bsl v21.16b, v5.16b, v4.16b |
| bsl v20.16b, v5.16b, v4.16b |
| bit v23.16b, v3.16b, v19.16b // ldiff <= min ? left : ... |
| bit v22.16b, v2.16b, v18.16b |
| bit v21.16b, v1.16b, v17.16b |
| bit v20.16b, v0.16b, v16.16b |
| subs w3, w3, #16 |
| st1 {v23.16b}, [x0], #16 |
| st1 {v22.16b}, [x6], #16 |
| st1 {v21.16b}, [x5], #16 |
| st1 {v20.16b}, [x10], #16 |
| b.le 8f |
| ld1 {v5.16b}, [x8], #16 |
| b 2b |
| 8: |
| subs w4, w4, #4 |
| b.le 9f |
| // End of horizontal loop, move pointers to next four rows |
| sub x8, x8, w9, uxtw |
| add x0, x0, x1 |
| add x6, x6, x1 |
| // Load the top row as early as possible |
| ld1 {v5.16b}, [x8], #16 |
| add x5, x5, x1 |
| add x10, x10, x1 |
| mov w3, w9 |
| b 1b |
| 9: |
| ret |
| |
| L(ipred_paeth_tbl): |
| .hword L(ipred_paeth_tbl) - 640b |
| .hword L(ipred_paeth_tbl) - 320b |
| .hword L(ipred_paeth_tbl) - 160b |
| .hword L(ipred_paeth_tbl) - 80b |
| .hword L(ipred_paeth_tbl) - 40b |
| endfunc |
| |
| // void ipred_smooth_8bpc_neon(pixel *dst, const ptrdiff_t stride, |
| // const pixel *const topleft, |
| // const int width, const int height, const int a, |
| // const int max_width, const int max_height); |
| function ipred_smooth_8bpc_neon, export=1 |
| movrel x10, X(sm_weights) |
| add x11, x10, w4, uxtw |
| add x10, x10, w3, uxtw |
| clz w9, w3 |
| adr x5, L(ipred_smooth_tbl) |
| sub x12, x2, w4, uxtw |
| sub w9, w9, #25 |
| ldrh w9, [x5, w9, uxtw #1] |
| ld1r {v4.16b}, [x12] // bottom |
| add x8, x2, #1 |
| sub x5, x5, w9, uxtw |
| add x6, x0, x1 |
| lsl x1, x1, #1 |
| br x5 |
| 40: |
| AARCH64_VALID_JUMP_TARGET |
| ld1r {v6.2s}, [x8] // top |
| ld1r {v7.2s}, [x10] // weights_hor |
| sub x2, x2, #4 |
| mov x7, #-4 |
| dup v5.16b, v6.b[3] // right |
| usubl v6.8h, v6.8b, v4.8b // top-bottom |
| uxtl v7.8h, v7.8b // weights_hor |
| 4: |
| ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left |
| ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver |
| shll v20.8h, v5.8b, #8 // right*256 |
| shll v21.8h, v5.8b, #8 |
| zip1 v1.2s, v1.2s, v0.2s // left, flipped |
| zip1 v0.2s, v3.2s, v2.2s |
| zip1 v16.2s, v16.2s, v17.2s // weights_ver |
| zip1 v18.2s, v18.2s, v19.2s |
| shll v22.8h, v4.8b, #8 // bottom*256 |
| shll v23.8h, v4.8b, #8 |
| usubl v0.8h, v0.8b, v5.8b // left-right |
| usubl v1.8h, v1.8b, v5.8b |
| uxtl v16.8h, v16.8b // weights_ver |
| uxtl v18.8h, v18.8b |
| mla v20.8h, v0.8h, v7.8h // right*256 + (left-right)*weights_hor |
| mla v21.8h, v1.8h, v7.8h |
| mla v22.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver |
| mla v23.8h, v6.8h, v18.8h |
| uhadd v20.8h, v20.8h, v22.8h |
| uhadd v21.8h, v21.8h, v23.8h |
| rshrn v20.8b, v20.8h, #8 |
| rshrn v21.8b, v21.8h, #8 |
| st1 {v20.s}[0], [x0], x1 |
| st1 {v20.s}[1], [x6], x1 |
| subs w4, w4, #4 |
| st1 {v21.s}[0], [x0], x1 |
| st1 {v21.s}[1], [x6], x1 |
| b.gt 4b |
| ret |
| 80: |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v6.8b}, [x8] // top |
| ld1 {v7.8b}, [x10] // weights_hor |
| sub x2, x2, #4 |
| mov x7, #-4 |
| dup v5.16b, v6.b[7] // right |
| usubl v6.8h, v6.8b, v4.8b // top-bottom |
| uxtl v7.8h, v7.8b // weights_hor |
| 8: |
| ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left |
| ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver |
| shll v20.8h, v5.8b, #8 // right*256 |
| shll v21.8h, v5.8b, #8 |
| shll v22.8h, v5.8b, #8 |
| shll v23.8h, v5.8b, #8 |
| usubl v0.8h, v0.8b, v5.8b // left-right |
| usubl v1.8h, v1.8b, v5.8b |
| usubl v2.8h, v2.8b, v5.8b |
| usubl v3.8h, v3.8b, v5.8b |
| shll v24.8h, v4.8b, #8 // bottom*256 |
| shll v25.8h, v4.8b, #8 |
| shll v26.8h, v4.8b, #8 |
| shll v27.8h, v4.8b, #8 |
| uxtl v16.8h, v16.8b // weights_ver |
| uxtl v17.8h, v17.8b |
| uxtl v18.8h, v18.8b |
| uxtl v19.8h, v19.8b |
| mla v20.8h, v3.8h, v7.8h // right*256 + (left-right)*weights_hor |
| mla v21.8h, v2.8h, v7.8h // (left flipped) |
| mla v22.8h, v1.8h, v7.8h |
| mla v23.8h, v0.8h, v7.8h |
| mla v24.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver |
| mla v25.8h, v6.8h, v17.8h |
| mla v26.8h, v6.8h, v18.8h |
| mla v27.8h, v6.8h, v19.8h |
| uhadd v20.8h, v20.8h, v24.8h |
| uhadd v21.8h, v21.8h, v25.8h |
| uhadd v22.8h, v22.8h, v26.8h |
| uhadd v23.8h, v23.8h, v27.8h |
| rshrn v20.8b, v20.8h, #8 |
| rshrn v21.8b, v21.8h, #8 |
| rshrn v22.8b, v22.8h, #8 |
| rshrn v23.8b, v23.8h, #8 |
| st1 {v20.8b}, [x0], x1 |
| st1 {v21.8b}, [x6], x1 |
| subs w4, w4, #4 |
| st1 {v22.8b}, [x0], x1 |
| st1 {v23.8b}, [x6], x1 |
| b.gt 8b |
| ret |
| 160: |
| 320: |
| 640: |
| AARCH64_VALID_JUMP_TARGET |
| add x12, x2, w3, uxtw |
| sub x2, x2, #2 |
| mov x7, #-2 |
| ld1r {v5.16b}, [x12] // right |
| sub x1, x1, w3, uxtw |
| mov w9, w3 |
| |
| 1: |
| ld2r {v0.8b, v1.8b}, [x2], x7 // left |
| ld2r {v16.8b, v17.8b}, [x11], #2 // weights_ver |
| usubl v0.8h, v0.8b, v5.8b // left-right |
| usubl v1.8h, v1.8b, v5.8b |
| uxtl v16.8h, v16.8b // weights_ver |
| uxtl v17.8h, v17.8b |
| 2: |
| ld1 {v7.16b}, [x10], #16 // weights_hor |
| ld1 {v3.16b}, [x8], #16 // top |
| shll v20.8h, v5.8b, #8 // right*256 |
| shll v21.8h, v5.8b, #8 |
| shll v22.8h, v5.8b, #8 |
| shll v23.8h, v5.8b, #8 |
| uxtl v6.8h, v7.8b // weights_hor |
| uxtl2 v7.8h, v7.16b |
| usubl v2.8h, v3.8b, v4.8b // top-bottom |
| usubl2 v3.8h, v3.16b, v4.16b |
| mla v20.8h, v1.8h, v6.8h // right*256 + (left-right)*weights_hor |
| mla v21.8h, v1.8h, v7.8h // (left flipped) |
| mla v22.8h, v0.8h, v6.8h |
| mla v23.8h, v0.8h, v7.8h |
| shll v24.8h, v4.8b, #8 // bottom*256 |
| shll v25.8h, v4.8b, #8 |
| shll v26.8h, v4.8b, #8 |
| shll v27.8h, v4.8b, #8 |
| mla v24.8h, v2.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver |
| mla v25.8h, v3.8h, v16.8h |
| mla v26.8h, v2.8h, v17.8h |
| mla v27.8h, v3.8h, v17.8h |
| uhadd v20.8h, v20.8h, v24.8h |
| uhadd v21.8h, v21.8h, v25.8h |
| uhadd v22.8h, v22.8h, v26.8h |
| uhadd v23.8h, v23.8h, v27.8h |
| rshrn v20.8b, v20.8h, #8 |
| rshrn2 v20.16b, v21.8h, #8 |
| rshrn v22.8b, v22.8h, #8 |
| rshrn2 v22.16b, v23.8h, #8 |
| subs w3, w3, #16 |
| st1 {v20.16b}, [x0], #16 |
| st1 {v22.16b}, [x6], #16 |
| b.gt 2b |
| subs w4, w4, #2 |
| b.le 9f |
| sub x8, x8, w9, uxtw |
| sub x10, x10, w9, uxtw |
| add x0, x0, x1 |
| add x6, x6, x1 |
| mov w3, w9 |
| b 1b |
| 9: |
| ret |
| |
| L(ipred_smooth_tbl): |
| .hword L(ipred_smooth_tbl) - 640b |
| .hword L(ipred_smooth_tbl) - 320b |
| .hword L(ipred_smooth_tbl) - 160b |
| .hword L(ipred_smooth_tbl) - 80b |
| .hword L(ipred_smooth_tbl) - 40b |
| endfunc |
| |
| // void ipred_smooth_v_8bpc_neon(pixel *dst, const ptrdiff_t stride, |
| // const pixel *const topleft, |
| // const int width, const int height, const int a, |
| // const int max_width, const int max_height); |
| function ipred_smooth_v_8bpc_neon, export=1 |
| movrel x7, X(sm_weights) |
| add x7, x7, w4, uxtw |
| clz w9, w3 |
| adr x5, L(ipred_smooth_v_tbl) |
| sub x8, x2, w4, uxtw |
| sub w9, w9, #25 |
| ldrh w9, [x5, w9, uxtw #1] |
| ld1r {v4.16b}, [x8] // bottom |
| add x2, x2, #1 |
| sub x5, x5, w9, uxtw |
| add x6, x0, x1 |
| lsl x1, x1, #1 |
| br x5 |
| 40: |
| AARCH64_VALID_JUMP_TARGET |
| ld1r {v6.2s}, [x2] // top |
| usubl v6.8h, v6.8b, v4.8b // top-bottom |
| 4: |
| ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver |
| shll v22.8h, v4.8b, #8 // bottom*256 |
| shll v23.8h, v4.8b, #8 |
| zip1 v16.2s, v16.2s, v17.2s // weights_ver |
| zip1 v18.2s, v18.2s, v19.2s |
| uxtl v16.8h, v16.8b // weights_ver |
| uxtl v18.8h, v18.8b |
| mla v22.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver |
| mla v23.8h, v6.8h, v18.8h |
| rshrn v22.8b, v22.8h, #8 |
| rshrn v23.8b, v23.8h, #8 |
| st1 {v22.s}[0], [x0], x1 |
| st1 {v22.s}[1], [x6], x1 |
| subs w4, w4, #4 |
| st1 {v23.s}[0], [x0], x1 |
| st1 {v23.s}[1], [x6], x1 |
| b.gt 4b |
| ret |
| 80: |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v6.8b}, [x2] // top |
| usubl v6.8h, v6.8b, v4.8b // top-bottom |
| 8: |
| ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver |
| shll v24.8h, v4.8b, #8 // bottom*256 |
| shll v25.8h, v4.8b, #8 |
| shll v26.8h, v4.8b, #8 |
| shll v27.8h, v4.8b, #8 |
| uxtl v16.8h, v16.8b // weights_ver |
| uxtl v17.8h, v17.8b |
| uxtl v18.8h, v18.8b |
| uxtl v19.8h, v19.8b |
| mla v24.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver |
| mla v25.8h, v6.8h, v17.8h |
| mla v26.8h, v6.8h, v18.8h |
| mla v27.8h, v6.8h, v19.8h |
| rshrn v24.8b, v24.8h, #8 |
| rshrn v25.8b, v25.8h, #8 |
| rshrn v26.8b, v26.8h, #8 |
| rshrn v27.8b, v27.8h, #8 |
| st1 {v24.8b}, [x0], x1 |
| st1 {v25.8b}, [x6], x1 |
| subs w4, w4, #4 |
| st1 {v26.8b}, [x0], x1 |
| st1 {v27.8b}, [x6], x1 |
| b.gt 8b |
| ret |
| 160: |
| 320: |
| 640: |
| AARCH64_VALID_JUMP_TARGET |
| // Set up pointers for four rows in parallel; x0, x6, x5, x8 |
| add x5, x0, x1 |
| add x8, x6, x1 |
| lsl x1, x1, #1 |
| sub x1, x1, w3, uxtw |
| mov w9, w3 |
| |
| 1: |
| ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver |
| uxtl v16.8h, v16.8b // weights_ver |
| uxtl v17.8h, v17.8b |
| uxtl v18.8h, v18.8b |
| uxtl v19.8h, v19.8b |
| 2: |
| ld1 {v3.16b}, [x2], #16 // top |
| shll v20.8h, v4.8b, #8 // bottom*256 |
| shll v21.8h, v4.8b, #8 |
| shll v22.8h, v4.8b, #8 |
| shll v23.8h, v4.8b, #8 |
| shll v24.8h, v4.8b, #8 |
| shll v25.8h, v4.8b, #8 |
| shll v26.8h, v4.8b, #8 |
| shll v27.8h, v4.8b, #8 |
| usubl v2.8h, v3.8b, v4.8b // top-bottom |
| usubl2 v3.8h, v3.16b, v4.16b |
| mla v20.8h, v2.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver |
| mla v21.8h, v3.8h, v16.8h |
| mla v22.8h, v2.8h, v17.8h |
| mla v23.8h, v3.8h, v17.8h |
| mla v24.8h, v2.8h, v18.8h |
| mla v25.8h, v3.8h, v18.8h |
| mla v26.8h, v2.8h, v19.8h |
| mla v27.8h, v3.8h, v19.8h |
| rshrn v20.8b, v20.8h, #8 |
| rshrn2 v20.16b, v21.8h, #8 |
| rshrn v22.8b, v22.8h, #8 |
| rshrn2 v22.16b, v23.8h, #8 |
| rshrn v24.8b, v24.8h, #8 |
| rshrn2 v24.16b, v25.8h, #8 |
| rshrn v26.8b, v26.8h, #8 |
| rshrn2 v26.16b, v27.8h, #8 |
| subs w3, w3, #16 |
| st1 {v20.16b}, [x0], #16 |
| st1 {v22.16b}, [x6], #16 |
| st1 {v24.16b}, [x5], #16 |
| st1 {v26.16b}, [x8], #16 |
| b.gt 2b |
| subs w4, w4, #4 |
| b.le 9f |
| sub x2, x2, w9, uxtw |
| add x0, x0, x1 |
| add x6, x6, x1 |
| add x5, x5, x1 |
| add x8, x8, x1 |
| mov w3, w9 |
| b 1b |
| 9: |
| ret |
| |
| L(ipred_smooth_v_tbl): |
| .hword L(ipred_smooth_v_tbl) - 640b |
| .hword L(ipred_smooth_v_tbl) - 320b |
| .hword L(ipred_smooth_v_tbl) - 160b |
| .hword L(ipred_smooth_v_tbl) - 80b |
| .hword L(ipred_smooth_v_tbl) - 40b |
| endfunc |
| |
| // void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride, |
| // const pixel *const topleft, |
| // const int width, const int height, const int a, |
| // const int max_width, const int max_height); |
| function ipred_smooth_h_8bpc_neon, export=1 |
| movrel x8, X(sm_weights) |
| add x8, x8, w3, uxtw |
| clz w9, w3 |
| adr x5, L(ipred_smooth_h_tbl) |
| add x12, x2, w3, uxtw |
| sub w9, w9, #25 |
| ldrh w9, [x5, w9, uxtw #1] |
| ld1r {v5.16b}, [x12] // right |
| sub x5, x5, w9, uxtw |
| add x6, x0, x1 |
| lsl x1, x1, #1 |
| br x5 |
| 40: |
| AARCH64_VALID_JUMP_TARGET |
| ld1r {v7.2s}, [x8] // weights_hor |
| sub x2, x2, #4 |
| mov x7, #-4 |
| uxtl v7.8h, v7.8b // weights_hor |
| 4: |
| ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left |
| shll v20.8h, v5.8b, #8 // right*256 |
| shll v21.8h, v5.8b, #8 |
| zip1 v1.2s, v1.2s, v0.2s // left, flipped |
| zip1 v0.2s, v3.2s, v2.2s |
| usubl v0.8h, v0.8b, v5.8b // left-right |
| usubl v1.8h, v1.8b, v5.8b |
| mla v20.8h, v0.8h, v7.8h // right*256 + (left-right)*weights_hor |
| mla v21.8h, v1.8h, v7.8h |
| rshrn v20.8b, v20.8h, #8 |
| rshrn v21.8b, v21.8h, #8 |
| st1 {v20.s}[0], [x0], x1 |
| st1 {v20.s}[1], [x6], x1 |
| subs w4, w4, #4 |
| st1 {v21.s}[0], [x0], x1 |
| st1 {v21.s}[1], [x6], x1 |
| b.gt 4b |
| ret |
| 80: |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v7.8b}, [x8] // weights_hor |
| sub x2, x2, #4 |
| mov x7, #-4 |
| uxtl v7.8h, v7.8b // weights_hor |
| 8: |
| ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left |
| shll v20.8h, v5.8b, #8 // right*256 |
| shll v21.8h, v5.8b, #8 |
| shll v22.8h, v5.8b, #8 |
| shll v23.8h, v5.8b, #8 |
| usubl v3.8h, v3.8b, v5.8b // left-right |
| usubl v2.8h, v2.8b, v5.8b |
| usubl v1.8h, v1.8b, v5.8b |
| usubl v0.8h, v0.8b, v5.8b |
| mla v20.8h, v3.8h, v7.8h // right*256 + (left-right)*weights_hor |
| mla v21.8h, v2.8h, v7.8h // (left flipped) |
| mla v22.8h, v1.8h, v7.8h |
| mla v23.8h, v0.8h, v7.8h |
| rshrn v20.8b, v20.8h, #8 |
| rshrn v21.8b, v21.8h, #8 |
| rshrn v22.8b, v22.8h, #8 |
| rshrn v23.8b, v23.8h, #8 |
| st1 {v20.8b}, [x0], x1 |
| st1 {v21.8b}, [x6], x1 |
| subs w4, w4, #4 |
| st1 {v22.8b}, [x0], x1 |
| st1 {v23.8b}, [x6], x1 |
| b.gt 8b |
| ret |
| 160: |
| 320: |
| 640: |
| AARCH64_VALID_JUMP_TARGET |
| sub x2, x2, #4 |
| mov x7, #-4 |
| // Set up pointers for four rows in parallel; x0, x6, x5, x10 |
| add x5, x0, x1 |
| add x10, x6, x1 |
| lsl x1, x1, #1 |
| sub x1, x1, w3, uxtw |
| mov w9, w3 |
| |
| 1: |
| ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left |
| usubl v0.8h, v0.8b, v5.8b // left-right |
| usubl v1.8h, v1.8b, v5.8b |
| usubl v2.8h, v2.8b, v5.8b |
| usubl v3.8h, v3.8b, v5.8b |
| 2: |
| ld1 {v7.16b}, [x8], #16 // weights_hor |
| shll v20.8h, v5.8b, #8 // right*256 |
| shll v21.8h, v5.8b, #8 |
| shll v22.8h, v5.8b, #8 |
| shll v23.8h, v5.8b, #8 |
| shll v24.8h, v5.8b, #8 |
| shll v25.8h, v5.8b, #8 |
| shll v26.8h, v5.8b, #8 |
| shll v27.8h, v5.8b, #8 |
| uxtl v6.8h, v7.8b // weights_hor |
| uxtl2 v7.8h, v7.16b |
| mla v20.8h, v3.8h, v6.8h // right*256 + (left-right)*weights_hor |
| mla v21.8h, v3.8h, v7.8h // (left flipped) |
| mla v22.8h, v2.8h, v6.8h |
| mla v23.8h, v2.8h, v7.8h |
| mla v24.8h, v1.8h, v6.8h |
| mla v25.8h, v1.8h, v7.8h |
| mla v26.8h, v0.8h, v6.8h |
| mla v27.8h, v0.8h, v7.8h |
| rshrn v20.8b, v20.8h, #8 |
| rshrn2 v20.16b, v21.8h, #8 |
| rshrn v22.8b, v22.8h, #8 |
| rshrn2 v22.16b, v23.8h, #8 |
| rshrn v24.8b, v24.8h, #8 |
| rshrn2 v24.16b, v25.8h, #8 |
| rshrn v26.8b, v26.8h, #8 |
| rshrn2 v26.16b, v27.8h, #8 |
| subs w3, w3, #16 |
| st1 {v20.16b}, [x0], #16 |
| st1 {v22.16b}, [x6], #16 |
| st1 {v24.16b}, [x5], #16 |
| st1 {v26.16b}, [x10], #16 |
| b.gt 2b |
| subs w4, w4, #4 |
| b.le 9f |
| sub x8, x8, w9, uxtw |
| add x0, x0, x1 |
| add x6, x6, x1 |
| add x5, x5, x1 |
| add x10, x10, x1 |
| mov w3, w9 |
| b 1b |
| 9: |
| ret |
| |
| L(ipred_smooth_h_tbl): |
| .hword L(ipred_smooth_h_tbl) - 640b |
| .hword L(ipred_smooth_h_tbl) - 320b |
| .hword L(ipred_smooth_h_tbl) - 160b |
| .hword L(ipred_smooth_h_tbl) - 80b |
| .hword L(ipred_smooth_h_tbl) - 40b |
| endfunc |
| |
| const padding_mask_buf |
| .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 |
| .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 |
| .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 |
| .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 |
| padding_mask: |
| .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
| .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
| .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
| .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
| endconst |
| |
| // void ipred_z1_upsample_edge_8bpc_neon(pixel *out, const int hsz, |
| // const pixel *const in, const int end); |
| function ipred_z1_upsample_edge_8bpc_neon, export=1 |
| movrel x4, padding_mask |
| ld1 {v0.16b}, [x2] // in[] |
| add x5, x2, w3, uxtw // in[end] |
| sub x4, x4, w3, uxtw |
| |
| ld1r {v1.16b}, [x5] // padding |
| ld1 {v3.16b}, [x4] // padding_mask |
| |
| movi v31.8h, #9 |
| |
| bit v0.16b, v1.16b, v3.16b // padded in[] |
| |
| ext v4.16b, v0.16b, v1.16b, #1 |
| ext v5.16b, v0.16b, v1.16b, #2 |
| ext v6.16b, v0.16b, v1.16b, #3 |
| |
| uaddl v16.8h, v4.8b, v5.8b // in[i+1] + in[i+2] |
| uaddl2 v17.8h, v4.16b, v5.16b |
| uaddl v18.8h, v0.8b, v6.8b // in[i+0] + in[i+3] |
| uaddl2 v19.8h, v0.16b, v6.16b |
| mul v16.8h, v16.8h, v31.8h // 9*(in[i+1] + in[i+2]) |
| mul v17.8h, v17.8h, v31.8h |
| sub v16.8h, v16.8h, v18.8h |
| sub v17.8h, v17.8h, v19.8h |
| |
| sqrshrun v16.8b, v16.8h, #4 |
| sqrshrun2 v16.16b, v17.8h, #4 |
| |
| zip1 v0.16b, v4.16b, v16.16b |
| zip2 v1.16b, v4.16b, v16.16b |
| |
| st1 {v0.16b, v1.16b}, [x0] |
| |
| ret |
| endfunc |
| |
| const edge_filter |
| .byte 0, 4, 8, 0 |
| .byte 0, 5, 6, 0 |
| // Leaving out the coeffs for strength=3 |
| // .byte 2, 4, 4, 0 |
| endconst |
| |
| // void ipred_z1_filter_edge_8bpc_neon(pixel *out, const int sz, |
| // const pixel *const in, const int end, |
| // const int strength); |
| function ipred_z1_filter_edge_8bpc_neon, export=1 |
| cmp w4, #3 |
| b.eq L(fivetap) // if (strength == 3) goto fivetap |
| |
| movrel x5, edge_filter, -3 |
| add x5, x5, w4, uxtw #2 // edge_filter + (strength - 1)*4 + 1 |
| |
| ld1 {v31.h}[0], [x5] // kernel[1-2] |
| |
| ld1 {v0.16b}, [x2], #16 |
| |
| dup v30.16b, v31.b[0] |
| dup v31.16b, v31.b[1] |
| 1: |
| // in[end], is the last valid pixel. We produce 16 pixels out by |
| // using 18 pixels in - the last pixel used is [17] of the ones |
| // read/buffered. |
| cmp w3, #17 |
| ld1 {v1.16b}, [x2], #16 |
| b.lt 2f |
| ext v2.16b, v0.16b, v1.16b, #1 |
| ext v3.16b, v0.16b, v1.16b, #2 |
| umull v4.8h, v0.8b, v30.8b |
| umlal v4.8h, v2.8b, v31.8b |
| umlal v4.8h, v3.8b, v30.8b |
| umull2 v5.8h, v0.16b, v30.16b |
| umlal2 v5.8h, v2.16b, v31.16b |
| umlal2 v5.8h, v3.16b, v30.16b |
| subs w1, w1, #16 |
| mov v0.16b, v1.16b |
| rshrn v4.8b, v4.8h, #4 |
| rshrn2 v4.16b, v5.8h, #4 |
| sub w3, w3, #16 |
| st1 {v4.16b}, [x0], #16 |
| b.gt 1b |
| ret |
| 2: |
| // Right padding |
| |
| // x2[w3-32] is the padding pixel (x2 points 32 bytes ahead) |
| movrel x5, padding_mask |
| sub w6, w3, #32 |
| sub x5, x5, w3, uxtw |
| add x6, x2, w6, sxtw |
| |
| ld1 {v2.16b}, [x5] // padding_mask |
| |
| ld1r {v1.16b}, [x6] |
| bit v0.16b, v1.16b, v2.16b // Pad v0-v1 |
| |
| // Filter one block |
| ext v2.16b, v0.16b, v1.16b, #1 |
| ext v3.16b, v0.16b, v1.16b, #2 |
| umull v4.8h, v0.8b, v30.8b |
| umlal v4.8h, v2.8b, v31.8b |
| umlal v4.8h, v3.8b, v30.8b |
| umull2 v5.8h, v0.16b, v30.16b |
| umlal2 v5.8h, v2.16b, v31.16b |
| umlal2 v5.8h, v3.16b, v30.16b |
| subs w1, w1, #16 |
| rshrn v4.8b, v4.8h, #4 |
| rshrn2 v4.16b, v5.8h, #4 |
| st1 {v4.16b}, [x0], #16 |
| b.le 9f |
| 5: |
| // After one block, any remaining output would only be filtering |
| // padding - thus just store the padding. |
| subs w1, w1, #16 |
| st1 {v1.16b}, [x0], #16 |
| b.gt 5b |
| 9: |
| ret |
| |
| L(fivetap): |
| sub x2, x2, #1 // topleft -= 1 |
| movi v29.16b, #2 |
| ld1 {v0.16b}, [x2], #16 |
| movi v30.16b, #4 |
| movi v31.16b, #4 |
| ins v0.b[0], v0.b[1] |
| 1: |
| // in[end+1], is the last valid pixel. We produce 16 pixels out by |
| // using 20 pixels in - the last pixel used is [19] of the ones |
| // read/buffered. |
| cmp w3, #18 |
| ld1 {v1.16b}, [x2], #16 |
| b.lt 2f // if (end + 1 < 19) |
| ext v2.16b, v0.16b, v1.16b, #1 |
| ext v3.16b, v0.16b, v1.16b, #2 |
| ext v4.16b, v0.16b, v1.16b, #3 |
| ext v5.16b, v0.16b, v1.16b, #4 |
| umull v6.8h, v0.8b, v29.8b |
| umlal v6.8h, v2.8b, v30.8b |
| umlal v6.8h, v3.8b, v31.8b |
| umlal v6.8h, v4.8b, v30.8b |
| umlal v6.8h, v5.8b, v29.8b |
| umull2 v7.8h, v0.16b, v29.16b |
| umlal2 v7.8h, v2.16b, v30.16b |
| umlal2 v7.8h, v3.16b, v31.16b |
| umlal2 v7.8h, v4.16b, v30.16b |
| umlal2 v7.8h, v5.16b, v29.16b |
| subs w1, w1, #16 |
| mov v0.16b, v1.16b |
| rshrn v6.8b, v6.8h, #4 |
| rshrn2 v6.16b, v7.8h, #4 |
| sub w3, w3, #16 |
| st1 {v6.16b}, [x0], #16 |
| b.gt 1b |
| ret |
| 2: |
| // Right padding |
| |
| // x2[w3+1-32] is the padding pixel (x2 points 32 bytes ahead) |
| movrel x5, padding_mask, -1 |
| sub w6, w3, #31 |
| sub x5, x5, w3, uxtw |
| add x6, x2, w6, sxtw |
| |
| ld1 {v2.16b, v3.16b}, [x5] // padding_mask |
| |
| ld1r {v28.16b}, [x6] |
| bit v0.16b, v28.16b, v2.16b // Pad v0-v1 |
| bit v1.16b, v28.16b, v3.16b |
| 4: |
| // Filter one block |
| ext v2.16b, v0.16b, v1.16b, #1 |
| ext v3.16b, v0.16b, v1.16b, #2 |
| ext v4.16b, v0.16b, v1.16b, #3 |
| ext v5.16b, v0.16b, v1.16b, #4 |
| umull v6.8h, v0.8b, v29.8b |
| umlal v6.8h, v2.8b, v30.8b |
| umlal v6.8h, v3.8b, v31.8b |
| umlal v6.8h, v4.8b, v30.8b |
| umlal v6.8h, v5.8b, v29.8b |
| umull2 v7.8h, v0.16b, v29.16b |
| umlal2 v7.8h, v2.16b, v30.16b |
| umlal2 v7.8h, v3.16b, v31.16b |
| umlal2 v7.8h, v4.16b, v30.16b |
| umlal2 v7.8h, v5.16b, v29.16b |
| subs w1, w1, #16 |
| mov v0.16b, v1.16b |
| mov v1.16b, v28.16b |
| rshrn v6.8b, v6.8h, #4 |
| rshrn2 v6.16b, v7.8h, #4 |
| sub w3, w3, #16 |
| st1 {v6.16b}, [x0], #16 |
| b.le 9f |
| // v0-v1[w3+1] is the last valid pixel; if (w3 + 1 > 0) we need to |
| // filter properly once more - aka (w3 >= 0). |
| cmp w3, #0 |
| b.ge 4b |
| 5: |
| // When w3 <= 0, all remaining pixels in v0-v1 are equal to the |
| // last valid pixel - thus just output that without filtering. |
| subs w1, w1, #16 |
| st1 {v1.16b}, [x0], #16 |
| b.gt 5b |
| 9: |
| ret |
| endfunc |
| |
| // void ipred_pixel_set_8bpc_neon(pixel *out, const pixel px, |
| // const int n); |
| function ipred_pixel_set_8bpc_neon, export=1 |
| dup v0.16b, w1 |
| 1: |
| subs w2, w2, #16 |
| st1 {v0.16b}, [x0], #16 |
| b.gt 1b |
| ret |
| endfunc |
| |
| // void ipred_z1_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride, |
| // const pixel *const top, |
| // const int width, const int height, |
| // const int dx, const int max_base_x); |
| function ipred_z1_fill1_8bpc_neon, export=1 |
| clz w9, w3 |
| adr x8, L(ipred_z1_fill1_tbl) |
| sub w9, w9, #25 |
| ldrh w9, [x8, w9, uxtw #1] |
| add x10, x2, w6, uxtw // top[max_base_x] |
| sub x8, x8, w9, uxtw |
| ld1r {v31.16b}, [x10] // padding |
| mov w7, w5 |
| mov w15, #64 |
| br x8 |
| 40: |
| AARCH64_VALID_JUMP_TARGET |
| 4: |
| lsr w8, w7, #6 // base |
| and w9, w7, #0x3e // frac |
| add w7, w7, w5 // xpos += dx |
| cmp w8, w6 // base >= max_base_x |
| lsr w10, w7, #6 // base |
| and w11, w7, #0x3e // frac |
| b.ge 49f |
| ldr d0, [x2, w8, uxtw] // top[base] |
| ldr d2, [x2, w10, uxtw] |
| dup v4.4h, w9 // frac |
| dup v5.4h, w11 |
| ext v1.8b, v0.8b, v0.8b, #1 // top[base+1] |
| ext v3.8b, v2.8b, v2.8b, #1 |
| usubl v6.8h, v1.8b, v0.8b // top[base+1]-top[base] |
| usubl v7.8h, v3.8b, v2.8b |
| ushll v16.8h, v0.8b, #6 // top[base]*64 |
| ushll v17.8h, v2.8b, #6 |
| mla v16.4h, v6.4h, v4.4h // + top[base+1]*frac |
| mla v17.4h, v7.4h, v5.4h |
| rshrn v16.8b, v16.8h, #6 |
| rshrn v17.8b, v17.8h, #6 |
| st1 {v16.s}[0], [x0], x1 |
| add w7, w7, w5 // xpos += dx |
| subs w4, w4, #2 |
| st1 {v17.s}[0], [x0], x1 |
| b.gt 4b |
| ret |
| |
| 49: |
| st1 {v31.s}[0], [x0], x1 |
| subs w4, w4, #2 |
| st1 {v31.s}[0], [x0], x1 |
| b.gt 49b |
| ret |
| |
| 80: |
| AARCH64_VALID_JUMP_TARGET |
| 8: |
| lsr w8, w7, #6 // base |
| and w9, w7, #0x3e // frac |
| add w7, w7, w5 // xpos += dx |
| cmp w8, w6 // base >= max_base_x |
| lsr w10, w7, #6 // base |
| and w11, w7, #0x3e // frac |
| b.ge 89f |
| ldr q0, [x2, w8, uxtw] // top[base] |
| ldr q2, [x2, w10, uxtw] |
| dup v4.8b, w9 // frac |
| dup v5.8b, w11 |
| sub w9, w15, w9 // 64 - frac |
| sub w11, w15, w11 |
| dup v6.8b, w9 // 64 - frac |
| dup v7.8b, w11 |
| ext v1.16b, v0.16b, v0.16b, #1 // top[base+1] |
| ext v3.16b, v2.16b, v2.16b, #1 |
| umull v16.8h, v0.8b, v6.8b // top[base]*(64-frac) |
| umlal v16.8h, v1.8b, v4.8b // + top[base+1]*frac |
| umull v17.8h, v2.8b, v7.8b |
| umlal v17.8h, v3.8b, v5.8b |
| rshrn v16.8b, v16.8h, #6 |
| rshrn v17.8b, v17.8h, #6 |
| st1 {v16.8b}, [x0], x1 |
| add w7, w7, w5 // xpos += dx |
| subs w4, w4, #2 |
| st1 {v17.8b}, [x0], x1 |
| b.gt 8b |
| ret |
| |
| 89: |
| st1 {v31.8b}, [x0], x1 |
| subs w4, w4, #2 |
| st1 {v31.8b}, [x0], x1 |
| b.gt 89b |
| ret |
| |
| 160: |
| 320: |
| 640: |
| AARCH64_VALID_JUMP_TARGET |
| |
| mov w12, w3 |
| |
| add x13, x0, x1 |
| lsl x1, x1, #1 |
| sub x1, x1, w3, uxtw |
| 1: |
| lsr w8, w7, #6 // base |
| and w9, w7, #0x3e // frac |
| add w7, w7, w5 // xpos += dx |
| cmp w8, w6 // base >= max_base_x |
| lsr w10, w7, #6 // base |
| and w11, w7, #0x3e // frac |
| b.ge 169f |
| add x8, x2, w8, uxtw |
| add x10, x2, w10, uxtw |
| dup v4.16b, w9 // frac |
| dup v5.16b, w11 |
| ld1 {v0.16b, v1.16b}, [x8], #32 // top[base] |
| ld1 {v2.16b, v3.16b}, [x10], #32 |
| sub w9, w15, w9 // 64 - frac |
| sub w11, w15, w11 |
| dup v6.16b, w9 // 64 - frac |
| dup v7.16b, w11 |
| add w7, w7, w5 // xpos += dx |
| 2: |
| ext v16.16b, v0.16b, v1.16b, #1 // top[base+1] |
| ext v17.16b, v2.16b, v3.16b, #1 |
| subs w3, w3, #16 |
| umull v18.8h, v0.8b, v6.8b // top[base]*(64-frac) |
| umlal v18.8h, v16.8b, v4.8b // + top[base+1]*frac |
| umull2 v19.8h, v0.16b, v6.16b |
| umlal2 v19.8h, v16.16b, v4.16b |
| umull v20.8h, v2.8b, v7.8b |
| umlal v20.8h, v17.8b, v5.8b |
| umull2 v21.8h, v2.16b, v7.16b |
| umlal2 v21.8h, v17.16b, v5.16b |
| rshrn v16.8b, v18.8h, #6 |
| rshrn2 v16.16b, v19.8h, #6 |
| rshrn v17.8b, v20.8h, #6 |
| rshrn2 v17.16b, v21.8h, #6 |
| st1 {v16.16b}, [x0], #16 |
| st1 {v17.16b}, [x13], #16 |
| b.le 3f |
| mov v0.16b, v1.16b |
| ld1 {v1.16b}, [x8], #16 // top[base] |
| mov v2.16b, v3.16b |
| ld1 {v3.16b}, [x10], #16 |
| b 2b |
| |
| 3: |
| subs w4, w4, #2 |
| b.le 9f |
| add x0, x0, x1 |
| add x13, x13, x1 |
| mov w3, w12 |
| b 1b |
| 9: |
| ret |
| |
| 169: |
| st1 {v31.16b}, [x0], #16 |
| subs w3, w3, #16 |
| st1 {v31.16b}, [x13], #16 |
| b.gt 169b |
| subs w4, w4, #2 |
| b.le 9b |
| add x0, x0, x1 |
| add x13, x13, x1 |
| mov w3, w12 |
| b 169b |
| |
| L(ipred_z1_fill1_tbl): |
| .hword L(ipred_z1_fill1_tbl) - 640b |
| .hword L(ipred_z1_fill1_tbl) - 320b |
| .hword L(ipred_z1_fill1_tbl) - 160b |
| .hword L(ipred_z1_fill1_tbl) - 80b |
| .hword L(ipred_z1_fill1_tbl) - 40b |
| endfunc |
| |
| function ipred_z1_fill2_8bpc_neon, export=1 |
| cmp w3, #8 |
| add x10, x2, w6, uxtw // top[max_base_x] |
| ld1r {v31.16b}, [x10] // padding |
| mov w7, w5 |
| mov w15, #64 |
| b.eq 8f |
| |
| 4: // w == 4 |
| lsr w8, w7, #6 // base |
| and w9, w7, #0x3e // frac |
| add w7, w7, w5 // xpos += dx |
| cmp w8, w6 // base >= max_base_x |
| lsr w10, w7, #6 // base |
| and w11, w7, #0x3e // frac |
| b.ge 49f |
| ldr d0, [x2, w8, uxtw] // top[base] |
| ldr d2, [x2, w10, uxtw] |
| dup v4.4h, w9 // frac |
| dup v5.4h, w11 |
| uzp2 v1.8b, v0.8b, v0.8b // top[base+1] |
| uzp1 v0.8b, v0.8b, v0.8b // top[base] |
| uzp2 v3.8b, v2.8b, v2.8b |
| uzp1 v2.8b, v2.8b, v2.8b |
| usubl v6.8h, v1.8b, v0.8b // top[base+1]-top[base] |
| usubl v7.8h, v3.8b, v2.8b |
| ushll v16.8h, v0.8b, #6 // top[base]*64 |
| ushll v17.8h, v2.8b, #6 |
| mla v16.4h, v6.4h, v4.4h // + top[base+1]*frac |
| mla v17.4h, v7.4h, v5.4h |
| rshrn v16.8b, v16.8h, #6 |
| rshrn v17.8b, v17.8h, #6 |
| st1 {v16.s}[0], [x0], x1 |
| add w7, w7, w5 // xpos += dx |
| subs w4, w4, #2 |
| st1 {v17.s}[0], [x0], x1 |
| b.gt 4b |
| ret |
| |
| 49: |
| st1 {v31.s}[0], [x0], x1 |
| subs w4, w4, #2 |
| st1 {v31.s}[0], [x0], x1 |
| b.gt 49b |
| ret |
| |
| 8: // w == 8 |
| lsr w8, w7, #6 // base |
| and w9, w7, #0x3e // frac |
| add w7, w7, w5 // xpos += dx |
| cmp w8, w6 // base >= max_base_x |
| lsr w10, w7, #6 // base |
| and w11, w7, #0x3e // frac |
| b.ge 89f |
| ldr q0, [x2, w8, uxtw] // top[base] |
| ldr q2, [x2, w10, uxtw] |
| dup v4.8b, w9 // frac |
| dup v5.8b, w11 |
| sub w9, w15, w9 // 64 - frac |
| sub w11, w15, w11 |
| dup v6.8b, w9 // 64 - frac |
| dup v7.8b, w11 |
| uzp2 v1.16b, v0.16b, v0.16b // top[base+1] |
| uzp1 v0.16b, v0.16b, v0.16b // top[base] |
| uzp2 v3.16b, v2.16b, v2.16b |
| uzp1 v2.16b, v2.16b, v2.16b |
| umull v16.8h, v1.8b, v4.8b // top[base+1]*frac |
| umlal v16.8h, v0.8b, v6.8b // + top[base]*(64-frac) |
| umull v17.8h, v3.8b, v5.8b |
| umlal v17.8h, v2.8b, v7.8b |
| rshrn v16.8b, v16.8h, #6 |
| rshrn v17.8b, v17.8h, #6 |
| st1 {v16.8b}, [x0], x1 |
| add w7, w7, w5 // xpos += dx |
| subs w4, w4, #2 |
| st1 {v17.8b}, [x0], x1 |
| b.gt 8b |
| ret |
| |
| 89: |
| st1 {v31.8b}, [x0], x1 |
| subs w4, w4, #2 |
| st1 {v31.8b}, [x0], x1 |
| b.gt 89b |
| ret |
| endfunc |
| |
| // void ipred_reverse_8bpc_neon(pixel *dst, const pixel *const src, |
| // const int n); |
| function ipred_reverse_8bpc_neon, export=1 |
| sub x1, x1, #16 |
| add x3, x0, #8 |
| mov x4, #16 |
| 1: |
| ld1 {v0.16b}, [x1] |
| subs w2, w2, #16 |
| rev64 v0.16b, v0.16b |
| sub x1, x1, #16 |
| st1 {v0.d}[1], [x0], x4 |
| st1 {v0.d}[0], [x3], x4 |
| b.gt 1b |
| ret |
| endfunc |
| |
| const increments |
| .short 0, 1, 2, 3, 4, 5, 6, 7 |
| endconst |
| |
| // void ipred_z3_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride, |
| // const pixel *const left, |
| // const int width, const int height, |
| // const int dy, const int max_base_y); |
| function ipred_z3_fill1_8bpc_neon, export=1 |
| cmp w6, #64 |
| clz w9, w3 |
| adr x8, L(ipred_z3_fill1_tbl) |
| sub w9, w9, #25 |
| ldrh w9, [x8, w9, uxtw #1] |
| add x10, x2, w6, uxtw // left[max_base_y] |
| sub x8, x8, w9, uxtw |
| movrel x11, increments |
| ld1r {v31.16b}, [x10] // padding |
| ld1 {v30.8h}, [x11] // increments |
| mov w7, w5 |
| b.gt L(ipred_z3_fill1_large_h16) |
| br x8 |
| |
| 40: |
| AARCH64_VALID_JUMP_TARGET |
| dup v29.4h, w5 // dy |
| |
| mul v30.4h, v30.4h, v29.4h // {0,1,2,3,4,5,6,7}*dy |
| movi v23.16b, #0x3e |
| |
| // Worst case max_base_y is width+height-1, for w=4, h=16, <= 32 |
| ld1 {v0.16b, v1.16b}, [x2] // left[] |
| add v30.4h, v29.4h, v30.4h // ypos |
| |
| movi v22.16b, #64 |
| movi v20.16b, #1 |
| movi v21.16b, #2 |
| |
| xtn v24.8b, v30.8h // (uint8_t)ypos |
| uqshrn v26.8b, v30.8h, #6 // base |
| and v24.8b, v24.8b, v23.8b // frac |
| |
| mov v4.8b, v31.8b |
| uqadd v27.8b, v26.8b, v20.8b // base + 1 |
| uqadd v28.8b, v26.8b, v21.8b // base + 2 |
| sub v25.8b, v22.8b, v24.8b // 64 - frac |
| |
| tbx v4.8b, {v0.16b, v1.16b}, v26.8b // left[base] |
| |
| trn1 v27.2s, v27.2s, v28.2s // base + 1, base + 2 |
| trn1 v24.2s, v24.2s, v24.2s // frac |
| trn1 v25.2s, v25.2s, v25.2s // 64 - frac |
| 1: |
| mov v5.8b, v31.8b |
| tbx v5.8b, {v0.16b, v1.16b}, v27.8b // left[base+1], left[base+2] |
| |
| trn1 v4.2s, v4.2s, v5.2s // left[base], left[base+1] |
| |
| umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) |
| umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac |
| rshrn v16.8b, v16.8h, #6 |
| st1 {v16.s}[0], [x0], x1 |
| subs w4, w4, #2 |
| st1 {v16.s}[1], [x0], x1 |
| b.le 9f |
| |
| ext v4.8b, v5.8b, v5.8b, #4 |
| uqadd v27.8b, v27.8b, v21.8b // base += 2 |
| b 1b |
| |
| 9: |
| ret |
| |
| 80: |
| AARCH64_VALID_JUMP_TARGET |
| dup v29.8h, w5 // dy |
| |
| mul v30.8h, v30.8h, v29.8h // {0,1,2,3,4,5,6,7}*dy |
| movi v23.16b, #0x3e |
| |
| // Worst case max_base_y is width+height-1, for w=8, h=32, <= 48 |
| ld1 {v0.16b, v1.16b, v2.16b}, [x2] // left[] |
| add v30.8h, v29.8h, v30.8h // ypos |
| |
| movi v22.16b, #64 |
| movi v20.16b, #1 |
| movi v21.16b, #2 |
| |
| xtn v24.8b, v30.8h // (uint8_t)ypos |
| uqshrn v26.8b, v30.8h, #6 // base |
| and v24.8b, v24.8b, v23.8b // frac |
| |
| mov v4.8b, v31.8b |
| uqadd v27.8b, v26.8b, v20.8b // base + 1 |
| uqadd v28.8b, v26.8b, v21.8b // base + 2 |
| sub v25.8b, v22.8b, v24.8b // 64 - frac |
| |
| tbx v4.8b, {v0.16b, v1.16b, v2.16b}, v26.8b // left[base] |
| 1: |
| mov v5.8b, v31.8b |
| mov v6.8b, v31.8b |
| tbx v5.8b, {v0.16b, v1.16b, v2.16b}, v27.8b // left[base+1] |
| tbx v6.8b, {v0.16b, v1.16b, v2.16b}, v28.8b // left[base+2] |
| |
| umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) |
| umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac |
| umull v17.8h, v5.8b, v25.8b |
| umlal v17.8h, v6.8b, v24.8b |
| rshrn v16.8b, v16.8h, #6 |
| rshrn v17.8b, v17.8h, #6 |
| st1 {v16.8b}, [x0], x1 |
| subs w4, w4, #2 |
| st1 {v17.8b}, [x0], x1 |
| b.le 9f |
| |
| mov v4.8b, v6.8b |
| uqadd v27.8b, v27.8b, v21.8b // base += 2 |
| uqadd v28.8b, v28.8b, v21.8b // base += 2 |
| b 1b |
| |
| 9: |
| ret |
| |
| 160: |
| AARCH64_VALID_JUMP_TARGET |
| dup v28.8h, w5 // dy |
| |
| shl v29.8h, v28.8h, #3 // 8*dy |
| mul v30.8h, v30.8h, v28.8h // {0,1,2,3,4,5,6,7}*dy |
| movi v23.16b, #0x3e |
| |
| // This is only executed if we've checked that max_base_y <= 64. |
| ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[] |
| add v28.8h, v28.8h, v30.8h // ypos |
| |
| movi v22.16b, #64 |
| movi v20.16b, #1 |
| movi v21.16b, #2 |
| |
| add v29.8h, v28.8h, v29.8h // ypos + 8*dy |
| |
| xtn v24.8b, v28.8h // (uint8_t)ypos |
| xtn2 v24.16b, v29.8h |
| uqshrn v26.8b, v28.8h, #6 // base |
| uqshrn2 v26.16b, v29.8h, #6 |
| and v24.16b, v24.16b, v23.16b // frac |
| |
| mov v4.16b, v31.16b |
| uqadd v27.16b, v26.16b, v20.16b // base + 1 |
| uqadd v28.16b, v26.16b, v21.16b // base + 2 |
| sub v25.16b, v22.16b, v24.16b // 64 - frac |
| |
| tbx v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v26.16b // left[base] |
| 1: |
| mov v5.16b, v31.16b |
| mov v6.16b, v31.16b |
| tbx v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v27.16b // left[base+1] |
| tbx v6.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v28.16b // left[base+2] |
| |
| umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) |
| umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac |
| umull2 v17.8h, v4.16b, v25.16b |
| umlal2 v17.8h, v5.16b, v24.16b |
| umull v18.8h, v5.8b, v25.8b |
| umlal v18.8h, v6.8b, v24.8b |
| umull2 v19.8h, v5.16b, v25.16b |
| umlal2 v19.8h, v6.16b, v24.16b |
| rshrn v16.8b, v16.8h, #6 |
| rshrn2 v16.16b, v17.8h, #6 |
| rshrn v17.8b, v18.8h, #6 |
| rshrn2 v17.16b, v19.8h, #6 |
| st1 {v16.16b}, [x0], x1 |
| subs w4, w4, #2 |
| st1 {v17.16b}, [x0], x1 |
| b.le 9f |
| |
| mov v4.16b, v6.16b |
| uqadd v27.16b, v27.16b, v21.16b // base += 2 |
| uqadd v28.16b, v28.16b, v21.16b // base += 2 |
| b 1b |
| |
| 9: |
| ret |
| 320: |
| 640: |
| AARCH64_VALID_JUMP_TARGET |
| dup v28.8h, w5 // dy |
| mov w12, w3 |
| |
| add x13, x0, x1 |
| |
| shl v29.8h, v28.8h, #3 // 8*dy |
| mul v30.8h, v30.8h, v28.8h // {0,1,2,3,4,5,6,7}*dy |
| movi v23.16b, #0x3e |
| |
| lsl x1, x1, #1 |
| sub x1, x1, w3, uxtw |
| add v30.8h, v28.8h, v30.8h // ypos |
| |
| // This is only executed if we've checked that max_base_y <= 64. |
| ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[] |
| |
| movi v22.16b, #64 |
| movi v20.16b, #1 |
| movi v21.16b, #2 |
| |
| 1: |
| mov v26.16b, v30.16b // reset ypos |
| |
| 2: |
| add v27.8h, v26.8h, v29.8h // ypos + 8*dy |
| uqshrn v16.8b, v26.8h, #6 // base |
| uqshrn2 v16.16b, v27.8h, #6 |
| xtn v24.8b, v26.8h // (uint8_t)ypos |
| xtn2 v24.16b, v27.8h |
| umov w14, v16.b[0] |
| and v24.16b, v24.16b, v23.16b // frac |
| |
| uqadd v17.16b, v16.16b, v20.16b // base + 1 |
| cmp w14, w6 // base >= max_base_y |
| uqadd v18.16b, v16.16b, v21.16b // base + 2 |
| sub v25.16b, v22.16b, v24.16b // 64 - frac |
| |
| b.ge 4f |
| |
| mov v4.16b, v31.16b |
| mov v5.16b, v31.16b |
| mov v6.16b, v31.16b |
| tbx v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v16.16b // left[base] |
| tbx v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v17.16b // left[base+1] |
| tbx v6.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v18.16b // left[base+2] |
| |
| subs w3, w3, #16 |
| umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) |
| umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac |
| umull2 v17.8h, v4.16b, v25.16b |
| umlal2 v17.8h, v5.16b, v24.16b |
| umull v18.8h, v5.8b, v25.8b |
| umlal v18.8h, v6.8b, v24.8b |
| umull2 v19.8h, v5.16b, v25.16b |
| umlal2 v19.8h, v6.16b, v24.16b |
| rshrn v16.8b, v16.8h, #6 |
| rshrn2 v16.16b, v17.8h, #6 |
| rshrn v17.8b, v18.8h, #6 |
| rshrn2 v17.16b, v19.8h, #6 |
| st1 {v16.16b}, [x0], #16 |
| st1 {v17.16b}, [x13], #16 |
| b.le 3f |
| add v26.8h, v27.8h, v29.8h // ypos += 16*dy |
| b 2b |
| |
| 3: |
| subs w4, w4, #2 |
| b.le 9f |
| movi v16.8h, #128 |
| add x0, x0, x1 |
| add x13, x13, x1 |
| add v30.8h, v30.8h, v16.8h // ypos = dy + y*(1<<6)*2 |
| mov w3, w12 |
| b 1b |
| |
| 4: |
| subs w3, w3, #16 |
| st1 {v31.16b}, [x0], #16 |
| st1 {v31.16b}, [x13], #16 |
| b.gt 4b |
| b 3b |
| |
| 9: |
| ret |
| |
| L(ipred_z3_fill1_large_h16): |
| // Fallback case for max_base_y > 64; similar to the z1 |
| // implementation. This does the filtering vertically, filling out |
| // a 2x pixel column at a time. |
| mov w15, #64 |
| add x13, x0, x1 |
| lsl x1, x1, #1 |
| |
| mov w12, w4 |
| 1: |
| lsr w8, w7, #6 // base |
| and w9, w7, #0x3e // frac |
| add w7, w7, w5 // ypos += dy |
| cmp w8, w6 // base >= max_base_y |
| lsr w10, w7, #6 // base |
| and w11, w7, #0x3e // frac |
| b.ge ipred_z3_fill_padding_neon |
| add x8, x2, w8, uxtw |
| add x10, x2, w10, uxtw |
| dup v4.16b, w9 // frac |
| dup v5.16b, w11 |
| ld1 {v0.16b, v1.16b}, [x8], #32 // left[base] |
| ld1 {v2.16b, v3.16b}, [x10], #32 |
| sub w9, w15, w9 // 64 - frac |
| sub w11, w15, w11 |
| dup v6.16b, w9 // 64 - frac |
| dup v7.16b, w11 |
| add w7, w7, w5 // ypos += dy |
| 2: |
| ext v16.16b, v0.16b, v1.16b, #1 // left[base+1] |
| ext v17.16b, v2.16b, v3.16b, #1 |
| subs w4, w4, #16 |
| umull v18.8h, v16.8b, v4.8b // left[base+1]*frac |
| umlal v18.8h, v0.8b, v6.8b // + left[base]*(64-frac) |
| umull2 v19.8h, v16.16b, v4.16b |
| umlal2 v19.8h, v0.16b, v6.16b |
| umull v20.8h, v17.8b, v5.8b |
| umlal v20.8h, v2.8b, v7.8b |
| umull2 v21.8h, v17.16b, v5.16b |
| umlal2 v21.8h, v2.16b, v7.16b |
| rshrn v16.8b, v18.8h, #6 |
| rshrn2 v16.16b, v19.8h, #6 |
| rshrn v17.8b, v20.8h, #6 |
| rshrn2 v17.16b, v21.8h, #6 |
| zip1 v18.16b, v16.16b, v17.16b |
| zip2 v19.16b, v16.16b, v17.16b |
| st1 {v18.h}[0], [x0], x1 |
| st1 {v18.h}[1], [x13], x1 |
| st1 {v18.h}[2], [x0], x1 |
| st1 {v18.h}[3], [x13], x1 |
| st1 {v18.h}[4], [x0], x1 |
| st1 {v18.h}[5], [x13], x1 |
| st1 {v18.h}[6], [x0], x1 |
| st1 {v18.h}[7], [x13], x1 |
| st1 {v19.h}[0], [x0], x1 |
| st1 {v19.h}[1], [x13], x1 |
| st1 {v19.h}[2], [x0], x1 |
| st1 {v19.h}[3], [x13], x1 |
| st1 {v19.h}[4], [x0], x1 |
| st1 {v19.h}[5], [x13], x1 |
| st1 {v19.h}[6], [x0], x1 |
| st1 {v19.h}[7], [x13], x1 |
| b.le 3f |
| mov v0.16b, v1.16b |
| ld1 {v1.16b}, [x8], #16 // left[base] |
| mov v2.16b, v3.16b |
| ld1 {v3.16b}, [x10], #16 |
| b 2b |
| |
| 3: |
| subs w3, w3, #2 |
| b.le 9f |
| lsr x1, x1, #1 |
| msub x0, x1, x12, x0 // ptr -= h * stride |
| msub x13, x1, x12, x13 |
| lsl x1, x1, #1 |
| add x0, x0, #2 |
| add x13, x13, #2 |
| mov w4, w12 |
| b 1b |
| 9: |
| ret |
| |
| L(ipred_z3_fill1_tbl): |
| .hword L(ipred_z3_fill1_tbl) - 640b |
| .hword L(ipred_z3_fill1_tbl) - 320b |
| .hword L(ipred_z3_fill1_tbl) - 160b |
| .hword L(ipred_z3_fill1_tbl) - 80b |
| .hword L(ipred_z3_fill1_tbl) - 40b |
| endfunc |
| |
| function ipred_z3_fill_padding_neon, export=0 |
| cmp w3, #16 |
| adr x8, L(ipred_z3_fill_padding_tbl) |
| b.gt L(ipred_z3_fill_padding_wide) |
| // w3 = remaining width, w4 = constant height |
| mov w12, w4 |
| |
| 1: |
| // Fill a WxH rectangle with padding. W can be any number; |
| // this fills the exact width by filling in the largest |
| // power of two in the remaining width, and repeating. |
| clz w9, w3 |
| sub w9, w9, #25 |
| ldrh w9, [x8, w9, uxtw #1] |
| sub x9, x8, w9, uxtw |
| br x9 |
| |
| 2: |
| st1 {v31.h}[0], [x0], x1 |
| subs w4, w4, #4 |
| st1 {v31.h}[0], [x13], x1 |
| st1 {v31.h}[0], [x0], x1 |
| st1 {v31.h}[0], [x13], x1 |
| b.gt 2b |
| subs w3, w3, #2 |
| lsr x1, x1, #1 |
| msub x0, x1, x12, x0 // ptr -= h * stride |
| msub x13, x1, x12, x13 |
| b.le 9f |
| lsl x1, x1, #1 |
| add x0, x0, #2 |
| add x13, x13, #2 |
| mov w4, w12 |
| b 1b |
| |
| 4: |
| st1 {v31.s}[0], [x0], x1 |
| subs w4, w4, #4 |
| st1 {v31.s}[0], [x13], x1 |
| st1 {v31.s}[0], [x0], x1 |
| st1 {v31.s}[0], [x13], x1 |
| b.gt 4b |
| subs w3, w3, #4 |
| lsr x1, x1, #1 |
| msub x0, x1, x12, x0 // ptr -= h * stride |
| msub x13, x1, x12, x13 |
| b.le 9f |
| lsl x1, x1, #1 |
| add x0, x0, #4 |
| add x13, x13, #4 |
| mov w4, w12 |
| b 1b |
| |
| 8: |
| st1 {v31.8b}, [x0], x1 |
| subs w4, w4, #4 |
| st1 {v31.8b}, [x13], x1 |
| st1 {v31.8b}, [x0], x1 |
| st1 {v31.8b}, [x13], x1 |
| b.gt 4b |
| subs w3, w3, #8 |
| lsr x1, x1, #1 |
| msub x0, x1, x12, x0 // ptr -= h * stride |
| msub x13, x1, x12, x13 |
| b.le 9f |
| lsl x1, x1, #1 |
| add x0, x0, #8 |
| add x13, x13, #8 |
| mov w4, w12 |
| b 1b |
| |
| 16: |
| 32: |
| 64: |
| st1 {v31.16b}, [x0], x1 |
| subs w4, w4, #4 |
| st1 {v31.16b}, [x13], x1 |
| st1 {v31.16b}, [x0], x1 |
| st1 {v31.16b}, [x13], x1 |
| b.gt 4b |
| subs w3, w3, #16 |
| lsr x1, x1, #1 |
| msub x0, x1, x12, x0 // ptr -= h * stride |
| msub x13, x1, x12, x13 |
| b.le 9f |
| lsl x1, x1, #1 |
| add x0, x0, #16 |
| add x13, x13, #16 |
| mov w4, w12 |
| b 1b |
| |
| 9: |
| ret |
| |
| L(ipred_z3_fill_padding_tbl): |
| .hword L(ipred_z3_fill_padding_tbl) - 64b |
| .hword L(ipred_z3_fill_padding_tbl) - 32b |
| .hword L(ipred_z3_fill_padding_tbl) - 16b |
| .hword L(ipred_z3_fill_padding_tbl) - 8b |
| .hword L(ipred_z3_fill_padding_tbl) - 4b |
| .hword L(ipred_z3_fill_padding_tbl) - 2b |
| |
| L(ipred_z3_fill_padding_wide): |
| // Fill a WxH rectangle with padding, with W > 16. |
| lsr x1, x1, #1 |
| mov w12, w3 |
| sub x1, x1, w3, uxtw |
| 1: |
| ands w5, w3, #15 |
| b.eq 2f |
| // If the width isn't aligned to 16, first do one 16 byte write |
| // and align the start pointer. |
| sub w3, w3, w5 |
| st1 {v31.16b}, [x0] |
| add x0, x0, w5, uxtw |
| 2: |
| // Fill the rest of the line with aligned 16 byte writes. |
| subs w3, w3, #16 |
| st1 {v31.16b}, [x0], #16 |
| b.gt 2b |
| subs w4, w4, #1 |
| add x0, x0, x1 |
| b.le 9f |
| mov w3, w12 |
| b 1b |
| 9: |
| ret |
| endfunc |
| |
| function ipred_z3_fill2_8bpc_neon, export=1 |
| cmp w3, #8 |
| add x10, x2, w6, uxtw // left[max_base_y] |
| movrel x11, increments |
| ld1r {v31.16b}, [x10] // padding |
| ld1 {v30.8h}, [x11] // increments |
| b.eq 80f |
| |
| 40: // w == 4 |
| dup v29.4h, w5 // dy |
| |
| mul v30.4h, v30.4h, v29.4h // {0,1,2,3,4,5,6,7}*dy |
| movi v23.16b, #0x3e |
| |
| // Worst case max_base_y is 2*(width+height)-2, but width+height <= 16, |
| // so max_base_y <= 32. |
| ld1 {v0.16b, v1.16b}, [x2] // left[] |
| add v30.4h, v29.4h, v30.4h // ypos |
| |
| movi v22.16b, #64 |
| movi v20.16b, #1 |
| movi v21.16b, #2 |
| |
| xtn v24.8b, v30.8h // (uint8_t)ypos |
| uqshrn v26.8b, v30.8h, #6 // base |
| and v24.8b, v24.8b, v23.8b // frac |
| |
| uqadd v27.8b, v26.8b, v20.8b // base + 1 |
| uqadd v28.8b, v26.8b, v21.8b // base + 2 |
| sub v25.8b, v22.8b, v24.8b // 64 - frac |
| uqadd v29.8b, v27.8b, v21.8b // base + 3 |
| |
| trn1 v24.2s, v24.2s, v24.2s // frac |
| trn1 v26.2s, v26.2s, v28.2s // base + 0, base + 2 |
| trn1 v27.2s, v27.2s, v29.2s // base + 1, base + 3 |
| trn1 v25.2s, v25.2s, v25.2s // 64 - frac |
| |
| movi v21.16b, #4 |
| 1: |
| mov v4.8b, v31.8b |
| mov v5.8b, v31.8b |
| tbx v4.8b, {v0.16b, v1.16b}, v26.8b // left[base], left[base+2] |
| tbx v5.8b, {v0.16b, v1.16b}, v27.8b // left[base+1], left[base+3] |
| |
| umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) |
| umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac |
| rshrn v16.8b, v16.8h, #6 |
| st1 {v16.s}[0], [x0], x1 |
| subs w4, w4, #2 |
| st1 {v16.s}[1], [x0], x1 |
| b.le 9f |
| |
| uqadd v26.8b, v26.8b, v21.8b // base += 4 |
| uqadd v27.8b, v27.8b, v21.8b // base += 4 |
| b 1b |
| |
| 9: |
| ret |
| |
| 80: // w == 8 |
| dup v29.8h, w5 // dy |
| |
| mul v30.8h, v30.8h, v29.8h // {0,1,2,3,4,5,6,7}*dy |
| movi v23.16b, #0x3e |
| |
| // Worst case max_base_y is 2*(width+height)-2, but width+height <= 16, |
| // so max_base_y <= 32. |
| ld1 {v0.16b, v1.16b}, [x2] // left[] |
| add v30.8h, v29.8h, v30.8h // ypos |
| |
| movi v22.16b, #64 |
| movi v20.16b, #1 |
| movi v21.16b, #2 |
| |
| xtn v24.8b, v30.8h // (uint8_t)ypos |
| uqshrn v26.8b, v30.8h, #6 // base |
| and v24.8b, v24.8b, v23.8b // frac |
| |
| uqadd v27.8b, v26.8b, v20.8b // base + 1 |
| uqadd v28.8b, v26.8b, v21.8b // base + 2 |
| sub v25.8b, v22.8b, v24.8b // 64 - frac |
| uqadd v29.8b, v27.8b, v21.8b // base + 3 |
| |
| trn1 v24.2d, v24.2d, v24.2d // frac |
| trn1 v26.2d, v26.2d, v28.2d // base + 0, base + 2 |
| trn1 v27.2d, v27.2d, v29.2d // base + 1, base + 3 |
| trn1 v25.2d, v25.2d, v25.2d // 64 - frac |
| |
| movi v21.16b, #4 |
| 1: |
| mov v4.16b, v31.16b |
| mov v5.16b, v31.16b |
| tbx v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v26.16b // left[base], left[base+2] |
| tbx v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v27.16b // left[base+1], left[base+3] |
| |
| umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) |
| umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac |
| umull2 v17.8h, v4.16b, v25.16b |
| umlal2 v17.8h, v5.16b, v24.16b |
| rshrn v16.8b, v16.8h, #6 |
| rshrn v17.8b, v17.8h, #6 |
| st1 {v16.8b}, [x0], x1 |
| subs w4, w4, #2 |
| st1 {v17.8b}, [x0], x1 |
| b.le 9f |
| |
| uqadd v26.16b, v26.16b, v21.16b // base += 4 |
| uqadd v27.16b, v27.16b, v21.16b // base += 4 |
| b 1b |
| |
| 9: |
| ret |
| endfunc |
| |
| |
| // void ipred_filter_8bpc_neon(pixel *dst, const ptrdiff_t stride, |
| // const pixel *const topleft, |
| // const int width, const int height, const int filt_idx, |
| // const int max_width, const int max_height); |
| function ipred_filter_8bpc_neon, export=1 |
| and w5, w5, #511 |
| movrel x6, X(filter_intra_taps) |
| lsl w5, w5, #6 |
| add x6, x6, w5, uxtw |
| ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32 |
| clz w9, w3 |
| adr x5, L(ipred_filter_tbl) |
| ld1 {v20.8b, v21.8b, v22.8b}, [x6] |
| sub w9, w9, #26 |
| ldrh w9, [x5, w9, uxtw #1] |
| sxtl v16.8h, v16.8b |
| sxtl v17.8h, v17.8b |
| sub x5, x5, w9, uxtw |
| sxtl v18.8h, v18.8b |
| sxtl v19.8h, v19.8b |
| add x6, x0, x1 |
| lsl x1, x1, #1 |
| sxtl v20.8h, v20.8b |
| sxtl v21.8h, v21.8b |
| sxtl v22.8h, v22.8b |
| br x5 |
| 40: |
| AARCH64_VALID_JUMP_TARGET |
| ldur s0, [x2, #1] // top (0-3) |
| sub x2, x2, #2 |
| mov x7, #-2 |
| uxtl v0.8h, v0.8b // top (0-3) |
| 4: |
| ld1 {v1.s}[0], [x2], x7 // left (0-1) + topleft (2) |
| mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) |
| mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) |
| mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) |
| uxtl v1.8h, v1.8b // left (0-1) + topleft (2) |
| mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) |
| mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) |
| mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) |
| mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) |
| sqrshrun v2.8b, v2.8h, #4 |
| subs w4, w4, #2 |
| st1 {v2.s}[0], [x0], x1 |
| uxtl v0.8h, v2.8b |
| st1 {v2.s}[1], [x6], x1 |
| ext v0.16b, v0.16b, v0.16b, #8 // move top from [4-7] to [0-3] |
| b.gt 4b |
| ret |
| 80: |
| AARCH64_VALID_JUMP_TARGET |
| ldur d0, [x2, #1] // top (0-7) |
| sub x2, x2, #2 |
| mov x7, #-2 |
| uxtl v0.8h, v0.8b // top (0-7) |
| 8: |
| ld1 {v1.s}[0], [x2], x7 // left (0-1) + topleft (2) |
| mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) |
| mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) |
| mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) |
| uxtl v1.8h, v1.8b // left (0-1) + topleft (2) |
| mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) |
| mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) |
| mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) |
| mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) |
| mul v3.8h, v17.8h, v0.h[4] // p1(top[0]) * filter(1) |
| mla v3.8h, v18.8h, v0.h[5] // p2(top[1]) * filter(2) |
| mla v3.8h, v19.8h, v0.h[6] // p3(top[2]) * filter(3) |
| sqrshrun v2.8b, v2.8h, #4 |
| uxtl v1.8h, v2.8b // first block, in 16 bit |
| mla v3.8h, v20.8h, v0.h[7] // p4(top[3]) * filter(4) |
| mla v3.8h, v16.8h, v0.h[3] // p0(topleft) * filter(0) |
| mla v3.8h, v21.8h, v1.h[3] // p5(left[0]) * filter(5) |
| mla v3.8h, v22.8h, v1.h[7] // p6(left[1]) * filter(6) |
| sqrshrun v3.8b, v3.8h, #4 |
| subs w4, w4, #2 |
| st2 {v2.s, v3.s}[0], [x0], x1 |
| zip2 v0.2s, v2.2s, v3.2s |
| st2 {v2.s, v3.s}[1], [x6], x1 |
| uxtl v0.8h, v0.8b |
| b.gt 8b |
| ret |
| 160: |
| 320: |
| AARCH64_VALID_JUMP_TARGET |
| add x8, x2, #1 |
| sub x2, x2, #2 |
| mov x7, #-2 |
| sub x1, x1, w3, uxtw |
| mov w9, w3 |
| |
| 1: |
| ld1 {v0.s}[0], [x2], x7 // left (0-1) + topleft (2) |
| uxtl v0.8h, v0.8b // left (0-1) + topleft (2) |
| 2: |
| ld1 {v2.16b}, [x8], #16 // top(0-15) |
| mul v3.8h, v16.8h, v0.h[2] // p0(topleft) * filter(0) |
| mla v3.8h, v21.8h, v0.h[1] // p5(left[0]) * filter(5) |
| uxtl v1.8h, v2.8b // top(0-7) |
| uxtl2 v2.8h, v2.16b // top(8-15) |
| mla v3.8h, v22.8h, v0.h[0] // p6(left[1]) * filter(6) |
| mla v3.8h, v17.8h, v1.h[0] // p1(top[0]) * filter(1) |
| mla v3.8h, v18.8h, v1.h[1] // p2(top[1]) * filter(2) |
| mla v3.8h, v19.8h, v1.h[2] // p3(top[2]) * filter(3) |
| mla v3.8h, v20.8h, v1.h[3] // p4(top[3]) * filter(4) |
| |
| mul v4.8h, v17.8h, v1.h[4] // p1(top[0]) * filter(1) |
| mla v4.8h, v18.8h, v1.h[5] // p2(top[1]) * filter(2) |
| mla v4.8h, v19.8h, v1.h[6] // p3(top[2]) * filter(3) |
| sqrshrun v3.8b, v3.8h, #4 |
| uxtl v0.8h, v3.8b // first block, in 16 bit |
| mla v4.8h, v20.8h, v1.h[7] // p4(top[3]) * filter(4) |
| mla v4.8h, v16.8h, v1.h[3] // p0(topleft) * filter(0) |
| mla v4.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5) |
| mla v4.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6) |
| |
| mul v5.8h, v17.8h, v2.h[0] // p1(top[0]) * filter(1) |
| mla v5.8h, v18.8h, v2.h[1] // p2(top[1]) * filter(2) |
| mla v5.8h, v19.8h, v2.h[2] // p3(top[2]) * filter(3) |
| sqrshrun v4.8b, v4.8h, #4 |
| uxtl v0.8h, v4.8b // second block, in 16 bit |
| mla v5.8h, v20.8h, v2.h[3] // p4(top[3]) * filter(4) |
| mla v5.8h, v16.8h, v1.h[7] // p0(topleft) * filter(0) |
| mla v5.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5) |
| mla v5.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6) |
| |
| mul v6.8h, v17.8h, v2.h[4] // p1(top[0]) * filter(1) |
| mla v6.8h, v18.8h, v2.h[5] // p2(top[1]) * filter(2) |
| mla v6.8h, v19.8h, v2.h[6] // p3(top[2]) * filter(3) |
| sqrshrun v5.8b, v5.8h, #4 |
| uxtl v0.8h, v5.8b // third block, in 16 bit |
| mla v6.8h, v20.8h, v2.h[7] // p4(top[3]) * filter(4) |
| mla v6.8h, v16.8h, v2.h[3] // p0(topleft) * filter(0) |
| mla v6.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5) |
| mla v6.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6) |
| |
| subs w3, w3, #16 |
| sqrshrun v6.8b, v6.8h, #4 |
| |
| st4 {v3.s, v4.s, v5.s, v6.s}[0], [x0], #16 |
| st4 {v3.s, v4.s, v5.s, v6.s}[1], [x6], #16 |
| b.le 8f |
| ins v0.h[2], v2.h[7] |
| ins v0.b[0], v6.b[7] |
| ins v0.b[2], v6.b[3] |
| b 2b |
| 8: |
| subs w4, w4, #2 |
| b.le 9f |
| sub x8, x6, w9, uxtw |
| add x0, x0, x1 |
| add x6, x6, x1 |
| mov w3, w9 |
| b 1b |
| 9: |
| ret |
| |
| L(ipred_filter_tbl): |
| .hword L(ipred_filter_tbl) - 320b |
| .hword L(ipred_filter_tbl) - 160b |
| .hword L(ipred_filter_tbl) - 80b |
| .hword L(ipred_filter_tbl) - 40b |
| endfunc |
| |
| // void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride, |
| // const uint16_t *const pal, const uint8_t *idx, |
| // const int w, const int h); |
| function pal_pred_8bpc_neon, export=1 |
| ld1 {v0.8h}, [x2] |
| clz w9, w4 |
| adr x6, L(pal_pred_tbl) |
| sub w9, w9, #25 |
| ldrh w9, [x6, w9, uxtw #1] |
| xtn v0.8b, v0.8h |
| sub x6, x6, w9, uxtw |
| add x2, x0, x1 |
| lsl x1, x1, #1 |
| br x6 |
| 4: |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v1.16b}, [x3], #16 |
| subs w5, w5, #4 |
| tbl v1.16b, {v0.16b}, v1.16b |
| st1 {v1.s}[0], [x0], x1 |
| st1 {v1.s}[1], [x2], x1 |
| st1 {v1.s}[2], [x0], x1 |
| st1 {v1.s}[3], [x2], x1 |
| b.gt 4b |
| ret |
| 8: |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v1.16b, v2.16b}, [x3], #32 |
| subs w5, w5, #4 |
| tbl v1.16b, {v0.16b}, v1.16b |
| st1 {v1.d}[0], [x0], x1 |
| tbl v2.16b, {v0.16b}, v2.16b |
| st1 {v1.d}[1], [x2], x1 |
| st1 {v2.d}[0], [x0], x1 |
| st1 {v2.d}[1], [x2], x1 |
| b.gt 8b |
| ret |
| 16: |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v1.16b, v2.16b, v3.16b, v4.16b}, [x3], #64 |
| subs w5, w5, #4 |
| tbl v1.16b, {v0.16b}, v1.16b |
| tbl v2.16b, {v0.16b}, v2.16b |
| st1 {v1.16b}, [x0], x1 |
| tbl v3.16b, {v0.16b}, v3.16b |
| st1 {v2.16b}, [x2], x1 |
| tbl v4.16b, {v0.16b}, v4.16b |
| st1 {v3.16b}, [x0], x1 |
| st1 {v4.16b}, [x2], x1 |
| b.gt 16b |
| ret |
| 32: |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64 |
| ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64 |
| subs w5, w5, #4 |
| tbl v16.16b, {v0.16b}, v16.16b |
| tbl v17.16b, {v0.16b}, v17.16b |
| tbl v18.16b, {v0.16b}, v18.16b |
| tbl v19.16b, {v0.16b}, v19.16b |
| tbl v20.16b, {v0.16b}, v20.16b |
| st1 {v16.16b, v17.16b}, [x0], x1 |
| tbl v21.16b, {v0.16b}, v21.16b |
| st1 {v18.16b, v19.16b}, [x2], x1 |
| tbl v22.16b, {v0.16b}, v22.16b |
| st1 {v20.16b, v21.16b}, [x0], x1 |
| tbl v23.16b, {v0.16b}, v23.16b |
| st1 {v22.16b, v23.16b}, [x2], x1 |
| b.gt 32b |
| ret |
| 64: |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64 |
| ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64 |
| subs w5, w5, #2 |
| tbl v16.16b, {v0.16b}, v16.16b |
| tbl v17.16b, {v0.16b}, v17.16b |
| tbl v18.16b, {v0.16b}, v18.16b |
| tbl v19.16b, {v0.16b}, v19.16b |
| st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1 |
| tbl v20.16b, {v0.16b}, v20.16b |
| tbl v21.16b, {v0.16b}, v21.16b |
| tbl v22.16b, {v0.16b}, v22.16b |
| tbl v23.16b, {v0.16b}, v23.16b |
| st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x1 |
| b.gt 64b |
| ret |
| |
| L(pal_pred_tbl): |
| .hword L(pal_pred_tbl) - 64b |
| .hword L(pal_pred_tbl) - 32b |
| .hword L(pal_pred_tbl) - 16b |
| .hword L(pal_pred_tbl) - 8b |
| .hword L(pal_pred_tbl) - 4b |
| endfunc |
| |
| // void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride, |
| // const pixel *const topleft, |
| // const int width, const int height, |
| // const int16_t *ac, const int alpha); |
| function ipred_cfl_128_8bpc_neon, export=1 |
| clz w9, w3 |
| adr x7, L(ipred_cfl_128_tbl) |
| sub w9, w9, #26 |
| ldrh w9, [x7, w9, uxtw #1] |
| movi v0.8h, #128 // dc |
| dup v1.8h, w6 // alpha |
| sub x7, x7, w9, uxtw |
| add x6, x0, x1 |
| lsl x1, x1, #1 |
| br x7 |
| L(ipred_cfl_splat_w4): |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v2.8h, v3.8h}, [x5], #32 |
| mul v2.8h, v2.8h, v1.8h // diff = ac * alpha |
| mul v3.8h, v3.8h, v1.8h |
| cmlt v4.8h, v2.8h, #0 // sign |
| cmlt v5.8h, v3.8h, #0 |
| add v2.8h, v2.8h, v4.8h // diff + sign |
| add v3.8h, v3.8h, v5.8h |
| srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign() |
| srshr v3.8h, v3.8h, #6 |
| add v2.8h, v2.8h, v0.8h // dc + apply_sign() |
| add v3.8h, v3.8h, v0.8h |
| sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign()) |
| sqxtun v3.8b, v3.8h |
| st1 {v2.s}[0], [x0], x1 |
| st1 {v2.s}[1], [x6], x1 |
| subs w4, w4, #4 |
| st1 {v3.s}[0], [x0], x1 |
| st1 {v3.s}[1], [x6], x1 |
| b.gt L(ipred_cfl_splat_w4) |
| ret |
| L(ipred_cfl_splat_w8): |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x5], #64 |
| mul v2.8h, v2.8h, v1.8h // diff = ac * alpha |
| mul v3.8h, v3.8h, v1.8h |
| mul v4.8h, v4.8h, v1.8h |
| mul v5.8h, v5.8h, v1.8h |
| cmlt v16.8h, v2.8h, #0 // sign |
| cmlt v17.8h, v3.8h, #0 |
| cmlt v18.8h, v4.8h, #0 |
| cmlt v19.8h, v5.8h, #0 |
| add v2.8h, v2.8h, v16.8h // diff + sign |
| add v3.8h, v3.8h, v17.8h |
| add v4.8h, v4.8h, v18.8h |
| add v5.8h, v5.8h, v19.8h |
| srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign() |
| srshr v3.8h, v3.8h, #6 |
| srshr v4.8h, v4.8h, #6 |
| srshr v5.8h, v5.8h, #6 |
| add v2.8h, v2.8h, v0.8h // dc + apply_sign() |
| add v3.8h, v3.8h, v0.8h |
| add v4.8h, v4.8h, v0.8h |
| add v5.8h, v5.8h, v0.8h |
| sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign()) |
| sqxtun v3.8b, v3.8h |
| sqxtun v4.8b, v4.8h |
| sqxtun v5.8b, v5.8h |
| st1 {v2.8b}, [x0], x1 |
| st1 {v3.8b}, [x6], x1 |
| subs w4, w4, #4 |
| st1 {v4.8b}, [x0], x1 |
| st1 {v5.8b}, [x6], x1 |
| b.gt L(ipred_cfl_splat_w8) |
| ret |
| L(ipred_cfl_splat_w16): |
| AARCH64_VALID_JUMP_TARGET |
| add x7, x5, w3, uxtw #1 |
| sub x1, x1, w3, uxtw |
| mov w9, w3 |
| 1: |
| ld1 {v2.8h, v3.8h}, [x5], #32 |
| ld1 {v4.8h, v5.8h}, [x7], #32 |
| mul v2.8h, v2.8h, v1.8h // diff = ac * alpha |
| mul v3.8h, v3.8h, v1.8h |
| mul v4.8h, v4.8h, v1.8h |
| mul v5.8h, v5.8h, v1.8h |
| cmlt v16.8h, v2.8h, #0 // sign |
| cmlt v17.8h, v3.8h, #0 |
| cmlt v18.8h, v4.8h, #0 |
| cmlt v19.8h, v5.8h, #0 |
| add v2.8h, v2.8h, v16.8h // diff + sign |
| add v3.8h, v3.8h, v17.8h |
| add v4.8h, v4.8h, v18.8h |
| add v5.8h, v5.8h, v19.8h |
| srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign() |
| srshr v3.8h, v3.8h, #6 |
| srshr v4.8h, v4.8h, #6 |
| srshr v5.8h, v5.8h, #6 |
| add v2.8h, v2.8h, v0.8h // dc + apply_sign() |
| add v3.8h, v3.8h, v0.8h |
| add v4.8h, v4.8h, v0.8h |
| add v5.8h, v5.8h, v0.8h |
| sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign()) |
| sqxtun v3.8b, v3.8h |
| sqxtun v4.8b, v4.8h |
| sqxtun v5.8b, v5.8h |
| subs w3, w3, #16 |
| st1 {v2.8b, v3.8b}, [x0], #16 |
| st1 {v4.8b, v5.8b}, [x6], #16 |
| b.gt 1b |
| subs w4, w4, #2 |
| add x5, x5, w9, uxtw #1 |
| add x7, x7, w9, uxtw #1 |
| add x0, x0, x1 |
| add x6, x6, x1 |
| mov w3, w9 |
| b.gt 1b |
| ret |
| |
| L(ipred_cfl_128_tbl): |
| L(ipred_cfl_splat_tbl): |
| .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) |
| .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) |
| .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8) |
| .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4) |
| endfunc |
| |
| // void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride, |
| // const pixel *const topleft, |
| // const int width, const int height, |
| // const int16_t *ac, const int alpha); |
| function ipred_cfl_top_8bpc_neon, export=1 |
| clz w9, w3 |
| adr x7, L(ipred_cfl_top_tbl) |
| sub w9, w9, #26 |
| ldrh w9, [x7, w9, uxtw #1] |
| dup v1.8h, w6 // alpha |
| add x2, x2, #1 |
| sub x7, x7, w9, uxtw |
| add x6, x0, x1 |
| lsl x1, x1, #1 |
| br x7 |
| 4: |
| AARCH64_VALID_JUMP_TARGET |
| ld1r {v0.2s}, [x2] |
| uaddlv h0, v0.8b |
| urshr v0.4h, v0.4h, #3 |
| dup v0.8h, v0.h[0] |
| b L(ipred_cfl_splat_w4) |
| 8: |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v0.8b}, [x2] |
| uaddlv h0, v0.8b |
| urshr v0.4h, v0.4h, #3 |
| dup v0.8h, v0.h[0] |
| b L(ipred_cfl_splat_w8) |
| 16: |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v0.16b}, [x2] |
| uaddlv h0, v0.16b |
| urshr v0.4h, v0.4h, #4 |
| dup v0.8h, v0.h[0] |
| b L(ipred_cfl_splat_w16) |
| 32: |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v2.16b, v3.16b}, [x2] |
| uaddlv h2, v2.16b |
| uaddlv h3, v3.16b |
| add v2.4h, v2.4h, v3.4h |
| urshr v2.4h, v2.4h, #5 |
| dup v0.8h, v2.h[0] |
| b L(ipred_cfl_splat_w16) |
| |
| L(ipred_cfl_top_tbl): |
| .hword L(ipred_cfl_top_tbl) - 32b |
| .hword L(ipred_cfl_top_tbl) - 16b |
| .hword L(ipred_cfl_top_tbl) - 8b |
| .hword L(ipred_cfl_top_tbl) - 4b |
| endfunc |
| |
| // void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride, |
| // const pixel *const topleft, |
| // const int width, const int height, |
| // const int16_t *ac, const int alpha); |
| function ipred_cfl_left_8bpc_neon, export=1 |
| sub x2, x2, w4, uxtw |
| clz w9, w3 |
| clz w8, w4 |
| adr x10, L(ipred_cfl_splat_tbl) |
| adr x7, L(ipred_cfl_left_tbl) |
| sub w9, w9, #26 |
| sub w8, w8, #26 |
| ldrh w9, [x10, w9, uxtw #1] |
| ldrh w8, [x7, w8, uxtw #1] |
| dup v1.8h, w6 // alpha |
| sub x9, x10, w9, uxtw |
| sub x7, x7, w8, uxtw |
| add x6, x0, x1 |
| lsl x1, x1, #1 |
| br x7 |
| |
| L(ipred_cfl_left_h4): |
| AARCH64_VALID_JUMP_TARGET |
| ld1r {v0.2s}, [x2] |
| uaddlv h0, v0.8b |
| urshr v0.4h, v0.4h, #3 |
| dup v0.8h, v0.h[0] |
| br x9 |
| |
| L(ipred_cfl_left_h8): |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v0.8b}, [x2] |
| uaddlv h0, v0.8b |
| urshr v0.4h, v0.4h, #3 |
| dup v0.8h, v0.h[0] |
| br x9 |
| |
| L(ipred_cfl_left_h16): |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v0.16b}, [x2] |
| uaddlv h0, v0.16b |
| urshr v0.4h, v0.4h, #4 |
| dup v0.8h, v0.h[0] |
| br x9 |
| |
| L(ipred_cfl_left_h32): |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v2.16b, v3.16b}, [x2] |
| uaddlv h2, v2.16b |
| uaddlv h3, v3.16b |
| add v2.4h, v2.4h, v3.4h |
| urshr v2.4h, v2.4h, #5 |
| dup v0.8h, v2.h[0] |
| br x9 |
| |
| L(ipred_cfl_left_tbl): |
| .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32) |
| .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16) |
| .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8) |
| .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4) |
| endfunc |
| |
| // void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride, |
| // const pixel *const topleft, |
| // const int width, const int height, |
| // const int16_t *ac, const int alpha); |
| function ipred_cfl_8bpc_neon, export=1 |
| sub x2, x2, w4, uxtw |
| add w8, w3, w4 // width + height |
| dup v1.8h, w6 // alpha |
| clz w9, w3 |
| clz w6, w4 |
| dup v16.8h, w8 // width + height |
| adr x7, L(ipred_cfl_tbl) |
| rbit w8, w8 // rbit(width + height) |
| sub w9, w9, #22 // 26 leading bits, minus table offset 4 |
| sub w6, w6, #26 |
| clz w8, w8 // ctz(width + height) |
| ldrh w9, [x7, w9, uxtw #1] |
| ldrh w6, [x7, w6, uxtw #1] |
| neg w8, w8 // -ctz(width + height) |
| sub x9, x7, w9, uxtw |
| sub x7, x7, w6, uxtw |
| ushr v16.8h, v16.8h, #1 // (width + height) >> 1 |
| dup v17.8h, w8 // -ctz(width + height) |
| add x6, x0, x1 |
| lsl x1, x1, #1 |
| br x7 |
| |
| L(ipred_cfl_h4): |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v0.s}[0], [x2], #4 |
| ins v0.s[1], wzr |
| add x2, x2, #1 |
| uaddlv h0, v0.8b |
| br x9 |
| L(ipred_cfl_w4): |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v2.s}[0], [x2] |
| ins v2.s[1], wzr |
| add v0.4h, v0.4h, v16.4h |
| uaddlv h2, v2.8b |
| cmp w4, #4 |
| add v0.4h, v0.4h, v2.4h |
| ushl v0.4h, v0.4h, v17.4h |
| b.eq 1f |
| // h = 8/16 |
| mov w16, #(0x3334/2) |
| movk w16, #(0x5556/2), lsl #16 |
| add w17, w4, w4 // w17 = 2*h = 16 or 32 |
| lsr w16, w16, w17 |
| dup v16.4h, w16 |
| sqdmulh v0.4h, v0.4h, v16.4h |
| 1: |
| dup v0.8h, v0.h[0] |
| b L(ipred_cfl_splat_w4) |
| |
| L(ipred_cfl_h8): |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v0.8b}, [x2], #8 |
| uaddlv h0, v0.8b |
| add x2, x2, #1 |
| br x9 |
| L(ipred_cfl_w8): |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v2.8b}, [x2] |
| add v0.4h, v0.4h, v16.4h |
| uaddlv h2, v2.8b |
| cmp w4, #8 |
| add v0.4h, v0.4h, v2.4h |
| ushl v0.4h, v0.4h, v17.4h |
| b.eq 1f |
| // h = 4/16/32 |
| cmp w4, #32 |
| mov w16, #(0x3334/2) |
| mov w17, #(0x5556/2) |
| csel w16, w16, w17, eq |
| dup v16.4h, w16 |
| sqdmulh v0.4h, v0.4h, v16.4h |
| 1: |
| dup v0.8h, v0.h[0] |
| b L(ipred_cfl_splat_w8) |
| |
| L(ipred_cfl_h16): |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v0.16b}, [x2], #16 |
| uaddlv h0, v0.16b |
| add x2, x2, #1 |
| br x9 |
| L(ipred_cfl_w16): |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v2.16b}, [x2] |
| add v0.4h, v0.4h, v16.4h |
| uaddlv h2, v2.16b |
| cmp w4, #16 |
| add v0.4h, v0.4h, v2.4h |
| ushl v0.4h, v0.4h, v17.4h |
| b.eq 1f |
| // h = 4/8/32 |
| cmp w4, #4 |
| mov w16, #(0x3334/2) |
| mov w17, #(0x5556/2) |
| csel w16, w16, w17, eq |
| dup v16.4h, w16 |
| sqdmulh v0.4h, v0.4h, v16.4h |
| 1: |
| dup v0.8h, v0.h[0] |
| b L(ipred_cfl_splat_w16) |
| |
| L(ipred_cfl_h32): |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v2.16b, v3.16b}, [x2], #32 |
| uaddlv h2, v2.16b |
| uaddlv h3, v3.16b |
| add x2, x2, #1 |
| add v0.4h, v2.4h, v3.4h |
| br x9 |
| L(ipred_cfl_w32): |
| AARCH64_VALID_JUMP_TARGET |
| ld1 {v2.16b, v3.16b}, [x2] |
| add v0.4h, v0.4h, v16.4h |
| uaddlv h2, v2.16b |
| uaddlv h3, v3.16b |
| cmp w4, #32 |
| add v0.4h, v0.4h, v2.4h |
| add v0.4h, v0.4h, v3.4h |
| ushl v0.4h, v0.4h, v17.4h |
| b.eq 1f |
| // h = 8/16 |
| mov w16, #(0x5556/2) |
| movk w16, #(0x3334/2), lsl #16 |
| add w17, w4, w4 // w17 = 2*h = 16 or 32 |
| lsr w16, w16, w17 |
| dup v16.4h, w16 |
| sqdmulh v0.4h, v0.4h, v16.4h |
| 1: |
| dup v0.8h, v0.h[0] |
| b L(ipred_cfl_splat_w16) |
| |
| L(ipred_cfl_tbl): |
| .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32) |
| .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16) |
| .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8) |
| .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4) |
| .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32) |
| .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16) |
| .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8) |
| .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4) |
| endfunc |
| |
| // void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx, |
| // const ptrdiff_t stride, const int w_pad, |
| // const int h_pad, const int cw, const int ch); |
| function ipred_cfl_ac_420_8bpc_neon, export=1 |
| clz w8, w5 |
| lsl w4, w4, #2 |
| adr x7, L(ipred_cfl_ac_420_tbl) |
| sub w8, w8, #27 |
| ldrh w8, [x7, w8, uxtw #1] |
| movi v16.8h, #0 |
| movi v17.8h, #0 |
| movi v18.8h, #0 |
| movi v19.8h, #0 |
| sub x7, x7, w8, uxtw |
| sub w8, w6, w4 // height - h_pad |
| rbit w9, w5 // rbit(width) |
| rbit w10, w6 // rbit(height) |
| clz w9, w9 // ctz(width) |
| clz w10, w10 // ctz(height) |
| add w9, w9, w10 // log2sz |
| add x10, x1, x2 |
| dup v31.4s, w9 |
| lsl x2, x2, #1 |
| neg v31.4s, v31.4s // -log2sz |
| br x7 |
| |
| L(ipred_cfl_ac_420_w4): |
| AARCH64_VALID_JUMP_TARGET |
| 1: // Copy and subsample input |
| ld1 {v0.8b}, [x1], x2 |
| ld1 {v1.8b}, [x10], x2 |
| ld1 {v0.d}[1], [x1], x2 |
| ld1 {v1.d}[1], [x10], x2 |
| uaddlp v0.8h, v0.16b |
| uaddlp v1.8h, v1.16b |
| add v0.8h, v0.8h, v1.8h |
| shl v0.8h, v0.8h, #1 |
| subs w8, w8, #2 |
| st1 {v0.8h}, [x0], #16 |
| add v16.8h, v16.8h, v0.8h |
| b.gt 1b |
| trn2 v1.2d, v0.2d, v0.2d |
| trn2 v0.2d, v0.2d, v0.2d |
| L(ipred_cfl_ac_420_w4_hpad): |
| cbz w4, 3f |
| 2: // Vertical padding (h_pad > 0) |
| subs w4, w4, #4 |
| st1 {v0.8h, v1.8h}, [x0], #32 |
| add v16.8h, v16.8h, v0.8h |
| add v17.8h, v17.8h, v1.8h |
| b.gt 2b |
| 3: |
| // Aggregate the sums |
| add v0.8h, v16.8h, v17.8h |
| uaddlv s0, v0.8h // sum |
| sub x0, x0, w6, uxtw #3 |
| urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz |
| dup v4.8h, v4.h[0] |
| 6: // Subtract dc from ac |
| ld1 {v0.8h, v1.8h}, [x0] |
| subs w6, w6, #4 |
| sub v0.8h, v0.8h, v4.8h |
| sub v1.8h, v1.8h, v4.8h |
| st1 {v0.8h, v1.8h}, [x0], #32 |
| b.gt 6b |
| ret |
| |
| L(ipred_cfl_ac_420_w8): |
| AARCH64_VALID_JUMP_TARGET |
| cbnz w3, L(ipred_cfl_ac_420_w8_wpad) |
| 1: // Copy and subsample input, without padding |
| ld1 {v0.16b}, [x1], x2 |
| ld1 {v1.16b}, [x10], x2 |
| ld1 {v2.16b}, [x1], x2 |
| uaddlp v0.8h, v0.16b |
| ld1 {v3.16b}, [x10], x2 |
| uaddlp v1.8h, v1.16b |
| uaddlp v2.8h, v2.16b |
| uaddlp v3.8h, v3.16b |
| add v0.8h, v0.8h, v1.8h |
| add v2.8h, v2.8h, v3.8h |
| shl v0.8h, v0.8h, #1 |
| shl v1.8h, v2.8h, #1 |
| subs w8, w8, #2 |
| st1 {v0.8h, v1.8h}, [x0], #32 |
| add v16.8h, v16.8h, v0.8h |
| add v17.8h, v17.8h, v1.8h |
| b.gt 1b |
| mov v0.16b, v1.16b |
| b L(ipred_cfl_ac_420_w8_hpad) |
| |
| L(ipred_cfl_ac_420_w8_wpad): |
| 1: // Copy and subsample input, padding 4 |
| ld1 {v0.8b}, [x1], x2 |
| ld1 {v1.8b}, [x10], x2 |
| ld1 {v0.d}[1], [x1], x2 |
| ld1 {v1.d}[1], [x10], x2 |
| uaddlp v0.8h, v0.16b |
| uaddlp v1.8h, v1.16b |
| add v0.8h, v0.8h, v1.8h |
| shl v0.8h, v0.8h, #1 |
| dup v1.4h, v0.h[3] |
| dup v3.4h, v0.h[7] |
| trn2 v2.2d, v0.2d, v0.2d |
| subs w8, w8, #2 |
| st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 |
| add v16.4h, v16.4h, v0.4h |
| add v17.4h, v17.4h, v1.4h |
| add v18.4h, v18.4h, v2.4h |
| add v19.4h, v19.4h, v3.4h |
| b.gt 1b |
| trn1 v0.2d, v2.2d, v3.2d |
| trn1 v1.2d, v2.2d, v3.2d |
| |
| L(ipred_cfl_ac_420_w8_hpad): |
| cbz w4, 3f |
| 2: // Vertical padding (h_pad > 0) |
| subs w4, w4, #4 |
| st1 {v0.8h, v1.8h}, [x0], #32 |
| add v16.8h, v16.8h, v0.8h |
| add v17.8h, v17.8h, v1.8h |
| st1 {v0.8h, v1.8h}, [x0], #32 |
| add v18.8h, v18.8h, v0.8h |
| add v19.8h, v19.8h, v1.8h |
| b.gt 2b |
| 3: |
| |
| L(ipred_cfl_ac_420_w8_calc_subtract_dc): |
| // Aggregate the sums |
| add v0.8h, v16.8h, v17.8h |
| add v2.8h, v18.8h, v19.8h |
| uaddlp v0.4s, v0.8h |
| uaddlp v2.4s, v2.8h |
| add v0.4s, v0.4s, v2.4s |
| addv s0, v0.4s // sum |
| sub x0, x0, w6, uxtw #4 |
| urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz |
| dup v4.8h, v4.h[0] |
| L(ipred_cfl_ac_420_w8_subtract_dc): |
| 6: // Subtract dc from ac |
| ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] |
| subs w6, w6, #4 |
| sub v0.8h, v0.8h, v4.8h |
| sub v1.8h, v1.8h, v4.8h |
| sub v2.8h, v2.8h, v4.8h |
| sub v3.8h, v3.8h, v4.8h |
| st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 |
| b.gt 6b |
| ret |
| |
| L(ipred_cfl_ac_420_w16): |
| AARCH64_VALID_JUMP_TARGET |
| adr x7, L(ipred_cfl_ac_420_w16_tbl) |
| ldrh w3, [x7, w3, uxtw #1] |
| sub x7, x7, w3, uxtw |
| br x7 |
| |
| L(ipred_cfl_ac_420_w16_wpad0): |
| AARCH64_VALID_JUMP_TARGET |
| 1: // Copy and subsample input, without padding |
| ld1 {v0.16b, v1.16b}, [x1], x2 |
| ld1 {v2.16b, v3.16b}, [x10], x2 |
| uaddlp v0.8h, v0.16b |
| ld1 {v4.16b, v5.16b}, [x1], x2 |
| uaddlp v1.8h, v1.16b |
| ld1 {v6.16b, v7.16b}, [x10], x2 |
| uaddlp v2.8h, v2.16b |
| uaddlp v3.8h, v3.16b |
| uaddlp v4.8h, v4.16b |
| uaddlp v5.8h, v5.16b |
| uaddlp v6.8h, v6.16b |
| uaddlp v7.8h, v7.16b |
| add v0.8h, v0.8h, v2.8h |
| add v1.8h, v1.8h, v3.8h |
| add v4.8h, v4.8h, v6.8h |
| add v5.8h, v5.8h, v7.8h |
| shl v0.8h, v0.8h, #1 |
| shl v1.8h, v1.8h, #1 |
| shl v2.8h, v4.8h, #1 |
| shl v3.8h, v5.8h, #1 |
| subs w8, w8, #2 |
| st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 |
| add v16.8h, v16.8h, v0.8h |
| add v17.8h, v17.8h, v1.8h |
| add v18.8h, v18.8h, v2.8h |
| add v19.8h, v19.8h, v3.8h |
| b.gt 1b |
| mov v0.16b, v2.16b |
| mov v1.16b, v3.16b |
| b L(ipred_cfl_ac_420_w16_hpad) |
| |
| L(ipred_cfl_ac_420_w16_wpad1): |
| AARCH64_VALID_JUMP_TARGET |
| 1: // Copy and subsample input, padding 4 |
| ldr d1, [x1, #16] |
| ld1 {v0.16b}, [x1], x2 |
| ldr d3, [x10, #16] |
| ld1 {v2.16b}, [x10], x2 |
| uaddlp v1.4h, v1.8b |
| ldr d5, [x1, #16] |
| uaddlp v0.8h, v0.16b |
| ld1 {v4.16b}, [x1], x2 |
| uaddlp v3.4h, v3.8b |
| ldr d7, [x10, #16] |
| uaddlp v2.8h, v2.16b |
| ld1 {v6.16b}, [x10], x2 |
| uaddlp v5.4h, v5.8b |
| uaddlp v4.8h, v4.16b |
| uaddlp v7.4h, v7.8b |
| uaddlp v6.8h, v6.16b |
| add v1.4h, v1.4h, v3.4h |
| add v0.8h, v0.8h, v2.8h |
| add v5.4h, v5.4h, v7.4h |
| add v4.8h, v4.8h, v6.8h |
| shl v1.4h, v1.4h, #1 |
| shl v0.8h, v0.8h, #1 |
| shl v3.4h, v5.4h, #1 |
| shl v2.8h, v4.8h, #1 |
| dup v4.4h, v1.h[3] |
| dup v5.4h, v3.h[3] |
| trn1 v1.2d, v1.2d, v4.2d |
| trn1 v3.2d, v3.2d, v5.2d |
| subs w8, w8, #2 |
| st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 |
| add v16.8h, v16.8h, v0.8h |
| add v17.8h, v17.8h, v1.8h |
| add v18.8h, v18.8h, v2.8h |
| add v19.8h, v19.8h, v3.8h |
| b.gt 1b |
| mov v0.16b, v2.16b |
| mov v1.16b, v3.16b |
| b L(ipred_cfl_ac_420_w16_hpad) |
| |
| L(ipred_cfl_ac_420_w16_wpad2): |
| AARCH64_VALID_JUMP_TARGET |
| 1: // Copy and subsample input, padding 8 |
| ld1 {v0.16b}, [x1], x2 |
| ld1 {v2.16b}, [x10], x2 |
| ld1 {v4.16b}, [x1], x2 |
| uaddlp v0.8h, v0.16b |
| ld1 {v6.16b}, [x10], x2 |
| uaddlp v2.8h, v2.16b |
| uaddlp v4.8h, v4.16b |
| uaddlp v6.8h, v6.16b |
| add v0.8h, v0.8h, v2.8h |
| add v4.8h, v4.8h, v6.8h |
| shl v0.8h, v0.8h, #1 |
| shl v2.8h, v4.8h, #1 |
| dup v1.8h, v0.h[7] |
| dup v3.8h, v2.h[7] |
| subs w8, w8, #2 |
| st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 |
| add v16.8h, v16.8h, v0.8h |
| add v17.8h, v17.8h, v1.8h |
| add v18.8h, v18.8h, v2.8h |
| add v19.8h, v19.8h, v3.8h |
| b.gt 1b |
| mov v0.16b, v2.16b |
| mov v1.16b, v3.16b |
| b L(ipred_cfl_ac_420_w16_hpad) |
| |
| L(ipred_cfl_ac_420_w16_wpad3): |
| AARCH64_VALID_JUMP_TARGET |
| 1: // Copy and subsample input, padding 12 |
| ld1 {v0.8b}, [x1], x2 |
| ld1 {v2.8b}, [x10], x2 |
| ld1 {v4.8b}, [x1], x2 |
| uaddlp v0.4h, v0.8b |
| ld1 {v6.8b}, [x10], x2 |
| uaddlp v2.4h, v2.8b |
| uaddlp v4.4h, v4.8b |
| uaddlp v6.4h, v6.8b |
| add v0.4h, v0.4h, v2.4h |
| add v4.4h, v4.4h, v6.4h |
| shl v0.4h, v0.4h, #1 |
| shl v2.4h, v4.4h, #1 |
| dup v1.8h, v0.h[3] |
| dup v3.8h, v2.h[3] |
| trn1 v0.2d, v0.2d, v1.2d |
| trn1 v2.2d, v2.2d, v3.2d |
| subs w8, w8, #2 |
| st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 |
| add v16.8h, v16.8h, v0.8h |
| add v17.8h, v17.8h, v1.8h |
| add v18.8h, v18.8h, v2.8h |
| add v19.8h, v19.8h, v3.8h |
| b.gt 1b |
| mov v0.16b, v2.16b |
| mov v1.16b, v3.16b |
| |
| L(ipred_cfl_ac_420_w16_hpad): |
| cbz w4, 3f |
| 2: // Vertical padding (h_pad > 0) |
| subs w4, w4, #4 |
| st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 |
| add v16.8h, v16.8h, v0.8h |
| add v17.8h, v17.8h, v1.8h |
| add v18.8h, v18.8h, v2.8h |
| add v19.8h, v19.8h, v3.8h |
| st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 |
| add v16.8h, v16.8h, v0.8h |
| add v17.8h, v17.8h, v1.8h |
| add v18.8h, v18.8h, v2.8h |
| add v19.8h, v19.8h, v3.8h |
| b.gt 2b |
| 3: |
| |
| // Double the height and reuse the w8 summing/subtracting |
| lsl w6, w6, #1 |
| b L(ipred_cfl_ac_420_w8_calc_subtract_dc) |
| |
| L(ipred_cfl_ac_420_tbl): |
| .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16) |
| .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8) |
| .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4) |
| .hword 0 |
| |
| L(ipred_cfl_ac_420_w16_tbl): |
| .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0) |
| .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1) |
| .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2) |
| .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3) |
| endfunc |
| |
| // void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx, |
| // const ptrdiff_t stride, const int w_pad, |
| // const int h_pad, const int cw, const int ch); |
| function ipred_cfl_ac_422_8bpc_neon, export=1 |
| clz w8, w5 |
| lsl w4, w4, #2 |
| adr x7, L(ipred_cfl_ac_422_tbl) |
| sub w8, w8, #27 |
| ldrh w8, [x7, w8, uxtw #1] |
| movi v16.8h, #0 |
| movi v17.8h, #0 |
| movi v18.8h, #0 |
| movi v19.8h, #0 |
| sub x7, x7, w8, uxtw |
| sub w8, w6, w4 // height - h_pad |
| rbit w9, w5 // rbit(width) |
| rbit w10, w6 // rbit(height) |
| clz w9, w9 // ctz(width) |
| clz w10, w10 // ctz(height) |
| add w9, w9, w10 // log2sz |
| add x10, x1, x2 |
| dup v31.4s, w9 |
| lsl x2, x2, #1 |
| neg v31.4s, v31.4s // -log2sz |
| br x7 |
| |
| L(ipred_cfl_ac_422_w4): |
| AARCH64_VALID_JUMP_TARGET |
| 1: // Copy and subsample input |
| ld1 {v0.8b}, [x1], x2 |
| ld1 {v0.d}[1], [x10], x2 |
| ld1 {v1.8b}, [x1], x2 |
| ld1 {v1.d}[1], [x10], x2 |
| uaddlp v0.8h, v0.16b |
| uaddlp v1.8h, v1.16b |
| shl v0.8h, v0.8h, #2 |
| shl v1.8h, v1.8h, #2 |
| subs w8, w8, #4 |
| add v16.8h, v16.8h, v0.8h |
| add v17.8h, v17.8h, v1.8h |
| st1 {v0.8h, v1.8h}, [x0], #32 |
| b.gt 1b |
| trn2 v0.2d, v1.2d, v1.2d |
| trn2 v1.2d, v1.2d, v1.2d |
| b L(ipred_cfl_ac_420_w4_hpad) |
| |
| L(ipred_cfl_ac_422_w8): |
| AARCH64_VALID_JUMP_TARGET |
| cbnz w3, L(ipred_cfl_ac_422_w8_wpad) |
| 1: // Copy and subsample input, without padding |
| ld1 {v0.16b}, [x1], x2 |
| ld1 {v1.16b}, [x10], x2 |
| ld1 {v2.16b}, [x1], x2 |
| uaddlp v0.8h, v0.16b |
| ld1 {v3.16b}, [x10], x2 |
| uaddlp v1.8h, v1.16b |
| uaddlp v2.8h, v2.16b |
| uaddlp v3.8h, v3.16b |
| shl v0.8h, v0.8h, #2 |
| shl v1.8h, v1.8h, #2 |
| shl v2.8h, v2.8h, #2 |
| shl v3.8h, v3.8h, #2 |
| subs w8, w8, #4 |
| st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 |
| add v16.8h, v16.8h, v0.8h |
| add v17.8h, v17.8h, v1.8h |
| add v18.8h, v18.8h, v2.8h |
| add v19.8h, v19.8h, v3.8h |
| b.gt 1b |
| mov v0.16b, v3.16b |
| mov v1.16b, v3.16b |
| b L(ipred_cfl_ac_420_w8_hpad) |
| |
| L(ipred_cfl_ac_422_w8_wpad): |
| 1: // Copy and subsample input, padding 4 |
| ld1 {v0.8b}, [x1], x2 |
| ld1 {v0.d}[1], [x10], x2 |
| ld1 {v2.8b}, [x1], x2 |
| ld1 {v2.d}[1], [x10], x2 |
| uaddlp v0.8h, v0.16b |
| uaddlp v2.8h, v2.16b |
| shl v0.8h, v0.8h, #2 |
| shl v2.8h, v2.8h, #2 |
| dup v4.4h, v0.h[3] |
| dup v5.8h, v0.h[7] |
| dup v6.4h, v2.h[3] |
| dup v7.8h, v2.h[7] |
| trn2 v1.2d, v0.2d, v5.2d |
| trn1 v0.2d, v0.2d, v4.2d |
| trn2 v3.2d, v2.2d, v7.2d |
| trn1 v2.2d, v2.2d, v6.2d |
| subs w8, w8, #4 |
| st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 |
| add v16.8h, v16.8h, v0.8h |
| add v17.8h, v17.8h, v1.8h |
| add v18.8h, v18.8h, v2.8h |
| add v19.8h, v19.8h, v3.8h |
| b.gt 1b |
| mov v0.16b, v3.16b |
| mov v1.16b, v3.16b |
| b L(ipred_cfl_ac_420_w8_hpad) |
| |
| L(ipred_cfl_ac_422_w16): |
| AARCH64_VALID_JUMP_TARGET |
| adr x7, L(ipred_cfl_ac_422_w16_tbl) |
| ldrh w3, [x7, w3, uxtw #1] |
| sub x7, x7, w3, uxtw |
| br x7 |
| |
| L(ipred_cfl_ac_422_w16_wpad0): |
| AARCH64_VALID_JUMP_TARGET |
| 1: // Copy and subsample input, without padding |
| ld1 {v0.16b, v1.16b}, [x1], x2 |
| ld1 {v2.16b, v3.16b}, [x10], x2 |
| uaddlp v0.8h, v0.16b |
| uaddlp v1.8h, v1.16b |
| uaddlp v2.8h, v2.16b |
| uaddlp v3.8h, v3.16b |
| shl v0.8h, v0.8h, #2 |
| shl v1.8h, v1.8h, #2 |
| shl v2.8h, v2.8h, #2 |
| shl v3.8h, v3.8h, #2 |
| subs w8, w8, #2 |
| st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 |
| add v16.8h, v16.8h, v0.8h |
| add v17.8h, v17.8h, v1.8h |
| add v18.8h, v18.8h, v2.8h |
| add v19.8h, v19.8h, v3.8h |
| b.gt 1b |
| mov v0.16b, v2.16b |
| mov v1.16b, v3.16b |
| b L(ipred_cfl_ac_420_w16_hpad) |
| |
| L(ipred_cfl_ac_422_w16_wpad1): |
| AARCH64_VALID_JUMP_TARGET |
| 1: // Copy and subsample input, padding 4 |
| ldr d1, [x1, #16] |
| ld1 {v0.16b}, [x1], x2 |
| ldr d3, [x10, #16] |
| ld1 {v2.16b}, [x10], x2 |
| uaddlp v1.4h, v1.8b |
| uaddlp v0.8h, v0.16b |
| uaddlp v3.4h, v3.8b |
| uaddlp v2.8h, v2.16b |
| shl v1.4h, v1.4h, #2 |
| shl v0.8h, v0.8h, #2 |
| shl v3.4h, v3.4h, #2 |
| shl v2.8h, v2.8h, #2 |
| dup v4.4h, v1.h[3] |
| dup v5.4h, v3.h[3] |
| trn1 v1.2d, v1.2d, v4.2d |
| trn1 v3.2d, v3.2d, v5.2d |
| subs w8, w8, #2 |
| st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 |
| add v16.8h, v16.8h, v0.8h |
| add v17.8h, v17.8h, v1.8h |
| add v18.8h, v18.8h, v2.8h |
| add v19.8h, v19.8h, v3.8h |
| b.gt 1b |
| mov v0.16b, v2.16b |
| mov v1.16b, v3.16b |
| b L(ipred_cfl_ac_420_w16_hpad) |
| |
| L(ipred_cfl_ac_422_w16_wpad2): |
| AARCH64_VALID_JUMP_TARGET |
| 1: // Copy and subsample input, padding 8 |
| ld1 {v0.16b}, [x1], x2 |
| ld1 {v2.16b}, [x10], x2 |
| uaddlp v0.8h, v0.16b |
| uaddlp v2.8h, v2.16b |
| shl v0.8h, v0.8h, #2 |
| shl v2.8h, v2.8h, #2 |
| dup v1.8h, v0.h[7] |
| dup v3.8h, v2.h[7] |
| subs w8, w8, #2 |
| st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 |
| add v16.8h, v16.8h, v0.8h |
| add v17.8h, v17.8h, v1.8h |
| add v18.8h, v18.8h, v2.8h |
| add v19.8h, v19.8h, v3.8h |
| b.gt 1b |
| mov v0.16b, v2.16b |
| mov v1.16b, v3.16b |
| b L(ipred_cfl_ac_420_w16_hpad) |
| |
| L(ipred_cfl_ac_422_w16_wpad3): |
| AARCH64_VALID_JUMP_TARGET |
| 1: // Copy and subsample input, padding 12 |
| ld1 {v0.8b}, [x1], x2 |
| ld1 {v2.8b}, [x10], x2 |
| uaddlp v0.4h, v0.8b |
| uaddlp v2.4h, v2.8b |
| shl v0.4h, v0.4h, #2 |
| shl v2.4h, v2.4h, #2 |
| dup v1.8h, v0.h[3] |
| dup v3.8h, v2.h[3] |
| trn1 v0.2d, v0.2d, v1.2d |
| trn1 v2.2d, v2.2d, v3.2d |
| subs w8, w8, #2 |
| st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 |
| add v16.8h, v16.8h, v0.8h |
| add v17.8h, v17.8h, v1.8h |
| add v18.8h, v18.8h, v2.8h |
| add v19.8h, v19.8h, v3.8h |
| b.gt 1b |
| mov v0.16b, v2.16b |
| mov v1.16b, v3.16b |
| b L(ipred_cfl_ac_420_w16_hpad) |
| |
| L(ipred_cfl_ac_422_tbl): |
| .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16) |
| .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8) |
| .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4) |
| .hword 0 |
| |
| L(ipred_cfl_ac_422_w16_tbl): |
| .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0) |
| .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1) |
| .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2) |
| .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3) |
| endfunc |
| |
| // void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx, |
| // const ptrdiff_t stride, const int w_pad, |
| // const int h_pad, const int cw, const int ch); |
| function ipred_cfl_ac_444_8bpc_neon, export=1 |
| clz w8, w5 |
| lsl w4, w4, #2 |
| adr x7, L(ipred_cfl_ac_444_tbl) |
| sub w8, w8, #26 |
| ldrh w8, [x7, w8, uxtw #1] |
| movi v16.8h, #0 |
| movi v17.8h, #0 |
| movi v18.8h, #0 |
| movi v19.8h, #0 |
| sub x7, x7, w8, uxtw |
| sub w8, w6, w4 // height - h_pad |
| rbit w9, w5 // rbit(width) |
| rbit w10, w6 // rbit(height) |
| clz w9, w9 // ctz(width) |
| clz w10, w10 // ctz(height) |
| add w9, w9, w10 // log2sz |
| add x10, x1, x2 |
| dup v31.4s, w9 |
| lsl x2, x2, #1 |
| neg v31.4s, v31.4s // -log2sz |
| br x7 |
| |
| L(ipred_cfl_ac_444_w4): |
| AARCH64_VALID_JUMP_TARGET |
| 1: // Copy and expand input |
| ld1 {v0.s}[0], [x1], x2 |
| ld1 {v0.s}[1], [x10], x2 |
| ld1 {v1.s}[0], [x1], x2 |
| ld1 {v1.s}[1], [x10], x2 |
| ushll v0.8h, v0.8b, #3 |
| ushll v1.8h, v1.8b, #3 |
| subs w8, w8, #4 |
| add v16.8h, v16.8h, v0.8h |
| add v17.8h, v17.8h, v1.8h |
| st1 {v0.8h, v1.8h}, [x0], #32 |
| b.gt 1b |
| trn2 v0.2d, v1.2d, v1.2d |
| trn2 v1.2d, v1.2d, v1.2d |
| b L(ipred_cfl_ac_420_w4_hpad) |
| |
| L(ipred_cfl_ac_444_w8): |
| AARCH64_VALID_JUMP_TARGET |
| 1: // Copy and expand input |
| ld1 {v0.8b}, [x1], x2 |
| ld1 {v1.8b}, [x10], x2 |
| ld1 {v2.8b}, [x1], x2 |
| ushll v0.8h, v0.8b, #3 |
| ld1 {v3.8b}, [x10], x2 |
| ushll v1.8h, v1.8b, #3 |
| ushll v2.8h, v2.8b, #3 |
| ushll v3.8h, v3.8b, #3 |
| subs w8, w8, #4 |
| st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 |
| add v16.8h, v16.8h, v0.8h |
| add v17.8h, v17.8h, v1.8h |
| add v18.8h, v18.8h, v2.8h |
| add v19.8h, v19.8h, v3.8h |
| b.gt 1b |
| mov v0.16b, v3.16b |
| mov v1.16b, v3.16b |
| b L(ipred_cfl_ac_420_w8_hpad) |
| |
| L(ipred_cfl_ac_444_w16): |
| AARCH64_VALID_JUMP_TARGET |
| cbnz w3, L(ipred_cfl_ac_444_w16_wpad) |
| 1: // Copy and expand input, without padding |
| ld1 {v0.16b}, [x1], x2 |
| ld1 {v2.16b}, [x10], x2 |
| ld1 {v4.16b}, [x1], x2 |
| ushll2 v1.8h, v0.16b, #3 |
| ushll v0.8h, v0.8b, #3 |
| ld1 {v6.16b}, [x10], x2 |
| ushll2 v3.8h, v2.16b, #3 |
| ushll v2.8h, v2.8b, #3 |
| ushll2 v5.8h, v4.16b, #3 |
| ushll v4.8h, v4.8b, #3 |
| ushll2 v7.8h, v6.16b, #3 |
| ushll v6.8h, v6.8b, #3 |
| subs w8, w8, #4 |
| st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 |
| add v16.8h, v16.8h, v0.8h |
| add v17.8h, v17.8h, v1.8h |
| add v18.8h, v18.8h, v2.8h |
| add v19.8h, v19.8h, v3.8h |
| st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 |
| add v16.8h, v16.8h, v4.8h |
| add v17.8h, v17.8h, v5.8h |
| add v18.8h, v18.8h, v6.8h |
| add v19.8h, v19.8h, v7.8h |
| b.gt 1b |
| mov v0.16b, v6.16b |
| mov v1.16b, v7.16b |
| mov v2.16b, v6.16b |
| mov v3.16b, v7.16b |
| b L(ipred_cfl_ac_420_w16_hpad) |
| |
| L(ipred_cfl_ac_444_w16_wpad): |
| 1: // Copy and expand input, padding 8 |
| ld1 {v0.8b}, [x1], x2 |
| ld1 {v2.8b}, [x10], x2 |
| ld1 {v4.8b}, [x1], x2 |
| ld1 {v6.8b}, [x10], x2 |
| ushll v0.8h, v0.8b, #3 |
| ushll v2.8h, v2.8b, #3 |
| ushll v4.8h, v4.8b, #3 |
| ushll v6.8h, v6.8b, #3 |
| dup v1.8h, v0.h[7] |
| dup v3.8h, v2.h[7] |
| dup v5.8h, v4.h[7] |
| dup v7.8h, v6.h[7] |
| subs w8, w8, #4 |
| st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 |
| add v16.8h, v16.8h, v0.8h |
| add v17.8h, v17.8h, v1.8h |
| add v18.8h, v18.8h, v2.8h |
| add v19.8h, v19.8h, v3.8h |
| st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 |
| add v16.8h, v16.8h, v4.8h |
| add v17.8h, v17.8h, v5.8h |
| add v18.8h, v18.8h, v6.8h |
| add v19.8h, v19.8h, v7.8h |
| b.gt 1b |
| mov v0.16b, v6.16b |
| mov v1.16b, v7.16b |
| mov v2.16b, v6.16b |
| mov v3.16b, v7.16b |
| b L(ipred_cfl_ac_420_w16_hpad) |
| |
| L(ipred_cfl_ac_444_w32): |
| AARCH64_VALID_JUMP_TARGET |
| adr x7, L(ipred_cfl_ac_444_w32_tbl) |
| ldrh w3, [x7, w3, uxtw] // (w3>>1) << 1 |
| sub x7, x7, w3, uxtw |
| br x7 |
| |
| L(ipred_cfl_ac_444_w32_wpad0): |
| AARCH64_VALID_JUMP_TARGET |
| 1: // Copy and expand input, without padding |
| ld1 {v2.16b, v3.16b}, [x1], x2 |
| ld1 {v6.16b, v7.16b}, [x10], x2 |
| ushll v0.8h, v2.8b, #3 |
| ushll2 v1.8h, v2.16b, #3 |
| ushll v2.8h, v3.8b, #3 |
| ushll2 v3.8h, v3.16b, #3 |
| ushll v4.8h, v6.8b, #3 |
| ushll2 v5.8h, v6.16b, #3 |
| ushll v6.8h, v7.8b, #3 |
| ushll2 v7.8h, v7.16b, #3 |
| subs w8, w8, #2 |
| st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 |
| add v16.8h, v16.8h, v0.8h |
| add v17.8h, v17.8h, v1.8h |
| add v18.8h, v18.8h, v2.8h |
| add v19.8h, v19.8h, v3.8h |
| st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 |
| add v16.8h, v16.8h, v4.8h |
| add v17.8h, v17.8h, v5.8h |
| add v18.8h, v18.8h, v6.8h |
| add v19.8h, v19.8h, v7.8h |
| b.gt 1b |
| b L(ipred_cfl_ac_444_w32_hpad) |
| |
| L(ipred_cfl_ac_444_w32_wpad2): |
| AARCH64_VALID_JUMP_TARGET |
| 1: // Copy and expand input, padding 8 |
| ldr d2, [x1, #16] |
| ld1 {v1.16b}, [x1], x2 |
| ldr d6, [x10, #16] |
| ld1 {v5.16b}, [x10], x2 |
| ushll v2.8h, v2.8b, #3 |
| ushll v0.8h, v1.8b, #3 |
| ushll2 v1.8h, v1.16b, #3 |
| ushll v6.8h, v6.8b, #3 |
| ushll v4.8h, v5.8b, #3 |
| ushll2 v5.8h, v5.16b, #3 |
| dup v3.8h, v2.h[7] |
| dup v7.8h, v6.h[7] |
| subs w8, w8, #2 |
| st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 |
| add v16.8h, v16.8h, v0.8h |
| add v17.8h, v17.8h, v1.8h |
| add v18.8h, v18.8h, v2.8h |
| add v19.8h, v19.8h, v3.8h |
| st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 |
| add v16.8h, v16.8h, v4.8h |
| add v17.8h, v17.8h, v5.8h |
| add v18.8h, v18.8h, v6.8h |
| add v19.8h, v19.8h, v7.8h |
| b.gt 1b |
| b L(ipred_cfl_ac_444_w32_hpad) |
| |
| L(ipred_cfl_ac_444_w32_wpad4): |
| AARCH64_VALID_JUMP_TARGET |
| 1: // Copy and expand input, padding 16 |
| ld1 {v1.16b}, [x1], x2 |
| ld1 {v5.16b}, [x10], x2 |
| ushll v0.8h, v1.8b, #3 |
| ushll2 v1.8h, v1.16b, #3 |
| ushll v4.8h, v5.8b, #3 |
| ushll2 v5.8h, v5.16b, #3 |
| dup v2.8h, v1.h[7] |
| dup v3.8h, v1.h[7] |
| dup v6.8h, v5.h[7] |
| dup v7.8h, v5.h[7] |
| subs w8, w8, #2 |
| st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 |
| add v16.8h, v16.8h, v0.8h |
| add v17.8h, v17.8h, v1.8h |
| add v18.8h, v18.8h, v2.8h |
| add v19.8h, v19.8h, v3.8h |
| st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 |
| add v16.8h, v16.8h, v4.8h |
| add v17.8h, v17.8h, v5.8h |
| add v18.8h, v18.8h, v6.8h |
| add v19.8h, v19.8h, v7.8h |
| b.gt 1b |
| b L(ipred_cfl_ac_444_w32_hpad) |
| |
| L(ipred_cfl_ac_444_w32_wpad6): |
| AARCH64_VALID_JUMP_TARGET |
| 1: // Copy and expand input, padding 24 |
| ld1 {v0.8b}, [x1], x2 |
| ld1 {v4.8b}, [x10], x2 |
| ushll v0.8h, v0.8b, #3 |
| ushll v4.8h, v4.8b, #3 |
| dup v1.8h, v0.h[7] |
| dup v2.8h, v0.h[7] |
| dup v3.8h, v0.h[7] |
| dup v5.8h, v4.h[7] |
| dup v6.8h, v4.h[7] |
| dup v7.8h, v4.h[7] |
| subs w8, w8, #2 |
| st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 |
| add v16.8h, v16.8h, v0.8h |
| add v17.8h, v17.8h, v1.8h |
| add v18.8h, v18.8h, v2.8h |
| add v19.8h, v19.8h, v3.8h |
| st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 |
| add v16.8h, v16.8h, v4.8h |
| add v17.8h, v17.8h, v5.8h |
| add v18.8h, v18.8h, v6.8h |
| add v19.8h, v19.8h, v7.8h |
| b.gt 1b |
| |
| L(ipred_cfl_ac_444_w32_hpad): |
| cbz w4, 3f |
| 2: // Vertical padding (h_pad > 0) |
| subs w4, w4, #2 |
| st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 |
| add v16.8h, v16.8h, v4.8h |
| add v17.8h, v17.8h, v5.8h |
| add v18.8h, v18.8h, v6.8h |
| add v19.8h, v19.8h, v7.8h |
| st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 |
| add v16.8h, v16.8h, v4.8h |
| add v17.8h, v17.8h, v5.8h |
| add v18.8h, v18.8h, v6.8h |
| add v19.8h, v19.8h, v7.8h |
| b.gt 2b |
| 3: |
| |
| // Quadruple the height and reuse the w8 subtracting |
| lsl w6, w6, #2 |
| // Aggregate the sums, with wider intermediates earlier than in |
| // ipred_cfl_ac_420_w8_calc_subtract_dc. |
| uaddlp v0.4s, v16.8h |
| uaddlp v1.4s, v17.8h |
| uaddlp v2.4s, v18.8h |
| uaddlp v3.4s, v19.8h |
| add v0.4s, v0.4s, v1.4s |
| add v2.4s, v2.4s, v3.4s |
| add v0.4s, v0.4s, v2.4s |
| addv s0, v0.4s // sum |
| sub x0, x0, w6, uxtw #4 |
| urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz |
| dup v4.8h, v4.h[0] |
| b L(ipred_cfl_ac_420_w8_subtract_dc) |
| |
| L(ipred_cfl_ac_444_tbl): |
| .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32) |
| .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16) |
| .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8) |
| .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4) |
| |
| L(ipred_cfl_ac_444_w32_tbl): |
| .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0) |
| .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2) |
| .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4) |
| .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6) |
| endfunc |