x86: Add high bitdepth cdef_filter SSSE3 asm
diff --git a/src/x86/cdef16_sse.asm b/src/x86/cdef16_sse.asm
index d1d46ea..1da520c 100644
--- a/src/x86/cdef16_sse.asm
+++ b/src/x86/cdef16_sse.asm
@@ -1,3 +1,5 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
; Copyright (c) 2017-2021, The rav1e contributors
; Copyright (c) 2021, Nathan Egge
; All rights reserved.
@@ -28,10 +30,33 @@
SECTION_RODATA
+%macro DUP8 1-*
+ %rep %0
+ times 8 dw %1
+ %rotate 1
+ %endrep
+%endmacro
+
+pri_taps: DUP8 4, 2, 3, 3
+dir_table: db 1 * 32 + 0, 2 * 32 + 0
+ db 1 * 32 + 0, 2 * 32 - 2
+ db -1 * 32 + 2, -2 * 32 + 4
+ db 0 * 32 + 2, -1 * 32 + 4
+ db 0 * 32 + 2, 0 * 32 + 4
+ db 0 * 32 + 2, 1 * 32 + 4
+ db 1 * 32 + 2, 2 * 32 + 4
+ db 1 * 32 + 0, 2 * 32 + 2
+ db 1 * 32 + 0, 2 * 32 + 0
+ db 1 * 32 + 0, 2 * 32 - 2
+ db -1 * 32 + 2, -2 * 32 + 4
+ db 0 * 32 + 2, -1 * 32 + 4
+
dir_shift: times 4 dw 0x4000
times 4 dw 0x1000
pw_128: times 4 dw 128
+pw_2048: times 8 dw 2048
+pw_m16384: times 8 dw -16384
cextern cdef_dir_8bpc_ssse3.main
cextern cdef_dir_8bpc_sse4.main
@@ -47,6 +72,891 @@
%endrep
%endmacro
+%if ARCH_X86_32
+DECLARE_REG_TMP 5, 3
+%elif WIN64
+DECLARE_REG_TMP 7, 4
+%else
+DECLARE_REG_TMP 7, 8
+%endif
+
+%macro CDEF_FILTER 2 ; w, h
+%if ARCH_X86_64
+ DEFINE_ARGS dst, stride, tmp, pridmp, pri, sec, dir
+ mova m8, [base+pw_2048]
+%else
+ DEFINE_ARGS dst, pridmp, tmp, sec, pri, _, dir
+ %define m8 [base+pw_2048]
+ %define m9 [rsp+16*1+gprsize]
+ %define m10 [rsp+16*2+gprsize]
+%endif
+ movifnidn prid, r4m
+ movifnidn secd, r5m
+ test prid, prid
+ jz .sec_only
+ movd m6, r4m
+%if ARCH_X86_32
+ mov [rsp+24], pridmpd
+%endif
+ bsr pridmpd, prid
+ lea tmpd, [priq*4]
+ cmp dword r9m, 0x3ff ; if (bpc == 10)
+ cmove prid, tmpd ; pri <<= 2
+ mov tmpd, r7m ; damping
+ mov dird, r6m
+ and prid, 16
+ pshufb m6, m7 ; splat
+ lea dirq, [base+dir_table+dirq*2]
+ lea priq, [base+pri_taps+priq*2]
+ test secd, secd
+ jz .pri_only
+ mova [rsp], m6
+ movd m6, secd
+ bsr secd, secd
+ sub pridmpd, tmpd
+ sub tmpd, secd
+ pshufb m6, m7
+ xor secd, secd
+ neg pridmpd
+ cmovs pridmpd, secd
+%if ARCH_X86_32
+ mov [pri_shift+4], secd
+ mov [sec_shift+4], secd
+%endif
+ mov [pri_shift+0], pridmpq
+ mov [sec_shift+0], tmpq
+ lea tmpq, [px]
+%if WIN64
+ movaps r4m, m9
+ movaps r6m, m10
+%elif ARCH_X86_32
+ mov pridmpd, [rsp+24]
+%endif
+%rep %1*%2/8
+ call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri_sec
+%endrep
+%if WIN64
+ movaps m9, r4m
+ movaps m10, r6m
+%endif
+ jmp .end
+.pri_only:
+ sub tmpd, pridmpd
+ cmovs tmpd, secd
+%if ARCH_X86_32
+ mov pridmpd, [rsp+24]
+ mov [pri_shift+4], secd
+%endif
+ mov [pri_shift+0], tmpq
+ lea tmpq, [px]
+%rep %1*%2/8
+ call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri
+%endrep
+.end:
+ RET
+.sec_only:
+ mov tmpd, r7m ; damping
+ movd m6, r5m
+ bsr secd, secd
+ mov dird, r6m
+ pshufb m6, m7
+ sub tmpd, secd
+ lea dirq, [base+dir_table+dirq*2]
+%if ARCH_X86_32
+ mov [sec_shift+4], prid
+%endif
+ mov [sec_shift+0], tmpq
+ lea tmpq, [px]
+%rep %1*%2/8
+ call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).sec
+%endrep
+ jmp .end
+%if %1 == %2
+DEFINE_ARGS dst, stride, tmp, off, pri, _, dir
+ALIGN function_align
+.pri:
+ movsx offq, byte [dirq+4] ; off_k0
+%if %1 == 4
+ movq m1, [dstq+strideq*0]
+ movhps m1, [dstq+strideq*1]
+ movq m2, [tmpq+offq+32*0] ; k0p0
+ movhps m2, [tmpq+offq+32*1]
+ neg offq
+ movq m3, [tmpq+offq+32*0] ; k0p1
+ movhps m3, [tmpq+offq+32*1]
+%else
+ mova m1, [dstq]
+ movu m2, [tmpq+offq]
+ neg offq
+ movu m3, [tmpq+offq]
+%endif
+ movsx offq, byte [dirq+5] ; off_k1
+ psubw m2, m1 ; diff_k0p0
+ psubw m3, m1 ; diff_k0p1
+ pabsw m4, m2 ; adiff_k0p0
+ psrlw m5, m4, [pri_shift+gprsize]
+ psubusw m0, m6, m5
+ pabsw m5, m3 ; adiff_k0p1
+ pminsw m0, m4
+ psrlw m4, m5, [pri_shift+gprsize]
+ psignw m0, m2 ; constrain(diff_k0p0)
+ psubusw m2, m6, m4
+ pminsw m2, m5
+%if %1 == 4
+ movq m4, [tmpq+offq+32*0] ; k1p0
+ movhps m4, [tmpq+offq+32*1]
+ neg offq
+ movq m5, [tmpq+offq+32*0] ; k1p1
+ movhps m5, [tmpq+offq+32*1]
+%else
+ movu m4, [tmpq+offq]
+ neg offq
+ movu m5, [tmpq+offq]
+%endif
+ psubw m4, m1 ; diff_k1p0
+ psubw m5, m1 ; diff_k1p1
+ psignw m2, m3 ; constrain(diff_k0p1)
+ pabsw m3, m4 ; adiff_k1p0
+ paddw m0, m2 ; constrain(diff_k0)
+ psrlw m2, m3, [pri_shift+gprsize]
+ psubusw m7, m6, m2
+ pabsw m2, m5 ; adiff_k1p1
+ pminsw m7, m3
+ psrlw m3, m2, [pri_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k1p0)
+ psubusw m4, m6, m3
+ pminsw m4, m2
+ psignw m4, m5 ; constrain(diff_k1p1)
+ paddw m7, m4 ; constrain(diff_k1)
+ pmullw m0, [priq+16*0] ; pri_tap_k0
+ pmullw m7, [priq+16*1] ; pri_tap_k1
+ paddw m0, m7 ; sum
+ psraw m2, m0, 15
+ paddw m0, m2
+ pmulhrsw m0, m8
+ paddw m0, m1
+%if %1 == 4
+ add tmpq, 32*2
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+%else
+ add tmpq, 32
+ mova [dstq], m0
+ add dstq, strideq
+%endif
+ ret
+ALIGN function_align
+.sec:
+ movsx offq, byte [dirq+8] ; off1_k0
+%if %1 == 4
+ movq m1, [dstq+strideq*0]
+ movhps m1, [dstq+strideq*1]
+ movq m2, [tmpq+offq+32*0] ; k0s0
+ movhps m2, [tmpq+offq+32*1]
+ neg offq
+ movq m3, [tmpq+offq+32*0] ; k0s1
+ movhps m3, [tmpq+offq+32*1]
+%else
+ mova m1, [dstq]
+ movu m2, [tmpq+offq]
+ neg offq
+ movu m3, [tmpq+offq]
+%endif
+ movsx offq, byte [dirq+0] ; off2_k0
+ psubw m2, m1 ; diff_k0s0
+ psubw m3, m1 ; diff_k0s1
+ pabsw m4, m2 ; adiff_k0s0
+ psrlw m5, m4, [sec_shift+gprsize]
+ psubusw m0, m6, m5
+ pabsw m5, m3 ; adiff_k0s1
+ pminsw m0, m4
+ psrlw m4, m5, [sec_shift+gprsize]
+ psignw m0, m2 ; constrain(diff_k0s0)
+ psubusw m2, m6, m4
+ pminsw m2, m5
+%if %1 == 4
+ movq m4, [tmpq+offq+32*0] ; k0s2
+ movhps m4, [tmpq+offq+32*1]
+ neg offq
+ movq m5, [tmpq+offq+32*0] ; k0s3
+ movhps m5, [tmpq+offq+32*1]
+%else
+ movu m4, [tmpq+offq]
+ neg offq
+ movu m5, [tmpq+offq]
+%endif
+ movsx offq, byte [dirq+9] ; off1_k1
+ psubw m4, m1 ; diff_k0s2
+ psubw m5, m1 ; diff_k0s3
+ psignw m2, m3 ; constrain(diff_k0s1)
+ pabsw m3, m4 ; adiff_k0s2
+ paddw m0, m2
+ psrlw m2, m3, [sec_shift+gprsize]
+ psubusw m7, m6, m2
+ pabsw m2, m5 ; adiff_k0s3
+ pminsw m7, m3
+ psrlw m3, m2, [sec_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k0s2)
+ psubusw m4, m6, m3
+ pminsw m4, m2
+%if %1 == 4
+ movq m2, [tmpq+offq+32*0] ; k1s0
+ movhps m2, [tmpq+offq+32*1]
+ neg offq
+ movq m3, [tmpq+offq+32*0] ; k1s1
+ movhps m3, [tmpq+offq+32*1]
+%else
+ movu m2, [tmpq+offq]
+ neg offq
+ movu m3, [tmpq+offq]
+%endif
+ movsx offq, byte [dirq+1] ; off2_k1
+ paddw m0, m7
+ psignw m4, m5 ; constrain(diff_k0s3)
+ paddw m0, m4 ; constrain(diff_k0)
+ psubw m2, m1 ; diff_k1s0
+ psubw m3, m1 ; diff_k1s1
+ paddw m0, m0 ; sec_tap_k0
+ pabsw m4, m2 ; adiff_k1s0
+ psrlw m5, m4, [sec_shift+gprsize]
+ psubusw m7, m6, m5
+ pabsw m5, m3 ; adiff_k1s1
+ pminsw m7, m4
+ psrlw m4, m5, [sec_shift+gprsize]
+ psignw m7, m2 ; constrain(diff_k1s0)
+ psubusw m2, m6, m4
+ pminsw m2, m5
+%if %1 == 4
+ movq m4, [tmpq+offq+32*0] ; k1s2
+ movhps m4, [tmpq+offq+32*1]
+ neg offq
+ movq m5, [tmpq+offq+32*0] ; k1s3
+ movhps m5, [tmpq+offq+32*1]
+%else
+ movu m4, [tmpq+offq]
+ neg offq
+ movu m5, [tmpq+offq]
+%endif
+ paddw m0, m7
+ psubw m4, m1 ; diff_k1s2
+ psubw m5, m1 ; diff_k1s3
+ psignw m2, m3 ; constrain(diff_k1s1)
+ pabsw m3, m4 ; adiff_k1s2
+ paddw m0, m2
+ psrlw m2, m3, [sec_shift+gprsize]
+ psubusw m7, m6, m2
+ pabsw m2, m5 ; adiff_k1s3
+ pminsw m7, m3
+ psrlw m3, m2, [sec_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k1s2)
+ psubusw m4, m6, m3
+ pminsw m4, m2
+ paddw m0, m7
+ psignw m4, m5 ; constrain(diff_k1s3)
+ paddw m0, m4 ; sum
+ psraw m2, m0, 15
+ paddw m0, m2
+ pmulhrsw m0, m8
+ paddw m0, m1
+%if %1 == 4
+ add tmpq, 32*2
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+%else
+ add tmpq, 32
+ mova [dstq], m0
+ add dstq, strideq
+%endif
+ ret
+ALIGN function_align
+.pri_sec:
+ movsx offq, byte [dirq+8] ; off2_k0
+%if %1 == 4
+ movq m1, [dstq+strideq*0]
+ movhps m1, [dstq+strideq*1]
+ movq m2, [tmpq+offq+32*0] ; k0s0
+ movhps m2, [tmpq+offq+32*1]
+ neg offq
+ movq m3, [tmpq+offq+32*0] ; k0s1
+ movhps m3, [tmpq+offq+32*1]
+%else
+ mova m1, [dstq]
+ movu m2, [tmpq+offq]
+ neg offq
+ movu m3, [tmpq+offq]
+%endif
+ movsx offq, byte [dirq+0] ; off3_k0
+ pabsw m4, m2
+%if ARCH_X86_64
+ pabsw m10, m3
+ pmaxsw m9, m2, m3
+ pminsw m10, m4
+%else
+ pabsw m7, m3
+ pmaxsw m5, m2, m3
+ pminsw m4, m7
+ mova m9, m5
+ mova m10, m4
+%endif
+ psubw m2, m1 ; diff_k0s0
+ psubw m3, m1 ; diff_k0s1
+ pabsw m4, m2 ; adiff_k0s0
+ psrlw m5, m4, [sec_shift+gprsize]
+ psubusw m0, m6, m5
+ pabsw m5, m3 ; adiff_k0s1
+ pminsw m0, m4
+ psrlw m4, m5, [sec_shift+gprsize]
+ psignw m0, m2 ; constrain(diff_k0s0)
+ psubusw m2, m6, m4
+ pminsw m2, m5
+%if %1 == 4
+ movq m4, [tmpq+offq+32*0] ; k0s2
+ movhps m4, [tmpq+offq+32*1]
+ neg offq
+ movq m5, [tmpq+offq+32*0] ; k0s3
+ movhps m5, [tmpq+offq+32*1]
+%else
+ movu m4, [tmpq+offq]
+ neg offq
+ movu m5, [tmpq+offq]
+%endif
+ movsx offq, byte [dirq+9] ; off2_k1
+ pabsw m7, m4
+ psignw m2, m3
+ pabsw m3, m5 ; constrain(diff_k0s1)
+%if ARCH_X86_64
+ pmaxsw m9, m4
+ pminsw m10, m7
+ pmaxsw m9, m5
+ pminsw m10, m3
+%else
+ pminsw m7, m10
+ pminsw m7, m3
+ pmaxsw m3, m9, m4
+ pmaxsw m3, m5
+ mova m10, m7
+ mova m9, m3
+%endif
+ psubw m4, m1 ; diff_k0s2
+ psubw m5, m1 ; diff_k0s3
+ paddw m0, m2
+ pabsw m3, m4 ; adiff_k0s2
+ psrlw m2, m3, [sec_shift+gprsize]
+ psubusw m7, m6, m2
+ pabsw m2, m5 ; adiff_k0s3
+ pminsw m7, m3
+ psrlw m3, m2, [sec_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k0s2)
+ psubusw m4, m6, m3
+ pminsw m4, m2
+%if %1 == 4
+ movq m2, [tmpq+offq+32*0] ; k1s0
+ movhps m2, [tmpq+offq+32*1]
+ neg offq
+ movq m3, [tmpq+offq+32*0] ; k1s1
+ movhps m3, [tmpq+offq+32*1]
+%else
+ movu m2, [tmpq+offq]
+ neg offq
+ movu m3, [tmpq+offq]
+%endif
+ movsx offq, byte [dirq+1] ; off3_k1
+ paddw m0, m7
+ pabsw m7, m2
+ psignw m4, m5 ; constrain(diff_k0s3)
+ pabsw m5, m3
+%if ARCH_X86_64
+ pmaxsw m9, m2
+ pminsw m10, m7
+ pmaxsw m9, m3
+ pminsw m10, m5
+%else
+ pminsw m7, m10
+ pminsw m7, m5
+ pmaxsw m5, m9, m2
+ pmaxsw m5, m3
+ mova m10, m7
+ mova m9, m5
+%endif
+ paddw m0, m4 ; constrain(diff_k0)
+ psubw m2, m1 ; diff_k1s0
+ psubw m3, m1 ; diff_k1s1
+ paddw m0, m0 ; sec_tap_k0
+ pabsw m4, m2 ; adiff_k1s0
+ psrlw m5, m4, [sec_shift+gprsize]
+ psubusw m7, m6, m5
+ pabsw m5, m3 ; adiff_k1s1
+ pminsw m7, m4
+ psrlw m4, m5, [sec_shift+gprsize]
+ psignw m7, m2 ; constrain(diff_k1s0)
+ psubusw m2, m6, m4
+ pminsw m2, m5
+%if %1 == 4
+ movq m4, [tmpq+offq+32*0] ; k1s2
+ movhps m4, [tmpq+offq+32*1]
+ neg offq
+ movq m5, [tmpq+offq+32*0] ; k1s3
+ movhps m5, [tmpq+offq+32*1]
+%else
+ movu m4, [tmpq+offq]
+ neg offq
+ movu m5, [tmpq+offq]
+%endif
+ movsx offq, byte [dirq+4] ; off1_k0
+ paddw m0, m7
+ pabsw m7, m4
+ psignw m2, m3 ; constrain(diff_k1s1)
+ pabsw m3, m5
+%if ARCH_X86_64
+ pmaxsw m9, m4
+ pminsw m10, m7
+ pmaxsw m9, m5
+ pminsw m10, m3
+%else
+ pminsw m7, m10
+ pminsw m7, m3
+ pmaxsw m3, m9, m4
+ pmaxsw m3, m5
+ mova m10, m7
+ mova m9, m3
+%endif
+ psubw m4, m1 ; diff_k1s2
+ psubw m5, m1 ; diff_k1s3
+ pabsw m3, m4 ; adiff_k1s2
+ paddw m0, m2
+ psrlw m2, m3, [sec_shift+gprsize]
+ psubusw m7, m6, m2
+ pabsw m2, m5 ; adiff_k1s3
+ pminsw m7, m3
+ psrlw m3, m2, [sec_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k1s2)
+ psubusw m4, m6, m3
+ pminsw m4, m2
+ paddw m0, m7
+%if %1 == 4
+ movq m2, [tmpq+offq+32*0] ; k0p0
+ movhps m2, [tmpq+offq+32*1]
+ neg offq
+ movq m3, [tmpq+offq+32*0] ; k0p1
+ movhps m3, [tmpq+offq+32*1]
+%else
+ movu m2, [tmpq+offq]
+ neg offq
+ movu m3, [tmpq+offq]
+%endif
+ movsx offq, byte [dirq+5] ; off1_k1
+ pabsw m7, m2
+ psignw m4, m5 ; constrain(diff_k1s3)
+ pabsw m5, m3
+%if ARCH_X86_64
+ pmaxsw m9, m2
+ pminsw m10, m7
+ pmaxsw m9, m3
+ pminsw m10, m5
+%else
+ pminsw m7, m10
+ pminsw m7, m5
+ pmaxsw m5, m9, m2
+ pmaxsw m5, m3
+ mova m10, m7
+ mova m9, m5
+%endif
+ psubw m2, m1 ; diff_k0p0
+ psubw m3, m1 ; diff_k0p1
+ paddw m0, m4
+ pabsw m4, m2 ; adiff_k0p0
+ psrlw m5, m4, [pri_shift+gprsize]
+ psubusw m7, [rsp+gprsize], m5
+ pabsw m5, m3 ; adiff_k0p1
+ pminsw m7, m4
+ psrlw m4, m5, [pri_shift+gprsize]
+ psignw m7, m2 ; constrain(diff_k0p0)
+ psubusw m2, [rsp+gprsize], m4
+ pminsw m2, m5
+%if %1 == 4
+ movq m4, [tmpq+offq+32*0] ; k1p0
+ movhps m4, [tmpq+offq+32*1]
+ neg offq
+ movq m5, [tmpq+offq+32*0] ; k1p1
+ movhps m5, [tmpq+offq+32*1]
+%else
+ movu m4, [tmpq+offq]
+ neg offq
+ movu m5, [tmpq+offq]
+%endif
+ psignw m2, m3 ; constrain(diff_k0p1)
+ pabsw m3, m4
+ paddw m7, m2 ; constrain(diff_k0)
+ pabsw m2, m5
+%if ARCH_X86_64
+ pmaxsw m9, m4
+ pminsw m10, m3
+ pmaxsw m9, m5
+ pminsw m10, m2
+%else
+ pminsw m3, m10
+ pminsw m3, m2
+ pmaxsw m2, m9, m4
+ pmaxsw m2, m5
+ mova m10, m3
+ mova m9, m2
+%endif
+ psubw m4, m1 ; diff_k1p0
+ psubw m5, m1 ; diff_k1p1
+ pabsw m3, m4 ; adiff_k1p0
+ pmullw m7, [priq+16*0] ; pri_tap_k0
+ paddw m0, m7
+ psrlw m2, m3, [pri_shift+gprsize]
+ psubusw m7, [rsp+16*0+gprsize], m2
+ pabsw m2, m5 ; adiff_k1p1
+ pminsw m7, m3
+ psrlw m3, m2, [pri_shift+gprsize]
+ psignw m7, m4 ; constrain(diff_k1p0)
+ psubusw m4, [rsp+16*0+gprsize], m3
+ pminsw m4, m2
+ psignw m4, m5 ; constrain(diff_k1p1)
+ paddw m7, m4 ; constrain(diff_k1)
+ pmullw m7, [priq+16*1] ; pri_tap_k1
+ paddw m0, m7 ; sum
+ psraw m2, m0, 15
+ paddw m0, m2
+ pmulhrsw m0, m8
+ paddw m0, m1
+%if ARCH_X86_64
+ pmaxsw m9, m1
+ pminsw m0, m9
+%else
+ pmaxsw m2, m9, m1
+ pminsw m0, m2
+%endif
+ pminsw m1, m10
+ pmaxsw m0, m1
+%if %1 == 4
+ add tmpq, 32*2
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+%else
+ add tmpq, 32
+ mova [dstq], m0
+ add dstq, strideq
+%endif
+ ret
+%endif
+%endmacro
+
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal cdef_filter_4x4_16bpc, 4, 8, 9, 32*10, dst, stride, left, top, pri, sec, edge
+ %define px rsp+32*4
+%else
+cglobal cdef_filter_4x4_16bpc, 2, 7, 8, -32*11, dst, stride, edge, top, left
+ %define px rsp+32*5
+%endif
+ %define base t0-dir_table
+ %define pri_shift px-16*6
+ %define sec_shift px-16*5
+ mov edged, r8m
+ LEA t0, dir_table
+ movu m0, [dstq+strideq*0]
+ movu m1, [dstq+strideq*1]
+ lea t1, [dstq+strideq*2]
+ movu m2, [t1 +strideq*0]
+ movu m3, [t1 +strideq*1]
+ movddup m7, [base+pw_m16384]
+ mova [px+32*0+0], m0
+ mova [px+32*1+0], m1
+ mova [px+32*2+0], m2
+ mova [px+32*3+0], m3
+ test edgeb, 4 ; HAVE_TOP
+ jz .no_top
+ movifnidn topq, topmp
+ movu m0, [topq+strideq*0]
+ movu m1, [topq+strideq*1]
+ mova [px-32*2+0], m0
+ mova [px-32*1+0], m1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .top_no_left
+ movd m0, [topq+strideq*0-4]
+ movd m1, [topq+strideq*1-4]
+ movd [px-32*2-4], m0
+ movd [px-32*1-4], m1
+ jmp .top_done
+.no_top:
+ mova [px-32*2+0], m7
+ mova [px-32*1+0], m7
+.top_no_left:
+ movd [px-32*2-4], m7
+ movd [px-32*1-4], m7
+.top_done:
+ test edgeb, 8 ; HAVE_BOTTOM
+ jz .no_bottom
+ lea r3, [dstq+strideq*4]
+ movu m0, [r3+strideq*0]
+ movu m1, [r3+strideq*1]
+ mova [px+32*4+0], m0
+ mova [px+32*5+0], m1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .bottom_no_left
+ movd m0, [r3+strideq*0-4]
+ movd m1, [r3+strideq*1-4]
+ movd [px+32*4-4], m0
+ movd [px+32*5-4], m1
+ jmp .bottom_done
+.no_bottom:
+ mova [px+32*4+0], m7
+ mova [px+32*5+0], m7
+.bottom_no_left:
+ movd [px+32*4-4], m7
+ movd [px+32*5-4], m7
+.bottom_done:
+ test edgeb, 1 ; HAVE_LEFT
+ jz .no_left
+ movifnidn leftq, r2mp
+ movd m0, [leftq+4*0]
+ movd m1, [leftq+4*1]
+ movd m2, [leftq+4*2]
+ movd m3, [leftq+4*3]
+ movd [px+32*0-4], m0
+ movd [px+32*1-4], m1
+ movd [px+32*2-4], m2
+ movd [px+32*3-4], m3
+ jmp .left_done
+.no_left:
+ REPX {movd [px+32*x-4], m7}, 0, 1, 2, 3
+.left_done:
+ test edgeb, 2 ; HAVE_RIGHT
+ jnz .padding_done
+ REPX {movd [px+32*x+8], m7}, -2, -1, 0, 1, 2, 3, 4, 5
+.padding_done:
+ CDEF_FILTER 4, 4
+
+%if ARCH_X86_64
+cglobal cdef_filter_4x8_16bpc, 4, 8, 9, 32*14, dst, stride, left, top, pri, sec, edge
+%else
+cglobal cdef_filter_4x8_16bpc, 2, 7, 8, -32*15, dst, stride, edge, top, left
+%endif
+ mov edged, r8m
+ LEA t0, dir_table
+ movu m0, [dstq+strideq*0]
+ movu m1, [dstq+strideq*1]
+ lea t1, [dstq+strideq*2]
+ movu m2, [t1 +strideq*0]
+ movu m3, [t1 +strideq*1]
+ lea t1, [t1 +strideq*2]
+ movu m4, [t1 +strideq*0]
+ movu m5, [t1 +strideq*1]
+ lea t1, [t1 +strideq*2]
+ movu m6, [t1 +strideq*0]
+ movu m7, [t1 +strideq*1]
+ mova [px+32*0+0], m0
+ mova [px+32*1+0], m1
+ mova [px+32*2+0], m2
+ mova [px+32*3+0], m3
+ mova [px+32*4+0], m4
+ mova [px+32*5+0], m5
+ mova [px+32*6+0], m6
+ mova [px+32*7+0], m7
+ movddup m7, [base+pw_m16384]
+ test edgeb, 4 ; HAVE_TOP
+ jz .no_top
+ movifnidn topq, topmp
+ movu m0, [topq+strideq*0]
+ movu m1, [topq+strideq*1]
+ mova [px-32*2+0], m0
+ mova [px-32*1+0], m1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .top_no_left
+ movd m0, [topq+strideq*0-4]
+ movd m1, [topq+strideq*1-4]
+ movd [px-32*2-4], m0
+ movd [px-32*1-4], m1
+ jmp .top_done
+.no_top:
+ mova [px-32*2+0], m7
+ mova [px-32*1+0], m7
+.top_no_left:
+ movd [px-32*2-4], m7
+ movd [px-32*1-4], m7
+.top_done:
+ test edgeb, 8 ; HAVE_BOTTOM
+ jz .no_bottom
+ lea r3, [dstq+strideq*8]
+ movu m0, [r3+strideq*0]
+ movu m1, [r3+strideq*1]
+ mova [px+32*8+0], m0
+ mova [px+32*9+0], m1
+ test edgeb, 1 ; HAVE_LEFT
+ jz .bottom_no_left
+ movd m0, [r3+strideq*0-4]
+ movd m1, [r3+strideq*1-4]
+ movd [px+32*8-4], m0
+ movd [px+32*9-4], m1
+ jmp .bottom_done
+.no_bottom:
+ mova [px+32*8+0], m7
+ mova [px+32*9+0], m7
+.bottom_no_left:
+ movd [px+32*8-4], m7
+ movd [px+32*9-4], m7
+.bottom_done:
+ test edgeb, 1 ; HAVE_LEFT
+ jz .no_left
+ movifnidn leftq, r2mp
+ movd m0, [leftq+4*0]
+ movd m1, [leftq+4*1]
+ movd m2, [leftq+4*2]
+ movd m3, [leftq+4*3]
+ movd [px+32*0-4], m0
+ movd [px+32*1-4], m1
+ movd [px+32*2-4], m2
+ movd [px+32*3-4], m3
+ movd m0, [leftq+4*4]
+ movd m1, [leftq+4*5]
+ movd m2, [leftq+4*6]
+ movd m3, [leftq+4*7]
+ movd [px+32*4-4], m0
+ movd [px+32*5-4], m1
+ movd [px+32*6-4], m2
+ movd [px+32*7-4], m3
+ jmp .left_done
+.no_left:
+ REPX {movd [px+32*x-4], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+.left_done:
+ test edgeb, 2 ; HAVE_RIGHT
+ jnz .padding_done
+ REPX {movd [px+32*x+8], m7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
+.padding_done:
+ CDEF_FILTER 4, 8
+
+%if ARCH_X86_64
+cglobal cdef_filter_8x8_16bpc, 4, 8, 9, 32*14, dst, stride, left, top, pri, sec, edge
+%else
+cglobal cdef_filter_8x8_16bpc, 2, 7, 8, -32*15, dst, stride, edge, top, left
+%endif
+ mov edged, r8m
+ LEA t0, dir_table
+ mova m0, [dstq+strideq*0+ 0]
+ movd m1, [dstq+strideq*0+16]
+ mova m2, [dstq+strideq*1+ 0]
+ movd m3, [dstq+strideq*1+16]
+ lea t1, [dstq+strideq*2]
+ mova m4, [t1 +strideq*0+ 0]
+ movd m5, [t1 +strideq*0+16]
+ mova m6, [t1 +strideq*1+ 0]
+ movd m7, [t1 +strideq*1+16]
+ lea t1, [t1 +strideq*2]
+ mova [px+32*0+ 0], m0
+ movd [px+32*0+16], m1
+ mova [px+32*1+ 0], m2
+ movd [px+32*1+16], m3
+ mova [px+32*2+ 0], m4
+ movd [px+32*2+16], m5
+ mova [px+32*3+ 0], m6
+ movd [px+32*3+16], m7
+ mova m0, [t1 +strideq*0+ 0]
+ movd m1, [t1 +strideq*0+16]
+ mova m2, [t1 +strideq*1+ 0]
+ movd m3, [t1 +strideq*1+16]
+ lea t1, [t1 +strideq*2]
+ mova m4, [t1 +strideq*0+ 0]
+ movd m5, [t1 +strideq*0+16]
+ mova m6, [t1 +strideq*1+ 0]
+ movd m7, [t1 +strideq*1+16]
+ mova [px+32*4+ 0], m0
+ movd [px+32*4+16], m1
+ mova [px+32*5+ 0], m2
+ movd [px+32*5+16], m3
+ mova [px+32*6+ 0], m4
+ movd [px+32*6+16], m5
+ mova [px+32*7+ 0], m6
+ movd [px+32*7+16], m7
+ movddup m7, [base+pw_m16384]
+ test edgeb, 4 ; HAVE_TOP
+ jz .no_top
+ movifnidn topq, topmp
+ mova m0, [topq+strideq*0+ 0]
+ mova m1, [topq+strideq*0+16]
+ mova m2, [topq+strideq*1+ 0]
+ mova m3, [topq+strideq*1+16]
+ mova [px-32*2+ 0], m0
+ movd [px-32*2+16], m1
+ mova [px-32*1+ 0], m2
+ movd [px-32*1+16], m3
+ test edgeb, 1 ; HAVE_LEFT
+ jz .top_no_left
+ movd m0, [topq+strideq*0-4]
+ movd m1, [topq+strideq*1-4]
+ movd [px-32*2-4], m0
+ movd [px-32*1-4], m1
+ jmp .top_done
+.no_top:
+ mova [px-32*2+ 0], m7
+ movd [px-32*2+16], m7
+ mova [px-32*1+ 0], m7
+ movd [px-32*1+16], m7
+.top_no_left:
+ movd [px-32*2- 4], m7
+ movd [px-32*1- 4], m7
+.top_done:
+ test edgeb, 8 ; HAVE_BOTTOM
+ jz .no_bottom
+ lea r3, [dstq+strideq*8]
+ mova m0, [r3+strideq*0+ 0]
+ movd m1, [r3+strideq*0+16]
+ mova m2, [r3+strideq*1+ 0]
+ movd m3, [r3+strideq*1+16]
+ mova [px+32*8+ 0], m0
+ movd [px+32*8+16], m1
+ mova [px+32*9+ 0], m2
+ movd [px+32*9+16], m3
+ test edgeb, 1 ; HAVE_LEFT
+ jz .bottom_no_left
+ movd m0, [r3+strideq*0-4]
+ movd m1, [r3+strideq*1-4]
+ movd [px+32*8- 4], m0
+ movd [px+32*9- 4], m1
+ jmp .bottom_done
+.no_bottom:
+ mova [px+32*8+ 0], m7
+ movd [px+32*8+16], m7
+ mova [px+32*9+ 0], m7
+ movd [px+32*9+16], m7
+.bottom_no_left:
+ movd [px+32*8- 4], m7
+ movd [px+32*9- 4], m7
+.bottom_done:
+ test edgeb, 1 ; HAVE_LEFT
+ jz .no_left
+ movifnidn leftq, r2mp
+ movd m0, [leftq+4*0]
+ movd m1, [leftq+4*1]
+ movd m2, [leftq+4*2]
+ movd m3, [leftq+4*3]
+ movd [px+32*0- 4], m0
+ movd [px+32*1- 4], m1
+ movd [px+32*2- 4], m2
+ movd [px+32*3- 4], m3
+ movd m0, [leftq+4*4]
+ movd m1, [leftq+4*5]
+ movd m2, [leftq+4*6]
+ movd m3, [leftq+4*7]
+ movd [px+32*4- 4], m0
+ movd [px+32*5- 4], m1
+ movd [px+32*6- 4], m2
+ movd [px+32*7- 4], m3
+ jmp .left_done
+.no_left:
+ REPX {movd [px+32*x- 4], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+.left_done:
+ test edgeb, 2 ; HAVE_RIGHT
+ jnz .padding_done
+ REPX {movd [px+32*x+16], m7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
+.padding_done:
+ CDEF_FILTER 8, 8
+
%macro CDEF_DIR 0
%if ARCH_X86_64
cglobal cdef_dir_16bpc, 4, 7, 16, src, stride, var, bdmax
diff --git a/src/x86/cdef_init_tmpl.c b/src/x86/cdef_init_tmpl.c
index 42d6cff..06a0d14 100644
--- a/src/x86/cdef_init_tmpl.c
+++ b/src/x86/cdef_init_tmpl.c
@@ -46,9 +46,9 @@
COLD void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
+#if BITDEPTH == 8
if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
-#if BITDEPTH == 8
c->fb[0] = BF(dav1d_cdef_filter_8x8, sse2);
c->fb[1] = BF(dav1d_cdef_filter_4x8, sse2);
c->fb[2] = BF(dav1d_cdef_filter_4x4, sse2);
@@ -57,11 +57,9 @@
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
c->dir = BF(dav1d_cdef_dir, ssse3);
-#if BITDEPTH == 8
c->fb[0] = BF(dav1d_cdef_filter_8x8, ssse3);
c->fb[1] = BF(dav1d_cdef_filter_4x8, ssse3);
c->fb[2] = BF(dav1d_cdef_filter_4x4, ssse3);
-#endif
if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;