x86: Add high bitdepth cdef_filter SSSE3 asm
diff --git a/src/x86/cdef16_sse.asm b/src/x86/cdef16_sse.asm
index d1d46ea..1da520c 100644
--- a/src/x86/cdef16_sse.asm
+++ b/src/x86/cdef16_sse.asm
@@ -1,3 +1,5 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
 ; Copyright (c) 2017-2021, The rav1e contributors
 ; Copyright (c) 2021, Nathan Egge
 ; All rights reserved.
@@ -28,10 +30,33 @@
 
 SECTION_RODATA
 
+%macro DUP8 1-*
+    %rep %0
+        times 8 dw %1
+        %rotate 1
+    %endrep
+%endmacro
+
+pri_taps:  DUP8 4, 2, 3, 3
+dir_table: db  1 * 32 + 0,  2 * 32 + 0
+           db  1 * 32 + 0,  2 * 32 - 2
+           db -1 * 32 + 2, -2 * 32 + 4
+           db  0 * 32 + 2, -1 * 32 + 4
+           db  0 * 32 + 2,  0 * 32 + 4
+           db  0 * 32 + 2,  1 * 32 + 4
+           db  1 * 32 + 2,  2 * 32 + 4
+           db  1 * 32 + 0,  2 * 32 + 2
+           db  1 * 32 + 0,  2 * 32 + 0
+           db  1 * 32 + 0,  2 * 32 - 2
+           db -1 * 32 + 2, -2 * 32 + 4
+           db  0 * 32 + 2, -1 * 32 + 4
+
 dir_shift: times 4 dw 0x4000
            times 4 dw 0x1000
 
 pw_128:    times 4 dw 128
+pw_2048:   times 8 dw 2048
+pw_m16384: times 8 dw -16384
 
 cextern cdef_dir_8bpc_ssse3.main
 cextern cdef_dir_8bpc_sse4.main
@@ -47,6 +72,891 @@
 %endrep
 %endmacro
 
+%if ARCH_X86_32
+DECLARE_REG_TMP 5, 3
+%elif WIN64
+DECLARE_REG_TMP 7, 4
+%else
+DECLARE_REG_TMP 7, 8
+%endif
+
+%macro CDEF_FILTER 2 ; w, h
+%if ARCH_X86_64
+    DEFINE_ARGS dst, stride, tmp, pridmp, pri, sec, dir
+    mova            m8, [base+pw_2048]
+%else
+    DEFINE_ARGS dst, pridmp, tmp, sec, pri, _, dir
+    %define         m8  [base+pw_2048]
+    %define         m9  [rsp+16*1+gprsize]
+    %define        m10  [rsp+16*2+gprsize]
+%endif
+    movifnidn     prid, r4m
+    movifnidn     secd, r5m
+    test          prid, prid
+    jz .sec_only
+    movd            m6, r4m
+%if ARCH_X86_32
+    mov       [rsp+24], pridmpd
+%endif
+    bsr        pridmpd, prid
+    lea           tmpd, [priq*4]
+    cmp      dword r9m, 0x3ff ; if (bpc == 10)
+    cmove         prid, tmpd  ;     pri <<= 2
+    mov           tmpd, r7m   ; damping
+    mov           dird, r6m
+    and           prid, 16
+    pshufb          m6, m7    ; splat
+    lea           dirq, [base+dir_table+dirq*2]
+    lea           priq, [base+pri_taps+priq*2]
+    test          secd, secd
+    jz .pri_only
+    mova         [rsp], m6
+    movd            m6, secd
+    bsr           secd, secd
+    sub        pridmpd, tmpd
+    sub           tmpd, secd
+    pshufb          m6, m7
+    xor           secd, secd
+    neg        pridmpd
+    cmovs      pridmpd, secd
+%if ARCH_X86_32
+    mov  [pri_shift+4], secd
+    mov  [sec_shift+4], secd
+%endif
+    mov  [pri_shift+0], pridmpq
+    mov  [sec_shift+0], tmpq
+    lea           tmpq, [px]
+%if WIN64
+    movaps         r4m, m9
+    movaps         r6m, m10
+%elif ARCH_X86_32
+    mov        pridmpd, [rsp+24]
+%endif
+%rep %1*%2/8
+    call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri_sec
+%endrep
+%if WIN64
+    movaps          m9, r4m
+    movaps         m10, r6m
+%endif
+    jmp .end
+.pri_only:
+    sub           tmpd, pridmpd
+    cmovs         tmpd, secd
+%if ARCH_X86_32
+    mov        pridmpd, [rsp+24]
+    mov  [pri_shift+4], secd
+%endif
+    mov  [pri_shift+0], tmpq
+    lea           tmpq, [px]
+%rep %1*%2/8
+    call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri
+%endrep
+.end:
+    RET
+.sec_only:
+    mov           tmpd, r7m ; damping
+    movd            m6, r5m
+    bsr           secd, secd
+    mov           dird, r6m
+    pshufb          m6, m7
+    sub           tmpd, secd
+    lea           dirq, [base+dir_table+dirq*2]
+%if ARCH_X86_32
+    mov  [sec_shift+4], prid
+%endif
+    mov  [sec_shift+0], tmpq
+    lea           tmpq, [px]
+%rep %1*%2/8
+    call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).sec
+%endrep
+    jmp .end
+%if %1 == %2
+DEFINE_ARGS dst, stride, tmp, off, pri, _, dir
+ALIGN function_align
+.pri:
+    movsx         offq, byte [dirq+4]    ; off_k0
+%if %1 == 4
+    movq            m1, [dstq+strideq*0]
+    movhps          m1, [dstq+strideq*1]
+    movq            m2, [tmpq+offq+32*0] ; k0p0
+    movhps          m2, [tmpq+offq+32*1]
+    neg           offq
+    movq            m3, [tmpq+offq+32*0] ; k0p1
+    movhps          m3, [tmpq+offq+32*1]
+%else
+    mova            m1, [dstq]
+    movu            m2, [tmpq+offq]
+    neg           offq
+    movu            m3, [tmpq+offq]
+%endif
+    movsx         offq, byte [dirq+5]    ; off_k1
+    psubw           m2, m1               ; diff_k0p0
+    psubw           m3, m1               ; diff_k0p1
+    pabsw           m4, m2               ; adiff_k0p0
+    psrlw           m5, m4, [pri_shift+gprsize]
+    psubusw         m0, m6, m5
+    pabsw           m5, m3               ; adiff_k0p1
+    pminsw          m0, m4
+    psrlw           m4, m5, [pri_shift+gprsize]
+    psignw          m0, m2               ; constrain(diff_k0p0)
+    psubusw         m2, m6, m4
+    pminsw          m2, m5
+%if %1 == 4
+    movq            m4, [tmpq+offq+32*0] ; k1p0
+    movhps          m4, [tmpq+offq+32*1]
+    neg           offq
+    movq            m5, [tmpq+offq+32*0] ; k1p1
+    movhps          m5, [tmpq+offq+32*1]
+%else
+    movu            m4, [tmpq+offq]
+    neg           offq
+    movu            m5, [tmpq+offq]
+%endif
+    psubw           m4, m1               ; diff_k1p0
+    psubw           m5, m1               ; diff_k1p1
+    psignw          m2, m3               ; constrain(diff_k0p1)
+    pabsw           m3, m4               ; adiff_k1p0
+    paddw           m0, m2               ; constrain(diff_k0)
+    psrlw           m2, m3, [pri_shift+gprsize]
+    psubusw         m7, m6, m2
+    pabsw           m2, m5               ; adiff_k1p1
+    pminsw          m7, m3
+    psrlw           m3, m2, [pri_shift+gprsize]
+    psignw          m7, m4               ; constrain(diff_k1p0)
+    psubusw         m4, m6, m3
+    pminsw          m4, m2
+    psignw          m4, m5               ; constrain(diff_k1p1)
+    paddw           m7, m4               ; constrain(diff_k1)
+    pmullw          m0, [priq+16*0]      ; pri_tap_k0
+    pmullw          m7, [priq+16*1]      ; pri_tap_k1
+    paddw           m0, m7               ; sum
+    psraw           m2, m0, 15
+    paddw           m0, m2
+    pmulhrsw        m0, m8
+    paddw           m0, m1
+%if %1 == 4
+    add           tmpq, 32*2
+    movq   [dstq+strideq*0], m0
+    movhps [dstq+strideq*1], m0
+    lea           dstq, [dstq+strideq*2]
+%else
+    add           tmpq, 32
+    mova        [dstq], m0
+    add           dstq, strideq
+%endif
+    ret
+ALIGN function_align
+.sec:
+    movsx         offq, byte [dirq+8]    ; off1_k0
+%if %1 == 4
+    movq            m1, [dstq+strideq*0]
+    movhps          m1, [dstq+strideq*1]
+    movq            m2, [tmpq+offq+32*0] ; k0s0
+    movhps          m2, [tmpq+offq+32*1]
+    neg           offq
+    movq            m3, [tmpq+offq+32*0] ; k0s1
+    movhps          m3, [tmpq+offq+32*1]
+%else
+    mova            m1, [dstq]
+    movu            m2, [tmpq+offq]
+    neg           offq
+    movu            m3, [tmpq+offq]
+%endif
+    movsx         offq, byte [dirq+0]    ; off2_k0
+    psubw           m2, m1               ; diff_k0s0
+    psubw           m3, m1               ; diff_k0s1
+    pabsw           m4, m2               ; adiff_k0s0
+    psrlw           m5, m4, [sec_shift+gprsize]
+    psubusw         m0, m6, m5
+    pabsw           m5, m3               ; adiff_k0s1
+    pminsw          m0, m4
+    psrlw           m4, m5, [sec_shift+gprsize]
+    psignw          m0, m2               ; constrain(diff_k0s0)
+    psubusw         m2, m6, m4
+    pminsw          m2, m5
+%if %1 == 4
+    movq            m4, [tmpq+offq+32*0] ; k0s2
+    movhps          m4, [tmpq+offq+32*1]
+    neg           offq
+    movq            m5, [tmpq+offq+32*0] ; k0s3
+    movhps          m5, [tmpq+offq+32*1]
+%else
+    movu            m4, [tmpq+offq]
+    neg           offq
+    movu            m5, [tmpq+offq]
+%endif
+    movsx         offq, byte [dirq+9]    ; off1_k1
+    psubw           m4, m1               ; diff_k0s2
+    psubw           m5, m1               ; diff_k0s3
+    psignw          m2, m3               ; constrain(diff_k0s1)
+    pabsw           m3, m4               ; adiff_k0s2
+    paddw           m0, m2
+    psrlw           m2, m3, [sec_shift+gprsize]
+    psubusw         m7, m6, m2
+    pabsw           m2, m5               ; adiff_k0s3
+    pminsw          m7, m3
+    psrlw           m3, m2, [sec_shift+gprsize]
+    psignw          m7, m4               ; constrain(diff_k0s2)
+    psubusw         m4, m6, m3
+    pminsw          m4, m2
+%if %1 == 4
+    movq            m2, [tmpq+offq+32*0] ; k1s0
+    movhps          m2, [tmpq+offq+32*1]
+    neg           offq
+    movq            m3, [tmpq+offq+32*0] ; k1s1
+    movhps          m3, [tmpq+offq+32*1]
+%else
+    movu            m2, [tmpq+offq]
+    neg           offq
+    movu            m3, [tmpq+offq]
+%endif
+    movsx         offq, byte [dirq+1]    ; off2_k1
+    paddw           m0, m7
+    psignw          m4, m5               ; constrain(diff_k0s3)
+    paddw           m0, m4               ; constrain(diff_k0)
+    psubw           m2, m1               ; diff_k1s0
+    psubw           m3, m1               ; diff_k1s1
+    paddw           m0, m0               ; sec_tap_k0
+    pabsw           m4, m2               ; adiff_k1s0
+    psrlw           m5, m4, [sec_shift+gprsize]
+    psubusw         m7, m6, m5
+    pabsw           m5, m3               ; adiff_k1s1
+    pminsw          m7, m4
+    psrlw           m4, m5, [sec_shift+gprsize]
+    psignw          m7, m2               ; constrain(diff_k1s0)
+    psubusw         m2, m6, m4
+    pminsw          m2, m5
+%if %1 == 4
+    movq            m4, [tmpq+offq+32*0] ; k1s2
+    movhps          m4, [tmpq+offq+32*1]
+    neg           offq
+    movq            m5, [tmpq+offq+32*0] ; k1s3
+    movhps          m5, [tmpq+offq+32*1]
+%else
+    movu            m4, [tmpq+offq]
+    neg           offq
+    movu            m5, [tmpq+offq]
+%endif
+    paddw           m0, m7
+    psubw           m4, m1               ; diff_k1s2
+    psubw           m5, m1               ; diff_k1s3
+    psignw          m2, m3               ; constrain(diff_k1s1)
+    pabsw           m3, m4               ; adiff_k1s2
+    paddw           m0, m2
+    psrlw           m2, m3, [sec_shift+gprsize]
+    psubusw         m7, m6, m2
+    pabsw           m2, m5               ; adiff_k1s3
+    pminsw          m7, m3
+    psrlw           m3, m2, [sec_shift+gprsize]
+    psignw          m7, m4               ; constrain(diff_k1s2)
+    psubusw         m4, m6, m3
+    pminsw          m4, m2
+    paddw           m0, m7
+    psignw          m4, m5               ; constrain(diff_k1s3)
+    paddw           m0, m4               ; sum
+    psraw           m2, m0, 15
+    paddw           m0, m2
+    pmulhrsw        m0, m8
+    paddw           m0, m1
+%if %1 == 4
+    add           tmpq, 32*2
+    movq   [dstq+strideq*0], m0
+    movhps [dstq+strideq*1], m0
+    lea           dstq, [dstq+strideq*2]
+%else
+    add           tmpq, 32
+    mova        [dstq], m0
+    add           dstq, strideq
+%endif
+    ret
+ALIGN function_align
+.pri_sec:
+    movsx         offq, byte [dirq+8]    ; off2_k0
+%if %1 == 4
+    movq            m1, [dstq+strideq*0]
+    movhps          m1, [dstq+strideq*1]
+    movq            m2, [tmpq+offq+32*0] ; k0s0
+    movhps          m2, [tmpq+offq+32*1]
+    neg           offq
+    movq            m3, [tmpq+offq+32*0] ; k0s1
+    movhps          m3, [tmpq+offq+32*1]
+%else
+    mova            m1, [dstq]
+    movu            m2, [tmpq+offq]
+    neg           offq
+    movu            m3, [tmpq+offq]
+%endif
+    movsx         offq, byte [dirq+0]    ; off3_k0
+    pabsw           m4, m2
+%if ARCH_X86_64
+    pabsw          m10, m3
+    pmaxsw          m9, m2, m3
+    pminsw         m10, m4
+%else
+    pabsw           m7, m3
+    pmaxsw          m5, m2, m3
+    pminsw          m4, m7
+    mova            m9, m5
+    mova           m10, m4
+%endif
+    psubw           m2, m1               ; diff_k0s0
+    psubw           m3, m1               ; diff_k0s1
+    pabsw           m4, m2               ; adiff_k0s0
+    psrlw           m5, m4, [sec_shift+gprsize]
+    psubusw         m0, m6, m5
+    pabsw           m5, m3               ; adiff_k0s1
+    pminsw          m0, m4
+    psrlw           m4, m5, [sec_shift+gprsize]
+    psignw          m0, m2               ; constrain(diff_k0s0)
+    psubusw         m2, m6, m4
+    pminsw          m2, m5
+%if %1 == 4
+    movq            m4, [tmpq+offq+32*0] ; k0s2
+    movhps          m4, [tmpq+offq+32*1]
+    neg           offq
+    movq            m5, [tmpq+offq+32*0] ; k0s3
+    movhps          m5, [tmpq+offq+32*1]
+%else
+    movu            m4, [tmpq+offq]
+    neg           offq
+    movu            m5, [tmpq+offq]
+%endif
+    movsx         offq, byte [dirq+9]    ; off2_k1
+    pabsw           m7, m4
+    psignw          m2, m3
+    pabsw           m3, m5               ; constrain(diff_k0s1)
+%if ARCH_X86_64
+    pmaxsw          m9, m4
+    pminsw         m10, m7
+    pmaxsw          m9, m5
+    pminsw         m10, m3
+%else
+    pminsw          m7, m10
+    pminsw          m7, m3
+    pmaxsw          m3, m9, m4
+    pmaxsw          m3, m5
+    mova           m10, m7
+    mova            m9, m3
+%endif
+    psubw           m4, m1               ; diff_k0s2
+    psubw           m5, m1               ; diff_k0s3
+    paddw           m0, m2
+    pabsw           m3, m4               ; adiff_k0s2
+    psrlw           m2, m3, [sec_shift+gprsize]
+    psubusw         m7, m6, m2
+    pabsw           m2, m5               ; adiff_k0s3
+    pminsw          m7, m3
+    psrlw           m3, m2, [sec_shift+gprsize]
+    psignw          m7, m4               ; constrain(diff_k0s2)
+    psubusw         m4, m6, m3
+    pminsw          m4, m2
+%if %1 == 4
+    movq            m2, [tmpq+offq+32*0] ; k1s0
+    movhps          m2, [tmpq+offq+32*1]
+    neg           offq
+    movq            m3, [tmpq+offq+32*0] ; k1s1
+    movhps          m3, [tmpq+offq+32*1]
+%else
+    movu            m2, [tmpq+offq]
+    neg           offq
+    movu            m3, [tmpq+offq]
+%endif
+    movsx         offq, byte [dirq+1]    ; off3_k1
+    paddw           m0, m7
+    pabsw           m7, m2
+    psignw          m4, m5               ; constrain(diff_k0s3)
+    pabsw           m5, m3
+%if ARCH_X86_64
+    pmaxsw          m9, m2
+    pminsw         m10, m7
+    pmaxsw          m9, m3
+    pminsw         m10, m5
+%else
+    pminsw          m7, m10
+    pminsw          m7, m5
+    pmaxsw          m5, m9, m2
+    pmaxsw          m5, m3
+    mova           m10, m7
+    mova            m9, m5
+%endif
+    paddw           m0, m4               ; constrain(diff_k0)
+    psubw           m2, m1               ; diff_k1s0
+    psubw           m3, m1               ; diff_k1s1
+    paddw           m0, m0               ; sec_tap_k0
+    pabsw           m4, m2               ; adiff_k1s0
+    psrlw           m5, m4, [sec_shift+gprsize]
+    psubusw         m7, m6, m5
+    pabsw           m5, m3               ; adiff_k1s1
+    pminsw          m7, m4
+    psrlw           m4, m5, [sec_shift+gprsize]
+    psignw          m7, m2               ; constrain(diff_k1s0)
+    psubusw         m2, m6, m4
+    pminsw          m2, m5
+%if %1 == 4
+    movq            m4, [tmpq+offq+32*0] ; k1s2
+    movhps          m4, [tmpq+offq+32*1]
+    neg           offq
+    movq            m5, [tmpq+offq+32*0] ; k1s3
+    movhps          m5, [tmpq+offq+32*1]
+%else
+    movu            m4, [tmpq+offq]
+    neg           offq
+    movu            m5, [tmpq+offq]
+%endif
+    movsx         offq, byte [dirq+4]    ; off1_k0
+    paddw           m0, m7
+    pabsw           m7, m4
+    psignw          m2, m3               ; constrain(diff_k1s1)
+    pabsw           m3, m5
+%if ARCH_X86_64
+    pmaxsw          m9, m4
+    pminsw         m10, m7
+    pmaxsw          m9, m5
+    pminsw         m10, m3
+%else
+    pminsw          m7, m10
+    pminsw          m7, m3
+    pmaxsw          m3, m9, m4
+    pmaxsw          m3, m5
+    mova           m10, m7
+    mova            m9, m3
+%endif
+    psubw           m4, m1               ; diff_k1s2
+    psubw           m5, m1               ; diff_k1s3
+    pabsw           m3, m4               ; adiff_k1s2
+    paddw           m0, m2
+    psrlw           m2, m3, [sec_shift+gprsize]
+    psubusw         m7, m6, m2
+    pabsw           m2, m5               ; adiff_k1s3
+    pminsw          m7, m3
+    psrlw           m3, m2, [sec_shift+gprsize]
+    psignw          m7, m4               ; constrain(diff_k1s2)
+    psubusw         m4, m6, m3
+    pminsw          m4, m2
+    paddw           m0, m7
+%if %1 == 4
+    movq            m2, [tmpq+offq+32*0] ; k0p0
+    movhps          m2, [tmpq+offq+32*1]
+    neg           offq
+    movq            m3, [tmpq+offq+32*0] ; k0p1
+    movhps          m3, [tmpq+offq+32*1]
+%else
+    movu            m2, [tmpq+offq]
+    neg           offq
+    movu            m3, [tmpq+offq]
+%endif
+    movsx         offq, byte [dirq+5]    ; off1_k1
+    pabsw           m7, m2
+    psignw          m4, m5               ; constrain(diff_k1s3)
+    pabsw           m5, m3
+%if ARCH_X86_64
+    pmaxsw          m9, m2
+    pminsw         m10, m7
+    pmaxsw          m9, m3
+    pminsw         m10, m5
+%else
+    pminsw          m7, m10
+    pminsw          m7, m5
+    pmaxsw          m5, m9, m2
+    pmaxsw          m5, m3
+    mova           m10, m7
+    mova            m9, m5
+%endif
+    psubw           m2, m1               ; diff_k0p0
+    psubw           m3, m1               ; diff_k0p1
+    paddw           m0, m4
+    pabsw           m4, m2               ; adiff_k0p0
+    psrlw           m5, m4, [pri_shift+gprsize]
+    psubusw         m7, [rsp+gprsize], m5
+    pabsw           m5, m3               ; adiff_k0p1
+    pminsw          m7, m4
+    psrlw           m4, m5, [pri_shift+gprsize]
+    psignw          m7, m2               ; constrain(diff_k0p0)
+    psubusw         m2, [rsp+gprsize], m4
+    pminsw          m2, m5
+%if %1 == 4
+    movq            m4, [tmpq+offq+32*0] ; k1p0
+    movhps          m4, [tmpq+offq+32*1]
+    neg           offq
+    movq            m5, [tmpq+offq+32*0] ; k1p1
+    movhps          m5, [tmpq+offq+32*1]
+%else
+    movu            m4, [tmpq+offq]
+    neg           offq
+    movu            m5, [tmpq+offq]
+%endif
+    psignw          m2, m3               ; constrain(diff_k0p1)
+    pabsw           m3, m4
+    paddw           m7, m2               ; constrain(diff_k0)
+    pabsw           m2, m5
+%if ARCH_X86_64
+    pmaxsw          m9, m4
+    pminsw         m10, m3
+    pmaxsw          m9, m5
+    pminsw         m10, m2
+%else
+    pminsw          m3, m10
+    pminsw          m3, m2
+    pmaxsw          m2, m9, m4
+    pmaxsw          m2, m5
+    mova           m10, m3
+    mova            m9, m2
+%endif
+    psubw           m4, m1               ; diff_k1p0
+    psubw           m5, m1               ; diff_k1p1
+    pabsw           m3, m4               ; adiff_k1p0
+    pmullw          m7, [priq+16*0]      ; pri_tap_k0
+    paddw           m0, m7
+    psrlw           m2, m3, [pri_shift+gprsize]
+    psubusw         m7, [rsp+16*0+gprsize], m2
+    pabsw           m2, m5               ; adiff_k1p1
+    pminsw          m7, m3
+    psrlw           m3, m2, [pri_shift+gprsize]
+    psignw          m7, m4               ; constrain(diff_k1p0)
+    psubusw         m4, [rsp+16*0+gprsize], m3
+    pminsw          m4, m2
+    psignw          m4, m5               ; constrain(diff_k1p1)
+    paddw           m7, m4               ; constrain(diff_k1)
+    pmullw          m7, [priq+16*1]      ; pri_tap_k1
+    paddw           m0, m7               ; sum
+    psraw           m2, m0, 15
+    paddw           m0, m2
+    pmulhrsw        m0, m8
+    paddw           m0, m1
+%if ARCH_X86_64
+    pmaxsw          m9, m1
+    pminsw          m0, m9
+%else
+    pmaxsw          m2, m9, m1
+    pminsw          m0, m2
+%endif
+    pminsw          m1, m10
+    pmaxsw          m0, m1
+%if %1 == 4
+    add           tmpq, 32*2
+    movq   [dstq+strideq*0], m0
+    movhps [dstq+strideq*1], m0
+    lea           dstq, [dstq+strideq*2]
+%else
+    add           tmpq, 32
+    mova        [dstq], m0
+    add           dstq, strideq
+%endif
+    ret
+%endif
+%endmacro
+
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal cdef_filter_4x4_16bpc, 4, 8, 9, 32*10, dst, stride, left, top, pri, sec, edge
+    %define         px  rsp+32*4
+%else
+cglobal cdef_filter_4x4_16bpc, 2, 7, 8, -32*11, dst, stride, edge, top, left
+    %define         px  rsp+32*5
+%endif
+    %define       base  t0-dir_table
+    %define  pri_shift  px-16*6
+    %define  sec_shift  px-16*5
+    mov          edged, r8m
+    LEA             t0, dir_table
+    movu            m0, [dstq+strideq*0]
+    movu            m1, [dstq+strideq*1]
+    lea             t1, [dstq+strideq*2]
+    movu            m2, [t1  +strideq*0]
+    movu            m3, [t1  +strideq*1]
+    movddup         m7, [base+pw_m16384]
+    mova   [px+32*0+0], m0
+    mova   [px+32*1+0], m1
+    mova   [px+32*2+0], m2
+    mova   [px+32*3+0], m3
+    test         edgeb, 4 ; HAVE_TOP
+    jz .no_top
+    movifnidn     topq, topmp
+    movu            m0, [topq+strideq*0]
+    movu            m1, [topq+strideq*1]
+    mova   [px-32*2+0], m0
+    mova   [px-32*1+0], m1
+    test         edgeb, 1 ; HAVE_LEFT
+    jz .top_no_left
+    movd            m0, [topq+strideq*0-4]
+    movd            m1, [topq+strideq*1-4]
+    movd   [px-32*2-4], m0
+    movd   [px-32*1-4], m1
+    jmp .top_done
+.no_top:
+    mova   [px-32*2+0], m7
+    mova   [px-32*1+0], m7
+.top_no_left:
+    movd   [px-32*2-4], m7
+    movd   [px-32*1-4], m7
+.top_done:
+    test         edgeb, 8 ; HAVE_BOTTOM
+    jz .no_bottom
+    lea             r3, [dstq+strideq*4]
+    movu            m0, [r3+strideq*0]
+    movu            m1, [r3+strideq*1]
+    mova   [px+32*4+0], m0
+    mova   [px+32*5+0], m1
+    test         edgeb, 1 ; HAVE_LEFT
+    jz .bottom_no_left
+    movd            m0, [r3+strideq*0-4]
+    movd            m1, [r3+strideq*1-4]
+    movd   [px+32*4-4], m0
+    movd   [px+32*5-4], m1
+    jmp .bottom_done
+.no_bottom:
+    mova   [px+32*4+0], m7
+    mova   [px+32*5+0], m7
+.bottom_no_left:
+    movd   [px+32*4-4], m7
+    movd   [px+32*5-4], m7
+.bottom_done:
+    test         edgeb, 1 ; HAVE_LEFT
+    jz .no_left
+    movifnidn    leftq, r2mp
+    movd            m0, [leftq+4*0]
+    movd            m1, [leftq+4*1]
+    movd            m2, [leftq+4*2]
+    movd            m3, [leftq+4*3]
+    movd   [px+32*0-4], m0
+    movd   [px+32*1-4], m1
+    movd   [px+32*2-4], m2
+    movd   [px+32*3-4], m3
+    jmp .left_done
+.no_left:
+    REPX {movd [px+32*x-4], m7}, 0, 1, 2, 3
+.left_done:
+    test         edgeb, 2 ; HAVE_RIGHT
+    jnz .padding_done
+    REPX {movd [px+32*x+8], m7}, -2, -1, 0, 1, 2, 3, 4, 5
+.padding_done:
+    CDEF_FILTER      4, 4
+
+%if ARCH_X86_64
+cglobal cdef_filter_4x8_16bpc, 4, 8, 9, 32*14, dst, stride, left, top, pri, sec, edge
+%else
+cglobal cdef_filter_4x8_16bpc, 2, 7, 8, -32*15, dst, stride, edge, top, left
+%endif
+    mov          edged, r8m
+    LEA             t0, dir_table
+    movu            m0, [dstq+strideq*0]
+    movu            m1, [dstq+strideq*1]
+    lea             t1, [dstq+strideq*2]
+    movu            m2, [t1  +strideq*0]
+    movu            m3, [t1  +strideq*1]
+    lea             t1, [t1  +strideq*2]
+    movu            m4, [t1  +strideq*0]
+    movu            m5, [t1  +strideq*1]
+    lea             t1, [t1  +strideq*2]
+    movu            m6, [t1  +strideq*0]
+    movu            m7, [t1  +strideq*1]
+    mova   [px+32*0+0], m0
+    mova   [px+32*1+0], m1
+    mova   [px+32*2+0], m2
+    mova   [px+32*3+0], m3
+    mova   [px+32*4+0], m4
+    mova   [px+32*5+0], m5
+    mova   [px+32*6+0], m6
+    mova   [px+32*7+0], m7
+    movddup         m7, [base+pw_m16384]
+    test         edgeb, 4 ; HAVE_TOP
+    jz .no_top
+    movifnidn     topq, topmp
+    movu            m0, [topq+strideq*0]
+    movu            m1, [topq+strideq*1]
+    mova   [px-32*2+0], m0
+    mova   [px-32*1+0], m1
+    test         edgeb, 1 ; HAVE_LEFT
+    jz .top_no_left
+    movd            m0, [topq+strideq*0-4]
+    movd            m1, [topq+strideq*1-4]
+    movd   [px-32*2-4], m0
+    movd   [px-32*1-4], m1
+    jmp .top_done
+.no_top:
+    mova   [px-32*2+0], m7
+    mova   [px-32*1+0], m7
+.top_no_left:
+    movd   [px-32*2-4], m7
+    movd   [px-32*1-4], m7
+.top_done:
+    test         edgeb, 8 ; HAVE_BOTTOM
+    jz .no_bottom
+    lea             r3, [dstq+strideq*8]
+    movu            m0, [r3+strideq*0]
+    movu            m1, [r3+strideq*1]
+    mova   [px+32*8+0], m0
+    mova   [px+32*9+0], m1
+    test         edgeb, 1 ; HAVE_LEFT
+    jz .bottom_no_left
+    movd            m0, [r3+strideq*0-4]
+    movd            m1, [r3+strideq*1-4]
+    movd   [px+32*8-4], m0
+    movd   [px+32*9-4], m1
+    jmp .bottom_done
+.no_bottom:
+    mova   [px+32*8+0], m7
+    mova   [px+32*9+0], m7
+.bottom_no_left:
+    movd   [px+32*8-4], m7
+    movd   [px+32*9-4], m7
+.bottom_done:
+    test         edgeb, 1 ; HAVE_LEFT
+    jz .no_left
+    movifnidn    leftq, r2mp
+    movd            m0, [leftq+4*0]
+    movd            m1, [leftq+4*1]
+    movd            m2, [leftq+4*2]
+    movd            m3, [leftq+4*3]
+    movd   [px+32*0-4], m0
+    movd   [px+32*1-4], m1
+    movd   [px+32*2-4], m2
+    movd   [px+32*3-4], m3
+    movd            m0, [leftq+4*4]
+    movd            m1, [leftq+4*5]
+    movd            m2, [leftq+4*6]
+    movd            m3, [leftq+4*7]
+    movd   [px+32*4-4], m0
+    movd   [px+32*5-4], m1
+    movd   [px+32*6-4], m2
+    movd   [px+32*7-4], m3
+    jmp .left_done
+.no_left:
+    REPX {movd [px+32*x-4], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+.left_done:
+    test         edgeb, 2 ; HAVE_RIGHT
+    jnz .padding_done
+    REPX {movd [px+32*x+8], m7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
+.padding_done:
+    CDEF_FILTER      4, 8
+
+%if ARCH_X86_64
+cglobal cdef_filter_8x8_16bpc, 4, 8, 9, 32*14, dst, stride, left, top, pri, sec, edge
+%else
+cglobal cdef_filter_8x8_16bpc, 2, 7, 8, -32*15, dst, stride, edge, top, left
+%endif
+    mov          edged, r8m
+    LEA             t0, dir_table
+    mova            m0, [dstq+strideq*0+ 0]
+    movd            m1, [dstq+strideq*0+16]
+    mova            m2, [dstq+strideq*1+ 0]
+    movd            m3, [dstq+strideq*1+16]
+    lea             t1, [dstq+strideq*2]
+    mova            m4, [t1  +strideq*0+ 0]
+    movd            m5, [t1  +strideq*0+16]
+    mova            m6, [t1  +strideq*1+ 0]
+    movd            m7, [t1  +strideq*1+16]
+    lea             t1, [t1  +strideq*2]
+    mova  [px+32*0+ 0], m0
+    movd  [px+32*0+16], m1
+    mova  [px+32*1+ 0], m2
+    movd  [px+32*1+16], m3
+    mova  [px+32*2+ 0], m4
+    movd  [px+32*2+16], m5
+    mova  [px+32*3+ 0], m6
+    movd  [px+32*3+16], m7
+    mova            m0, [t1  +strideq*0+ 0]
+    movd            m1, [t1  +strideq*0+16]
+    mova            m2, [t1  +strideq*1+ 0]
+    movd            m3, [t1  +strideq*1+16]
+    lea             t1, [t1  +strideq*2]
+    mova            m4, [t1  +strideq*0+ 0]
+    movd            m5, [t1  +strideq*0+16]
+    mova            m6, [t1  +strideq*1+ 0]
+    movd            m7, [t1  +strideq*1+16]
+    mova  [px+32*4+ 0], m0
+    movd  [px+32*4+16], m1
+    mova  [px+32*5+ 0], m2
+    movd  [px+32*5+16], m3
+    mova  [px+32*6+ 0], m4
+    movd  [px+32*6+16], m5
+    mova  [px+32*7+ 0], m6
+    movd  [px+32*7+16], m7
+    movddup         m7, [base+pw_m16384]
+    test         edgeb, 4 ; HAVE_TOP
+    jz .no_top
+    movifnidn     topq, topmp
+    mova            m0, [topq+strideq*0+ 0]
+    mova            m1, [topq+strideq*0+16]
+    mova            m2, [topq+strideq*1+ 0]
+    mova            m3, [topq+strideq*1+16]
+    mova  [px-32*2+ 0], m0
+    movd  [px-32*2+16], m1
+    mova  [px-32*1+ 0], m2
+    movd  [px-32*1+16], m3
+    test         edgeb, 1 ; HAVE_LEFT
+    jz .top_no_left
+    movd            m0, [topq+strideq*0-4]
+    movd            m1, [topq+strideq*1-4]
+    movd   [px-32*2-4], m0
+    movd   [px-32*1-4], m1
+    jmp .top_done
+.no_top:
+    mova  [px-32*2+ 0], m7
+    movd  [px-32*2+16], m7
+    mova  [px-32*1+ 0], m7
+    movd  [px-32*1+16], m7
+.top_no_left:
+    movd  [px-32*2- 4], m7
+    movd  [px-32*1- 4], m7
+.top_done:
+    test         edgeb, 8 ; HAVE_BOTTOM
+    jz .no_bottom
+    lea             r3, [dstq+strideq*8]
+    mova            m0, [r3+strideq*0+ 0]
+    movd            m1, [r3+strideq*0+16]
+    mova            m2, [r3+strideq*1+ 0]
+    movd            m3, [r3+strideq*1+16]
+    mova  [px+32*8+ 0], m0
+    movd  [px+32*8+16], m1
+    mova  [px+32*9+ 0], m2
+    movd  [px+32*9+16], m3
+    test         edgeb, 1 ; HAVE_LEFT
+    jz .bottom_no_left
+    movd            m0, [r3+strideq*0-4]
+    movd            m1, [r3+strideq*1-4]
+    movd  [px+32*8- 4], m0
+    movd  [px+32*9- 4], m1
+    jmp .bottom_done
+.no_bottom:
+    mova  [px+32*8+ 0], m7
+    movd  [px+32*8+16], m7
+    mova  [px+32*9+ 0], m7
+    movd  [px+32*9+16], m7
+.bottom_no_left:
+    movd  [px+32*8- 4], m7
+    movd  [px+32*9- 4], m7
+.bottom_done:
+    test         edgeb, 1 ; HAVE_LEFT
+    jz .no_left
+    movifnidn    leftq, r2mp
+    movd            m0, [leftq+4*0]
+    movd            m1, [leftq+4*1]
+    movd            m2, [leftq+4*2]
+    movd            m3, [leftq+4*3]
+    movd  [px+32*0- 4], m0
+    movd  [px+32*1- 4], m1
+    movd  [px+32*2- 4], m2
+    movd  [px+32*3- 4], m3
+    movd            m0, [leftq+4*4]
+    movd            m1, [leftq+4*5]
+    movd            m2, [leftq+4*6]
+    movd            m3, [leftq+4*7]
+    movd  [px+32*4- 4], m0
+    movd  [px+32*5- 4], m1
+    movd  [px+32*6- 4], m2
+    movd  [px+32*7- 4], m3
+    jmp .left_done
+.no_left:
+    REPX {movd [px+32*x- 4], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+.left_done:
+    test         edgeb, 2 ; HAVE_RIGHT
+    jnz .padding_done
+    REPX {movd [px+32*x+16], m7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
+.padding_done:
+    CDEF_FILTER      8, 8
+
 %macro CDEF_DIR 0
 %if ARCH_X86_64
 cglobal cdef_dir_16bpc, 4, 7, 16, src, stride, var, bdmax
diff --git a/src/x86/cdef_init_tmpl.c b/src/x86/cdef_init_tmpl.c
index 42d6cff..06a0d14 100644
--- a/src/x86/cdef_init_tmpl.c
+++ b/src/x86/cdef_init_tmpl.c
@@ -46,9 +46,9 @@
 COLD void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
     const unsigned flags = dav1d_get_cpu_flags();
 
+#if BITDEPTH == 8
     if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
 
-#if BITDEPTH == 8
     c->fb[0] = BF(dav1d_cdef_filter_8x8, sse2);
     c->fb[1] = BF(dav1d_cdef_filter_4x8, sse2);
     c->fb[2] = BF(dav1d_cdef_filter_4x4, sse2);
@@ -57,11 +57,9 @@
     if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
 
     c->dir = BF(dav1d_cdef_dir, ssse3);
-#if BITDEPTH == 8
     c->fb[0] = BF(dav1d_cdef_filter_8x8, ssse3);
     c->fb[1] = BF(dav1d_cdef_filter_4x8, ssse3);
     c->fb[2] = BF(dav1d_cdef_filter_4x4, ssse3);
-#endif
 
     if (!(flags & DAV1D_X86_CPU_FLAG_SSE41)) return;