x86/itx: 32x16 inverse dct transforms hbd/sse4
diff --git a/src/x86/itx16_sse.asm b/src/x86/itx16_sse.asm
index 399ade9..61b4578 100644
--- a/src/x86/itx16_sse.asm
+++ b/src/x86/itx16_sse.asm
@@ -135,6 +135,8 @@
 
 tbl_16x32_2d: dw 0, 14, 44, 90, 151, 215, 279, 343
 
+tbl_32x16_2d: dw 0, 10, 36, 78
+
 tbl_Nx32_odd_offset: db 2*16, 2*23
                      db 2*20, 2*19
                      db 2*18, 2*21
@@ -3925,6 +3927,8 @@
     mova       [cq+1*64+r5], m1
     mova       [cq+2*64+r5], m2
     mova       [cq+3*64+r5], m3
+    pxor                 m0, m0
+    REPX {mova [cq+x*64+r5], m0}, 8, 9, 10, 11, 12, 13, 14, 15
     sub                 r5d, 16
     jge .loop_pass1
 
@@ -4127,25 +4131,18 @@
     call m(idct_8x8_internal_16bpc).write_8x8
 %if ARCH_X86_64
     add                  r7, 16
+%define mzero m9
 %else
     add dword [rsp+2*gprsize+16*16], 16
-%endif
-    add                  cq, 64*4
-    dec                 r4d
-    jg .loop_pass2
-.zero:
-%if ARCH_X86_32
 %define mzero m7
     pxor                 m7, m7
-%else
-%define mzero m9
 %endif
-    REPX {mova [cq+x*16-64*8], mzero}, \
-                                  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
-                        16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, \
-                        32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, \
-                        48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63
+    REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
+    add                  cq, 64*4
+    REPX {mova [cq+x*16], mzero}, -8, -7, -6, -5, -4, -3, -2, -1
 %undef mzero
+    dec                 r4d
+    jg .loop_pass2
 %if WIN64
     mov                  r7, [rsp+16*16+gprsize]
 %endif
@@ -4342,13 +4339,22 @@
     call m(idct_8x8_internal_16bpc).write_8x8
 %if ARCH_X86_64
     add                  r7, 16
+%define mzero m9
 %else
     add dword [rsp+2*gprsize+16*16], 16
+%define mzero m7
+    pxor                 m7, m7
 %endif
+    REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
     add                  cq, 64*4
+    REPX {mova [cq+x*16], mzero}, -8, -7, -6, -5, -4, -3, -2, -1
+%undef mzero
     dec                 r4d
     jg .loop_pass2
-    jmp m(idct_16x16_internal_16bpc).zero
+%if WIN64
+    mov                  r7, [rsp+16*16+gprsize]
+%endif
+    RET
 
 INV_TXFM_16X16_FN flipadst, dct
 INV_TXFM_16X16_FN flipadst, adst
@@ -4535,8 +4541,6 @@
     add                 r5d, 4
     jmp .pass2_loop
 .end:
-    REPX {mova [cq+x*16], m6}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
-                        16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
 %if WIN64
     mov                  r7, [rsp+16*16+gprsize]
 %endif
@@ -5283,10 +5287,11 @@
     add                 rsp, 32*16
     mova                 m0, [rsp+16* 3]
     call m(inv_txfm_add_dct_dct_8x32_16bpc).pass2
+%assign stack_size (stack_size-41*16)
 %if STACK_ALIGNMENT >= 16
 %assign stack_size_padded (stack_size_padded-41*16)
+%assign stack_offset (stack_offset-41*16)
 %else
-%assign stack_size (stack_size-41*16)
 %xdefine rstkm [rsp + stack_size]
 %endif
     RET
@@ -5988,6 +5993,7 @@
     mov                 r3d, 8
     add                 r5d, 10240
     sar                 r5d, 14
+.dconly2:
     imul                r5d, 2896
     add                 r5d, 34816
     movd                 m0, r5d
@@ -6011,3 +6017,337 @@
     dec                 r3d
     jg .dconly_loop
     RET
+
+cglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
+                                         dst, stride, c, eob
+    LEA                  r6, base
+    test               eobd, eobd
+    jz .dconly
+
+    ; remove entirely-zero iterations
+%undef cmp
+    mov                 r5d, 8
+.zero_loop:
+    sub                 r5d, 2
+    cmp                eobw, word [o2(tbl_32x16_2d)+r5]
+    jl .zero_loop
+
+    ; actual first pass after skipping all-zero data
+.loop_pass1:
+%if ARCH_X86_64
+    mova                m11, [o(pd_2048)]
+    mova                m12, [o(clip_min)]
+    mova                m13, [o(clip_max)]
+    mova                m14, [o(pd_2896)]
+    pmulld               m0, m14, [cq+64* 1+r5*8]
+    pmulld               m1, m14, [cq+64* 7+r5*8]
+    pmulld               m2, m14, [cq+64* 9+r5*8]
+    pmulld               m3, m14, [cq+64*15+r5*8]
+    pmulld               m4, m14, [cq+64*17+r5*8]
+    pmulld               m5, m14, [cq+64*23+r5*8]
+    pmulld               m6, m14, [cq+64*25+r5*8]
+    pmulld               m7, m14, [cq+64*31+r5*8]
+    REPX     {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+%else
+    mova                 m7, [o(pd_2896)]
+    pmulld               m0, m7, [cq+64* 1+r5*8]
+    pmulld               m1, m7, [cq+64* 7+r5*8]
+    pmulld               m2, m7, [cq+64* 9+r5*8]
+    pmulld               m3, m7, [cq+64*15+r5*8]
+    pmulld               m4, m7, [cq+64*17+r5*8]
+    pmulld               m5, m7, [cq+64*23+r5*8]
+    pmulld               m6, m7, [cq+64*25+r5*8]
+    pmulld               m7, [cq+64*31+r5*8]
+    mova              [rsp], m7
+    mova                 m7, [o(pd_2048)]
+    REPX      {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
+    paddd                m7, [rsp]
+%endif
+    REPX     {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+    mov                  r3, rsp
+    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1
+%if ARCH_X86_64
+    pmulld               m0, m14, [cq+64* 3+r5*8]
+    pmulld               m1, m14, [cq+64* 5+r5*8]
+    pmulld               m2, m14, [cq+64*11+r5*8]
+    pmulld               m3, m14, [cq+64*13+r5*8]
+    pmulld               m4, m14, [cq+64*19+r5*8]
+    pmulld               m5, m14, [cq+64*21+r5*8]
+    pmulld               m6, m14, [cq+64*27+r5*8]
+    pmulld               m7, m14, [cq+64*29+r5*8]
+    REPX     {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+%else
+    mova                 m7, [o(pd_2896)]
+    pmulld               m0, m7, [cq+64* 3+r5*8]
+    pmulld               m1, m7, [cq+64* 5+r5*8]
+    pmulld               m2, m7, [cq+64*11+r5*8]
+    pmulld               m3, m7, [cq+64*13+r5*8]
+    pmulld               m4, m7, [cq+64*19+r5*8]
+    pmulld               m5, m7, [cq+64*21+r5*8]
+    pmulld               m6, m7, [cq+64*27+r5*8]
+    pmulld               m7, [cq+64*29+r5*8]
+    mova         [rsp+16*8], m7
+    mova                 m7, [o(pd_2048)]
+    REPX      {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
+    paddd                m7, [rsp+16*8]
+%endif
+    REPX     {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+    call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2
+    add                  r3, 16*(16+4*ARCH_X86_32)
+%if ARCH_X86_64
+    pmulld               m0, m14, [cq+64* 2+r5*8]
+    pmulld               m1, m14, [cq+64* 6+r5*8]
+    pmulld               m2, m14, [cq+64*10+r5*8]
+    pmulld               m3, m14, [cq+64*14+r5*8]
+    pmulld               m4, m14, [cq+64*18+r5*8]
+    pmulld               m5, m14, [cq+64*22+r5*8]
+    pmulld               m6, m14, [cq+64*26+r5*8]
+    pmulld               m7, m14, [cq+64*30+r5*8]
+    REPX     {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+%else
+    mova                 m7, [o(pd_2896)]
+    pmulld               m0, m7, [cq+64* 2+r5*8]
+    pmulld               m1, m7, [cq+64* 6+r5*8]
+    pmulld               m2, m7, [cq+64*10+r5*8]
+    pmulld               m3, m7, [cq+64*14+r5*8]
+    pmulld               m4, m7, [cq+64*18+r5*8]
+    pmulld               m5, m7, [cq+64*22+r5*8]
+    pmulld               m6, m7, [cq+64*26+r5*8]
+    pmulld               m7, [cq+64*30+r5*8]
+    mova        [rsp+16*16], m7
+    mova                 m7, [o(pd_2048)]
+    REPX      {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
+    paddd                m7, [rsp+16*16]
+%endif
+    REPX     {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+    call m(idct_16x4_internal_16bpc).main_oddhalf
+%if ARCH_X86_64
+    pmulld               m0, m14, [cq+64* 0+r5*8]
+    pmulld               m1, m14, [cq+64* 4+r5*8]
+    pmulld               m2, m14, [cq+64* 8+r5*8]
+    pmulld               m3, m14, [cq+64*12+r5*8]
+    pmulld               m4, m14, [cq+64*16+r5*8]
+    pmulld               m5, m14, [cq+64*20+r5*8]
+    pmulld               m6, m14, [cq+64*24+r5*8]
+    pmulld               m7, m14, [cq+64*28+r5*8]
+    REPX     {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+%else
+    mova                 m7, [o(pd_2896)]
+    pmulld               m0, m7, [cq+64* 0+r5*8]
+    pmulld               m1, m7, [cq+64* 4+r5*8]
+    pmulld               m2, m7, [cq+64* 8+r5*8]
+    pmulld               m3, m7, [cq+64*12+r5*8]
+    pmulld               m4, m7, [cq+64*16+r5*8]
+    pmulld               m5, m7, [cq+64*20+r5*8]
+    pmulld               m6, m7, [cq+64*24+r5*8]
+    pmulld               m7, [cq+64*28+r5*8]
+    mova        [rsp+16*16], m7
+    mova                 m7, [o(pd_2048)]
+    REPX      {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
+    paddd                m7, [rsp+16*16]
+%endif
+    REPX     {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+    call m(idct_8x4_internal_16bpc).main_pass1
+    call m(idct_8x4_internal_16bpc).round
+    sub                  r3, 16*(16+4*ARCH_X86_32)
+    call .round_dct32
+%if ARCH_X86_64
+    call m(idct_8x4_internal_16bpc).transpose4x8packed
+    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+    mova    [cq+64* 8+r5*8], m8
+    mova    [cq+64* 9+r5*8], m9
+    mova    [cq+64*10+r5*8], m10
+    mova    [cq+64*11+r5*8], m11
+    mova                 m8, [r3+16* 9] ;  8  9
+    mova                m10, [r3+16*11] ; 10 11
+    mova                m12, [r3+16*13] ; 12 13
+    mova                m14, [r3+16*15] ; 14 15
+    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+    mova    [cq+64* 4+r5*8], m8
+    mova    [cq+64* 5+r5*8], m9
+    mova    [cq+64* 6+r5*8], m10
+    mova    [cq+64* 7+r5*8], m11
+    mova                 m8, [r3+16* 8] ; 24 25
+    mova                m10, [r3+16*10] ; 26 27
+    mova                m12, [r3+16*12] ; 28 29
+    mova                m14, [r3+16*14] ; 30 31
+    call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+    mova    [cq+64*12+r5*8], m8
+    mova    [cq+64*13+r5*8], m9
+    mova    [cq+64*14+r5*8], m10
+    mova    [cq+64*15+r5*8], m11
+%else
+    sub                  r3, 8*16
+    mova                 m0, [r3+ 8*16]
+    mova                 m2, [r3+10*16]
+    mova                 m4, [r3+12*16]
+    mova                 m6, [r3+14*16]
+    packssdw             m0, [r3+ 9*16]
+    packssdw             m2, [r3+11*16]
+    packssdw             m4, [r3+13*16]
+    packssdw             m6, [r3+15*16]
+    call m(idct_8x4_internal_16bpc).transpose4x8packed
+    mova    [cq+64* 4+r5*8], m0
+    mova    [cq+64* 5+r5*8], m1
+    mova    [cq+64* 6+r5*8], m2
+    mova    [cq+64* 7+r5*8], m3
+    mova                 m0, [r3+16*16]
+    mova                 m2, [r3+18*16]
+    mova                 m4, [r3+20*16]
+    mova                 m6, [r3+22*16]
+    packssdw             m0, [r3+17*16]
+    packssdw             m2, [r3+19*16]
+    packssdw             m4, [r3+21*16]
+    packssdw             m6, [r3+23*16]
+    call m(idct_8x4_internal_16bpc).transpose4x8packed
+    mova    [cq+64* 8+r5*8], m0
+    mova    [cq+64* 9+r5*8], m1
+    mova    [cq+64*10+r5*8], m2
+    mova    [cq+64*11+r5*8], m3
+    mova                 m0, [r3+31*16]
+    mova                 m2, [r3+29*16]
+    mova                 m4, [r3+27*16]
+    mova                 m6, [r3+25*16]
+    packssdw             m0, [r3+30*16]
+    packssdw             m2, [r3+28*16]
+    packssdw             m4, [r3+26*16]
+    packssdw             m6, [r3+24*16]
+    call m(idct_8x4_internal_16bpc).transpose4x8packed
+    mova    [cq+64*12+r5*8], m0
+    mova    [cq+64*13+r5*8], m1
+    mova    [cq+64*14+r5*8], m2
+    mova    [cq+64*15+r5*8], m3
+    mova                 m0, [r3+ 0*16]
+    mova                 m2, [r3+ 2*16]
+    mova                 m4, [r3+ 4*16]
+    mova                 m6, [r3+ 6*16]
+    packssdw             m0, [r3+ 1*16]
+    packssdw             m2, [r3+ 3*16]
+    packssdw             m4, [r3+ 5*16]
+    packssdw             m6, [r3+ 7*16]
+    call m(idct_8x4_internal_16bpc).transpose4x8packed
+%endif
+    mova    [cq+64* 0+r5*8], m0
+    mova    [cq+64* 1+r5*8], m1
+    mova    [cq+64* 2+r5*8], m2
+    mova    [cq+64* 3+r5*8], m3
+    pxor                 m0, m0
+    REPX {mova [cq+x*64+r5*8], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \
+                                    24, 25, 26, 27, 28, 29, 30, 31
+    sub                 r5d, 2
+    jge .loop_pass1
+
+    ; pass=2, we need to call this otherwise the stack pointer has
+    ; the wrong offset in the 8-bit code
+    call .pass2
+    RET
+
+.pass2:
+%if ARCH_X86_64
+    mova                 m8, [o(pw_2048)]
+    pxor                 m9, m9
+    mova                m10, [o(pixel_10bpc_max)]
+%if WIN64
+    mov [rsp+16*16+gprsize], r7
+%endif
+    mov                  r7, dstq
+%else
+    mov [rsp+2*gprsize+16*16], dstq
+%endif
+    lea                  r3, [strideq*3]
+    mov                 r4d, 4
+    jmp m(idct_16x16_internal_16bpc).loop_pass2
+
+.round_dct32:
+%if ARCH_X86_64
+    psrld               m11, 11 ; pd_1
+    IDCT32_END            0, 15, 8, 9, 10, 1    ; 0 15 16 31
+    mova         [r3+ 0*16], m6
+    mova         [r3+23*16], m7
+    IDCT32_END            1, 14, 6, 7, 10, 1    ; 1 14 17 30
+    packssdw             m0, m1       ;  0  1
+    packssdw            m14, m15      ; 14 15
+    packssdw             m8, m6       ; 16 17
+    packssdw             m7, m9       ; 30 31
+    mova         [r3+16*15], m14
+    mova         [r3+16*14], m7
+    IDCT32_END            2, 15, 10, 7, 6, 1    ; 2 13 18 29
+    IDCT32_END            3, 14,  1, 9, 6, 1    ; 3 12 19 28
+    packssdw             m2, m3       ;  2  3
+    packssdw            m14, m15      ; 12 13
+    packssdw            m10, m1       ; 18 19
+    packssdw             m9, m7       ; 28 29
+    mova         [r3+16*13], m14
+    mova         [r3+16*12], m9
+    IDCT32_END            4, 15, 1, 7, 6, 1     ; 4 11 20 27
+    IDCT32_END            5, 14, 3, 9, 6, 1     ; 5 10 21 26
+    packssdw             m4, m5       ;  4  5
+    packssdw            m14, m15      ; 10 11
+    packssdw             m1, m3       ; 20 21
+    packssdw             m9, m7       ; 26 27
+    mova         [r3+16*11], m14
+    mova         [r3+16*10], m9
+    mova                 m6, [r3+ 0*16]
+    mova                 m7, [r3+23*16]
+    IDCT32_END            6, 15, 14, 5,  3, 1   ; 6 9 22 25
+    IDCT32_END            7, 11,  3, 9, 13, 1   ; 7 8 23 24
+    packssdw             m6, m7       ;  6  7
+    packssdw            m11, m15      ;  8  9
+    packssdw            m14, m3       ; 22 23
+    packssdw             m9, m5       ; 24 25
+    mova          [r3+16*9], m11
+    mova          [r3+16*8], m9
+    mova                m12, m1
+    ret
+%else
+    mova         [r3+16*16], m0
+    mova         [r3+17*16], m1
+    mova         [r3+18*16], m2
+    mova         [r3+19*16], m3
+    mova         [r3+20*16], m4
+    mova         [r3+21*16], m5
+    mova         [r3+22*16], m6
+    mova         [r3+23*16], m7
+    pcmpeqd              m1, m1     ; -1
+    mova                 m2, [o(clip_min)]
+    mova                 m3, [o(clip_max)]
+
+    mov                  r4, 15*16
+.loop_dct32_end:
+    mova                 m0, [r3+16*16]
+    mova                 m6, [r3+16*24]
+    psubd                m5, m0, m6 ; idct16 out15 - n
+    paddd                m0, m6     ; idct16 out0  + n
+    pmaxsd               m0, m2
+    pmaxsd               m5, m2
+    pminsd               m0, m3
+    pminsd               m5, m3
+    psubd                m0, m1
+    psubd                m5, m1
+    mova                 m7, [r3]
+    mova                 m4, [r3+r4]
+    psubd                m6, m0, m4 ; out31 - n
+    paddd                m0, m4     ; out0  + n
+    paddd                m4, m5, m7 ; out15 - n
+    psubd                m5, m7     ; out16 + n
+    REPX       {psrad x, 1}, m0, m5, m4, m6
+    mova               [r3], m0
+    mova            [r3+r4], m4
+    mova         [r3+16*16], m5
+    mova         [r3+24*16], m6
+    add                  r3, 16
+    sub                  r4, 32
+    jg .loop_dct32_end
+    ret
+%endif
+
+.dconly:
+    imul                r5d, [cq], 2896
+    mov                [cq], eobd ; 0
+    mov                 r3d, 16
+    add                 r5d, 2048
+    sar                 r5d, 12
+    imul                r5d, 2896
+    add                 r5d, 6144
+    sar                 r5d, 13
+    jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2
diff --git a/src/x86/itx_init_tmpl.c b/src/x86/itx_init_tmpl.c
index ce4b582..b1d31b0 100644
--- a/src/x86/itx_init_tmpl.c
+++ b/src/x86/itx_init_tmpl.c
@@ -168,7 +168,7 @@
         assign_itx2_fn (R,  8, 32, sse4);
         assign_itx2_fn (R, 32,  8, sse4);
         assign_itx2_fn (R, 16, 32, sse4);
-        assign_itx_fn(R, 32, 16, identity_identity, IDTX, sse4);
+        assign_itx2_fn (R, 32, 16, sse4);
         assign_itx_fn(, 32, 32, identity_identity, IDTX, sse4);
     }
 #endif