x86: add AVX2 versions for filmgrain.fguv_32x32xn[422/444]

fguv_32x32xn_8bpc_420_csfl0_c: 14568.2
fguv_32x32xn_8bpc_420_csfl0_avx2: 940.2
fguv_32x32xn_8bpc_420_csfl1_c: 10682.0
fguv_32x32xn_8bpc_420_csfl1_avx2: 783.3
fguv_32x32xn_8bpc_422_csfl0_c: 16370.5
fguv_32x32xn_8bpc_422_csfl0_avx2: 1557.3
fguv_32x32xn_8bpc_422_csfl1_c: 11333.8
fguv_32x32xn_8bpc_422_csfl1_avx2: 902.1
fguv_32x32xn_8bpc_444_csfl0_c: 12950.1
fguv_32x32xn_8bpc_444_csfl0_avx2: 822.9
fguv_32x32xn_8bpc_444_csfl1_c: 8806.7
fguv_32x32xn_8bpc_444_csfl1_avx2: 708.2
diff --git a/src/x86/film_grain.asm b/src/x86/film_grain.asm
index 1e29136..94ee123 100644
--- a/src/x86/film_grain.asm
+++ b/src/x86/film_grain.asm
@@ -28,6 +28,8 @@
 %if ARCH_X86_64
 
 SECTION_RODATA 32
+pb_8x_27_17_8x_17_27: times 8 db 27, 17
+                      times 8 db 17, 27
 pw_1024: times 16 dw 1024
 pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
 rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
@@ -1457,8 +1459,9 @@
 .end_hv:
     RET
 
-cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
-                                      grain_lut, h, sby, luma, lstride, uv_pl, is_id
+%macro FGUV_FN 3 ; name, ss_hor, ss_ver
+cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
+                                     grain_lut, h, sby, luma, lstride, uv_pl, is_id
     pcmpeqw         m10, m10
     psrld           m10, 24
     mov             r7d, [fg_dataq+FGData.scaling_shift]
@@ -1474,7 +1477,7 @@
     cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
     jne .csfl
 
-%macro FGUV_32x32xN_LOOP 1 ; not-csfl
+%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
     DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
 
 %if %1
@@ -1485,7 +1488,11 @@
     vpbroadcastw    m15, [fg_dataq+FGData.uv_offset+r7*4]
 %else
     vpbroadcastd    m14, [pw_1024]
+%if %2
     vpbroadcastd    m15, [pb_23_22]
+%else
+    vpbroadcastd   xm15, [pb_27_17_17_27]
+%endif
 %endif
 
     mov        overlapd, [fg_dataq+FGData.overlap_flag]
@@ -1507,7 +1514,7 @@
     mov           lumaq, r9mp
     lea             r12, [srcq+wq]
     lea             r13, [dstq+wq]
-    lea             r14, [lumaq+wq*2]
+    lea             r14, [lumaq+wq*(1+%2)]
     mov           r11mp, r12
     mov           r12mp, r13
     mov        lstrideq, r10mp
@@ -1528,8 +1535,8 @@
     rorx          offyd, seed, 8
     shr           offxd, 12
     and           offyd, 0xf
-    imul          offyd, 82
-    lea           offyq, [offyq+offxq+498]  ; offy*stride+offx
+    imul          offyd, 164>>%3
+    lea           offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)]  ; offy*stride+offx
 
     DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
                 h, offxy, see, overlap, unused1, unused2, lstride
@@ -1538,21 +1545,29 @@
     mov      grain_lutq, grain_lutmp
 %%loop_y:
     ; src
+%if %2
     mova            xm4, [lumaq+lstrideq*0+ 0]
     mova            xm6, [lumaq+lstrideq*0+16]
     mova            xm0, [srcq]
     vpbroadcastd     m7, [pb_1]
-    vinserti128      m4, [lumaq+lstrideq*2 +0], 1
-    vinserti128      m6, [lumaq+lstrideq*2+16], 1
+    vinserti128      m4, [lumaq+lstrideq*(1+%3) +0], 1
+    vinserti128      m6, [lumaq+lstrideq*(1+%3)+16], 1
     vinserti128      m0, [srcq+strideq], 1
     pxor             m2, m2
     pmaddubsw        m4, m7
     pmaddubsw        m6, m7
     pavgw            m4, m2
     pavgw            m6, m2
+%else
+    pxor             m2, m2
+    mova             m4, [lumaq]
+    mova             m0, [srcq]
+%endif
 
 %if %1
+%if %2
     packuswb         m4, m6                 ; luma
+%endif
     punpckhbw        m6, m4, m0
     punpcklbw        m4, m0                 ; { luma, chroma }
     pmaddubsw        m6, m14
@@ -1564,6 +1579,9 @@
     packuswb         m4, m6                 ; pack+unpack = clip
     punpckhbw        m6, m4, m2
     punpcklbw        m4, m2
+%elif %2 == 0
+    punpckhbw        m6, m4, m2
+    punpcklbw        m4, m2
 %endif
 
     punpckhwd        m5, m4, m2
@@ -1592,8 +1610,12 @@
     punpcklbw        m0, m2                 ; m0-1: src as word
 
     ; grain = grain_lut[offy+y][offx+x]
+%if %2
     movu            xm3, [grain_lutq+offxyq+ 0]
     vinserti128      m3, [grain_lutq+offxyq+82], 1
+%else
+    movu             m3, [grain_lutq+offxyq]
+%endif
     pcmpgtb          m7, m2, m3
     punpcklbw        m2, m3, m7
     punpckhbw        m3, m7
@@ -1612,21 +1634,31 @@
     pminsw           m0, m12
     pminsw           m1, m12
     packuswb         m0, m1
+%if %2
     mova         [dstq], xm0
     vextracti128 [dstq+strideq], m0, 1
+%else
+    mova         [dstq], m0
+%endif
 
+%if %2
     lea            srcq, [srcq+strideq*2]
     lea            dstq, [dstq+strideq*2]
-    lea           lumaq, [lumaq+lstrideq*4]
-    add      grain_lutq, 82*2
-    sub              hb, 2
+    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+    add            srcq, strideq
+    add            dstq, strideq
+    add           lumaq, lstrideq
+%endif
+    add      grain_lutq, 82<<%2
+    sub              hb, 1+%2
     jg %%loop_y
 
-    add              wq, 16
+    add              wq, 32>>%2
     jge %%end
     mov            srcq, r11mp
     mov            dstq, r12mp
-    lea           lumaq, [r14+wq*2]
+    lea           lumaq, [r14+wq*(1+%2)]
     add            srcq, wq
     add            dstq, wq
     test       overlapd, overlapd
@@ -1648,13 +1680,13 @@
     DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
                 offx, offy, see, left_offxy, unused1, unused2, lstride
 
-    lea     left_offxyd, [offyd+16]         ; previous column's offy*stride+offx
+    lea     left_offxyd, [offyd+(32>>%2)]         ; previous column's offy*stride+offx
     mov           offxd, seed
     rorx          offyd, seed, 8
     shr           offxd, 12
     and           offyd, 0xf
-    imul          offyd, 82
-    lea           offyq, [offyq+offxq+498]  ; offy*stride+offx
+    imul          offyd, 164>>%3
+    lea           offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)]  ; offy*stride+offx
 
     DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
                 h, offxy, see, left_offxy, unused1, unused2, lstride
@@ -1663,21 +1695,29 @@
     mov      grain_lutq, grain_lutmp
 %%loop_y_h_overlap:
     ; src
+%if %2
     mova            xm4, [lumaq+lstrideq*0+ 0]
     mova            xm6, [lumaq+lstrideq*0+16]
     mova            xm0, [srcq]
     vpbroadcastd     m7, [pb_1]
-    vinserti128      m4, [lumaq+lstrideq*2 +0], 1
-    vinserti128      m6, [lumaq+lstrideq*2+16], 1
+    vinserti128      m4, [lumaq+lstrideq*(1+%3) +0], 1
+    vinserti128      m6, [lumaq+lstrideq*(1+%3)+16], 1
     vinserti128      m0, [srcq+strideq], 1
     pxor             m2, m2
     pmaddubsw        m4, m7
     pmaddubsw        m6, m7
     pavgw            m4, m2
     pavgw            m6, m2
+%else
+    mova             m4, [lumaq]
+    mova             m0, [srcq]
+    pxor             m2, m2
+%endif
 
 %if %1
+%if %2
     packuswb         m4, m6                 ; luma
+%endif
     punpckhbw        m6, m4, m0
     punpcklbw        m4, m0                 ; { luma, chroma }
     pmaddubsw        m6, m14
@@ -1689,6 +1729,9 @@
     packuswb         m4, m6                 ; pack+unpack = clip
     punpckhbw        m6, m4, m2
     punpcklbw        m4, m2
+%elif %2 == 0
+    punpckhbw        m6, m4, m2
+    punpcklbw        m4, m2
 %endif
 
     punpckhwd        m5, m4, m2
@@ -1717,6 +1760,7 @@
     punpcklbw        m0, m2                 ; m0-1: src as word
 
     ; grain = grain_lut[offy+y][offx+x]
+%if %2
 %if %1
     vpbroadcastd     m6, [pb_23_22] ; FIXME
 %endif
@@ -1736,6 +1780,25 @@
     pcmpeqw          m6, m6 ; FIXME
     psrldq           m6, 15 ; FIXME
     vpblendvb        m3, m3, m4, m6
+%else
+%if %1
+    vpbroadcastd    xm6, [pb_27_17_17_27]
+%endif
+    movu             m3, [grain_lutq+offxyq]
+    movd            xm4, [grain_lutq+left_offxyq]
+    punpcklbw       xm4, xm3
+%if %1
+    pmaddubsw       xm4, xm6, xm4
+    pmulhrsw        xm4, [pw_1024]
+%else
+    pmaddubsw       xm4, xm15, xm4
+    pmulhrsw        xm4, xm14
+%endif
+    packsswb        xm4, xm4
+    pcmpeqw         xm6, xm6
+    psrldq          xm6, 14
+    vpblendvb        m3, m3, m4, m6
+%endif
     pcmpgtb          m7, m2, m3
     punpcklbw        m2, m3, m7
     punpckhbw        m3, m7
@@ -1754,21 +1817,31 @@
     pminsw           m0, m12
     pminsw           m1, m12
     packuswb         m0, m1
+%if %2
     mova         [dstq], xm0
     vextracti128 [dstq+strideq], m0, 1
+%else
+    mova         [dstq], m0
+%endif
 
+%if %2
     lea            srcq, [srcq+strideq*2]
     lea            dstq, [dstq+strideq*2]
-    lea           lumaq, [lumaq+lstrideq*4]
-    add      grain_lutq, 82*2
-    sub              hb, 2
+    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+    add            srcq, strideq
+    add            dstq, strideq
+    add           lumaq, lstrideq
+%endif
+    add      grain_lutq, 82*(1+%2)
+    sub              hb, 1+%2
     jg %%loop_y_h_overlap
 
-    add              wq, 16
+    add              wq, 32>>%2
     jge %%end
     mov            srcq, r11mp
     mov            dstq, r12mp
-    lea           lumaq, [r14+wq*2]
+    lea           lumaq, [r14+wq*(1+%2)]
     add            srcq, wq
     add            dstq, wq
 
@@ -1801,7 +1874,7 @@
     mov           lumaq, r9mp
     lea             r12, [srcq+wq]
     lea             r13, [dstq+wq]
-    lea             r14, [lumaq+wq*2]
+    lea             r14, [lumaq+wq*(1+%2)]
     mov           r11mp, r12
     mov           r12mp, r13
     mov        lstrideq, r10mp
@@ -1828,9 +1901,9 @@
     rorx          offxd, seed, 12
     and           offyd, 0xf000f
     and           offxd, 0xf000f
-    imul          offyd, 82
+    imul          offyd, 164>>%3
     ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
-    lea           offyq, [offyq+offxq+0x10001*498+16*82]
+    lea           offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
 
     DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
                 h, offxy, see, overlap, top_offxy, unused, lstride
@@ -1840,23 +1913,34 @@
 
     mov              hd, hm
     mov      grain_lutq, grain_lutmp
+%if %2 == 0
+    vbroadcasti128   m1, [pb_8x_27_17_8x_17_27]
+%endif
 %%loop_y_v_overlap:
     ; src
+%if %2
     mova            xm4, [lumaq+lstrideq*0+ 0]
     mova            xm6, [lumaq+lstrideq*0+16]
     mova            xm0, [srcq]
     vpbroadcastd     m7, [pb_1]
-    vinserti128      m4, [lumaq+lstrideq*2 +0], 1
-    vinserti128      m6, [lumaq+lstrideq*2+16], 1
+    vinserti128      m4, [lumaq+lstrideq*(1+%3) +0], 1
+    vinserti128      m6, [lumaq+lstrideq*(1+%3)+16], 1
     vinserti128      m0, [srcq+strideq], 1
     pxor             m2, m2
     pmaddubsw        m4, m7
     pmaddubsw        m6, m7
     pavgw            m4, m2
     pavgw            m6, m2
+%else
+    mova             m4, [lumaq]
+    mova             m0, [srcq]
+    pxor             m2, m2
+%endif
 
 %if %1
+%if %2
     packuswb         m4, m6                 ; luma
+%endif
     punpckhbw        m6, m4, m0
     punpcklbw        m4, m0                 ; { luma, chroma }
     pmaddubsw        m6, m14
@@ -1868,6 +1952,9 @@
     packuswb         m4, m6                 ; pack+unpack = clip
     punpckhbw        m6, m4, m2
     punpcklbw        m4, m2
+%elif %2 == 0
+    punpckhbw        m6, m4, m2
+    punpcklbw        m4, m2
 %endif
 
     punpckhwd        m5, m4, m2
@@ -1891,11 +1978,42 @@
     packusdw         m8, m4
     packusdw         m5, m6
 
+%if %2
     ; unpack chroma_source
     punpckhbw        m1, m0, m2
     punpcklbw        m0, m2                 ; m0-1: src as word
+%endif
 
     ; grain = grain_lut[offy+y][offx+x]
+%if %3 == 0
+%if %2
+    mova             m6, [pb_8x_27_17_8x_17_27]
+    movu            xm3, [grain_lutq+offxyq]
+    movu            xm4, [grain_lutq+top_offxyq]
+    vinserti128      m3, [grain_lutq+offxyq+82], 1
+    vinserti128      m4, [grain_lutq+top_offxyq+82], 1
+%else
+    movu             m3, [grain_lutq+offxyq]
+    movu             m4, [grain_lutq+top_offxyq]
+%endif
+    punpckhbw        m9, m4, m3
+    punpcklbw        m4, m3
+%if %2
+    pmaddubsw        m9, m6, m9
+    pmaddubsw        m4, m6, m4
+%else
+    pmaddubsw        m9, m1, m9
+    pmaddubsw        m4, m1, m4
+%endif
+%if %1
+    pmulhrsw         m9, [pw_1024]
+    pmulhrsw         m4, [pw_1024]
+%else
+    pmulhrsw         m9, m14
+    pmulhrsw         m4, m14
+%endif
+    packsswb         m3, m4, m9
+%else
 %if %1
     vpbroadcastd     m6, [pb_23_22]
 %endif
@@ -1915,6 +2033,7 @@
     vpermq           m4, m4, q3120
     ; only interpolate first line, insert second line unmodified
     vinserti128      m3, m4, [grain_lutq+offxyq+82], 1
+%endif
     pcmpgtb          m7, m2, m3
     punpcklbw        m2, m3, m7
     punpckhbw        m3, m7
@@ -1926,6 +2045,7 @@
     pmulhrsw         m3, m11
 
     ; dst = clip_pixel(src, noise)
+%if %2
     paddw            m0, m2
     paddw            m1, m3
     pmaxsw           m0, m13
@@ -1935,21 +2055,46 @@
     packuswb         m0, m1
     mova         [dstq], xm0
     vextracti128 [dstq+strideq], m0, 1
+%else
+    pxor             m6, m6
+    punpckhbw        m9, m0, m6
+    punpcklbw        m0, m6                 ; m0-1: src as word
 
-    sub              hb, 2
+    paddw            m0, m2
+    paddw            m9, m3
+    pmaxsw           m0, m13
+    pmaxsw           m9, m13
+    pminsw           m0, m12
+    pminsw           m9, m12
+    packuswb         m0, m9
+    mova         [dstq], m0
+%endif
+
+    sub              hb, 1+%2
     jl %%end_y_v_overlap
+%if %2
     lea            srcq, [srcq+strideq*2]
     lea            dstq, [dstq+strideq*2]
-    lea           lumaq, [lumaq+lstrideq*4]
-    add      grain_lutq, 82*2
+    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+    add            srcq, strideq
+    add            dstq, strideq
+    add           lumaq, lstrideq
+%endif
+    add      grain_lutq, 82<<%2
+%if %2 == 0
+    vbroadcasti128   m1, [pb_8x_27_17_8x_17_27+16]
+    btc              hd, 16
+    jnc %%loop_y_v_overlap
+%endif
     jmp %%loop_y
 
 %%end_y_v_overlap:
-    add              wq, 16
+    add              wq, 32>>%2
     jge %%end_hv
     mov            srcq, r11mp
     mov            dstq, r12mp
-    lea           lumaq, [r14+wq*2]
+    lea           lumaq, [r14+wq*(1+%2)]
     add            srcq, wq
     add            dstq, wq
 
@@ -1974,15 +2119,15 @@
     DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
                 offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride
 
-    lea  topleft_offxyq, [top_offxyq+16]
-    lea     left_offxyq, [offyq+16]
+    lea  topleft_offxyq, [top_offxyq+(32>>%2)]
+    lea     left_offxyq, [offyq+(32>>%2)]
     rorx          offyd, seed, 8
     rorx          offxd, seed, 12
     and           offyd, 0xf000f
     and           offxd, 0xf000f
-    imul          offyd, 82
+    imul          offyd, 164>>%3
     ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
-    lea           offyq, [offyq+offxq+0x10001*498+16*82]
+    lea           offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
 
     DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
                 h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride
@@ -1992,23 +2137,34 @@
 
     mov              hd, hm
     mov      grain_lutq, grain_lutmp
+%if %2 == 0
+    vbroadcasti128   m1, [pb_8x_27_17_8x_17_27]
+%endif
 %%loop_y_hv_overlap:
     ; src
+%if %2
     mova            xm4, [lumaq+lstrideq*0+ 0]
     mova            xm6, [lumaq+lstrideq*0+16]
     mova            xm0, [srcq]
     vpbroadcastd     m7, [pb_1]
-    vinserti128      m4, [lumaq+lstrideq*2 +0], 1
-    vinserti128      m6, [lumaq+lstrideq*2+16], 1
+    vinserti128      m4, [lumaq+lstrideq*(1+%3) +0], 1
+    vinserti128      m6, [lumaq+lstrideq*(1+%3)+16], 1
     vinserti128      m0, [srcq+strideq], 1
     pxor             m2, m2
     pmaddubsw        m4, m7
     pmaddubsw        m6, m7
     pavgw            m4, m2
     pavgw            m6, m2
+%else
+    mova             m4, [lumaq]
+    mova             m0, [srcq]
+    pxor             m2, m2
+%endif
 
 %if %1
+%if %2
     packuswb         m4, m6                 ; luma
+%endif
     punpckhbw        m6, m4, m0
     punpcklbw        m4, m0                 ; { luma, chroma }
     pmaddubsw        m6, m14
@@ -2020,6 +2176,9 @@
     packuswb         m4, m6                 ; pack+unpack = clip
     punpckhbw        m6, m4, m2
     punpcklbw        m4, m2
+%elif %2 == 0
+    punpckhbw        m6, m4, m2
+    punpcklbw        m4, m2
 %endif
 
     punpckhwd        m5, m4, m2
@@ -2043,44 +2202,94 @@
     packusdw         m8, m4
     packusdw         m5, m6
 
+%if %2
     ; unpack chroma source
     punpckhbw        m1, m0, m2
     punpcklbw        m0, m2                 ; m0-1: src as word
+%endif
 
     ; grain = grain_lut[offy+y][offx+x]
 %if %1
+%if %2
     vpbroadcastd     m9, [pb_23_22]
+%else
+    vpbroadcastd    xm9, [pb_27_17_17_27]
 %endif
+%endif
+
+%if %2
     movu            xm3, [grain_lutq+offxyq]
+%if %3
     movq            xm6, [grain_lutq+top_offxyq]
+%else
+    movu            xm6, [grain_lutq+top_offxyq]
+%endif
     vinserti128      m3, [grain_lutq+offxyq+82], 1
+%if %3
     vinserti128      m6, [grain_lutq+top_offxyq+8], 1
+%else
+    vinserti128      m6, [grain_lutq+top_offxyq+82], 1
+%endif
+%else
+    movu             m3, [grain_lutq+offxyq]
+    movu             m6, [grain_lutq+top_offxyq]
+%endif
     movd            xm4, [grain_lutq+left_offxyq]
     movd            xm7, [grain_lutq+topleft_offxyq]
+%if %2
     vinserti128      m4, [grain_lutq+left_offxyq+82], 1
+%if %3 == 0
+    vinserti128      m7, [grain_lutq+topleft_offxyq+82], 1
+%endif
+%endif
+
     ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+%if %2
     punpcklbw        m4, m3
+%if %3
     punpcklbw       xm7, xm6
+%else
+    punpcklbw        m7, m6
+%endif
+    punpcklwd        m4, m7
 %if %1
     pmaddubsw        m4, m9, m4
-    pmaddubsw       xm7, xm9, xm7
     pmulhrsw         m4, [pw_1024]
-    pmulhrsw        xm7, [pw_1024]
 %else
     pmaddubsw        m4, m15, m4
-    pmaddubsw       xm7, xm15, xm7
     pmulhrsw         m4, m14
-    pmulhrsw        xm7, xm14
 %endif
     packsswb         m4, m4
-    packsswb        xm7, xm7
     pcmpeqw          m9, m9                 ; this is kind of ugly
     psrldq           m9, 15
     vpblendvb        m3, m3, m4, m9
-    shufpd           m9, m9, m9, 1110b
-    vpblendvb        m6, m6, m7, m9
-    vpermq           m9, m3, q3120
+    psrldq           m4, 1
+%if %3
+    shufpd           m9, m9, m9, 1110b      ; clear upper lane
+%endif
+    vpblendvb        m6, m6, m4, m9
+%else
+    punpcklbw       xm4, xm3
+    punpcklbw       xm7, xm6
+    punpckldq       xm4, xm7
+%if %1
+    pmaddubsw       xm4, xm9, xm4
+    pmulhrsw        xm4, [pw_1024]
+%else
+    pmaddubsw       xm4, xm15, xm4
+    pmulhrsw        xm4, xm14
+%endif
+    packsswb        xm4, xm4
+    pcmpeqw         xm9, xm9                 ; this is kind of ugly
+    psrldq          xm9, 14
+    vpblendvb        m3, m3, m4, m9
+    psrldq          xm4, 2
+    vpblendvb        m6, m6, m4, m9
+%endif
+
     ; followed by v interpolation (top | cur -> cur)
+%if %3
+    vpermq           m9, m3, q3120
     punpcklbw        m6, m9
 %if %1
     vpbroadcastd     m9, [pb_23_22]
@@ -2093,6 +2302,26 @@
     packsswb         m6, m6
     vpermq           m6, m6, q3120
     vpblendd         m3, m3, m6, 00001111b
+%else
+    punpckhbw        m9, m6, m3
+    punpcklbw        m6, m3
+%if %2
+    mova             m3, [pb_8x_27_17_8x_17_27]
+    pmaddubsw        m9, m3, m9
+    pmaddubsw        m6, m3, m6
+%else
+    pmaddubsw        m9, m1, m9
+    pmaddubsw        m6, m1, m6
+%endif
+%if %1
+    pmulhrsw         m9, [pw_1024]
+    pmulhrsw         m6, [pw_1024]
+%else
+    pmulhrsw         m9, m14
+    pmulhrsw         m6, m14
+%endif
+    packsswb         m3, m6, m9
+%endif
     pcmpgtb          m7, m2, m3
     punpcklbw        m2, m3, m7
     punpckhbw        m3, m7
@@ -2104,6 +2333,7 @@
     pmulhrsw         m3, m11
 
     ; dst = clip_pixel(src, noise)
+%if %2
     paddw            m0, m2
     paddw            m1, m3
     pmaxsw           m0, m13
@@ -2113,20 +2343,47 @@
     packuswb         m0, m1
     mova         [dstq], xm0
     vextracti128 [dstq+strideq], m0, 1
+%else
+    pxor             m6, m6
+    punpckhbw        m9, m0, m6
+    punpcklbw        m0, m6                 ; m0-1: src as word
+    paddw            m0, m2
+    paddw            m9, m3
+    pmaxsw           m0, m13
+    pmaxsw           m9, m13
+    pminsw           m0, m12
+    pminsw           m9, m12
+    packuswb         m0, m9
+    mova         [dstq], m0
+%endif
 
+%if %2
     lea            srcq, [srcq+strideq*2]
     lea            dstq, [dstq+strideq*2]
-    lea           lumaq, [lumaq+lstrideq*4]
-    add      grain_lutq, 82*2
-    sub              hb, 2
+    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+    add            srcq, strideq
+    add            dstq, strideq
+    add           lumaq, lstrideq
+%endif
+    add      grain_lutq, 82<<%2
+    sub              hb, 1+%2
+%if %2
     jg %%loop_y_h_overlap
+%else
+    je %%end_y_hv_overlap
+    vbroadcasti128   m1, [pb_8x_27_17_8x_17_27+16]
+    btc              hd, 16
+    jnc %%loop_y_hv_overlap
+    jmp %%loop_y_h_overlap
+%endif
 
 %%end_y_hv_overlap:
-    add              wq, 16
+    add              wq, 32>>%2
     jge %%end_hv
     mov            srcq, r11mp
     mov            dstq, r12mp
-    lea           lumaq, [r14+wq*2]
+    lea           lumaq, [r14+wq*(1+%2)]
     add            srcq, wq
     add            dstq, wq
     jmp %%loop_x_hv_overlap
@@ -2135,8 +2392,13 @@
     RET
 %endmacro
 
-    FGUV_32x32xN_LOOP 1
+    %%FGUV_32x32xN_LOOP 1, %2, %3
 .csfl:
-    FGUV_32x32xN_LOOP 0
+    %%FGUV_32x32xN_LOOP 0, %2, %3
+%endmacro
+
+FGUV_FN 420, 1, 1
+FGUV_FN 422, 1, 0
+FGUV_FN 444, 0, 0
 
 %endif ; ARCH_X86_64
diff --git a/src/x86/film_grain_init_tmpl.c b/src/x86/film_grain_init_tmpl.c
index 55de12a..d0de86d 100644
--- a/src/x86/film_grain_init_tmpl.c
+++ b/src/x86/film_grain_init_tmpl.c
@@ -41,6 +41,8 @@
 decl_generate_grain_uv_fn(dav1d_generate_grain_uv_444_avx2);
 decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_avx2);
 decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_avx2);
+decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i422_avx2);
+decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i444_avx2);
 
 COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c) {
     const unsigned flags = dav1d_get_cpu_flags();
@@ -65,5 +67,7 @@
     c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_generate_grain_uv_444_avx2;
     c->fgy_32x32xn = dav1d_fgy_32x32xn_avx2;
     c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_avx2;
+    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_fguv_32x32xn_i422_avx2;
+    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_fguv_32x32xn_i444_avx2;
 #endif
 }