loongarch: Improve the performance of mc_8bpc.mc functions

Relative speedup over C code:

mc_8tap_regular_w2_0_8bpc_c:                      5.3 ( 1.00x)
mc_8tap_regular_w2_0_8bpc_lsx:                    0.8 ( 6.62x)
mc_8tap_regular_w2_h_8bpc_c:                     11.0 ( 1.00x)
mc_8tap_regular_w2_h_8bpc_lsx:                    2.5 ( 4.40x)
mc_8tap_regular_w2_hv_8bpc_c:                    24.4 ( 1.00x)
mc_8tap_regular_w2_hv_8bpc_lsx:                   9.1 ( 2.70x)
mc_8tap_regular_w2_v_8bpc_c:                     12.9 ( 1.00x)
mc_8tap_regular_w2_v_8bpc_lsx:                    3.2 ( 4.08x)
mc_8tap_regular_w4_0_8bpc_c:                      4.8 ( 1.00x)
mc_8tap_regular_w4_0_8bpc_lsx:                    0.8 ( 5.97x)
mc_8tap_regular_w4_h_8bpc_c:                     20.0 ( 1.00x)
mc_8tap_regular_w4_h_8bpc_lsx:                    3.9 ( 5.06x)
mc_8tap_regular_w4_hv_8bpc_c:                    44.3 ( 1.00x)
mc_8tap_regular_w4_hv_8bpc_lsx:                  15.0 ( 2.96x)
mc_8tap_regular_w4_v_8bpc_c:                     23.5 ( 1.00x)
mc_8tap_regular_w4_v_8bpc_lsx:                    4.2 ( 5.54x)
mc_8tap_regular_w8_0_8bpc_c:                      4.8 ( 1.00x)
mc_8tap_regular_w8_0_8bpc_lsx:                    0.8 ( 6.03x)
mc_8tap_regular_w8_h_8bpc_c:                     37.5 ( 1.00x)
mc_8tap_regular_w8_h_8bpc_lsx:                    7.6 ( 4.96x)
mc_8tap_regular_w8_hv_8bpc_c:                    84.0 ( 1.00x)
mc_8tap_regular_w8_hv_8bpc_lsx:                  23.9 ( 3.51x)
mc_8tap_regular_w8_v_8bpc_c:                     44.8 ( 1.00x)
mc_8tap_regular_w8_v_8bpc_lsx:                    7.2 ( 6.23x)
mc_8tap_regular_w16_0_8bpc_c:                     5.8 ( 1.00x)
mc_8tap_regular_w16_0_8bpc_lsx:                   1.1 ( 5.12x)
mc_8tap_regular_w16_h_8bpc_c:                   103.8 ( 1.00x)
mc_8tap_regular_w16_h_8bpc_lsx:                  21.6 ( 4.80x)
mc_8tap_regular_w16_hv_8bpc_c:                  220.2 ( 1.00x)
mc_8tap_regular_w16_hv_8bpc_lsx:                 65.1 ( 3.38x)
mc_8tap_regular_w16_v_8bpc_c:                   124.8 ( 1.00x)
mc_8tap_regular_w16_v_8bpc_lsx:                  19.9 ( 6.28x)
mc_8tap_regular_w32_0_8bpc_c:                     8.9 ( 1.00x)
mc_8tap_regular_w32_0_8bpc_lsx:                   2.9 ( 3.06x)
mc_8tap_regular_w32_h_8bpc_c:                   323.6 ( 1.00x)
mc_8tap_regular_w32_h_8bpc_lsx:                  69.1 ( 4.68x)
mc_8tap_regular_w32_hv_8bpc_c:                  649.5 ( 1.00x)
mc_8tap_regular_w32_hv_8bpc_lsx:                197.7 ( 3.29x)
mc_8tap_regular_w32_v_8bpc_c:                   390.5 ( 1.00x)
mc_8tap_regular_w32_v_8bpc_lsx:                  61.9 ( 6.31x)
mc_8tap_regular_w64_0_8bpc_c:                    13.3 ( 1.00x)
mc_8tap_regular_w64_0_8bpc_lsx:                   9.7 ( 1.37x)
mc_8tap_regular_w64_h_8bpc_c:                  1145.3 ( 1.00x)
mc_8tap_regular_w64_h_8bpc_lsx:                 248.2 ( 4.61x)
mc_8tap_regular_w64_hv_8bpc_c:                 2204.4 ( 1.00x)
mc_8tap_regular_w64_hv_8bpc_lsx:                682.1 ( 3.23x)
mc_8tap_regular_w64_v_8bpc_c:                  1384.9 ( 1.00x)
mc_8tap_regular_w64_v_8bpc_lsx:                 218.9 ( 6.33x)
mc_8tap_regular_w128_0_8bpc_c:                   33.6 ( 1.00x)
mc_8tap_regular_w128_0_8bpc_lsx:                 27.7 ( 1.21x)
mc_8tap_regular_w128_h_8bpc_c:                 3228.1 ( 1.00x)
mc_8tap_regular_w128_h_8bpc_lsx:                701.7 ( 4.60x)
mc_8tap_regular_w128_hv_8bpc_c:                6108.2 ( 1.00x)
mc_8tap_regular_w128_hv_8bpc_lsx:              1905.3 ( 3.21x)
mc_8tap_regular_w128_v_8bpc_c:                 3906.8 ( 1.00x)
mc_8tap_regular_w128_v_8bpc_lsx:                617.4 ( 6.33x)
diff --git a/src/loongarch/mc.S b/src/loongarch/mc.S
index 0d335b5..9e0dbff 100644
--- a/src/loongarch/mc.S
+++ b/src/loongarch/mc.S
@@ -2624,3 +2624,1029 @@
 
 #undef bpc_sh
 #undef bpcw_sh
+
+.macro  vhaddw.d.h  in0
+    vhaddw.w.h  \in0,  \in0,  \in0
+    vhaddw.d.w  \in0,  \in0,  \in0
+.endm
+.macro  vhaddw.q.w  in0
+    vhaddw.d.w  \in0,  \in0,  \in0
+    vhaddw.q.d  \in0,  \in0,  \in0
+.endm
+.macro PUT_H_8W in0
+    vbsrl.v          vr2,    \in0,  1
+    vbsrl.v          vr3,    \in0,  2
+    vbsrl.v          vr4,    \in0,  3
+    vbsrl.v          vr5,    \in0,  4
+    vbsrl.v          vr6,    \in0,  5
+    vbsrl.v          vr7,    \in0,  6
+    vbsrl.v          vr10,   \in0,  7
+    vilvl.d          vr2,    vr2,   \in0
+    vilvl.d          vr3,    vr4,   vr3
+    vilvl.d          vr4,    vr6,   vr5
+    vilvl.d          vr5,    vr10,  vr7
+    vdp2.h.bu.b      \in0,   vr2,   vr8
+    vdp2.h.bu.b      vr2,    vr3,   vr8
+    vdp2.h.bu.b      vr3,    vr4,   vr8
+    vdp2.h.bu.b      vr4,    vr5,   vr8
+    vhaddw.d.h       \in0
+    vhaddw.d.h       vr2
+    vhaddw.d.h       vr3
+    vhaddw.d.h       vr4
+    vpickev.w        \in0,   vr2,   \in0
+    vpickev.w        vr2,    vr4,   vr3
+    vpickev.h        \in0,   vr2,   \in0
+    vadd.h           \in0,   \in0,  vr9
+.endm
+.macro FILTER_8TAP_4W in0
+    vbsrl.v          vr10,   \in0,  1
+    vbsrl.v          vr11,   \in0,  2
+    vbsrl.v          vr12,   \in0,  3
+    vilvl.d          vr10,   vr10, \in0
+    vilvl.d          vr11,   vr12,  vr11
+    vdp2.h.bu.b      vr7,    vr10,  vr8
+    vdp2.h.bu.b      vr10,   vr11,  vr8
+    vhaddw.d.h       vr7
+    vhaddw.d.h       vr10
+    vpickev.w        \in0,   vr10,  vr7
+.endm
+.macro FILTER_8TAP_8W in0
+    vbsrl.v         vr10,    \in0,  1
+    vbsrl.v         vr11,    \in0,  2
+    vbsrl.v         vr12,    \in0,  3
+    vbsrl.v         vr13,    \in0,  4
+    vbsrl.v         vr14,    \in0,  5
+    vbsrl.v         vr15,    \in0,  6
+    vbsrl.v         vr16,    \in0,  7
+    vilvl.d         vr10,    vr10,  \in0
+    vilvl.d         vr11,    vr12,  vr11
+    vilvl.d         vr12,    vr14,  vr13
+    vilvl.d         vr13,    vr16,  vr15
+    vdp2.h.bu.b     vr14,    vr10,  vr8
+    vdp2.h.bu.b     vr15,    vr11,  vr8
+    vdp2.h.bu.b     vr16,    vr12,  vr8
+    vdp2.h.bu.b     vr17,    vr13,  vr8
+    vhaddw.d.h      vr14
+    vhaddw.d.h      vr15
+    vhaddw.d.h      vr16
+    vhaddw.d.h      vr17
+    vpickev.w       vr13,    vr15,  vr14
+    vpickev.w       vr14,    vr17,  vr16
+    vpickev.h       \in0,    vr14,  vr13 //x0 ... x7
+    vsrari.h        \in0,    \in0,  2
+.endm
+.macro FILTER_8TAP_8W_CLIP_STORE
+    vdp2.w.h        vr12,    vr0,   vr9
+    vdp2.w.h        vr13,    vr1,   vr9
+    vdp2.w.h        vr14,    vr2,   vr9
+    vdp2.w.h        vr15,    vr3,   vr9
+    vdp2.w.h        vr16,    vr4,   vr9
+    vdp2.w.h        vr17,    vr5,   vr9
+    vdp2.w.h        vr18,    vr6,   vr9
+    vdp2.w.h        vr19,    vr7,   vr9
+    vhaddw.q.w      vr12
+    vhaddw.q.w      vr13
+    vhaddw.q.w      vr14
+    vhaddw.q.w      vr15
+    vhaddw.q.w      vr16
+    vhaddw.q.w      vr17
+    vhaddw.q.w      vr18
+    vhaddw.q.w      vr19
+    vpackev.w       vr12,    vr13,  vr12
+    vpackev.w       vr13,    vr15,  vr14
+    vpackev.d       vr12,    vr13,  vr12
+    vpackev.w       vr14,    vr17,  vr16
+    vpackev.w       vr15,    vr19,  vr18
+    vpackev.d       vr13,    vr15,  vr14
+    vssrarni.hu.w   vr13,    vr12,  10
+    vssrani.bu.h    vr13,    vr13,  0
+    vstelm.d        vr13,    a0,    0,   0
+    add.d           a0,      a0,    a1
+.endm
+.macro VEXTRINS_Hx8 in0
+    vextrins.h      vr0,     \in0,  0x70
+    vextrins.h      vr1,     \in0,  0x71
+    vextrins.h      vr2,     \in0,  0x72
+    vextrins.h      vr3,     \in0,  0x73
+    vextrins.h      vr4,     \in0,  0x74
+    vextrins.h      vr5,     \in0,  0x75
+    vextrins.h      vr6,     \in0,  0x76
+    vextrins.h      vr7,     \in0,  0x77
+.endm
+.macro VBSRL_Vx8
+    vbsrl.v         vr0,     vr0,   2
+    vbsrl.v         vr1,     vr1,   2
+    vbsrl.v         vr2,     vr2,   2
+    vbsrl.v         vr3,     vr3,   2
+    vbsrl.v         vr4,     vr4,   2
+    vbsrl.v         vr5,     vr5,   2
+    vbsrl.v         vr6,     vr6,   2
+    vbsrl.v         vr7,     vr7,   2
+.endm
+
+.macro PUT_8TAP_8BPC_LSX lable
+    li.w             t0,     4
+    la.local         t6,     dav1d_mc_subpel_filters
+    slli.d           t2,     a3,    1  //src_stride*2
+    add.d            t3,     t2,    a3 //src_stride*3
+    slli.d           t4,     t2,    1  //src_stride*4
+
+    bnez             a6,     .l_\lable\()put_h //mx
+    bnez             a7,     .l_\lable\()put_v //my
+
+    clz.w            t1,     a4
+    li.w             t5,     24
+    sub.w            t1,     t1,    t5
+    la.local         t5,     .l_\lable\()put_hv0_jtable
+    alsl.d           t1,     t1,    t5,   3
+    ld.d             t6,     t1,    0
+    add.d            t5,     t5,    t6
+    jirl             $r0,    t5,    0
+
+    .align   3
+.l_\lable\()put_hv0_jtable:
+    .dword .l_\lable\()put_hv0_128w - .l_\lable\()put_hv0_jtable
+    .dword .l_\lable\()put_hv0_64w  - .l_\lable\()put_hv0_jtable
+    .dword .l_\lable\()put_hv0_32w  - .l_\lable\()put_hv0_jtable
+    .dword .l_\lable\()put_hv0_16w  - .l_\lable\()put_hv0_jtable
+    .dword .l_\lable\()put_hv0_8w   - .l_\lable\()put_hv0_jtable
+    .dword .l_\lable\()put_hv0_4w   - .l_\lable\()put_hv0_jtable
+    .dword .l_\lable\()put_hv0_2w   - .l_\lable\()put_hv0_jtable
+
+.l_\lable\()put_hv0_2w:
+    vldrepl.h        vr0,    a2,    0
+    add.d            a2,     a2,    a3
+    vldrepl.h        vr1,    a2,    0
+    vstelm.h         vr0,    a0,    0,     0
+    add.d            a0,     a0,    a1
+    vstelm.h         vr1,    a0,    0,     0
+    add.d            a2,     a2,    a3
+    add.d            a0,     a0,    a1
+    addi.w           a5,     a5,    -2
+    bnez             a5,     .l_\lable\()put_hv0_2w
+    b                .l_\lable\()end_put_8tap
+.l_\lable\()put_hv0_4w:
+    fld.s            f0,     a2,    0
+    fldx.s           f1,     a2,    a3
+    fst.s            f0,     a0,    0
+    fstx.s           f1,     a0,    a1
+    alsl.d           a2,     a3,    a2,    1
+    alsl.d           a0,     a1,    a0,    1
+    addi.w           a5,     a5,    -2
+    bnez             a5,     .l_\lable\()put_hv0_4w
+    b                .l_\lable\()end_put_8tap
+.l_\lable\()put_hv0_8w:
+    fld.d            f0,     a2,    0
+    fldx.d           f1,     a2,    a3
+    fst.d            f0,     a0,    0
+    fstx.d           f1,     a0,    a1
+    alsl.d           a2,     a3,    a2,    1
+    alsl.d           a0,     a1,    a0,    1
+    addi.w           a5,     a5,    -2
+    bnez             a5,     .l_\lable\()put_hv0_8w
+    b                .l_\lable\()end_put_8tap
+.l_\lable\()put_hv0_16w:
+    vld              vr0,    a2,    0
+    vldx             vr1,    a2,    a3
+    vst              vr0,    a0,    0
+    vstx             vr1,    a0,    a1
+    alsl.d           a2,     a3,    a2,    1
+    alsl.d           a0,     a1,    a0,    1
+    addi.w           a5,     a5,    -2
+    bnez             a5,     .l_\lable\()put_hv0_16w
+    b                .l_\lable\()end_put_8tap
+.l_\lable\()put_hv0_32w:
+    vld              vr0,    a2,    0
+    vld              vr1,    a2,    16
+    add.d            a2,     a2,    a3
+    vld              vr2,    a2,    0
+    vld              vr3,    a2,    16
+    vst              vr0,    a0,    0
+    vst              vr1,    a0,    16
+    add.d            a0,     a0,    a1
+    vst              vr2,    a0,    0
+    vst              vr3,    a0,    16
+    add.d            a2,     a2,    a3
+    add.d            a0,     a0,    a1
+    addi.w           a5,     a5,    -2
+    bnez             a5,     .l_\lable\()put_hv0_32w
+    b                .l_\lable\()end_put_8tap
+.l_\lable\()put_hv0_64w:
+    vld              vr0,    a2,    0
+    vld              vr1,    a2,    16
+    vld              vr2,    a2,    32
+    vld              vr3,    a2,    48
+    add.d            a2,     a2,    a3
+    vld              vr4,    a2,    0
+    vld              vr5,    a2,    16
+    vld              vr6,    a2,    32
+    vld              vr7,    a2,    48
+    add.d            a2,     a2,    a3
+    vst              vr0,    a0,    0
+    vst              vr1,    a0,    16
+    vst              vr2,    a0,    32
+    vst              vr3,    a0,    48
+    add.d            a0,     a0,    a1
+    vst              vr4,    a0,    0
+    vst              vr5,    a0,    16
+    vst              vr6,    a0,    32
+    vst              vr7,    a0,    48
+    add.d            a0,     a0,    a1
+    addi.w           a5,     a5,    -2
+    bnez             a5,     .l_\lable\()put_hv0_64w
+    b                .l_\lable\()end_put_8tap
+.l_\lable\()put_hv0_128w:
+    vld              vr0,    a2,    0
+    vld              vr1,    a2,    16
+    vld              vr2,    a2,    32
+    vld              vr3,    a2,    48
+    vld              vr4,    a2,    64
+    vld              vr5,    a2,    80
+    vld              vr6,    a2,    96
+    vld              vr7,    a2,    112
+    add.d            a2,     a2,    a3
+    vld              vr8,    a2,    0
+    vld              vr9,    a2,    16
+    vld              vr10,   a2,    32
+    vld              vr11,   a2,    48
+    vld              vr12,   a2,    64
+    vld              vr13,   a2,    80
+    vld              vr14,   a2,    96
+    vld              vr15,   a2,    112
+    add.d            a2,     a2,    a3
+    vst              vr0,    a0,    0
+    vst              vr1,    a0,    16
+    vst              vr2,    a0,    32
+    vst              vr3,    a0,    48
+    vst              vr4,    a0,    64
+    vst              vr5,    a0,    80
+    vst              vr6,    a0,    96
+    vst              vr7,    a0,    112
+    add.d            a0,     a0,    a1
+    vst              vr8,    a0,    0
+    vst              vr9,    a0,    16
+    vst              vr10,   a0,    32
+    vst              vr11,   a0,    48
+    vst              vr12,   a0,    64
+    vst              vr13,   a0,    80
+    vst              vr14,   a0,    96
+    vst              vr15,   a0,    112
+    add.d            a0,     a0,    a1
+    addi.w           a5,     a5,    -2
+    bnez             a5,     .l_\lable\()put_hv0_128w
+    b                .l_\lable\()end_put_8tap
+
+.l_\lable\()put_h:
+    bnez             a7,     .l_\lable\()put_hv //if(fh) && if (fv)
+    ld.d             t5,     sp,    0  //filter_type
+    andi             t1,     t5,    3
+    blt              t0,     a4,    .l_\lable\()put_h_idx_fh
+    andi             t1,     t5,    1
+    addi.w           t1,     t1,    3
+
+.l_\lable\()put_h_idx_fh:
+    addi.w           t5,     zero,  120
+    mul.w            t1,     t1,    t5
+    addi.w           t5,     a6,    -1
+    slli.w           t5,     t5,    3
+    add.w            t1,     t1,    t5
+    add.d            t1,     t6,    t1 //fh's offset
+    vldrepl.d        vr8,    t1,    0
+    addi.d           a2,     a2,    -3
+    li.w             t1,     34
+    vreplgr2vr.h     vr9,    t1
+
+    clz.w            t1,     a4
+    li.w             t5,     24
+    sub.w            t1,     t1,    t5
+    la.local         t5,     .l_\lable\()put_h_jtable
+    alsl.d           t1,     t1,    t5,   3
+    ld.d             t6,     t1,    0
+    add.d            t5,     t5,    t6
+    jirl             $r0,    t5,    0
+
+    .align   3
+.l_\lable\()put_h_jtable:
+    .dword .l_\lable\()put_h_128w - .l_\lable\()put_h_jtable
+    .dword .l_\lable\()put_h_64w  - .l_\lable\()put_h_jtable
+    .dword .l_\lable\()put_h_32w  - .l_\lable\()put_h_jtable
+    .dword .l_\lable\()put_h_16w  - .l_\lable\()put_h_jtable
+    .dword .l_\lable\()put_h_8w   - .l_\lable\()put_h_jtable
+    .dword .l_\lable\()put_h_4w   - .l_\lable\()put_h_jtable
+    .dword .l_\lable\()put_h_2w   - .l_\lable\()put_h_jtable
+
+.l_\lable\()put_h_2w:
+    vld              vr0,    a2,    0
+    vldx             vr1,    a2,    a3
+    add.d            a2,     a2,    t2
+
+    vbsrl.v          vr2,    vr0,   1
+    vilvl.d          vr0,    vr2,   vr0
+    vdp2.h.bu.b      vr2,    vr0,   vr8
+    vhaddw.w.h       vr0,    vr2,   vr2
+    vhaddw.d.w       vr0,    vr0,   vr0
+    vbsrl.v          vr2,    vr1,   1
+    vilvl.d          vr1,    vr2,   vr1
+    vdp2.h.bu.b      vr2,    vr1,   vr8
+    vhaddw.w.h       vr1,    vr2,   vr2
+    vhaddw.d.w       vr1,    vr1,   vr1
+    vpickev.w        vr0,    vr1,   vr0
+    vpickev.h        vr0,    vr0,   vr0
+    vadd.h           vr0,    vr0,   vr9
+    vssrani.bu.h     vr0,    vr0,   6
+
+    vstelm.h         vr0,    a0,    0,   0
+    add.d            a0,     a0,    a1
+    vstelm.h         vr0,    a0,    0,   1
+    add.d            a0,     a0,    a1
+    addi.w           a5,     a5,    -2
+    bnez             a5,     .l_\lable\()put_h_2w
+    b                .l_\lable\()end_put_8tap
+
+.l_\lable\()put_h_4w:
+    vld              vr0,    a2,    0
+    vldx             vr1,    a2,    a3
+    add.d            a2,     a2,    t2
+
+    vbsrl.v          vr2,    vr0,   1
+    vbsrl.v          vr3,    vr0,   2
+    vbsrl.v          vr4,    vr0,   3
+    vilvl.d          vr0,    vr2,   vr0 //x0 x1
+    vilvl.d          vr2,    vr4,   vr3 //x2 x3
+    vdp2.h.bu.b      vr3,    vr0,   vr8
+    vdp2.h.bu.b      vr4,    vr2,   vr8
+    vhaddw.w.h       vr0,    vr3,   vr3
+    vhaddw.d.w       vr0,    vr0,   vr0
+    vhaddw.w.h       vr2,    vr4,   vr4
+    vhaddw.d.w       vr2,    vr2,   vr2
+    vpickev.w        vr5,    vr2,   vr0
+    vbsrl.v          vr2,    vr1,   1
+    vbsrl.v          vr3,    vr1,   2
+    vbsrl.v          vr4,    vr1,   3
+    vilvl.d          vr0,    vr2,   vr1 //x0 x1
+    vilvl.d          vr2,    vr4,   vr3 //x2 x3
+    vdp2.h.bu.b      vr3,    vr0,   vr8
+    vdp2.h.bu.b      vr4,    vr2,   vr8
+    vhaddw.w.h       vr0,    vr3,   vr3
+    vhaddw.d.w       vr0,    vr0,   vr0
+    vhaddw.w.h       vr2,    vr4,   vr4
+    vhaddw.d.w       vr2,    vr2,   vr2
+    vpickev.w        vr6,    vr2,   vr0
+    vpickev.h        vr0,    vr6,   vr5
+    vadd.h           vr0,    vr0,   vr9
+    vssrani.bu.h     vr0,    vr0,   6
+
+    vstelm.w         vr0,    a0,    0,    0
+    add.d            a0,     a0,    a1
+    vstelm.w         vr0,    a0,    0,    1
+    add.d            a0,     a0,    a1
+    addi.d           a5,     a5,    -2
+    bnez             a5,     .l_\lable\()put_h_4w
+    b                .l_\lable\()end_put_8tap
+
+.l_\lable\()put_h_8w:
+    vld              vr0,    a2,    0
+    vldx             vr1,    a2,    a3
+    add.d            a2,     a2,    t2
+    PUT_H_8W         vr0
+    PUT_H_8W         vr1
+    vssrani.bu.h     vr1,    vr0,   6
+    vstelm.d         vr1,    a0,    0,    0
+    add.d            a0,     a0,    a1
+    vstelm.d         vr1,    a0,    0,    1
+    add.d            a0,     a0,    a1
+    addi.w           a5,     a5,    -2
+    bnez             a5,     .l_\lable\()put_h_8w
+    b                .l_\lable\()end_put_8tap
+
+.l_\lable\()put_h_16w:
+.l_\lable\()put_h_32w:
+.l_\lable\()put_h_64w:
+.l_\lable\()put_h_128w:
+    addi.d           t0,     a2,    0 //src
+    addi.w           t5,     a5,    0 //h
+    addi.d           t8,     a0,    0 //dst
+.l_\lable\()put_h_16w_loop:
+    vld              vr0,    a2,    0
+    vldx             vr1,    a2,    a3
+    add.d            a2,     a2,    t2
+    PUT_H_8W         vr0
+    PUT_H_8W         vr1
+    vssrani.bu.h     vr1,    vr0,   6
+    vstelm.d         vr1,    a0,    0,   0
+    add.d            a0,     a0,    a1
+    vstelm.d         vr1,    a0,    0,   1
+    add.d            a0,     a0,    a1
+    addi.d           a5,     a5,    -2
+    bnez             a5,     .l_\lable\()put_h_16w_loop
+    addi.d           a2,     t0,    8
+    addi.d           t0,     t0,    8
+    addi.d           a0,     t8,    8
+    addi.d           t8,     t8,    8
+    addi.w           a5,     t5,    0
+    addi.w           a4,     a4,    -8
+    bnez             a4,     .l_\lable\()put_h_16w_loop
+    b                .l_\lable\()end_put_8tap
+
+.l_\lable\()put_v:
+    ld.d             t1,     sp,    0  //filter_type
+    srli.w           t1,     t1,    2
+    blt              t0,     a5,    .l_\lable\()put_v_idx_fv
+    andi             t1,     t1,    1
+    addi.w           t1,     t1,    3
+
+.l_\lable\()put_v_idx_fv:
+    addi.w           t5,     zero,  120
+    mul.w            t1,     t1,    t5
+    addi.w           t5,     a7,    -1
+    slli.w           t5,     t5,    3
+    add.w            t1,     t1,    t5
+    add.d            t1,     t6,    t1 //fv's offset
+    vldrepl.d        vr8,    t1,    0
+    sub.d            a2,     a2,    t3
+
+    clz.w            t1,     a4
+    li.w             t5,     24
+    sub.w            t1,     t1,    t5
+    la.local         t5,     .l_\lable\()put_v_jtable
+    alsl.d           t1,     t1,    t5,   3
+    ld.d             t6,     t1,    0
+    add.d            t5,     t5,    t6
+    jirl             $r0,    t5,    0
+
+    .align   3
+.l_\lable\()put_v_jtable:
+    .dword .l_\lable\()put_v_128w - .l_\lable\()put_v_jtable
+    .dword .l_\lable\()put_v_64w  - .l_\lable\()put_v_jtable
+    .dword .l_\lable\()put_v_32w  - .l_\lable\()put_v_jtable
+    .dword .l_\lable\()put_v_16w  - .l_\lable\()put_v_jtable
+    .dword .l_\lable\()put_v_8w   - .l_\lable\()put_v_jtable
+    .dword .l_\lable\()put_v_4w   - .l_\lable\()put_v_jtable
+    .dword .l_\lable\()put_v_2w   - .l_\lable\()put_v_jtable
+
+.l_\lable\()put_v_2w:
+    fld.s            f0,     a2,    0
+    fldx.s           f1,     a2,    a3
+    fldx.s           f2,     a2,    t2
+    add.d            a2,     a2,    t3
+    fld.s            f3,     a2,    0
+    fldx.s           f4,     a2,    a3
+    fldx.s           f5,     a2,    t2
+    fldx.s           f6,     a2,    t3
+    add.d            a2,     a2,    t4
+    vilvl.b          vr0,    vr1,   vr0
+    vilvl.b          vr1,    vr3,   vr2
+    vilvl.b          vr2,    vr5,   vr4
+    vilvl.b          vr3,    vr7,   vr6
+    vilvl.h          vr0,    vr1,   vr0
+    vilvl.h          vr1,    vr3,   vr2
+    vilvl.w          vr0,    vr1,   vr0
+
+.l_\lable\()put_v_2w_loop:
+    fld.s            f7,     a2,    0  //h0
+    fldx.s           f10,    a2,    a3 //h1
+    add.d            a2,     a2,    t2
+
+    vextrins.b       vr0,    vr7,   0x70
+    vextrins.b       vr0,    vr7,   0xf1
+    vbsrl.v          vr1,    vr0,   1
+    vextrins.b       vr1,    vr10,  0x70
+    vextrins.b       vr1,    vr10,  0xf1
+    vdp2.h.bu.b      vr10,   vr0,   vr8
+    vdp2.h.bu.b      vr11,   vr1,   vr8
+    vbsrl.v          vr0,    vr1,   1
+    vhaddw.d.h       vr10
+    vhaddw.d.h       vr11
+    vpickev.w        vr10,   vr11,  vr10
+    vssrarni.hu.w    vr10,   vr10,  6
+    vssrani.bu.h     vr10,   vr10,  0
+
+    vstelm.h         vr10,   a0,    0,   0
+    add.d            a0,     a0,    a1
+    vstelm.h         vr10,   a0,    0,   1
+    add.d            a0,     a0,    a1
+    addi.w           a5,     a5,    -2
+    bnez             a5,     .l_\lable\()put_v_2w_loop
+    b                .l_\lable\()end_put_8tap
+
+.l_\lable\()put_v_4w:
+    fld.s            f0,     a2,    0
+    fldx.s           f1,     a2,    a3
+    fldx.s           f2,     a2,    t2
+    add.d            a2,     a2,    t3
+    fld.s            f3,     a2,    0
+    fldx.s           f4,     a2,    a3
+    fldx.s           f5,     a2,    t2
+    fldx.s           f6,     a2,    t3
+    add.d            a2,     a2,    t4
+
+    vilvl.b          vr0,    vr1,   vr0
+    vilvl.b          vr1,    vr3,   vr2
+    vilvl.b          vr2,    vr5,   vr4
+    vilvl.b          vr3,    vr7,   vr6
+    vilvl.h          vr0,    vr1,   vr0
+    vilvl.h          vr1,    vr3,   vr2
+    vilvl.w          vr2,    vr1,   vr0
+    vilvh.w          vr3,    vr1,   vr0
+
+.l_\lable\()put_v_4w_loop:
+    fld.s            f7,     a2,    0
+    fldx.s           f10,    a2,    a3
+    add.d            a2,     a2,    t2
+
+    vextrins.b       vr2,    vr7,   0x70
+    vextrins.b       vr2,    vr7,   0xf1 //x0x1(h0)
+    vbsrl.v          vr4,    vr2,   1
+    vextrins.b       vr4,    vr10,  0x70
+    vextrins.b       vr4,    vr10,  0xf1 //x0x1(h1)
+    vdp2.h.bu.b      vr11,   vr2,   vr8
+    vdp2.h.bu.b      vr12,   vr4,   vr8
+    vbsrl.v          vr2,    vr4,   1
+
+    vextrins.b       vr3,    vr7,   0x72
+    vextrins.b       vr3,    vr7,   0xf3 //x2x3(h0)
+    vbsrl.v          vr4,    vr3,   1
+    vextrins.b       vr4,    vr10,  0x72
+    vextrins.b       vr4,    vr10,  0xf3 //x2x3(h1)
+    vdp2.h.bu.b      vr13,   vr3,   vr8
+    vdp2.h.bu.b      vr14,   vr4,   vr8
+    vbsrl.v          vr3,    vr4,   1
+
+    vhaddw.d.h       vr11
+    vhaddw.d.h       vr12
+    vhaddw.d.h       vr13
+    vhaddw.d.h       vr14
+
+    vpickev.w        vr11,   vr13,  vr11
+    vpickev.w        vr12,   vr14,  vr12
+    vpickev.h        vr11,   vr12,  vr11
+    vssrarni.bu.h    vr11,   vr11,  6
+    vstelm.w         vr11,   a0,    0,   0
+    add.d            a0,     a0,    a1
+    vstelm.w         vr11,   a0,    0,   1
+    add.d            a0,     a0,    a1
+    addi.w           a5,     a5,    -2
+    bnez             a5,     .l_\lable\()put_v_4w_loop
+    b                .l_\lable\()end_put_8tap
+
+.l_\lable\()put_v_8w:
+.l_\lable\()put_v_16w:
+.l_\lable\()put_v_32w:
+.l_\lable\()put_v_64w:
+.l_\lable\()put_v_128w:
+    addi.d           t0,     a2,    0 //src
+    addi.d           t5,     a5,    0 //h
+    addi.d           t8,     a0,    0 //dst
+.l_\lable\()put_v_8w_loop0:
+    fld.d            f0,     a2,    0
+    fldx.d           f1,     a2,    a3
+    fldx.d           f2,     a2,    t2
+    add.d            a2,     a2,    t3
+    fld.d            f3,     a2,    0
+    fldx.d           f4,     a2,    a3
+    fldx.d           f5,     a2,    t2
+    fldx.d           f6,     a2,    t3
+    add.d            a2,     a2,    t4
+
+    vilvl.b          vr0,    vr1,   vr0
+    vilvl.b          vr1,    vr3,   vr2
+    vilvl.b          vr2,    vr5,   vr4
+    vilvl.b          vr3,    vr7,   vr6
+    vilvl.h          vr4,    vr1,   vr0
+    vilvh.h          vr5,    vr1,   vr0
+    vilvl.h          vr6,    vr3,   vr2
+    vilvh.h          vr7,    vr3,   vr2
+    vilvl.w          vr0,    vr6,   vr4 // x0x1
+    vilvh.w          vr1,    vr6,   vr4 // x2x3
+    vilvl.w          vr2,    vr7,   vr5 // x4x5
+    vilvh.w          vr3,    vr7,   vr5 // x6x7
+.l_\lable\()put_v_8w_loop:
+    fld.d            f7,     a2,    0
+    fldx.d           f10,    a2,    a3
+    add.d            a2,     a2,    t2
+    //h0
+    vextrins.b       vr0,    vr7,   0x70
+    vextrins.b       vr0,    vr7,   0xf1
+    vextrins.b       vr1,    vr7,   0x72
+    vextrins.b       vr1,    vr7,   0xf3
+    vextrins.b       vr2,    vr7,   0x74
+    vextrins.b       vr2,    vr7,   0xf5
+    vextrins.b       vr3,    vr7,   0x76
+    vextrins.b       vr3,    vr7,   0xf7
+    vdp2.h.bu.b      vr11,   vr0,   vr8
+    vdp2.h.bu.b      vr12,   vr1,   vr8
+    vdp2.h.bu.b      vr13,   vr2,   vr8
+    vdp2.h.bu.b      vr14,   vr3,   vr8
+    vhaddw.d.h       vr11
+    vhaddw.d.h       vr12
+    vhaddw.d.h       vr13
+    vhaddw.d.h       vr14
+    vpickev.w        vr11,   vr12,  vr11
+    vpickev.w        vr12,   vr14,  vr13
+    vpickev.h        vr11,   vr12,  vr11
+    vssrarni.bu.h    vr11,   vr11,  6
+    fst.d            f11,    a0,    0
+    add.d            a0,     a0,    a1
+    //h1
+    vbsrl.v          vr0,    vr0,   1
+    vbsrl.v          vr1,    vr1,   1
+    vbsrl.v          vr2,    vr2,   1
+    vbsrl.v          vr3,    vr3,   1
+    vextrins.b       vr0,    vr10,  0x70
+    vextrins.b       vr0,    vr10,  0xf1
+    vextrins.b       vr1,    vr10,  0x72
+    vextrins.b       vr1,    vr10,  0xf3
+    vextrins.b       vr2,    vr10,  0x74
+    vextrins.b       vr2,    vr10,  0xf5
+    vextrins.b       vr3,    vr10,  0x76
+    vextrins.b       vr3,    vr10,  0xf7
+    vdp2.h.bu.b      vr11,   vr0,   vr8
+    vdp2.h.bu.b      vr12,   vr1,   vr8
+    vdp2.h.bu.b      vr13,   vr2,   vr8
+    vdp2.h.bu.b      vr14,   vr3,   vr8
+    vhaddw.d.h       vr11
+    vhaddw.d.h       vr12
+    vhaddw.d.h       vr13
+    vhaddw.d.h       vr14
+    vpickev.w        vr11,   vr12,  vr11
+    vpickev.w        vr12,   vr14,  vr13
+    vpickev.h        vr11,   vr12,  vr11
+    vssrarni.bu.h    vr11,   vr11,  6
+    fst.d            f11,    a0,    0
+    add.d            a0,     a0,    a1
+    vbsrl.v          vr0,    vr0,   1
+    vbsrl.v          vr1,    vr1,   1
+    vbsrl.v          vr2,    vr2,   1
+    vbsrl.v          vr3,    vr3,   1
+    addi.w           a5,     a5,    -2
+    bnez             a5,     .l_\lable\()put_v_8w_loop
+    addi.d           a2,     t0,    8
+    addi.d           t0,     t0,    8
+    addi.d           a0,     t8,    8
+    addi.d           t8,     t8,    8
+    addi.d           a5,     t5,    0
+    addi.w           a4,     a4,    -8
+    bnez             a4,     .l_\lable\()put_v_8w_loop0
+    b                .l_\lable\()end_put_8tap
+
+.l_\lable\()put_hv:
+    ld.d             t5,     sp,    0  //filter_type
+    andi             t1,     t5,    3
+    blt              t0,     a4,    .l_\lable\()put_hv_idx_fh
+    andi             t1,     t5,    1
+    addi.w           t1,     t1,    3
+.l_\lable\()put_hv_idx_fh:
+    addi.w           t5,     zero,  120
+    mul.w            t1,     t1,    t5
+    addi.w           t5,     a6,    -1
+    slli.w           t5,     t5,    3
+    add.w            t1,     t1,    t5
+    add.d            t1,     t6,    t1 //fh's offset
+    vldrepl.d        vr8,    t1,    0
+    ld.d             t1,     sp,    0  //filter_type
+    srli.w           t1,     t1,    2
+    blt              t0,     a5,    .l_\lable\()put_hv_idx_fv
+    andi             t1,     t1,    1
+    addi.w           t1,     t1,    3
+.l_\lable\()put_hv_idx_fv:
+    addi.w           t5,     zero,  120
+    mul.w            t1,     t1,    t5
+    addi.w           t5,     a7,    -1
+    slli.w           t5,     t5,    3
+    add.w            t1,     t1,    t5
+    add.d            t1,     t6,    t1 //fv's offset
+    vldrepl.d        vr9,    t1,    0
+    vexth.h.b        vr9,    vr9
+
+    sub.d            a2,     a2,    t3
+    addi.d           a2,     a2,    -3
+
+    clz.w            t1,     a4
+    li.w             t5,     24
+    sub.w            t1,     t1,    t5
+    la.local         t5,     .l_\lable\()put_hv_jtable
+    alsl.d           t1,     t1,    t5,   3
+    ld.d             t6,     t1,    0
+    add.d            t5,     t5,    t6
+    jirl             $r0,    t5,    0
+
+    .align   3
+.l_\lable\()put_hv_jtable:
+    .dword .l_\lable\()put_hv_128w - .l_\lable\()put_hv_jtable
+    .dword .l_\lable\()put_hv_64w  - .l_\lable\()put_hv_jtable
+    .dword .l_\lable\()put_hv_32w  - .l_\lable\()put_hv_jtable
+    .dword .l_\lable\()put_hv_16w  - .l_\lable\()put_hv_jtable
+    .dword .l_\lable\()put_hv_8w   - .l_\lable\()put_hv_jtable
+    .dword .l_\lable\()put_hv_4w   - .l_\lable\()put_hv_jtable
+    .dword .l_\lable\()put_hv_2w   - .l_\lable\()put_hv_jtable
+
+.l_\lable\()put_hv_2w:
+    vld              vr0,    a2,    0
+    vldx             vr1,    a2,    a3
+    vldx             vr2,    a2,    t2
+    add.d            a2,     a2,    t3
+    vld              vr3,    a2,    0
+    vldx             vr4,    a2,    a3
+    vldx             vr5,    a2,    t2
+    vldx             vr6,    a2,    t3
+    add.d            a2,     a2,    t4
+
+    vbsrl.v          vr10,   vr0,   1
+    vbsrl.v          vr11,   vr1,   1
+    vbsrl.v          vr12,   vr2,   1
+    vbsrl.v          vr13,   vr3,   1
+    vbsrl.v          vr14,   vr4,   1
+    vbsrl.v          vr15,   vr5,   1
+    vbsrl.v          vr16,   vr6,   1
+    vilvl.d          vr0,    vr10,  vr0
+    vilvl.d          vr1,    vr11,  vr1
+    vilvl.d          vr2,    vr12,  vr2
+    vilvl.d          vr3,    vr13,  vr3
+    vilvl.d          vr4,    vr14,  vr4
+    vilvl.d          vr5,    vr15,  vr5
+    vilvl.d          vr6,    vr16,  vr6
+    vdp2.h.bu.b      vr10,   vr0,   vr8
+    vdp2.h.bu.b      vr11,   vr1,   vr8
+    vdp2.h.bu.b      vr12,   vr2,   vr8
+    vdp2.h.bu.b      vr13,   vr3,   vr8
+    vdp2.h.bu.b      vr14,   vr4,   vr8
+    vdp2.h.bu.b      vr15,   vr5,   vr8
+    vdp2.h.bu.b      vr16,   vr6,   vr8
+    vhaddw.d.h       vr10
+    vhaddw.d.h       vr11
+    vhaddw.d.h       vr12
+    vhaddw.d.h       vr13
+    vhaddw.d.h       vr14
+    vhaddw.d.h       vr15
+    vhaddw.d.h       vr16
+
+    vpackev.w        vr10,   vr11,  vr10
+    vpackev.w        vr12,   vr13,  vr12
+    vpackod.d        vr11,   vr12,  vr10
+    vpackev.d        vr10,   vr12,  vr10
+
+    vpackev.w        vr12,   vr15,  vr14
+    vpackev.w        vr16,   vr17,  vr16
+    vpackod.d        vr13,   vr16,  vr12
+    vpackev.d        vr12,   vr16,  vr12
+
+    vpickev.h        vr10,   vr12,  vr10 //0 1 2  3  4  5  6  * (h0)
+    vpickev.h        vr11,   vr13,  vr11 //8 9 10 11 12 13 14 * (h1)
+    vsrari.h         vr10,   vr10,  2
+    vsrari.h         vr11,   vr11,  2
+.l_\lable\()put_hv_2w_loop:
+    vld              vr7,    a2,    0
+    vldx             vr12,   a2,    a3
+    add.d            a2,     a2,    t2
+
+    vbsrl.v          vr1,    vr7,   1
+    vbsrl.v          vr2,    vr12,  1
+    vilvl.d          vr0,    vr1,   vr7
+    vilvl.d          vr1,    vr2,   vr12
+    vdp2.h.bu.b      vr2,    vr0,   vr8
+    vdp2.h.bu.b      vr3,    vr1,   vr8
+    vhaddw.d.h       vr2
+    vhaddw.d.h       vr3
+    vpickev.w        vr2,    vr3,   vr2
+    vpickev.h        vr2,    vr2,   vr2
+    vsrari.h         vr2,    vr2,   2
+    vextrins.h       vr10,   vr2,   0x70 //0 1 2 3 4 5 6 7
+    vextrins.h       vr11,   vr2,   0x71
+    vbsrl.v          vr12,   vr10,  2
+    vbsrl.v          vr13,   vr11,  2
+    vextrins.h       vr12,   vr2,   0x72 //1 2 3 4 5 6 7 8
+    vextrins.h       vr13,   vr2,   0x73
+    vdp2.w.h         vr0,    vr10,  vr9
+    vdp2.w.h         vr1,    vr11,  vr9
+    vdp2.w.h         vr2,    vr12,  vr9
+    vdp2.w.h         vr3,    vr13,  vr9
+    vhaddw.q.w       vr0
+    vhaddw.q.w       vr1
+    vhaddw.q.w       vr2
+    vhaddw.q.w       vr3
+    vpackev.w        vr0,    vr1,   vr0
+    vpackev.w        vr1,    vr3,   vr2
+    vpackev.d        vr0,    vr1,   vr0
+    vssrarni.hu.w    vr0,    vr0,   10
+    vssrani.bu.h     vr0,    vr0,   0
+    vbsrl.v          vr10,   vr12,  2
+    vbsrl.v          vr11,   vr13,  2
+    vstelm.h         vr0,    a0,    0,   0
+    add.d            a0,     a0,    a1
+    vstelm.h         vr0,    a0,    0,   1
+    add.d            a0,     a0,    a1
+    addi.d           a5,     a5,    -2
+    bnez             a5,     .l_\lable\()put_hv_2w_loop
+    b                .l_\lable\()end_put_8tap
+
+.l_\lable\()put_hv_4w:
+    vld              vr0,    a2,    0
+    vldx             vr1,    a2,    a3
+    vldx             vr2,    a2,    t2
+    add.d            a2,     a2,    t3
+    vld              vr3,    a2,    0
+    vldx             vr4,    a2,    a3
+    vldx             vr5,    a2,    t2
+    vldx             vr6,    a2,    t3
+    add.d            a2,     a2,    t4
+    FILTER_8TAP_4W   vr0 //x0 x1 x2 x3
+    FILTER_8TAP_4W   vr1
+    FILTER_8TAP_4W   vr2
+    FILTER_8TAP_4W   vr3
+    FILTER_8TAP_4W   vr4
+    FILTER_8TAP_4W   vr5
+    FILTER_8TAP_4W   vr6
+    vpackev.h        vr0,    vr1,   vr0
+    vpackev.h        vr1,    vr3,   vr2
+    vpackev.h        vr2,    vr5,   vr4
+    vpackev.h        vr3,    vr7,   vr6
+    vilvl.w          vr4,    vr1,   vr0
+    vilvh.w          vr5,    vr1,   vr0
+    vilvl.w          vr6,    vr3,   vr2
+    vilvh.w          vr7,    vr3,   vr2
+    vilvl.d          vr0,    vr6,   vr4 //0 1 2 3 4 5 6 *
+    vilvh.d          vr1,    vr6,   vr4
+    vilvl.d          vr2,    vr7,   vr5
+    vilvh.d          vr3,    vr7,   vr5
+    vsrari.h         vr0,    vr0,   2
+    vsrari.h         vr1,    vr1,   2
+    vsrari.h         vr2,    vr2,   2
+    vsrari.h         vr3,    vr3,   2
+.l_\lable\()put_hv_4w_loop:
+    vld              vr4,    a2,    0
+    vldx             vr5,    a2,    a3
+    add.d            a2,     a2,    t2
+    FILTER_8TAP_4W   vr4
+    FILTER_8TAP_4W   vr5
+    vpickev.h        vr4,    vr5,   vr4
+    vsrari.h         vr4,    vr4,   2
+    vextrins.h       vr0,    vr4,   0x70
+    vextrins.h       vr1,    vr4,   0x71
+    vextrins.h       vr2,    vr4,   0x72
+    vextrins.h       vr3,    vr4,   0x73
+    vbsrl.v          vr5,    vr0,   2
+    vbsrl.v          vr6,    vr1,   2
+    vbsrl.v          vr7,    vr2,   2
+    vbsrl.v          vr10,   vr3,   2
+    vextrins.h       vr5,    vr4,   0x74
+    vextrins.h       vr6,    vr4,   0x75
+    vextrins.h       vr7,    vr4,   0x76
+    vextrins.h       vr10,   vr4,   0x77
+    vdp2.w.h         vr11,   vr0,   vr9
+    vdp2.w.h         vr12,   vr1,   vr9
+    vdp2.w.h         vr13,   vr2,   vr9
+    vdp2.w.h         vr14,   vr3,   vr9
+    vhaddw.q.w       vr11
+    vhaddw.q.w       vr12
+    vhaddw.q.w       vr13
+    vhaddw.q.w       vr14
+    vpackev.w        vr0,    vr12,  vr11
+    vpackev.w        vr1,    vr14,  vr13
+    vpackev.d        vr0,    vr1,   vr0
+    vdp2.w.h         vr11,   vr5,   vr9
+    vdp2.w.h         vr12,   vr6,   vr9
+    vdp2.w.h         vr13,   vr7,   vr9
+    vdp2.w.h         vr14,   vr10,  vr9
+    vhaddw.q.w       vr11
+    vhaddw.q.w       vr12
+    vhaddw.q.w       vr13
+    vhaddw.q.w       vr14
+    vpackev.w        vr1,    vr12,  vr11
+    vpackev.w        vr2,    vr14,  vr13
+    vpackev.d        vr1,    vr2,   vr1
+    vssrarni.hu.w    vr1,    vr0,   10
+    vssrani.bu.h     vr1,    vr1,   0
+    vstelm.w         vr1,    a0,    0,    0
+    add.d            a0,     a0,    a1
+    vstelm.w         vr1,    a0,    0,    1
+    add.d            a0,     a0,    a1
+    vbsrl.v          vr0,    vr5,   2
+    vbsrl.v          vr1,    vr6,   2
+    vbsrl.v          vr2,    vr7,   2
+    vbsrl.v          vr3,    vr10,  2
+    addi.w           a5,     a5,    -2
+    bnez             a5,     .l_\lable\()put_hv_4w_loop
+    b                .l_\lable\()end_put_8tap
+
+.l_\lable\()put_hv_8w:
+.l_\lable\()put_hv_16w:
+.l_\lable\()put_hv_32w:
+.l_\lable\()put_hv_64w:
+.l_\lable\()put_hv_128w:
+    addi.d          t0,      a2,    0 //src
+    addi.d          t5,      a5,    0 //h
+    addi.d          t8,      a0,    0 //dst
+.l_\lable\()put_hv_8w_loop0:
+    vld             vr0,     a2,    0
+    vldx            vr1,     a2,    a3
+    vldx            vr2,     a2,    t2
+    add.d           a2,      a2,    t3
+    vld             vr3,     a2,    0
+    vldx            vr4,     a2,    a3
+    vldx            vr5,     a2,    t2
+    vldx            vr6,     a2,    t3
+    add.d           a2,      a2,    t4
+    FILTER_8TAP_8W  vr0
+    FILTER_8TAP_8W  vr1
+    FILTER_8TAP_8W  vr2
+    FILTER_8TAP_8W  vr3
+    FILTER_8TAP_8W  vr4
+    FILTER_8TAP_8W  vr5
+    FILTER_8TAP_8W  vr6
+    LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7,\
+                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7,\
+                       vr10,vr11,vr12,vr13,vr14,vr15,vr16,vr17
+.l_\lable\()put_hv_8w_loop:
+    vld             vr20,    a2,    0
+    vldx            vr21,    a2,    a3
+    add.d           a2,      a2,    t2
+    FILTER_8TAP_8W  vr20
+    FILTER_8TAP_8W  vr21
+    VEXTRINS_Hx8    vr20
+    FILTER_8TAP_8W_CLIP_STORE
+    VBSRL_Vx8
+    VEXTRINS_Hx8    vr21
+    FILTER_8TAP_8W_CLIP_STORE
+    VBSRL_Vx8
+    addi.w          a5,      a5,    -2
+    bnez            a5,      .l_\lable\()put_hv_8w_loop
+    addi.d          a2,      t0,    8
+    addi.d          t0,      t0,    8
+    addi.d          a0,      t8,    8
+    addi.d          t8,      t8,    8
+    addi.d          a5,      t5,    0
+    addi.w          a4,      a4,    -8
+    bnez            a4,      .l_\lable\()put_hv_8w_loop0
+.l_\lable\()end_put_8tap:
+.endm
+
+function put_8tap_regular_8bpc_lsx
+    addi.d   sp, sp,  -16
+    st.d   zero, sp,  0
+    PUT_8TAP_8BPC_LSX 0
+    addi.d   sp, sp,  16
+endfunc
+
+function put_8tap_smooth_regular_8bpc_lsx
+    addi.d   sp, sp,  -16
+    li.w     t0, 1
+    st.d     t0, sp,  0
+    PUT_8TAP_8BPC_LSX 1
+    addi.d   sp, sp,  16
+endfunc
+
+function put_8tap_sharp_regular_8bpc_lsx
+    addi.d   sp, sp,  -16
+    li.w     t0, 2
+    st.d     t0, sp,  0
+    PUT_8TAP_8BPC_LSX 2
+    addi.d   sp, sp,  16
+endfunc
+
+function put_8tap_regular_smooth_8bpc_lsx
+    addi.d   sp, sp,  -16
+    li.w     t0, 4
+    st.d     t0, sp,  0
+    PUT_8TAP_8BPC_LSX 4
+    addi.d   sp, sp,  16
+endfunc
+
+function put_8tap_smooth_8bpc_lsx
+    addi.d   sp, sp,  -16
+    li.w     t0, 5
+    st.d     t0, sp,  0
+    PUT_8TAP_8BPC_LSX 5
+    addi.d   sp, sp,  16
+endfunc
+
+function put_8tap_sharp_smooth_8bpc_lsx
+    addi.d   sp, sp,  -16
+    li.w     t0, 6
+    st.d     t0, sp,  0
+    PUT_8TAP_8BPC_LSX 6
+    addi.d   sp, sp,  16
+endfunc
+
+function put_8tap_regular_sharp_8bpc_lsx
+    addi.d   sp, sp,  -16
+    li.w     t0, 8
+    st.d     t0, sp,  0
+    PUT_8TAP_8BPC_LSX 8
+    addi.d   sp, sp,  16
+endfunc
+
+function put_8tap_smooth_sharp_8bpc_lsx
+    addi.d   sp, sp,  -16
+    li.w     t0, 9
+    st.d     t0, sp,  0
+    PUT_8TAP_8BPC_LSX 9
+    addi.d   sp, sp,  16
+endfunc
+
+function put_8tap_sharp_8bpc_lsx
+    addi.d   sp, sp,  -16
+    li.w     t0, 10
+    st.d     t0, sp,  0
+    PUT_8TAP_8BPC_LSX 10
+    addi.d   sp, sp,  16
+endfunc
diff --git a/src/loongarch/mc.h b/src/loongarch/mc.h
index 56168e5..d5ac00f 100644
--- a/src/loongarch/mc.h
+++ b/src/loongarch/mc.h
@@ -32,6 +32,11 @@
 #include "src/mc.h"
 #include "src/cpu.h"
 
+#define init_mc_fn(type, name, suffix) \
+    c->mc[type] = BF(dav1d_put_##name, suffix)
+#define init_mct_fn(type, name, suffix) \
+    c->mct[type] = BF(dav1d_prep_##name, suffix)
+
 decl_avg_fn(BF(dav1d_avg, lsx));
 decl_w_avg_fn(BF(dav1d_w_avg, lsx));
 decl_mask_fn(BF(dav1d_mask, lsx));
@@ -39,6 +44,16 @@
 decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, lsx));
 decl_w_mask_fn(BF(dav1d_w_mask_420, lsx));
 
+decl_mc_fn(BF(dav1d_put_8tap_regular,          lsx));
+decl_mc_fn(BF(dav1d_put_8tap_regular_smooth,   lsx));
+decl_mc_fn(BF(dav1d_put_8tap_regular_sharp,    lsx));
+decl_mc_fn(BF(dav1d_put_8tap_smooth,           lsx));
+decl_mc_fn(BF(dav1d_put_8tap_smooth_regular,   lsx));
+decl_mc_fn(BF(dav1d_put_8tap_smooth_sharp,     lsx));
+decl_mc_fn(BF(dav1d_put_8tap_sharp,            lsx));
+decl_mc_fn(BF(dav1d_put_8tap_sharp_regular,    lsx));
+decl_mc_fn(BF(dav1d_put_8tap_sharp_smooth,     lsx));
+
 decl_avg_fn(BF(dav1d_avg, lasx));
 decl_w_avg_fn(BF(dav1d_w_avg, lasx));
 decl_mask_fn(BF(dav1d_mask, lasx));
@@ -59,6 +74,16 @@
     c->warp8x8t = BF(dav1d_warp_affine_8x8t, lsx);
     c->w_mask[2] = BF(dav1d_w_mask_420, lsx);
 
+    init_mc_fn(FILTER_2D_8TAP_REGULAR,         8tap_regular,        lsx);
+    init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH,  8tap_regular_smooth, lsx);
+    init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP,   8tap_regular_sharp,  lsx);
+    init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR,  8tap_smooth_regular, lsx);
+    init_mc_fn(FILTER_2D_8TAP_SMOOTH,          8tap_smooth,         lsx);
+    init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP,    8tap_smooth_sharp,   lsx);
+    init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR,   8tap_sharp_regular,  lsx);
+    init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH,    8tap_sharp_smooth,   lsx);
+    init_mc_fn(FILTER_2D_8TAP_SHARP,           8tap_sharp,          lsx);
+
     if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LASX)) return;
 
     c->avg = BF(dav1d_avg, lasx);