Upgrade libdav1d to 1.4.1

This project was upgraded with external_updater.
Usage: tools/external_updater/updater.sh update external/libdav1d
For more info, check https://cs.android.com/android/platform/superproject/+/main:tools/external_updater/README.md

Bug: 330952417
Test: atest CtsMediaV2TestCases -- --module-arg  \
CtsMediaV2TestCases:instrumentation-arg:codec-prefix\
:=c2.android.av1

Change-Id: I140e0975b742e45ec1a03d3b1c2644c9eb684f6d
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 702f284..a3cf425 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -4,7 +4,7 @@
     - test
 
 .debian-amd64-common:
-    image: registry.videolan.org/dav1d-debian-unstable:20240113214804
+    image: registry.videolan.org/dav1d-debian-unstable:20240226203953
     stage: build
     tags:
         - docker
@@ -220,7 +220,7 @@
         - cd build
         - time meson test -v --suite checkasm
 
-build-debian-clang14:
+build-debian-clang:
     extends: .debian-amd64-common
     variables:
         CC: clang
@@ -355,6 +355,18 @@
         - ninja -C build
         - cd build && meson test -v
 
+build-debian-aarch64-clang-17:
+    extends: .debian-amd64-common
+    variables:
+        QEMU_LD_PREFIX: /usr/aarch64-linux-gnu/
+    script:
+        - meson setup build --buildtype release
+                            -Dtrim_dsp=false
+                            --werror
+                            --cross-file package/crossfiles/aarch64-linux-clang.meson
+        - ninja -C build
+        - cd build && meson test -v
+
 build-macos:
     stage: build
     tags:
@@ -427,9 +439,12 @@
         - meson setup build --buildtype release
                             -Dtrim_dsp=false
                             --werror
-                            --cross-file package/crossfiles/riscv64-linux.meson
+                            --cross-file package/crossfiles/${CROSSFILE}.meson
         - ninja -C build
         - cd build && meson test -v
+    parallel:
+      matrix:
+        - CROSSFILE: [riscv64-linux, riscv64-linux-clang]
 
 build-debian-loongarch64:
     extends: .debian-amd64-common
@@ -734,6 +749,33 @@
                       "rv64,v=true,vext_spec=v1.0,vlen=512,elen=64",
                       "rv64,v=true,vext_spec=v1.0,vlen=1024,elen=64" ]
 
+test-debian-aarch64-qemu:
+    extends:
+        - .debian-amd64-common
+        - .test-common
+    needs: ["build-debian-aarch64"]
+    script:
+        - meson setup build --buildtype release
+                            -Dtestdata_tests=true
+                            -Dlogging=false
+                            -Dtrim_dsp=false
+                            --cross-file package/crossfiles/aarch64-linux.meson
+        - ninja -C build
+        - cd build && time meson test -v --timeout-multiplier 2
+    variables:
+        QEMU_LD_PREFIX: /usr/aarch64-linux-gnu/
+    parallel:
+      matrix:
+        # sve-default-vector-length sets the max vector length in bytes;
+        # the default is 64, allowing up to 512 bit vectors. Testing 1024
+        # and 2048 bit vectors requires raising this limit. The sve<n>
+        # option sets the active vector length in bits.
+        - QEMU_CPU: [ "max,sve-default-vector-length=256,sve128=on",
+                      "max,sve-default-vector-length=256,sve256=on",
+                      "max,sve-default-vector-length=256,sve512=on",
+                      "max,sve-default-vector-length=256,sve1024=on",
+                      "max,sve-default-vector-length=256,sve2048=on" ]
+
 test-debian-armv7-clang-5:
     extends:
         - .debian-armv7-common
@@ -785,11 +827,11 @@
         - avx2
     script:
         - *test-argon-script
-        - ../tests/dav1d_argon.bash -t 2 -j 4 -c 0         || exit_code=$((exit_code + $?))
-        - ../tests/dav1d_argon.bash -t 2 -j 4 -c sse2 -g 0 || exit_code=$((exit_code + $?))
-        - ../tests/dav1d_argon.bash -t 2 -j 4 -c ssse3     || exit_code=$((exit_code + $?))
-        - ../tests/dav1d_argon.bash -t 2 -j 4 -c sse41     || exit_code=$((exit_code + $?))
-        - ../tests/dav1d_argon.bash -t 2 -j 4 -c avx2      || exit_code=$((exit_code + $?))
+        - ../tests/dav1d_argon.bash -t 1 -c 0         || exit_code=$((exit_code + $?))
+        - ../tests/dav1d_argon.bash -t 2 -c sse2 -g 0 || exit_code=$((exit_code + $?))
+        - ../tests/dav1d_argon.bash -t 3 -c ssse3     || exit_code=$((exit_code + $?))
+        - ../tests/dav1d_argon.bash -t 4 -c sse41     || exit_code=$((exit_code + $?))
+        - ../tests/dav1d_argon.bash -t 5 -c avx2      || exit_code=$((exit_code + $?))
         - if [ $exit_code -ne 0 ]; then exit $exit_code; fi
 
 test-debian32-argon:
@@ -804,9 +846,9 @@
                             --cross-file package/crossfiles/i686-linux32.meson
         - cd build && ninja
         - exit_code=0
-        - ../tests/dav1d_argon.bash -t 2 -j 4 -c sse2       || exit_code=$((exit_code + $?))
-        - ../tests/dav1d_argon.bash -t 2 -j 4 -c ssse3      || exit_code=$((exit_code + $?))
-        - ../tests/dav1d_argon.bash -t 2 -j 4 -c sse41 -g 0 || exit_code=$((exit_code + $?))
+        - ../tests/dav1d_argon.bash -t 2 -c sse2       || exit_code=$((exit_code + $?))
+        - ../tests/dav1d_argon.bash -t 2 -c ssse3      || exit_code=$((exit_code + $?))
+        - ../tests/dav1d_argon.bash -t 2 -c sse41 -g 0 || exit_code=$((exit_code + $?))
         - if [ $exit_code -ne 0 ]; then exit $exit_code; fi
 
 test-debian-argon-avx512:
@@ -819,7 +861,7 @@
         - amd64-avx512
     script:
         - *test-argon-script
-        - ../tests/dav1d_argon.bash -t 2 -j 1 -c avx512icl || exit_code=$((exit_code + $?))
+        - ../tests/dav1d_argon.bash -t 2 -c avx512icl || exit_code=$((exit_code + $?))
         - if [ $exit_code -ne 0 ]; then exit $exit_code; fi
 
 .test-debian-arm-argon:
@@ -831,9 +873,9 @@
                             -Dtrim_dsp=false
         - cd build && ninja
         - exit_code=0
-        - ../tests/dav1d_argon.bash -t 2 -j 4 -c 0         || exit_code=$((exit_code + $?))
-        - ../tests/dav1d_argon.bash -t 2 -j 4 -c neon      || exit_code=$((exit_code + $?))
-        - ../tests/dav1d_argon.bash -t 2 -j 4 -c neon -g 0 || exit_code=$((exit_code + $?))
+        - ../tests/dav1d_argon.bash -t 2 -c 0         || exit_code=$((exit_code + $?))
+        - ../tests/dav1d_argon.bash -t 2 -c neon      || exit_code=$((exit_code + $?))
+        - ../tests/dav1d_argon.bash -t 2 -c neon -g 0 || exit_code=$((exit_code + $?))
         - if [ $exit_code -ne 0 ]; then exit $exit_code; fi
 
 test-debian-armv7-argon:
diff --git a/METADATA b/METADATA
index 8298a7f..fbf04db 100644
--- a/METADATA
+++ b/METADATA
@@ -8,13 +8,13 @@
   license_type: NOTICE
   last_upgrade_date {
     year: 2024
-    month: 2
-    day: 14
+    month: 3
+    day: 23
   }
   homepage: "https://code.videolan.org/videolan/dav1d/"
   identifier {
     type: "Git"
     value: "https://code.videolan.org/videolan/dav1d.git"
-    version: "1.4.0"
+    version: "1.4.1"
   }
 }
diff --git a/NEWS b/NEWS
index f74af58..88b1eea 100644
--- a/NEWS
+++ b/NEWS
@@ -1,5 +1,17 @@
+Changes for 1.4.1 'Road Runner':
+--------------------------------
+
+1.4.1 is a small release of dav1d, improving notably ARM and RISC-V speed
+
+- Optimizations for 6tap filters for NEON (ARM)
+- More RISC-V optimizations for itx (4x8, 8x4, 4x16, 16x4, 8x16, 16x8)
+- Reduction of binary size on ARM64, ARM32 and RISC-V
+- Fix out-of-bounds read in 8bpc SSE2/SSSE3 wiener_filter
+- Msac optimizations
+
+
 Changes for 1.4.0 'Road Runner':
-------------------------------------------------------
+--------------------------------
 
 1.4.0 is a medium release of dav1d, focusing on new architecture support and optimizations
 
@@ -9,7 +21,7 @@
 - New architecture supported: RISC-V
 - RISC-V optimizations for itx
 - Misc improvements in threading and in reducing binary size
-- Fix potential integer overflow with extremely large frame sizes
+- Fix potential integer overflow with extremely large frame sizes (CVE-2024-1580)
 
 
 Changes for 1.3.0 'Tundra Peregrine Falcon (Calidus)':
@@ -26,7 +38,7 @@
 
 
 Changes for 1.2.1 'Arctic Peregrine Falcon':
--------------------------------------------
+--------------------------------------------
 
 1.2.1 is a small release of dav1d, adding more SIMD and fixes
 
@@ -42,7 +54,7 @@
 
 
 Changes for 1.2.0 'Arctic Peregrine Falcon':
--------------------------------------------
+--------------------------------------------
 
 1.2.0 is a small release of dav1d, adding more SIMD and fixes
 
@@ -55,7 +67,7 @@
 
 
 Changes for 1.1.0 'Arctic Peregrine Falcon':
--------------------------------------------
+--------------------------------------------
 
 1.1.0 is an important release of dav1d, fixing numerous bugs, and adding SIMD
 
diff --git a/THANKS.md b/THANKS.md
index 4fc8d27..b7aa200 100644
--- a/THANKS.md
+++ b/THANKS.md
@@ -16,19 +16,20 @@
 
 And all the dav1d Authors (git shortlog -sn), including:
 
-Martin Storsjö, Henrik Gramner, Ronald S. Bultje, Janne Grunau, James Almer,
-Victorien Le Couviour--Tuffet, Matthias Dressel, Marvin Scholz,
-Jean-Baptiste Kempf, Luc Trudeau, Hugo Beauzée-Luyssen, Konstantin Pavlov,
-Niklas Haas, David Michael Barr, Steve Lhomme, Nathan E. Egge, Wan-Teh Chang,
-Kyle Siefring, B Krishnan Iyer, Francois Cartegnie, Liwei Wang, Luca Barbato,
-David Conrad, Derek Buitenhuis, Jan Beich, Michael Bradshaw, Raphaël Zumer,
-Xuefeng Jiang, Christophe Gisquet, Justin Bull, Boyuan Xiao, Dale Curtis,
-Emmanuel Gil Peyrot, Raphael Zumer, Rupert Swarbrick, Thierry Foucu,
-Thomas Daede, Colin Lee, Jonathan Wright, Lynne, Michail Alvanos, Nico Weber,
-Salome Thirot, SmilingWolf, Tristan Laurent, Vittorio Giovara, Yannis Guyon,
-André Kempe, Anisse Astier, Anton Mitrofanov, Charlie Hayden, Dmitriy Sychov,
-Ewout ter Hoeven, Fred Barbier, Jean-Yves Avenard, Joe Drago, Mark Shuttleworth,
-Matthieu Bouron, Mehdi Sabwat, Nicolas Frattaroli, Pablo Stebler, Rostislav
-Pehlivanov, Sebastian Dröge, Shiz, Steinar Midtskogen, Sylvain BERTRAND,
-Sylvestre Ledru, Timo Gurr, Tristan Matthews, Vibhoothi, Xavier Claessens,
-Xu Guangxin, kossh1 and skal.
+Henrik Gramner, Martin Storsjö, Ronald S. Bultje, Janne Grunau, James Almer,
+Victorien Le Couviour--Tuffet, Matthias Dressel, Nathan E. Egge,
+Jean-Baptiste Kempf, Marvin Scholz, Luc Trudeau, Niklas Haas,
+Hugo Beauzée-Luyssen, Konstantin Pavlov, David Michael Barr, Steve Lhomme,
+yuanhecai, Luca Barbato, Wan-Teh Chang, Kyle Siefring, B Krishnan Iyer,
+Francois Cartegnie, Liwei Wang, David Conrad, Derek Buitenhuis, Jan Beich,
+Michael Bradshaw, Raphaël Zumer, Xuefeng Jiang, Arpad Panyik, Christophe Gisquet,
+Justin Bull, Boyuan Xiao, Dale Curtis, Emmanuel Gil Peyrot, Raphael Zumer,
+Rupert Swarbrick, Thierry Foucu, Thomas Daede, jinbo, André Kempe, Colin Lee,
+Jonathan Wright, Lynne, Michail Alvanos, Nico Weber, Salome Thirot, SmilingWolf,
+Tristan Laurent, Tristan Matthews, Vittorio Giovara, Yannis Guyon,
+Andrey Semashev, Anisse Astier, Anton Mitrofanov, Charlie Hayden, Dmitriy Sychov,
+Ewout ter Hoeven, Fred Barbier, Hao Chen, Jean-Yves Avenard, Joe Drago,
+Mark Shuttleworth, Matthieu Bouron, Mehdi Sabwat, Nicolas Frattaroli,
+Pablo Stebler, Rostislav Pehlivanov, Sebastian Dröge, Shiz, Steinar Midtskogen,
+Sylvain BERTRAND, Sylvestre Ledru, Timo Gurr, Vibhoothi,
+Vignesh Venkatasubramanian, Xavier Claessens, Xu Guangxin, kossh1 and skal.
diff --git a/gcovr.cfg b/gcovr.cfg
index d09a0ec..e02ae33 100644
--- a/gcovr.cfg
+++ b/gcovr.cfg
@@ -1,4 +1,4 @@
 exclude = .*/tests/.*
 exclude = .*/tools/.*
 exclude = .*/include/common/dump.h
-gcov-ignore-parse-errors = yes
+gcov-ignore-parse-errors = negative_hits.warn
diff --git a/meson.build b/meson.build
index 6e49852..e371415 100644
--- a/meson.build
+++ b/meson.build
@@ -23,7 +23,7 @@
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 project('dav1d', ['c'],
-    version: '1.4.0',
+    version: '1.4.1',
     default_options: ['c_std=c99',
                       'warning_level=2',
                       'buildtype=release',
@@ -309,6 +309,10 @@
     optional_arguments += '-fno-stack-check'
 endif
 
+if (host_machine.cpu_family() == 'aarch64' or host_machine.cpu_family().startswith('arm'))
+    optional_arguments += '-fno-align-functions'
+endif
+
 add_project_arguments(cc.get_supported_arguments(optional_arguments), language : 'c')
 add_project_link_arguments(cc.get_supported_link_arguments(optional_link_arguments), language : 'c')
 
@@ -365,6 +369,66 @@
     if cc.compiles(check_pic_code)
         cdata.set('PIC', '3')
     endif
+
+    if host_machine.cpu_family() == 'aarch64'
+        have_as_arch = cc.compiles('''__asm__ (".arch armv8-a");''')
+        cdata.set10('HAVE_AS_ARCH_DIRECTIVE', have_as_arch)
+        as_arch_str = ''
+        if have_as_arch
+            as_arch_level = 'armv8-a'
+            # Check what .arch levels are supported. In principle, we only
+            # want to detect up to armv8.2-a here (binutils requires that
+            # in order to enable i8mm). However, older Clang versions
+            # (before Clang 17, and Xcode versions up to and including 15.0)
+            # didn't support controlling dotprod/i8mm extensions via
+            # .arch_extension, therefore try to enable a high enough .arch
+            # level as well, to implicitly make them available via that.
+            foreach arch : ['armv8.2-a', 'armv8.4-a', 'armv8.6-a']
+                if cc.compiles('__asm__ (".arch ' + arch + '\\n");')
+                    as_arch_level = arch
+                endif
+            endforeach
+            # Clang versions before 17 also had a bug
+            # (https://github.com/llvm/llvm-project/issues/32220)
+            # causing a plain ".arch <level>" to not have any effect unless it
+            # had an extra "+<feature>" included - but it was activated on the
+            # next ".arch_extension" directive instead. Check if we can include
+            # "+crc" as dummy feature to make the .arch directive behave as
+            # expected and take effect right away.
+            if cc.compiles('__asm__ (".arch ' + as_arch_level + '+crc\\n");')
+                as_arch_level = as_arch_level + '+crc'
+            endif
+            cdata.set('AS_ARCH_LEVEL', as_arch_level)
+            as_arch_str = '".arch ' + as_arch_level + '\\n"'
+        endif
+        extensions = {
+            'dotprod': 'udot v0.4s, v0.16b, v0.16b',
+            'i8mm':    'usdot v0.4s, v0.16b, v0.16b',
+            'sve':     'whilelt p0.s, x0, x1',
+            'sve2':    'sqrdmulh z0.s, z0.s, z0.s',
+        }
+        foreach name, instr : extensions
+            # Test for support for the various extensions. First test if
+            # the assembler supports the .arch_extension directive for
+            # enabling/disabling the extension, then separately check whether
+            # the instructions themselves are supported. Even if .arch_extension
+            # isn't supported, we may be able to assemble the instructions
+            # if the .arch level includes support for them.
+            code = '__asm__ (' + as_arch_str
+            code += '".arch_extension ' + name + '\\n"'
+            code += ');'
+            supports_archext = cc.compiles(code)
+            cdata.set10('HAVE_AS_ARCHEXT_' + name.to_upper() + '_DIRECTIVE', supports_archext)
+            code = '__asm__ (' + as_arch_str
+            if supports_archext
+                code += '".arch_extension ' + name + '\\n"'
+            endif
+            code += '"' + instr + '\\n"'
+            code += ');'
+            supports_instr = cc.compiles(code, name: name.to_upper())
+            cdata.set10('HAVE_' + name.to_upper(), supports_instr)
+        endforeach
+    endif
 endif
 
 cdata.set10('ARCH_X86', host_machine.cpu_family().startswith('x86'))
@@ -477,6 +541,17 @@
         ])
 endif
 
+if is_asm_enabled and host_machine.cpu_family().startswith('riscv')
+    as_option_code = '''__asm__ (
+".option arch, +v\n"
+"vsetivli zero, 0, e8, m1, ta, ma"
+);
+'''
+    if not cc.compiles(as_option_code, name : 'RISC-V Vector')
+        error('Compiler doesn\'t support \'.option arch\' asm directive. Update to binutils>=2.38 or clang>=17 or use \'-Denable_asm=false\'.')
+    endif
+endif
+
 # Generate config.h
 config_h_target = configure_file(output: 'config.h', configuration: cdata)
 
diff --git a/package/crossfiles/aarch64-linux-clang.meson b/package/crossfiles/aarch64-linux-clang.meson
new file mode 100644
index 0000000..2d218c7
--- /dev/null
+++ b/package/crossfiles/aarch64-linux-clang.meson
@@ -0,0 +1,16 @@
+[binaries]
+c = 'clang'
+cpp = 'clang++'
+ar = 'aarch64-linux-gnu-ar'
+strip = 'aarch64-linux-gnu-strip'
+exe_wrapper = 'qemu-aarch64'
+
+[properties]
+c_args = '-target aarch64-linux-gnu'
+c_link_args = '-target aarch64-linux-gnu'
+
+[host_machine]
+system = 'linux'
+cpu_family = 'aarch64'
+cpu = 'aarch64'
+endian = 'little'
diff --git a/package/crossfiles/aarch64-linux.meson b/package/crossfiles/aarch64-linux.meson
new file mode 100644
index 0000000..7dae0fc
--- /dev/null
+++ b/package/crossfiles/aarch64-linux.meson
@@ -0,0 +1,12 @@
+[binaries]
+c = 'aarch64-linux-gnu-gcc'
+cpp = 'aarch64-linux-gnu-g++'
+ar = 'aarch64-linux-gnu-ar'
+strip = 'aarch64-linux-gnu-strip'
+exe_wrapper = 'qemu-aarch64'
+
+[host_machine]
+system = 'linux'
+cpu_family = 'aarch64'
+cpu = 'aarch64'
+endian = 'little'
diff --git a/package/crossfiles/riscv64-linux-clang.meson b/package/crossfiles/riscv64-linux-clang.meson
new file mode 100644
index 0000000..c16d74d
--- /dev/null
+++ b/package/crossfiles/riscv64-linux-clang.meson
@@ -0,0 +1,16 @@
+[binaries]
+c = 'clang'
+cpp = 'clang++'
+ar = 'riscv64-linux-gnu-ar'
+strip = 'riscv64-linux-gnu-strip'
+exe_wrapper = 'qemu-riscv64'
+
+[properties]
+c_args = '-target riscv64-linux-gnu'
+c_link_args = '-target riscv64-linux-gnu'
+
+[host_machine]
+system = 'linux'
+cpu_family = 'riscv64'
+cpu = 'riscv64'
+endian = 'little'
diff --git a/src/arm/32/itx.S b/src/arm/32/itx.S
index ceea025..9ba1df7 100644
--- a/src/arm/32/itx.S
+++ b/src/arm/32/itx.S
@@ -965,6 +965,8 @@
 
 .ifc \variant, identity_
         // The identity shl #1 and downshift srshr #1 cancel out
+
+        b               L(itx_8x8_epilog)
 .else
         blx             r4
 
@@ -976,8 +978,8 @@
         vrshr.s16       q13, q13, #1
         vrshr.s16       q14, q14, #1
         vrshr.s16       q15, q15, #1
-.endif
 
+L(itx_8x8_epilog):
         transpose_8x8h  q8,  q9,  q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
 
         blx             r5
@@ -985,11 +987,12 @@
         load_add_store_8x8 r0, r7
         vpop            {q4-q7}
         pop             {r4-r5,r7,pc}
+.endif
 endfunc
 .endm
 
-def_fn_8x8_base
 def_fn_8x8_base identity_
+def_fn_8x8_base
 
 .macro def_fn_8x8 txfm1, txfm2
 function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1
@@ -1444,14 +1447,16 @@
 .else
         identity_4x16_shift1 d0[0]
 .endif
+        b               L(horz_16x4_epilog)
 .else
         blx             r4
-.endif
-.if \shift > 0
 .irp i, q8, q9, q10, q11, q12, q13, q14, q15
         vrshr.s16       \i,  \i,  #\shift
 .endr
-.endif
+.if \shift == 1
+        b               L(horz_16x4_epilog)
+.else
+L(horz_16x4_epilog):
         transpose_4x4h  q8,  q9,  d16, d17, d18, d19
         transpose_4x4h  q10, q11, d20, d21, d22, d23
         transpose_4x4h  q12, q13, d24, d25, d26, d27
@@ -1462,13 +1467,15 @@
 .endr
 
         pop             {pc}
+.endif
+.endif
 endfunc
 .endm
 
-def_horz_16 scale=0, identity=0, shift=2
-def_horz_16 scale=1, identity=0, shift=1, suffix=_scale
-def_horz_16 scale=0, identity=1, shift=-2, suffix=_identity
 def_horz_16 scale=1, identity=1, shift=-1, suffix=_scale_identity
+def_horz_16 scale=0, identity=1, shift=-2, suffix=_identity
+def_horz_16 scale=1, identity=0, shift=1, suffix=_scale
+def_horz_16 scale=0, identity=0, shift=2
 
 function inv_txfm_add_vert_4x16_neon
         push            {lr}
@@ -1597,6 +1604,8 @@
 .endr
 
         identity_4x16_shift1 d0[0]
+
+        b               L(itx_16x4_epilog)
 .else
         vmov.i16        q2,  #0
         vmov.i16        q3,  #0
@@ -1615,30 +1624,25 @@
         vswp            d19, d22
         vswp            d18, d20
         vswp            d19, d21
-.irp i, q8, q9, q10, q11
+        vswp            d25, d28
+        vswp            d27, d30
+        vswp            d26, d28
+        vswp            d27, d29
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
         vrshr.s16       \i,  \i,  #1
 .endr
-.endif
+
+L(itx_16x4_epilog):
         transpose_4x8h  q8,  q9,  q10, q11
         blx             r5
         mov             r6,  r0
         load_add_store_8x4 r6, r7
 
-.ifc \variant, identity_
         vmov            q8,  q12
         vmov            q9,  q13
         vmov            q10, q14
         vmov            q11, q15
-.else
-        vswp            d25, d28
-        vswp            d27, d30
-        vswp            d26, d28
-        vswp            d27, d29
-        vrshr.s16       q8,  q12, #1
-        vrshr.s16       q9,  q13, #1
-        vrshr.s16       q10, q14, #1
-        vrshr.s16       q11, q15, #1
-.endif
+
         transpose_4x8h  q8,  q9,  q10, q11
         blx             r5
         add             r6,  r0,  #8
@@ -1646,6 +1650,7 @@
 
         vpop            {q4-q7}
         pop             {r4-r11,pc}
+.endif
 endfunc
 
 function inv_txfm_\variant\()add_4x16_neon
@@ -1696,12 +1701,14 @@
         movw            r12, #(5793-4096)*8
         vdup.16         d0,  r12
         identity_8x4_shift1 q8,  q9,  q10, q11, d0[0]
+
+        b               L(itx_4x16_epilog)
 .else
         blx             r4
 .irp i, q8, q9, q10, q11
         vrshr.s16       \i,  \i,  #1
 .endr
-.endif
+L(itx_4x16_epilog):
         transpose_4x8h  q8,  q9,  q10, q11
         vswp            d19, d21
         vswp            d18, d20
@@ -1714,11 +1721,12 @@
 
         vpop            {q4-q7}
         pop             {r4-r11,pc}
+.endif
 endfunc
 .endm
 
-def_fn_416_base
 def_fn_416_base identity_
+def_fn_416_base
 
 .macro def_fn_416 w, h, txfm1, txfm2, eob_half
 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
@@ -1728,11 +1736,15 @@
         push            {r4-r11,lr}
         vpush           {q4-q7}
 .if \w == 4
+.ifnc \txfm1, identity
         movrel_local    r4,  inv_\txfm1\()_8h_x\w\()_neon
+.endif
         movrel_local    r5,  inv_\txfm2\()_4h_x\h\()_neon
         mov             r10, #\eob_half
 .else
+.ifnc \txfm1, identity
         movrel_local    r4,  inv_\txfm1\()_4h_x\w\()_neon
+.endif
         movrel_local    r5,  inv_\txfm2\()_8h_x\h\()_neon
 .endif
 .ifc \txfm1, identity
@@ -1765,8 +1777,7 @@
 def_fns_416 4, 16
 def_fns_416 16, 4
 
-.macro def_fn_816_base variant
-function inv_txfm_\variant\()add_16x8_neon
+function inv_txfm_add_16x8_neon
         sub_sp_align    256
 
 .irp i, 0, 4
@@ -1805,6 +1816,7 @@
         pop             {r4-r11,pc}
 endfunc
 
+.macro def_fn_816_base variant
 function inv_txfm_\variant\()add_8x16_neon
         sub_sp_align    256
 
@@ -1849,6 +1861,10 @@
 .endr
 2:
 
+.ifc \variant, identity_
+        b               L(itx_8x16_epilog)
+.else
+L(itx_8x16_epilog):
 .irp i, 0, 4
         add             r6,  r0,  #(\i)
         add             r7,  sp,  #(\i*2)
@@ -1859,11 +1875,18 @@
         add_sp_align    256
         vpop            {q4-q7}
         pop             {r4-r11,pc}
+.endif
 endfunc
 .endm
 
-def_fn_816_base
 def_fn_816_base identity_
+def_fn_816_base
+
+/* Define symbols used in .if statement */
+.equ dct, 1
+.equ identity, 2
+.equ adst, 3
+.equ flipadst, 4
 
 .macro def_fn_816 w, h, txfm1, txfm2, eob_8x8, eob_4x4
 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
@@ -1873,7 +1896,9 @@
         push            {r4-r11,lr}
         vpush           {q4-q7}
 .if \w == 8
+.ifnc \txfm1, identity
         movrel_local    r4,  inv_\txfm1\()_8h_x8_neon
+.endif
         movrel_local    r5,  inv_\txfm2\()_4h_x16_neon
 .else
 .ifc \txfm1, identity
@@ -1889,7 +1914,7 @@
 .else
         mov             r10, #\eob_4x4
 .endif
-.ifc \txfm1, identity
+.if \w == 8 && \txfm1 == identity
         b               inv_txfm_identity_add_\w\()x\h\()_neon
 .else
         b               inv_txfm_add_\w\()x\h\()_neon
diff --git a/src/arm/32/itx16.S b/src/arm/32/itx16.S
index aa6c272..7691272 100644
--- a/src/arm/32/itx16.S
+++ b/src/arm/32/itx16.S
@@ -547,11 +547,11 @@
         vmov.i16        q15, #0
         vld1.32         {q8,  q9},  [r2, :128]
         vst1.32         {q14, q15}, [r2, :128]!
-        vshr.s16        q8,  q8,  #2
+        vshr.s32        q8,  q8,  #2
         vld1.32         {q10, q11}, [r2, :128]
-        vshr.s16        q9,  q9,  #2
-        vshr.s16        q10, q10, #2
-        vshr.s16        q11, q11, #2
+        vshr.s32        q9,  q9,  #2
+        vshr.s32        q10, q10, #2
+        vshr.s32        q11, q11, #2
 
         iwht4
 
@@ -598,7 +598,9 @@
         vld1.16         {d3}, [r0, :64], r1
 
 L(itx_4x4_end):
-        vmvn.i16        q15, #0xfc00 // 0x3ff
+        // read bitdepth_max from the callers stack
+        ldr             r4,  [sp, #44]
+        vdup.i16        q15, r4
         sub             r0,  r0,  r1, lsl #2
         vqadd.s16       q8,  q8,  q0
         vqadd.s16       q9,  q9,  q1
@@ -1487,6 +1489,10 @@
         vqrshrn.s32     d21, q13, #\shift
         vqrshrn.s32     d22, q14, #\shift
         vqrshrn.s32     d23, q15, #\shift
+.if \scale
+        b               L(horz_16x2_epilog)
+.else
+L(horz_16x2_epilog):
         vuzp.16         q8,  q9
         vuzp.16         q10, q11
 
@@ -1495,11 +1501,12 @@
 .endr
 
         pop             {pc}
+.endif
 endfunc
 .endm
 
-def_horz_16 scale=0, shift=2
 def_horz_16 scale=1, shift=1, suffix=_scale
+def_horz_16 scale=0, shift=2
 
 function inv_txfm_add_vert_4x16_neon
         push            {lr}
diff --git a/src/arm/32/msac.S b/src/arm/32/msac.S
index b06e109..b16957f 100644
--- a/src/arm/32/msac.S
+++ b/src/arm/32/msac.S
@@ -279,60 +279,67 @@
         sub             r4,  r4,  r3           // rng = u - v
         clz             r5,  r4                // clz(rng)
         eor             r5,  r5,  #16          // d = clz(rng) ^ 16
-        mvn             r7,  r7                // ~dif
-        add             r7,  r7,  r3, lsl #16  // ~dif + (v << 16)
+        sub             r7,  r7,  r3, lsl #16  // dif - (v << 16)
 L(renorm2):
         lsl             r4,  r4,  r5           // rng << d
         subs            r6,  r6,  r5           // cnt -= d
-        lsl             r7,  r7,  r5           // (~dif + (v << 16)) << d
+        lsl             r7,  r7,  r5           // (dif - (v << 16)) << d
         str             r4,  [r0, #RNG]
-        mvn             r7,  r7                // ~dif
-        bhs             9f
+        bhs             4f
 
         // refill
         ldr             r3,  [r0, #BUF_POS]    // BUF_POS
         ldr             r4,  [r0, #BUF_END]    // BUF_END
         add             r5,  r3,  #4
-        cmp             r5,  r4
-        bgt             2f
+        subs            r5,  r5,  r4
+        bhi             6f
 
-        ldr             r3,  [r3]              // next_bits
-        add             r8,  r6,  #23          // shift_bits = cnt + 23
-        add             r6,  r6,  #16          // cnt += 16
-        rev             r3,  r3                // next_bits = bswap(next_bits)
-        sub             r5,  r5,  r8, lsr #3   // buf_pos -= shift_bits >> 3
-        and             r8,  r8,  #24          // shift_bits &= 24
-        lsr             r3,  r3,  r8           // next_bits >>= shift_bits
-        sub             r8,  r8,  r6           // shift_bits -= 16 + cnt
-        str             r5,  [r0, #BUF_POS]
-        lsl             r3,  r3,  r8           // next_bits <<= shift_bits
-        rsb             r6,  r8,  #16          // cnt = cnt + 32 - shift_bits
-        eor             r7,  r7,  r3           // dif ^= next_bits
-        b               9f
+        ldr             r8,  [r3]              // next_bits
+        rsb             r5,  r6,  #16
+        add             r4,  r6,  #16          // shift_bits = cnt + 16
+        mvn             r8,  r8
+        lsr             r5,  r5,  #3           // num_bytes_read
+        rev             r8,  r8                // next_bits = bswap(next_bits)
+        lsr             r8,  r8,  r4           // next_bits >>= shift_bits
 
-2:      // refill_eob
-        rsb             r5,  r6,  #8           // c = 8 - cnt
-3:
-        cmp             r3,  r4
-        bge             4f
-        ldrb            r8,  [r3], #1
-        lsl             r8,  r8,  r5
-        eor             r7,  r7,  r8
-        subs            r5,  r5,  #8
-        bge             3b
-
-4:      // refill_eob_end
+2:      // refill_end
+        add             r3,  r3,  r5
+        add             r6,  r6,  r5, lsl #3   // cnt += num_bits_read
         str             r3,  [r0, #BUF_POS]
-        rsb             r6,  r5,  #8           // cnt = 8 - c
 
-9:
+3:      // refill_end2
+        orr             r7,  r7,  r8           // dif |= next_bits
+
+4:      // end
         str             r6,  [r0, #CNT]
         str             r7,  [r0, #DIF]
-
         mov             r0,  lr
         add             sp,  sp,  #48
-
         pop             {r4-r10,pc}
+
+5:      // pad_with_ones
+        add             r8,  r6,  #-240
+        lsr             r8,  r8,  r8
+        b               3b
+
+6:      // refill_eob
+        cmp             r3,  r4
+        bhs             5b
+
+        ldr             r8,  [r4, #-4]
+        lsl             r5,  r5,  #3
+        lsr             r8,  r8,  r5
+        add             r5,  r6,  #16
+        mvn             r8,  r8
+        sub             r4,  r4,  r3           // num_bytes_left
+        rev             r8,  r8
+        lsr             r8,  r8,  r5
+        rsb             r5,  r6,  #16
+        lsr             r5,  r5,  #3
+        cmp             r5,  r4
+        it              hs
+        movhs           r5,  r4
+        b               2b
 endfunc
 
 function msac_decode_symbol_adapt8_neon, export=1
@@ -414,53 +421,38 @@
         sub             r4,  r4,  r3           // rng = u - v
         clz             r5,  r4                // clz(rng)
         eor             r5,  r5,  #16          // d = clz(rng) ^ 16
-        mvn             r7,  r7                // ~dif
-        add             r7,  r7,  r3, lsl #16  // ~dif + (v << 16)
+        sub             r7,  r7,  r3, lsl #16  // dif - (v << 16)
         lsl             r4,  r4,  r5           // rng << d
         subs            r6,  r6,  r5           // cnt -= d
-        lsl             r7,  r7,  r5           // (~dif + (v << 16)) << d
+        lsl             r7,  r7,  r5           // (dif - (v << 16)) << d
         str             r4,  [r0, #RNG]
         vdup.16         d1,  r4
-        mvn             r7,  r7                // ~dif
-        bhs             9f
+        bhs             5f
 
         // refill
         ldr             r3,  [r0, #BUF_POS]    // BUF_POS
         ldr             r4,  [r0, #BUF_END]    // BUF_END
         add             r5,  r3,  #4
-        cmp             r5,  r4
-        bgt             2f
+        subs            r5,  r5,  r4
+        bhi             7f
 
-        ldr             r3,  [r3]              // next_bits
-        add             r8,  r6,  #23          // shift_bits = cnt + 23
-        add             r6,  r6,  #16          // cnt += 16
-        rev             r3,  r3                // next_bits = bswap(next_bits)
-        sub             r5,  r5,  r8, lsr #3   // buf_pos -= shift_bits >> 3
-        and             r8,  r8,  #24          // shift_bits &= 24
-        lsr             r3,  r3,  r8           // next_bits >>= shift_bits
-        sub             r8,  r8,  r6           // shift_bits -= 16 + cnt
-        str             r5,  [r0, #BUF_POS]
-        lsl             r3,  r3,  r8           // next_bits <<= shift_bits
-        rsb             r6,  r8,  #16          // cnt = cnt + 32 - shift_bits
-        eor             r7,  r7,  r3           // dif ^= next_bits
-        b               9f
+        ldr             r8,  [r3]              // next_bits
+        rsb             r5,  r6,  #16
+        add             r4,  r6,  #16          // shift_bits = cnt + 16
+        mvn             r8,  r8
+        lsr             r5,  r5,  #3           // num_bytes_read
+        rev             r8,  r8                // next_bits = bswap(next_bits)
+        lsr             r8,  r8,  r4           // next_bits >>= shift_bits
 
-2:      // refill_eob
-        rsb             r5,  r6,  #8           // c = 40 - cnt
-3:
-        cmp             r3,  r4
-        bge             4f
-        ldrb            r8,  [r3], #1
-        lsl             r8,  r8,  r5
-        eor             r7,  r7,  r8
-        subs            r5,  r5,  #8
-        bge             3b
-
-4:      // refill_eob_end
+3:      // refill_end
+        add             r3,  r3,  r5
+        add             r6,  r6,  r5, lsl #3   // cnt += num_bits_read
         str             r3,  [r0, #BUF_POS]
-        rsb             r6,  r5,  #8           // cnt = 40 - c
 
-9:
+4:      // refill_end2
+        orr             r7,  r7,  r8           // dif |= next_bits
+
+5:      // end
         lsl             lr,  lr,  #1
         sub             lr,  lr,  #5
         lsr             r12, r7,  #16
@@ -473,6 +465,30 @@
         str             r7,  [r0, #DIF]
         lsr             r0,  r2,  #1
         pop             {r4-r10,pc}
+
+6:      // pad_with_ones
+        add             r8,  r6,  #-240
+        lsr             r8,  r8,  r8
+        b               4b
+
+7:      // refill_eob
+        cmp             r3,  r4
+        bhs             6b
+
+        ldr             r8,  [r4, #-4]
+        lsl             r5,  r5,  #3
+        lsr             r8,  r8,  r5
+        add             r5,  r6,  #16
+        mvn             r8,  r8
+        sub             r4,  r4,  r3           // num_bytes_left
+        rev             r8,  r8
+        lsr             r8,  r8,  r5
+        rsb             r5,  r6,  #16
+        lsr             r5,  r5,  #3
+        cmp             r5,  r4
+        it              hs
+        movhs           r5,  r4
+        b               3b
 endfunc
 
 function msac_decode_bool_equi_neon, export=1
@@ -493,7 +509,6 @@
         movhs           r7,  r8                // if (ret) dif = dif - vw;
 
         clz             r5,  r4                // clz(rng)
-        mvn             r7,  r7                // ~dif
         eor             r5,  r5,  #16          // d = clz(rng) ^ 16
         mov             lr,  r2
         b               L(renorm2)
@@ -519,7 +534,6 @@
         movhs           r7,  r8                // if (ret) dif = dif - vw;
 
         clz             r5,  r4                // clz(rng)
-        mvn             r7,  r7                // ~dif
         eor             r5,  r5,  #16          // d = clz(rng) ^ 16
         mov             lr,  r2
         b               L(renorm2)
@@ -549,7 +563,6 @@
 
         cmp             r10, #0
         clz             r5,  r4                // clz(rng)
-        mvn             r7,  r7                // ~dif
         eor             r5,  r5,  #16          // d = clz(rng) ^ 16
         mov             lr,  r2
 
diff --git a/src/arm/64/itx.S b/src/arm/64/itx.S
index 53490cd..7063cbd 100644
--- a/src/arm/64/itx.S
+++ b/src/arm/64/itx.S
@@ -879,6 +879,8 @@
 
 .ifc \variant, identity_
         // The identity shl #1 and downshift srshr #1 cancel out
+
+        b               L(itx_8x8_epilog)
 .else
         blr             x4
 
@@ -890,19 +892,20 @@
         srshr           v21.8h,  v21.8h,  #1
         srshr           v22.8h,  v22.8h,  #1
         srshr           v23.8h,  v23.8h,  #1
-.endif
 
+L(itx_8x8_epilog):
         transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
 
         blr             x5
 
         load_add_store_8x8 x0, x7
         ret             x15
+.endif
 endfunc
 .endm
 
-def_fn_8x8_base
 def_fn_8x8_base identity_
+def_fn_8x8_base
 
 .macro def_fn_8x8 txfm1, txfm2
 function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1
@@ -1390,14 +1393,16 @@
 .endif
 .if \identity
         identity_8x16_shift2 v0.h[0]
+        b               L(horz_16x8_epilog)
 .else
         blr             x4
-.endif
-.if \shift > 0
 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
         srshr           \i,  \i,  #\shift
 .endr
-.endif
+.if \shift == 1
+        b               L(horz_16x8_epilog)
+.else
+L(horz_16x8_epilog):
         transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
         transpose_8x8h  v24, v25, v26, v27, v28, v29, v30, v31, v4, v5
 
@@ -1406,12 +1411,14 @@
 .endr
 
         ret             x14
+.endif
+.endif
 endfunc
 .endm
 
-def_horz_16 scale=0, identity=0, shift=2
 def_horz_16 scale=1, identity=0, shift=1, suffix=_scale
 def_horz_16 scale=0, identity=1, shift=0, suffix=_identity
+def_horz_16 scale=0, identity=0, shift=2
 
 function inv_txfm_add_vert_8x16_neon
         mov             x14, x30
@@ -1512,6 +1519,8 @@
 .endr
 
         identity_8x16_shift1 v0.h[0]
+
+        b               L(itx_16x4_epilog)
 .else
 .irp i, v16.4h, v17.4h, v18.4h, v19.4h, v20.4h, v21.4h, v22.4h, v23.4h, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h
         ld1             {\i},    [x2]
@@ -1527,33 +1536,29 @@
 .irp i, v16.8h, v17.8h, v18.8h, v19.8h
         srshr           \i,  \i,  #1
 .endr
-.endif
+
+        ins             v24.d[1], v28.d[0]
+        ins             v25.d[1], v29.d[0]
+        ins             v26.d[1], v30.d[0]
+        ins             v27.d[1], v31.d[0]
+        srshr           v20.8h,  v24.8h,  #1
+        srshr           v21.8h,  v25.8h,  #1
+        srshr           v22.8h,  v26.8h,  #1
+        srshr           v23.8h,  v27.8h,  #1
+
+L(itx_16x4_epilog):
         transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5
         blr             x5
         mov             x6,  x0
         load_add_store_8x4 x6, x7
 
-.ifc \variant, identity_
-        mov             v16.16b, v20.16b
-        mov             v17.16b, v21.16b
-        mov             v18.16b, v22.16b
-        mov             v19.16b, v23.16b
-.else
-        ins             v24.d[1], v28.d[0]
-        ins             v25.d[1], v29.d[0]
-        ins             v26.d[1], v30.d[0]
-        ins             v27.d[1], v31.d[0]
-        srshr           v16.8h,  v24.8h,  #1
-        srshr           v17.8h,  v25.8h,  #1
-        srshr           v18.8h,  v26.8h,  #1
-        srshr           v19.8h,  v27.8h,  #1
-.endif
-        transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5
+        transpose_4x8h_mov v20, v21, v22, v23, v2,  v3,  v4,  v5, v16, v17, v18, v19
         blr             x5
         add             x6,  x0,  #8
         load_add_store_8x4 x6, x7
 
         ret             x15
+.endif
 endfunc
 
 function inv_txfm_\variant\()add_4x16_neon
@@ -1605,12 +1610,14 @@
         mov             w16, #(5793-4096)*8
         dup             v0.4h,   w16
         identity_8x4_shift1 v16, v17, v18, v19, v0.h[0]
+
+        b               L(itx_4x16_epilog)
 .else
         blr             x4
 .irp i, v16.8h, v17.8h, v18.8h, v19.8h
         srshr           \i,  \i,  #1
 .endr
-.endif
+L(itx_4x16_epilog):
         transpose_4x8h  v16, v17, v18, v19, v4,  v5,  v6,  v7
         ins             v20.d[0], v16.d[1]
         ins             v21.d[0], v17.d[1]
@@ -1622,11 +1629,12 @@
         load_add_store_4x16 x0, x6
 
         ret             x15
+.endif
 endfunc
 .endm
 
-def_fn_416_base
 def_fn_416_base identity_
+def_fn_416_base
 
 .macro def_fn_416 w, h, txfm1, txfm2, eob_half
 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
@@ -1634,11 +1642,15 @@
         idct_dc         \w,  \h,  1
 .endif
 .if \w == 4
+.ifnc \txfm1, identity
         adr             x4,  inv_\txfm1\()_8h_x\w\()_neon
+.endif
         adr             x5,  inv_\txfm2\()_4h_x\h\()_neon
         mov             w13, #\eob_half
 .else
+.ifnc \txfm1, identity
         adr             x4,  inv_\txfm1\()_4h_x\w\()_neon
+.endif
         adr             x5,  inv_\txfm2\()_8h_x\h\()_neon
 .endif
 .ifc \txfm1, identity
@@ -1690,13 +1702,16 @@
         mov             w16, #2*(5793-4096)*8
         dup             v0.4h,   w16
         identity_8x16_shift1 v0.h[0]
+
+        b               L(itx_16x8_epilog)
 .else
         blr             x4
 
-.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
         srshr           \i,  \i,  #1
 .endr
-.endif
+
+L(itx_16x8_epilog):
         transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
 
         blr             x5
@@ -1704,27 +1719,7 @@
         mov             x6,  x0
         load_add_store_8x8 x6, x7
 
-.ifc \variant, identity_
-        mov             v16.16b, v24.16b
-        mov             v17.16b, v25.16b
-        mov             v18.16b, v26.16b
-        mov             v19.16b, v27.16b
-        mov             v20.16b, v28.16b
-        mov             v21.16b, v29.16b
-        mov             v22.16b, v30.16b
-        mov             v23.16b, v31.16b
-.else
-        srshr           v16.8h,  v24.8h,  #1
-        srshr           v17.8h,  v25.8h,  #1
-        srshr           v18.8h,  v26.8h,  #1
-        srshr           v19.8h,  v27.8h,  #1
-        srshr           v20.8h,  v28.8h,  #1
-        srshr           v21.8h,  v29.8h,  #1
-        srshr           v22.8h,  v30.8h,  #1
-        srshr           v23.8h,  v31.8h,  #1
-.endif
-
-        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
+        transpose_8x8h_mov v24, v25, v26, v27, v28, v29, v30, v31, v2, v3, v16, v17, v18, v19, v20, v21, v22, v23
 
         blr             x5
 
@@ -1732,6 +1727,7 @@
         load_add_store_8x8 x0, x7
 
         ret             x15
+.endif
 endfunc
 
 function inv_txfm_\variant\()add_8x16_neon
@@ -1790,14 +1786,16 @@
         scale_input     .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
 .ifc \variant, identity_
         // The identity shl #1 and downshift srshr #1 cancel out
+
+        b               L(itx_8x16_epilog)
 .else
         blr             x4
 
 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
         srshr           \i,  \i,  #1
 .endr
-.endif
 
+L(itx_8x16_epilog):
         transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
 
         blr             x5
@@ -1805,18 +1803,21 @@
         load_add_store_8x16 x0, x6
 
         ret             x15
+.endif
 endfunc
 .endm
 
-def_fn_816_base
 def_fn_816_base identity_
+def_fn_816_base
 
 .macro def_fn_816 w, h, txfm1, txfm2, eob_half
 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
 .ifc \txfm1\()_\txfm2, dct_dct
         idct_dc         \w,  \h,  1
 .endif
+.ifnc \txfm1, identity
         adr             x4,  inv_\txfm1\()_8h_x\w\()_neon
+.endif
         adr             x5,  inv_\txfm2\()_8h_x\h\()_neon
 .if \w == 8
         mov             x13, #\eob_half
diff --git a/src/arm/64/itx16.S b/src/arm/64/itx16.S
index eee3a96..31ee9be 100644
--- a/src/arm/64/itx16.S
+++ b/src/arm/64/itx16.S
@@ -514,13 +514,17 @@
         b               L(itx_4x4_end)
 endfunc
 
+// HBD inv_txfm_add_4x4_neon deviates from the common pattern with registers
+// x0-x4  external parameters
+// x5     function pointer to first transform
+// x6     function pointer to second transform
 function inv_txfm_add_4x4_neon
         movi            v30.4s,  #0
         movi            v31.4s,  #0
         ld1             {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]
         st1             {v30.4s, v31.4s}, [x2], #32
 
-        blr             x4
+        blr             x5
 
         st1             {v30.4s, v31.4s}, [x2], #32
         sqxtn           v16.4h,  v16.4s
@@ -529,7 +533,7 @@
         sqxtn           v19.4h,  v19.4s
         transpose_4x4h  v16, v17, v18, v19, v20, v21, v22, v23
 
-        blr             x5
+        blr             x6
 
         ld1             {v0.d}[0], [x0], x1
         ld1             {v0.d}[1], [x0], x1
@@ -541,7 +545,7 @@
         srshr           v18.8h,  v18.8h,  #4
 
 L(itx_4x4_end):
-        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff
+        dup             v31.8h,  w4
         sub             x0,  x0,  x1, lsl #2
         usqadd          v0.8h,   v16.8h
         usqadd          v1.8h,   v18.8h
@@ -579,8 +583,8 @@
         b               L(itx_4x4_end)
 1:
 .endif
-        adr             x4,  inv_\txfm1\()_4s_x4_neon
-        movrel          x5,  X(inv_\txfm2\()_4h_x4_neon)
+        adr             x5,  inv_\txfm1\()_4s_x4_neon
+        movrel          x6,  X(inv_\txfm2\()_4h_x4_neon)
         b               inv_txfm_add_4x4_neon
 endfunc
 .endm
@@ -1381,6 +1385,10 @@
         sqrshrn2        v21.8h,  v29.4s,  #\shift
         sqrshrn2        v22.8h,  v30.4s,  #\shift
         sqrshrn2        v23.8h,  v31.4s,  #\shift
+.if \scale
+        b               L(horz_16x4_epilog)
+.else
+L(horz_16x4_epilog):
         transpose_4x8h  v16, v17, v18, v19, v4,  v5,  v6,  v7
         transpose_4x8h  v20, v21, v22, v23, v4,  v5,  v6,  v7
 
@@ -1389,11 +1397,12 @@
 .endr
 
         ret             x14
+.endif
 endfunc
 .endm
 
-def_horz_16 scale=0, shift=2
 def_horz_16 scale=1, shift=1, suffix=_scale
+def_horz_16 scale=0, shift=2
 
 function inv_txfm_add_vert_8x16_neon
         mov             x14, x30
diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S
index 9f7b4e7..3df0393 100644
--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -1154,7 +1154,7 @@
         uxtl            \r6\().8h, \r6\().8b
 .endif
 .endm
-.macro mul_mla_4 d, s0, s1, s2, s3, wd
+.macro mul_mla_4tap d, s0, s1, s2, s3, wd
         mul             \d\wd,  \s0\wd,  v0.h[0]
         mla             \d\wd,  \s1\wd,  v0.h[1]
         mla             \d\wd,  \s2\wd,  v0.h[2]
@@ -1163,7 +1163,51 @@
 // Interleaving the mul/mla chains actually hurts performance
 // significantly on Cortex A53, thus keeping mul/mla tightly
 // chained like this.
-.macro mul_mla_8_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7
+.macro mul_mla_6tap_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7
+        mul             \d0\().4h, \s1\().4h, v0.h[1]
+        mla             \d0\().4h, \s2\().4h, v0.h[2]
+        mla             \d0\().4h, \s3\().4h, v0.h[3]
+        mla             \d0\().4h, \s4\().4h, v0.h[4]
+        mla             \d0\().4h, \s5\().4h, v0.h[5]
+        mla             \d0\().4h, \s6\().4h, v0.h[6]
+.endm
+.macro mul_mla_6tap_0 d0, s0, s1, s2, s3, s4, s5, s6, s7
+        mul             \d0\().8h, \s1\().8h, v0.h[1]
+        mla             \d0\().8h, \s2\().8h, v0.h[2]
+        mla             \d0\().8h, \s3\().8h, v0.h[3]
+        mla             \d0\().8h, \s4\().8h, v0.h[4]
+        mla             \d0\().8h, \s5\().8h, v0.h[5]
+        mla             \d0\().8h, \s6\().8h, v0.h[6]
+.endm
+.macro mul_mla_6tap_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+        mul             \d0\().8h, \s1\().8h, v0.h[1]
+        mla             \d0\().8h, \s2\().8h, v0.h[2]
+        mla             \d0\().8h, \s3\().8h, v0.h[3]
+        mla             \d0\().8h, \s4\().8h, v0.h[4]
+        mla             \d0\().8h, \s5\().8h, v0.h[5]
+        mla             \d0\().8h, \s6\().8h, v0.h[6]
+        mul             \d1\().8h, \s2\().8h, v0.h[1]
+        mla             \d1\().8h, \s3\().8h, v0.h[2]
+        mla             \d1\().8h, \s4\().8h, v0.h[3]
+        mla             \d1\().8h, \s5\().8h, v0.h[4]
+        mla             \d1\().8h, \s6\().8h, v0.h[5]
+        mla             \d1\().8h, \s7\().8h, v0.h[6]
+.endm
+.macro mul_mla_6tap_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
+        mul             \d0\().8h, \s1\().8h, v0.h[1]
+        mla             \d0\().8h, \s2\().8h, v0.h[2]
+        mla             \d0\().8h, \s3\().8h, v0.h[3]
+        mla             \d0\().8h, \s4\().8h, v0.h[4]
+        mla             \d0\().8h, \s5\().8h, v0.h[5]
+        mla             \d0\().8h, \s6\().8h, v0.h[6]
+        mul             \d1\().8h, \s3\().8h, v0.h[1]
+        mla             \d1\().8h, \s4\().8h, v0.h[2]
+        mla             \d1\().8h, \s5\().8h, v0.h[3]
+        mla             \d1\().8h, \s6\().8h, v0.h[4]
+        mla             \d1\().8h, \s7\().8h, v0.h[5]
+        mla             \d1\().8h, \s8\().8h, v0.h[6]
+.endm
+.macro mul_mla_8tap_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7
         mul             \d0\().4h, \s0\().4h, v0.h[0]
         mla             \d0\().4h, \s1\().4h, v0.h[1]
         mla             \d0\().4h, \s2\().4h, v0.h[2]
@@ -1173,7 +1217,7 @@
         mla             \d0\().4h, \s6\().4h, v0.h[6]
         mla             \d0\().4h, \s7\().4h, v0.h[7]
 .endm
-.macro mul_mla_8_0 d0, s0, s1, s2, s3, s4, s5, s6, s7
+.macro mul_mla_8tap_0 d0, s0, s1, s2, s3, s4, s5, s6, s7
         mul             \d0\().8h, \s0\().8h, v0.h[0]
         mla             \d0\().8h, \s1\().8h, v0.h[1]
         mla             \d0\().8h, \s2\().8h, v0.h[2]
@@ -1183,7 +1227,7 @@
         mla             \d0\().8h, \s6\().8h, v0.h[6]
         mla             \d0\().8h, \s7\().8h, v0.h[7]
 .endm
-.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+.macro mul_mla_8tap_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
         mul             \d0\().8h, \s0\().8h, v0.h[0]
         mla             \d0\().8h, \s1\().8h, v0.h[1]
         mla             \d0\().8h, \s2\().8h, v0.h[2]
@@ -1201,7 +1245,7 @@
         mla             \d1\().8h, \s7\().8h, v0.h[6]
         mla             \d1\().8h, \s8\().8h, v0.h[7]
 .endm
-.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
+.macro mul_mla_8tap_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
         mul             \d0\().8h, \s0\().8h, v0.h[0]
         mla             \d0\().8h, \s1\().8h, v0.h[1]
         mla             \d0\().8h, \s2\().8h, v0.h[2]
@@ -1315,11 +1359,11 @@
 .endif
 .endm
 
-.macro make_8tap_fn op, type, type_h, type_v
+.macro make_8tap_fn op, type, type_h, type_v, taps
 function \op\()_8tap_\type\()_8bpc_neon, export=1
         mov             x8,  \type_h
         mov             x9,  \type_v
-        b               \op\()_8tap_neon
+        b               \op\()_\taps\()_neon
 endfunc
 .endm
 
@@ -1328,18 +1372,8 @@
 #define SMOOTH  ((1*15<<7)|4*15)
 #define SHARP   ((2*15<<7)|3*15)
 
-.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv
-make_8tap_fn \type, regular,        REGULAR, REGULAR
-make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
-make_8tap_fn \type, regular_sharp,  REGULAR, SHARP
-make_8tap_fn \type, smooth,         SMOOTH,  SMOOTH
-make_8tap_fn \type, smooth_regular, SMOOTH,  REGULAR
-make_8tap_fn \type, smooth_sharp,   SMOOTH,  SHARP
-make_8tap_fn \type, sharp,          SHARP,   SHARP
-make_8tap_fn \type, sharp_regular,  SHARP,   REGULAR
-make_8tap_fn \type, sharp_smooth,   SHARP,   SMOOTH
-
-function \type\()_8tap_neon
+.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv, taps
+function \type\()_\taps\()_neon
         mov             w10,  #0x4081  // (1 << 14) | (1 << 7) | (1 << 0)
         mul             \mx,  \mx, w10
         mul             \my,  \my, w10
@@ -1354,12 +1388,12 @@
         tst             \mx, #(0x7f << 14)
         sub             w8,  w8,  #24
         movrel          x10, X(mc_subpel_filters), -8
-        b.ne            L(\type\()_8tap_h)
+        b.ne            L(\type\()_\taps\()_h)
         tst             \my, #(0x7f << 14)
-        b.ne            L(\type\()_8tap_v)
+        b.ne            L(\type\()_\taps\()_v)
         b               \type\()_neon
 
-L(\type\()_8tap_h):
+L(\type\()_\taps\()_h):
         cmp             \w,  #4
         ubfx            w9,  \mx, #7, #7
         and             \mx, \mx, #0x7f
@@ -1368,9 +1402,9 @@
 4:
         tst             \my,  #(0x7f << 14)
         add             \xmx, x10, \mx, uxtw #3
-        b.ne            L(\type\()_8tap_hv)
+        b.ne            L(\type\()_\taps\()_hv)
 
-        adr             x9,  L(\type\()_8tap_h_tbl)
+        adr             x9,  L(\type\()_\taps\()_h_tbl)
         ldrh            w8,  [x9, x8, lsl #1]
         sub             x9,  x9,  w8, uxtw
         br              x9
@@ -1471,6 +1505,18 @@
         uxtl            v20.8h,  v20.8b
         uxtl            v21.8h,  v21.8b
 
+.ifc \taps, 6tap
+        ext             v19.16b, v16.16b, v17.16b, #2
+        ext             v23.16b, v20.16b, v21.16b, #2
+        mul             v18.8h,  v19.8h,  v0.h[1]
+        mul             v22.8h,  v23.8h,  v0.h[1]
+.irpc i, 23456
+        ext             v19.16b, v16.16b, v17.16b, #(2*\i)
+        ext             v23.16b, v20.16b, v21.16b, #(2*\i)
+        mla             v18.8h,  v19.8h,  v0.h[\i]
+        mla             v22.8h,  v23.8h,  v0.h[\i]
+.endr
+.else   // 8tap
         mul             v18.8h,  v16.8h,  v0.h[0]
         mul             v22.8h,  v20.8h,  v0.h[0]
 .irpc i, 1234567
@@ -1479,6 +1525,7 @@
         mla             v18.8h,  v19.8h,  v0.h[\i]
         mla             v22.8h,  v23.8h,  v0.h[\i]
 .endr
+.endif
         subs            \h,  \h,  #2
         srshr           v18.8h,  v18.8h, #2
         srshr           v22.8h,  v22.8h, #2
@@ -1523,6 +1570,26 @@
         uxtl            v22.8h,  v22.8b
 
 16:
+.ifc \taps, 6tap
+        ext             v28.16b, v16.16b, v17.16b, #2
+        ext             v29.16b, v17.16b, v18.16b, #2
+        ext             v30.16b, v20.16b, v21.16b, #2
+        ext             v31.16b, v21.16b, v22.16b, #2
+        mul             v24.8h,  v28.8h,  v0.h[1]
+        mul             v25.8h,  v29.8h,  v0.h[1]
+        mul             v26.8h,  v30.8h,  v0.h[1]
+        mul             v27.8h,  v31.8h,  v0.h[1]
+.irpc i, 23456
+        ext             v28.16b, v16.16b, v17.16b, #(2*\i)
+        ext             v29.16b, v17.16b, v18.16b, #(2*\i)
+        ext             v30.16b, v20.16b, v21.16b, #(2*\i)
+        ext             v31.16b, v21.16b, v22.16b, #(2*\i)
+        mla             v24.8h,  v28.8h,  v0.h[\i]
+        mla             v25.8h,  v29.8h,  v0.h[\i]
+        mla             v26.8h,  v30.8h,  v0.h[\i]
+        mla             v27.8h,  v31.8h,  v0.h[\i]
+.endr
+.else   // 8tap
         mul             v24.8h,  v16.8h,  v0.h[0]
         mul             v25.8h,  v17.8h,  v0.h[0]
         mul             v26.8h,  v20.8h,  v0.h[0]
@@ -1537,6 +1604,7 @@
         mla             v26.8h,  v30.8h,  v0.h[\i]
         mla             v27.8h,  v31.8h,  v0.h[\i]
 .endr
+.endif
         srshr           v24.8h,  v24.8h, #2
         srshr           v25.8h,  v25.8h, #2
         srshr           v26.8h,  v26.8h, #2
@@ -1575,18 +1643,18 @@
         b.gt            161b
         ret
 
-L(\type\()_8tap_h_tbl):
-        .hword L(\type\()_8tap_h_tbl) - 1280b
-        .hword L(\type\()_8tap_h_tbl) -  640b
-        .hword L(\type\()_8tap_h_tbl) -  320b
-        .hword L(\type\()_8tap_h_tbl) -  160b
-        .hword L(\type\()_8tap_h_tbl) -   80b
-        .hword L(\type\()_8tap_h_tbl) -   40b
-        .hword L(\type\()_8tap_h_tbl) -   20b
+L(\type\()_\taps\()_h_tbl):
+        .hword L(\type\()_\taps\()_h_tbl) - 1280b
+        .hword L(\type\()_\taps\()_h_tbl) -  640b
+        .hword L(\type\()_\taps\()_h_tbl) -  320b
+        .hword L(\type\()_\taps\()_h_tbl) -  160b
+        .hword L(\type\()_\taps\()_h_tbl) -   80b
+        .hword L(\type\()_\taps\()_h_tbl) -   40b
+        .hword L(\type\()_\taps\()_h_tbl) -   20b
         .hword 0
 
 
-L(\type\()_8tap_v):
+L(\type\()_\taps\()_v):
         cmp             \h,  #4
         ubfx            w9,  \my, #7, #7
         and             \my, \my, #0x7f
@@ -1595,7 +1663,7 @@
 4:
         add             \xmy, x10, \my, uxtw #3
 
-        adr             x9,  L(\type\()_8tap_v_tbl)
+        adr             x9,  L(\type\()_\taps\()_v_tbl)
         ldrh            w8,  [x9, x8, lsl #1]
         sub             x9,  x9,  w8, uxtw
         br              x9
@@ -1620,7 +1688,7 @@
         interleave_1_h  v1, v2, v3, v4, v5
         b.gt            24f
         uxtl_b          v1, v2, v3, v4
-        mul_mla_4       v6, v1, v2, v3, v4, .4h
+        mul_mla_4tap    v6, v1, v2, v3, v4, .4h
         sqrshrun_b      6,  v6
         st_h            \d_strd, v6, 2
         ret
@@ -1630,7 +1698,7 @@
         interleave_1_h  v5, v6, v7
         interleave_2_s  v1, v2, v3, v4, v5, v6
         uxtl_b          v1, v2, v3, v4
-        mul_mla_4       v6, v1, v2, v3, v4, .8h
+        mul_mla_4tap    v6, v1, v2, v3, v4, .8h
         sqrshrun_b      6,  v6
         st_h            \d_strd, v6, 4
         ret
@@ -1655,7 +1723,7 @@
         interleave_1_h  v7,  v16, v17, v18, v19
         interleave_2_s  v5,  v6,  v7,  v16, v17, v18
         uxtl_b          v5,  v6,  v7,  v16
-        mul_mla_8_0     v30, v1,  v2,  v3,  v4,  v5,  v6,  v7,  v16
+        mul_mla_\taps\()_0 v30, v1, v2, v3, v4, v5, v6, v7, v16
         sqrshrun_b      6,   v30
         st_h            \d_strd, v30, 4
         b.le            0f
@@ -1673,7 +1741,7 @@
         load_h          \sr2, \src, \s_strd, v16, v17
         interleave_1_h  v7,  v16, v17
         uxtl_b          v5,  v6,  v7,  v16
-        mul_mla_8_0_4h  v30, v1,  v2,  v3,  v4,  v5,  v6,  v7,  v16
+        mul_mla_\taps\()_0_4h v30, v1, v2, v3, v4, v5, v6, v7, v16
         sqrshrun_b      6,   v30
         st_h            \d_strd, v30, 2
 0:
@@ -1698,13 +1766,13 @@
         load_s          \src, \sr2, \s_strd, v1, v2, v3, v4, v5
         interleave_1_s  v1, v2, v3, v4, v5
         uxtl_b          v1, v2, v3, v4
-        mul_mla_4       v6, v1, v2, v3, v4, .8h
+        mul_mla_4tap    v6, v1, v2, v3, v4, .8h
         shift_store_4   \type, \d_strd, v6
         b.le            0f
         load_s          \sr2, \src, \s_strd, v6, v7
         interleave_1_s  v5, v6, v7
         uxtl_b          v5, v6
-        mul_mla_4       v7, v3, v4, v5, v6, .8h
+        mul_mla_4tap    v7, v3, v4, v5, v6, .8h
         shift_store_4   \type, \d_strd, v7
 0:
         ret
@@ -1729,28 +1797,28 @@
         load_s          \sr2, \src, \s_strd, v23, v24, v25, v26
         interleave_1_s  v22, v23, v24, v25, v26
         uxtl_b          v22, v23, v24, v25
-        mul_mla_8_2     v1,  v2,  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
+        mul_mla_\taps\()_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
         shift_store_4   \type, \d_strd, v1, v2
         b.le            0f
         load_s          \sr2,  \src, \s_strd, v27, v16
         subs            \h,  \h,  #2
         interleave_1_s  v26, v27, v16
         uxtl_b          v26, v27
-        mul_mla_8_0     v1,  v20, v21, v22, v23, v24, v25, v26, v27
+        mul_mla_\taps\()_0 v1, v20, v21, v22, v23, v24, v25, v26, v27
         shift_store_4   \type, \d_strd, v1
         b.le            0f
         load_s          \sr2,  \src, \s_strd, v17, v18
         subs            \h,  \h,  #2
         interleave_1_s  v16, v17, v18
         uxtl_b          v16, v17
-        mul_mla_8_0     v2,  v22, v23, v24, v25, v26, v27, v16, v17
+        mul_mla_\taps\()_0 v2, v22, v23, v24, v25, v26, v27, v16, v17
         shift_store_4   \type, \d_strd, v2
         b.le            0f
         subs            \h,  \h,  #4
         load_s          \sr2, \src, \s_strd, v19, v20, v21, v22
         interleave_1_s  v18, v19, v20, v21, v22
         uxtl_b          v18, v19, v20, v21
-        mul_mla_8_2     v1,  v2,  v24, v25, v26, v27, v16, v17, v18, v19, v20, v21
+        mul_mla_\taps\()_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21
         shift_store_4   \type, \d_strd, v1, v2
         b.gt            48b
 0:
@@ -1773,14 +1841,14 @@
 
         load_8b         \src, \sr2, \s_strd, v1, v2, v3, v4, v5
         uxtl_b          v1, v2, v3, v4, v5
-        mul_mla_4       v6, v1, v2, v3, v4, .8h
-        mul_mla_4       v7, v2, v3, v4, v5, .8h
+        mul_mla_4tap    v6, v1, v2, v3, v4, .8h
+        mul_mla_4tap    v7, v2, v3, v4, v5, .8h
         shift_store_8   \type, \d_strd, v6, v7
         b.le            0f
         load_8b         \sr2, \src, \s_strd, v6, v7
         uxtl_b          v6, v7
-        mul_mla_4       v1, v3, v4, v5, v6, .8h
-        mul_mla_4       v2, v4, v5, v6, v7, .8h
+        mul_mla_4tap    v1, v3, v4, v5, v6, .8h
+        mul_mla_4tap    v2, v4, v5, v6, v7, .8h
         shift_store_8   \type, \d_strd, v1, v2
 0:
         ret
@@ -1809,32 +1877,32 @@
         subs            \h,  \h,  #2
         load_8b         \sr2, \src, \s_strd, v23, v24
         uxtl_b          v23, v24
-        mul_mla_8_1     v1,  v2,  v16, v17, v18, v19, v20, v21, v22, v23, v24
+        mul_mla_\taps\()_1 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24
         shift_store_8   \type, \d_strd, v1, v2
         b.le            9f
         subs            \h,  \h,  #2
         load_8b         \sr2, \src, \s_strd, v25, v26
         uxtl_b          v25, v26
-        mul_mla_8_1     v3,  v4,  v18, v19, v20, v21, v22, v23, v24, v25, v26
+        mul_mla_\taps\()_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26
         shift_store_8   \type, \d_strd, v3, v4
         b.le            9f
         subs            \h,  \h,  #2
         load_8b         \sr2, \src, \s_strd, v27, v16
         uxtl_b          v27, v16
-        mul_mla_8_1     v1,  v2,  v20, v21, v22, v23, v24, v25, v26, v27, v16
+        mul_mla_\taps\()_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16
         shift_store_8   \type, \d_strd, v1, v2
         b.le            9f
         subs            \h,  \h,  #2
         load_8b         \sr2, \src, \s_strd, v17, v18
         uxtl_b          v17, v18
-        mul_mla_8_1     v3,  v4,  v22, v23, v24, v25, v26, v27, v16, v17, v18
+        mul_mla_\taps\()_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18
         shift_store_8   \type, \d_strd, v3, v4
         b.le            9f
         subs            \h,  \h,  #4
         load_8b         \sr2, \src, \s_strd, v19, v20, v21, v22
         uxtl_b          v19, v20, v21, v22
-        mul_mla_8_1     v1,  v2,  v24, v25, v26, v27, v16, v17, v18, v19, v20
-        mul_mla_8_1     v3,  v4,  v26, v27, v16, v17, v18, v19, v20, v21, v22
+        mul_mla_\taps\()_1 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20
+        mul_mla_\taps\()_1 v3, v4, v26, v27, v16, v17, v18, v19, v20, v21, v22
         shift_store_8   \type, \d_strd, v1, v2, v3, v4
         b.gt            88b
 9:
@@ -1882,10 +1950,10 @@
         uxtl2           v25.8h, v3.16b
         uxtl2           v26.8h, v4.16b
         uxtl2           v27.8h, v5.16b
-        mul_mla_4       v1,  v16, v17, v18, v19, .8h
-        mul_mla_4       v16, v17, v18, v19, v20, .8h
-        mul_mla_4       v2,  v23, v24, v25, v26, .8h
-        mul_mla_4       v17, v24, v25, v26, v27, .8h
+        mul_mla_4tap    v1,  v16, v17, v18, v19, .8h
+        mul_mla_4tap    v16, v17, v18, v19, v20, .8h
+        mul_mla_4tap    v2,  v23, v24, v25, v26, .8h
+        mul_mla_4tap    v17, v24, v25, v26, v27, .8h
         shift_store_16  \type, \d_strd, v1, v2, v16, v17
         b.le            0f
         load_16b        \sr2, \src, \s_strd, v6,  v7
@@ -1893,25 +1961,25 @@
         uxtl            v22.8h, v7.8b
         uxtl2           v28.8h, v6.16b
         uxtl2           v29.8h, v7.16b
-        mul_mla_4       v1,  v18, v19, v20, v21, .8h
-        mul_mla_4       v3,  v19, v20, v21, v22, .8h
-        mul_mla_4       v2,  v25, v26, v27, v28, .8h
-        mul_mla_4       v4,  v26, v27, v28, v29, .8h
+        mul_mla_4tap    v1,  v18, v19, v20, v21, .8h
+        mul_mla_4tap    v3,  v19, v20, v21, v22, .8h
+        mul_mla_4tap    v2,  v25, v26, v27, v28, .8h
+        mul_mla_4tap    v4,  v26, v27, v28, v29, .8h
         shift_store_16  \type, \d_strd, v1, v2, v3, v4
 0:
         ret
 
-L(\type\()_8tap_v_tbl):
-        .hword L(\type\()_8tap_v_tbl) - 1280b
-        .hword L(\type\()_8tap_v_tbl) -  640b
-        .hword L(\type\()_8tap_v_tbl) -  320b
-        .hword L(\type\()_8tap_v_tbl) -  160b
-        .hword L(\type\()_8tap_v_tbl) -   80b
-        .hword L(\type\()_8tap_v_tbl) -   40b
-        .hword L(\type\()_8tap_v_tbl) -   20b
+L(\type\()_\taps\()_v_tbl):
+        .hword L(\type\()_\taps\()_v_tbl) - 1280b
+        .hword L(\type\()_\taps\()_v_tbl) -  640b
+        .hword L(\type\()_\taps\()_v_tbl) -  320b
+        .hword L(\type\()_\taps\()_v_tbl) -  160b
+        .hword L(\type\()_\taps\()_v_tbl) -   80b
+        .hword L(\type\()_\taps\()_v_tbl) -   40b
+        .hword L(\type\()_\taps\()_v_tbl) -   20b
         .hword 0
 
-L(\type\()_8tap_hv):
+L(\type\()_\taps\()_hv):
         cmp             \h,  #4
         ubfx            w9,  \my, #7, #7
         and             \my, \my, #0x7f
@@ -1920,7 +1988,7 @@
 4:
         add             \xmy,  x10, \my, uxtw #3
 
-        adr             x9,  L(\type\()_8tap_hv_tbl)
+        adr             x9,  L(\type\()_\taps\()_hv_tbl)
         ldrh            w8,  [x9, x8, lsl #1]
         sub             x9,  x9,  w8, uxtw
         br              x9
@@ -1952,13 +2020,13 @@
         addp            v28.4h,  v28.4h,  v29.4h
         addp            v16.4h,  v28.4h,  v28.4h
         srshr           v16.4h,  v16.4h,  #2
-        bl              L(\type\()_8tap_filter_2)
+        bl              L(\type\()_\taps\()_filter_2)
 
         trn1            v16.2s, v16.2s, v28.2s
         mov             v17.8b, v28.8b
 
 2:
-        bl              L(\type\()_8tap_filter_2)
+        bl              L(\type\()_\taps\()_filter_2)
 
         ext             v18.8b, v17.8b, v28.8b, #4
         smull           v2.4s,  v16.4h, v1.h[0]
@@ -1997,19 +2065,27 @@
         addp            v16.4h,  v28.4h,  v28.4h
         srshr           v16.4h,  v16.4h,  #2
 
-        bl              L(\type\()_8tap_filter_2)
+        bl              L(\type\()_\taps\()_filter_2)
         trn1            v16.2s, v16.2s, v28.2s
         mov             v17.8b, v28.8b
-        bl              L(\type\()_8tap_filter_2)
+        bl              L(\type\()_\taps\()_filter_2)
         ext             v18.8b, v17.8b, v28.8b, #4
         mov             v19.8b, v28.8b
-        bl              L(\type\()_8tap_filter_2)
+        bl              L(\type\()_\taps\()_filter_2)
         ext             v20.8b, v19.8b, v28.8b, #4
         mov             v21.8b, v28.8b
 
 28:
-        bl              L(\type\()_8tap_filter_2)
+        bl              L(\type\()_\taps\()_filter_2)
         ext             v22.8b, v21.8b, v28.8b, #4
+.ifc \taps, 6tap
+        smull           v2.4s,  v17.4h, v1.h[1]
+        smlal           v2.4s,  v18.4h, v1.h[2]
+        smlal           v2.4s,  v19.4h, v1.h[3]
+        smlal           v2.4s,  v20.4h, v1.h[4]
+        smlal           v2.4s,  v21.4h, v1.h[5]
+        smlal           v2.4s,  v22.4h, v1.h[6]
+.else   // 8tap
         smull           v2.4s,  v16.4h, v1.h[0]
         smlal           v2.4s,  v17.4h, v1.h[1]
         smlal           v2.4s,  v18.4h, v1.h[2]
@@ -2018,6 +2094,7 @@
         smlal           v2.4s,  v21.4h, v1.h[5]
         smlal           v2.4s,  v22.4h, v1.h[6]
         smlal           v2.4s,  v28.4h, v1.h[7]
+.endif
 
         sqrshrn         v2.4h,  v2.4s,  #\shift_hv
         sqxtun          v2.8b,  v2.8h
@@ -2036,7 +2113,7 @@
 0:
         ret             x15
 
-L(\type\()_8tap_filter_2):
+L(\type\()_\taps\()_filter_2):
         ld1             {v28.8b},  [\sr2], \s_strd
         ld1             {v30.8b},  [\src], \s_strd
         uxtl            v28.8h,  v28.8b
@@ -2083,12 +2160,12 @@
         mla             v31.4h,  v30.4h,  v0.h[3]
         srshr           v16.4h,  v31.4h,  #2
 
-        bl              L(\type\()_8tap_filter_4)
+        bl              L(\type\()_\taps\()_filter_4)
         mov             v17.8b, v28.8b
         mov             v18.8b, v29.8b
 
 4:
-        bl              L(\type\()_8tap_filter_4)
+        bl              L(\type\()_\taps\()_filter_4)
         // Interleaving the mul/mla chains actually hurts performance
         // significantly on Cortex A53, thus keeping mul/mla tightly
         // chained like this.
@@ -2121,8 +2198,13 @@
 480:    // 4x8, 4x16, 4x32 hv
         ld1             {v1.8b},  [\xmy]
         sub             \src, \src, #1
+.ifc \taps, 6tap
+        sub             \sr2, \src, \s_strd
+        sub             \src, \src, \s_strd, lsl #1
+.else
         sub             \sr2, \src, \s_strd, lsl #1
         sub             \src, \sr2, \s_strd
+.endif
         add             \ds2, \dst, \d_strd
         lsl             \s_strd, \s_strd, #1
         lsl             \d_strd, \d_strd, #1
@@ -2139,20 +2221,38 @@
         mla             v31.4h,  v28.4h,  v0.h[1]
         mla             v31.4h,  v29.4h,  v0.h[2]
         mla             v31.4h,  v30.4h,  v0.h[3]
+.ifc \taps, 6tap
+        srshr           v18.4h,  v31.4h,  #2
+.else
         srshr           v16.4h,  v31.4h,  #2
 
-        bl              L(\type\()_8tap_filter_4)
+        bl              L(\type\()_\taps\()_filter_4)
         mov             v17.8b, v28.8b
         mov             v18.8b, v29.8b
-        bl              L(\type\()_8tap_filter_4)
+.endif
+        bl              L(\type\()_\taps\()_filter_4)
         mov             v19.8b, v28.8b
         mov             v20.8b, v29.8b
-        bl              L(\type\()_8tap_filter_4)
+        bl              L(\type\()_\taps\()_filter_4)
         mov             v21.8b, v28.8b
         mov             v22.8b, v29.8b
 
 48:
-        bl              L(\type\()_8tap_filter_4)
+        bl              L(\type\()_\taps\()_filter_4)
+.ifc \taps, 6tap
+        smull           v2.4s,  v18.4h, v1.h[1]
+        smlal           v2.4s,  v19.4h, v1.h[2]
+        smlal           v2.4s,  v20.4h, v1.h[3]
+        smlal           v2.4s,  v21.4h, v1.h[4]
+        smlal           v2.4s,  v22.4h, v1.h[5]
+        smlal           v2.4s,  v28.4h, v1.h[6]
+        smull           v3.4s,  v19.4h, v1.h[1]
+        smlal           v3.4s,  v20.4h, v1.h[2]
+        smlal           v3.4s,  v21.4h, v1.h[3]
+        smlal           v3.4s,  v22.4h, v1.h[4]
+        smlal           v3.4s,  v28.4h, v1.h[5]
+        smlal           v3.4s,  v29.4h, v1.h[6]
+.else   // 8tap
         smull           v2.4s,  v16.4h, v1.h[0]
         smlal           v2.4s,  v17.4h, v1.h[1]
         smlal           v2.4s,  v18.4h, v1.h[2]
@@ -2169,6 +2269,7 @@
         smlal           v3.4s,  v22.4h, v1.h[5]
         smlal           v3.4s,  v28.4h, v1.h[6]
         smlal           v3.4s,  v29.4h, v1.h[7]
+.endif
         sqrshrn         v2.4h,  v2.4s,  #\shift_hv
         sqrshrn         v3.4h,  v3.4s,  #\shift_hv
         subs            \h,  \h,  #2
@@ -2182,8 +2283,10 @@
         st1             {v3.4h}, [\ds2], \d_strd
 .endif
         b.le            0f
+.ifc \taps, 8tap
         mov             v16.8b,  v18.8b
         mov             v17.8b,  v19.8b
+.endif
         mov             v18.8b,  v20.8b
         mov             v19.8b,  v21.8b
         mov             v20.8b,  v22.8b
@@ -2193,7 +2296,7 @@
 0:
         ret             x15
 
-L(\type\()_8tap_filter_4):
+L(\type\()_\taps\()_filter_4):
         ld1             {v26.8b}, [\sr2], \s_strd
         ld1             {v27.8b}, [\src], \s_strd
         uxtl            v26.8h,  v26.8b
@@ -2237,15 +2340,15 @@
         lsl             \d_strd, \d_strd, #1
         lsl             \s_strd, \s_strd, #1
 
-        bl              L(\type\()_8tap_filter_8_first)
-        bl              L(\type\()_8tap_filter_8)
+        bl              L(\type\()_\taps\()_filter_8_first)
+        bl              L(\type\()_\taps\()_filter_8)
         mov             v17.16b, v24.16b
         mov             v18.16b, v25.16b
 
 8:
         smull           v2.4s,  v16.4h, v1.h[0]
         smull2          v3.4s,  v16.8h, v1.h[0]
-        bl              L(\type\()_8tap_filter_8)
+        bl              L(\type\()_\taps\()_filter_8)
         smull           v4.4s,  v17.4h, v1.h[0]
         smull2          v5.4s,  v17.8h, v1.h[0]
         smlal           v2.4s,  v17.4h, v1.h[1]
@@ -2303,7 +2406,9 @@
         ld1             {v0.8b},  [\xmx]
         ld1             {v1.8b},  [\xmy]
         sub             \src,  \src,  #3
+.ifc \taps, 8tap
         sub             \src,  \src,  \s_strd
+.endif
         sub             \src,  \src,  \s_strd, lsl #1
         sxtl            v0.8h,  v0.8b
         sxtl            v1.8h,  v1.8b
@@ -2316,21 +2421,52 @@
         lsl             \d_strd, \d_strd, #1
         lsl             \s_strd, \s_strd, #1
 
-        bl              L(\type\()_8tap_filter_8_first)
-        bl              L(\type\()_8tap_filter_8)
+        bl              L(\type\()_\taps\()_filter_8_first)
+.ifc \taps, 6tap
+        mov             v18.16b, v16.16b
+.else
+        bl              L(\type\()_\taps\()_filter_8)
         mov             v17.16b, v24.16b
         mov             v18.16b, v25.16b
-        bl              L(\type\()_8tap_filter_8)
+.endif
+        bl              L(\type\()_\taps\()_filter_8)
         mov             v19.16b, v24.16b
         mov             v20.16b, v25.16b
-        bl              L(\type\()_8tap_filter_8)
+        bl              L(\type\()_\taps\()_filter_8)
         mov             v21.16b, v24.16b
         mov             v22.16b, v25.16b
 
 88:
+.ifc \taps, 6tap
+        smull           v2.4s,  v18.4h, v1.h[1]
+        smull2          v3.4s,  v18.8h, v1.h[1]
+        bl              L(\type\()_\taps\()_filter_8)
+        smull           v4.4s,  v19.4h, v1.h[1]
+        smull2          v5.4s,  v19.8h, v1.h[1]
+        smlal           v2.4s,  v19.4h, v1.h[2]
+        smlal2          v3.4s,  v19.8h, v1.h[2]
+        smlal           v4.4s,  v20.4h, v1.h[2]
+        smlal2          v5.4s,  v20.8h, v1.h[2]
+        smlal           v2.4s,  v20.4h, v1.h[3]
+        smlal2          v3.4s,  v20.8h, v1.h[3]
+        smlal           v4.4s,  v21.4h, v1.h[3]
+        smlal2          v5.4s,  v21.8h, v1.h[3]
+        smlal           v2.4s,  v21.4h, v1.h[4]
+        smlal2          v3.4s,  v21.8h, v1.h[4]
+        smlal           v4.4s,  v22.4h, v1.h[4]
+        smlal2          v5.4s,  v22.8h, v1.h[4]
+        smlal           v2.4s,  v22.4h, v1.h[5]
+        smlal2          v3.4s,  v22.8h, v1.h[5]
+        smlal           v4.4s,  v24.4h, v1.h[5]
+        smlal2          v5.4s,  v24.8h, v1.h[5]
+        smlal           v2.4s,  v24.4h, v1.h[6]
+        smlal2          v3.4s,  v24.8h, v1.h[6]
+        smlal           v4.4s,  v25.4h, v1.h[6]
+        smlal2          v5.4s,  v25.8h, v1.h[6]
+.else   // 8tap
         smull           v2.4s,  v16.4h, v1.h[0]
         smull2          v3.4s,  v16.8h, v1.h[0]
-        bl              L(\type\()_8tap_filter_8)
+        bl              L(\type\()_\taps\()_filter_8)
         smull           v4.4s,  v17.4h, v1.h[0]
         smull2          v5.4s,  v17.8h, v1.h[0]
         smlal           v2.4s,  v17.4h, v1.h[1]
@@ -2361,6 +2497,7 @@
         smlal2          v3.4s,  v24.8h, v1.h[7]
         smlal           v4.4s,  v25.4h, v1.h[7]
         smlal2          v5.4s,  v25.8h, v1.h[7]
+.endif
         sqrshrn         v2.4h,  v2.4s,  #\shift_hv
         sqrshrn2        v2.8h,  v3.4s,  #\shift_hv
         sqrshrn         v4.4h,  v4.4s,  #\shift_hv
@@ -2376,8 +2513,10 @@
         st1             {v4.8h}, [\ds2], \d_strd
 .endif
         b.le            9f
+.ifc \taps, 8tap
         mov             v16.16b, v18.16b
         mov             v17.16b, v19.16b
+.endif
         mov             v18.16b, v20.16b
         mov             v19.16b, v21.16b
         mov             v20.16b, v22.16b
@@ -2399,14 +2538,32 @@
 .else
         add             \dst,  \dst,  #16
 .endif
+.ifc \taps, 6tap
+        add             \src,  \src,  \s_strd,  lsl #1
+.endif
         b               168b
 0:
         ret             x15
 
-L(\type\()_8tap_filter_8_first):
+L(\type\()_\taps\()_filter_8_first):
         ld1             {v28.8b, v29.8b},  [\src], \s_strd
         uxtl            v28.8h,  v28.8b
         uxtl            v29.8h,  v29.8b
+.ifc \taps, 6tap
+        ext             v24.16b, v28.16b, v29.16b, #(2*1)
+        ext             v25.16b, v28.16b, v29.16b, #(2*2)
+        ext             v26.16b, v28.16b, v29.16b, #(2*3)
+        ext             v27.16b, v28.16b, v29.16b, #(2*4)
+        mul             v16.8h,  v24.8h,  v0.h[1]
+        mla             v16.8h,  v25.8h,  v0.h[2]
+        mla             v16.8h,  v26.8h,  v0.h[3]
+        mla             v16.8h,  v27.8h,  v0.h[4]
+        ext             v24.16b, v28.16b, v29.16b, #(2*5)
+        ext             v25.16b, v28.16b, v29.16b, #(2*6)
+        ext             v26.16b, v28.16b, v29.16b, #(2*7)
+        mla             v16.8h,  v24.8h,  v0.h[5]
+        mla             v16.8h,  v25.8h,  v0.h[6]
+.else   // 8tap
         mul             v16.8h,  v28.8h,  v0.h[0]
         ext             v24.16b, v28.16b, v29.16b, #(2*1)
         ext             v25.16b, v28.16b, v29.16b, #(2*2)
@@ -2422,16 +2579,29 @@
         mla             v16.8h,  v24.8h,  v0.h[5]
         mla             v16.8h,  v25.8h,  v0.h[6]
         mla             v16.8h,  v26.8h,  v0.h[7]
+.endif
         srshr           v16.8h,  v16.8h,  #2
         ret
 
-L(\type\()_8tap_filter_8):
+L(\type\()_\taps\()_filter_8):
         ld1             {v28.8b, v29.8b},  [\sr2], \s_strd
         ld1             {v30.8b, v31.8b},  [\src], \s_strd
         uxtl            v28.8h,  v28.8b
         uxtl            v29.8h,  v29.8b
         uxtl            v30.8h,  v30.8b
         uxtl            v31.8h,  v31.8b
+.ifc \taps, 6tap
+        ext             v26.16b, v28.16b, v29.16b, #2
+        ext             v27.16b, v30.16b, v31.16b, #2
+        mul             v24.8h,  v26.8h,  v0.h[1]
+        mul             v25.8h,  v27.8h,  v0.h[1]
+.irpc i, 23456
+        ext             v26.16b, v28.16b, v29.16b, #(2*\i)
+        ext             v27.16b, v30.16b, v31.16b, #(2*\i)
+        mla             v24.8h,  v26.8h,  v0.h[\i]
+        mla             v25.8h,  v27.8h,  v0.h[\i]
+.endr
+.else   // 8tap
         mul             v24.8h,  v28.8h,  v0.h[0]
         mul             v25.8h,  v30.8h,  v0.h[0]
 .irpc i, 1234567
@@ -2440,22 +2610,25 @@
         mla             v24.8h,  v26.8h,  v0.h[\i]
         mla             v25.8h,  v27.8h,  v0.h[\i]
 .endr
+.endif
         srshr           v24.8h,  v24.8h, #2
         srshr           v25.8h,  v25.8h, #2
         ret
 
-L(\type\()_8tap_hv_tbl):
-        .hword L(\type\()_8tap_hv_tbl) - 1280b
-        .hword L(\type\()_8tap_hv_tbl) -  640b
-        .hword L(\type\()_8tap_hv_tbl) -  320b
-        .hword L(\type\()_8tap_hv_tbl) -  160b
-        .hword L(\type\()_8tap_hv_tbl) -   80b
-        .hword L(\type\()_8tap_hv_tbl) -   40b
-        .hword L(\type\()_8tap_hv_tbl) -   20b
+L(\type\()_\taps\()_hv_tbl):
+        .hword L(\type\()_\taps\()_hv_tbl) - 1280b
+        .hword L(\type\()_\taps\()_hv_tbl) -  640b
+        .hword L(\type\()_\taps\()_hv_tbl) -  320b
+        .hword L(\type\()_\taps\()_hv_tbl) -  160b
+        .hword L(\type\()_\taps\()_hv_tbl) -   80b
+        .hword L(\type\()_\taps\()_hv_tbl) -   40b
+        .hword L(\type\()_\taps\()_hv_tbl) -   20b
         .hword 0
 endfunc
+.endm
 
 
+.macro filter_bilin_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv
 function \type\()_bilin_8bpc_neon, export=1
         dup             v1.16b, \mx
         dup             v3.16b, \my
@@ -2987,8 +3160,34 @@
 endfunc
 .endm
 
-filter_fn put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10
-filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6
+make_8tap_fn    put,  regular_sharp,  REGULAR, SHARP,   8tap
+make_8tap_fn    put,  smooth_sharp,   SMOOTH,  SHARP,   8tap
+make_8tap_fn    put,  sharp,          SHARP,   SHARP,   8tap
+make_8tap_fn    put,  sharp_regular,  SHARP,   REGULAR, 8tap
+make_8tap_fn    put,  sharp_smooth,   SHARP,   SMOOTH,  8tap
+filter_fn       put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10, 8tap
+
+make_8tap_fn    put,  regular,        REGULAR, REGULAR, 6tap
+make_8tap_fn    put,  regular_smooth, REGULAR, SMOOTH,  6tap
+make_8tap_fn    put,  smooth,         SMOOTH,  SMOOTH,  6tap
+make_8tap_fn    put,  smooth_regular, SMOOTH,  REGULAR, 6tap
+filter_fn       put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10, 6tap
+filter_bilin_fn put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10
+
+make_8tap_fn    prep, regular_sharp,  REGULAR, SHARP,   8tap
+make_8tap_fn    prep, smooth_sharp,   SMOOTH,  SHARP,   8tap
+make_8tap_fn    prep, sharp,          SHARP,   SHARP,   8tap
+make_8tap_fn    prep, sharp_regular,  SHARP,   REGULAR, 8tap
+make_8tap_fn    prep, sharp_smooth,   SHARP,   SMOOTH,  8tap
+filter_fn       prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6,  8tap
+
+make_8tap_fn    prep, regular,        REGULAR, REGULAR, 6tap
+make_8tap_fn    prep, regular_smooth, REGULAR, SMOOTH,  6tap
+make_8tap_fn    prep, smooth,         SMOOTH,  SMOOTH,  6tap
+make_8tap_fn    prep, smooth_regular, SMOOTH,  REGULAR, 6tap
+filter_fn       prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6,  6tap
+filter_bilin_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6
+
 
 .macro load_filter_row dst, src, inc
         asr             w13, \src, #10
diff --git a/src/arm/64/mc16.S b/src/arm/64/mc16.S
index 1bfb12e..576fab1 100644
--- a/src/arm/64/mc16.S
+++ b/src/arm/64/mc16.S
@@ -1374,19 +1374,35 @@
         sub             \r3\wd,  \r3\wd,  \c\wd
 .endif
 .endm
-.macro smull_smlal_4 d, s0, s1, s2, s3
+.macro smull_smlal_4tap d, s0, s1, s2, s3
         smull           \d\().4s,  \s0\().4h,  v0.h[0]
         smlal           \d\().4s,  \s1\().4h,  v0.h[1]
         smlal           \d\().4s,  \s2\().4h,  v0.h[2]
         smlal           \d\().4s,  \s3\().4h,  v0.h[3]
 .endm
-.macro smull2_smlal2_4 d, s0, s1, s2, s3
+.macro smull2_smlal2_4tap d, s0, s1, s2, s3
         smull2          \d\().4s,  \s0\().8h,  v0.h[0]
         smlal2          \d\().4s,  \s1\().8h,  v0.h[1]
         smlal2          \d\().4s,  \s2\().8h,  v0.h[2]
         smlal2          \d\().4s,  \s3\().8h,  v0.h[3]
 .endm
-.macro smull_smlal_8 d, s0, s1, s2, s3, s4, s5, s6, s7
+.macro smull_smlal_6tap d, s0, s1, s2, s3, s4, s5, s6, s7
+        smull           \d\().4s,  \s1\().4h,  v0.h[1]
+        smlal           \d\().4s,  \s2\().4h,  v0.h[2]
+        smlal           \d\().4s,  \s3\().4h,  v0.h[3]
+        smlal           \d\().4s,  \s4\().4h,  v0.h[4]
+        smlal           \d\().4s,  \s5\().4h,  v0.h[5]
+        smlal           \d\().4s,  \s6\().4h,  v0.h[6]
+.endm
+.macro smull2_smlal2_6tap d, s0, s1, s2, s3, s4, s5, s6, s7
+        smull2          \d\().4s,  \s1\().8h,  v0.h[1]
+        smlal2          \d\().4s,  \s2\().8h,  v0.h[2]
+        smlal2          \d\().4s,  \s3\().8h,  v0.h[3]
+        smlal2          \d\().4s,  \s4\().8h,  v0.h[4]
+        smlal2          \d\().4s,  \s5\().8h,  v0.h[5]
+        smlal2          \d\().4s,  \s6\().8h,  v0.h[6]
+.endm
+.macro smull_smlal_8tap d, s0, s1, s2, s3, s4, s5, s6, s7
         smull           \d\().4s,  \s0\().4h,  v0.h[0]
         smlal           \d\().4s,  \s1\().4h,  v0.h[1]
         smlal           \d\().4s,  \s2\().4h,  v0.h[2]
@@ -1396,7 +1412,7 @@
         smlal           \d\().4s,  \s6\().4h,  v0.h[6]
         smlal           \d\().4s,  \s7\().4h,  v0.h[7]
 .endm
-.macro smull2_smlal2_8 d, s0, s1, s2, s3, s4, s5, s6, s7
+.macro smull2_smlal2_8tap d, s0, s1, s2, s3, s4, s5, s6, s7
         smull2          \d\().4s,  \s0\().8h,  v0.h[0]
         smlal2          \d\().4s,  \s1\().8h,  v0.h[1]
         smlal2          \d\().4s,  \s2\().8h,  v0.h[2]
@@ -1499,11 +1515,11 @@
         st1             {\r0\().8h, \r1\().8h}, [\dst], \strd
 .endm
 
-.macro make_8tap_fn op, type, type_h, type_v
+.macro make_8tap_fn op, type, type_h, type_v, taps
 function \op\()_8tap_\type\()_16bpc_neon, export=1
         mov             w9,  \type_h
         mov             w10, \type_v
-        b               \op\()_8tap_neon
+        b               \op\()_\taps\()_neon
 endfunc
 .endm
 
@@ -1512,18 +1528,8 @@
 #define SMOOTH  ((1*15<<7)|4*15)
 #define SHARP   ((2*15<<7)|3*15)
 
-.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2
-make_8tap_fn \type, regular,        REGULAR, REGULAR
-make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
-make_8tap_fn \type, regular_sharp,  REGULAR, SHARP
-make_8tap_fn \type, smooth,         SMOOTH,  SMOOTH
-make_8tap_fn \type, smooth_regular, SMOOTH,  REGULAR
-make_8tap_fn \type, smooth_sharp,   SMOOTH,  SHARP
-make_8tap_fn \type, sharp,          SHARP,   SHARP
-make_8tap_fn \type, sharp_regular,  SHARP,   REGULAR
-make_8tap_fn \type, sharp_smooth,   SHARP,   SMOOTH
-
-function \type\()_8tap_neon
+.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2, taps
+function \type\()_\taps\()_neon
 .ifc \bdmax, w8
         ldr             w8,  [sp]
 .endif
@@ -1547,12 +1553,12 @@
         add             w13, w12, \bdmax       // 6 + intermediate_bits
         sub             w12, w12, \bdmax       // 6 - intermediate_bits
         movrel          x11, X(mc_subpel_filters), -8
-        b.ne            L(\type\()_8tap_h)
+        b.ne            L(\type\()_\taps\()_h)
         tst             \my, #(0x7f << 14)
-        b.ne            L(\type\()_8tap_v)
+        b.ne            L(\type\()_\taps\()_v)
         b               \type\()_neon
 
-L(\type\()_8tap_h):
+L(\type\()_\taps\()_h):
         cmp             \w,   #4
         ubfx            w10,  \mx, #7, #7
         and             \mx,  \mx, #0x7f
@@ -1561,9 +1567,9 @@
 4:
         tst             \my,  #(0x7f << 14)
         add             \xmx, x11, \mx, uxtw #3
-        b.ne            L(\type\()_8tap_hv)
+        b.ne            L(\type\()_\taps\()_hv)
 
-        adr             x10, L(\type\()_8tap_h_tbl)
+        adr             x10, L(\type\()_\taps\()_h_tbl)
         dup             v30.4s,  w12           // 6 - intermediate_bits
         ldrh            w9,  [x10, x9, lsl #1]
         neg             v30.4s,  v30.4s        // -(6-intermediate_bits)
@@ -1682,6 +1688,22 @@
         mov             \mx, \w
 
 8:
+.ifc \taps, 6tap
+        ext             v24.16b, v16.16b, v17.16b, #2
+        ext             v25.16b, v20.16b, v21.16b, #2
+        smull           v18.4s,  v24.4h,  v0.h[1]
+        smull2          v19.4s,  v24.8h,  v0.h[1]
+        smull           v22.4s,  v25.4h,  v0.h[1]
+        smull2          v23.4s,  v25.8h,  v0.h[1]
+.irpc i, 23456
+        ext             v24.16b, v16.16b, v17.16b, #(2*\i)
+        ext             v25.16b, v20.16b, v21.16b, #(2*\i)
+        smlal           v18.4s,  v24.4h,  v0.h[\i]
+        smlal2          v19.4s,  v24.8h,  v0.h[\i]
+        smlal           v22.4s,  v25.4h,  v0.h[\i]
+        smlal2          v23.4s,  v25.8h,  v0.h[\i]
+.endr
+.else   // 8tap
         smull           v18.4s,  v16.4h,  v0.h[0]
         smull2          v19.4s,  v16.8h,  v0.h[0]
         smull           v22.4s,  v20.4h,  v0.h[0]
@@ -1694,6 +1716,7 @@
         smlal           v22.4s,  v25.4h,  v0.h[\i]
         smlal2          v23.4s,  v25.8h,  v0.h[\i]
 .endr
+.endif
         subs            \mx, \mx, #8
         srshl           v18.4s,  v18.4s,  v30.4s // -(6-intermediate_bits)
         srshl           v19.4s,  v19.4s,  v30.4s // -(6-intermediate_bits)
@@ -1734,18 +1757,18 @@
         b.gt            81b
         ret
 
-L(\type\()_8tap_h_tbl):
-        .hword L(\type\()_8tap_h_tbl) - 1280b
-        .hword L(\type\()_8tap_h_tbl) -  640b
-        .hword L(\type\()_8tap_h_tbl) -  320b
-        .hword L(\type\()_8tap_h_tbl) -  160b
-        .hword L(\type\()_8tap_h_tbl) -   80b
-        .hword L(\type\()_8tap_h_tbl) -   40b
-        .hword L(\type\()_8tap_h_tbl) -   20b
+L(\type\()_\taps\()_h_tbl):
+        .hword L(\type\()_\taps\()_h_tbl) - 1280b
+        .hword L(\type\()_\taps\()_h_tbl) -  640b
+        .hword L(\type\()_\taps\()_h_tbl) -  320b
+        .hword L(\type\()_\taps\()_h_tbl) -  160b
+        .hword L(\type\()_\taps\()_h_tbl) -   80b
+        .hword L(\type\()_\taps\()_h_tbl) -   40b
+        .hword L(\type\()_\taps\()_h_tbl) -   20b
         .hword 0
 
 
-L(\type\()_8tap_v):
+L(\type\()_\taps\()_v):
         cmp             \h,  #4
         ubfx            w10, \my, #7, #7
         and             \my, \my, #0x7f
@@ -1758,7 +1781,7 @@
         dup             v30.4s,  w12           // 6 - intermediate_bits
         movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
 .endif
-        adr             x10, L(\type\()_8tap_v_tbl)
+        adr             x10, L(\type\()_\taps\()_v_tbl)
         ldrh            w9,  [x10, x9, lsl #1]
 .ifc \type, prep
         neg             v30.4s,  v30.4s        // -(6-intermediate_bits)
@@ -1785,7 +1808,7 @@
         load_s          \src, \sr2, \s_strd, v1, v2, v3, v4, v5
         interleave_1_s  v1,  v2,  v3,  v4,  v5
         b.gt            24f
-        smull_smlal_4   v6,  v1,  v2,  v3,  v4
+        smull_smlal_4tap v6, v1,  v2,  v3,  v4
         sqrshrun_h      6,   v6
         umin_h          v31, .8h, v6
         st_s            \d_strd, v6, 2
@@ -1794,8 +1817,8 @@
 24:     // 2x4 v
         load_s          \sr2, \src, \s_strd, v6, v7
         interleave_1_s  v5,  v6,  v7
-        smull_smlal_4   v16, v1,  v2,  v3,  v4
-        smull_smlal_4   v17, v3,  v4,  v5,  v6
+        smull_smlal_4tap v16, v1, v2, v3, v4
+        smull_smlal_4tap v17, v3, v4, v5, v6
         sqrshrun_h      6,   v16, v17
         umin_h          v31, .8h, v16
         st_s            \d_strd, v16, 4
@@ -1817,8 +1840,8 @@
         subs            \h,  \h,  #4
         load_s          \sr2, \src, \s_strd, v16, v17, v18, v19
         interleave_1_s  v7,  v16, v17, v18, v19
-        smull_smlal_8   v24, v1,  v2,  v3,  v4,  v5,  v6,  v7,  v16
-        smull_smlal_8   v25, v3,  v4,  v5,  v6,  v7,  v16, v17, v18
+        smull_smlal_\taps v24, v1,  v2,  v3,  v4,  v5,  v6,  v7, v16
+        smull_smlal_\taps v25, v3,  v4,  v5,  v6,  v7, v16, v17, v18
         sqrshrun_h      6,   v24, v25
         umin_h          v31, .8h, v24
         st_s            \d_strd, v24, 4
@@ -1836,7 +1859,7 @@
 26:
         load_s          \sr2, \src, \s_strd, v16, v17
         interleave_1_s  v7,  v16, v17
-        smull_smlal_8   v24, v1,  v2,  v3,  v4,  v5,  v6,  v7,  v16
+        smull_smlal_\taps v24, v1, v2,  v3,  v4,  v5,  v6,  v7, v16
         sqrshrun_h      6,   v24
         umin_h          v31, .4h, v24
         st_s            \d_strd, v24, 2
@@ -1860,13 +1883,13 @@
         sxtl            v0.8h,   v0.8b
 
         load_4h         \src, \sr2, \s_strd, v1, v2, v3, v4, v5
-        smull_smlal_4   v6,  v1,  v2,  v3,  v4
-        smull_smlal_4   v7,  v2,  v3,  v4,  v5
+        smull_smlal_4tap v6,  v1,  v2,  v3,  v4
+        smull_smlal_4tap v7,  v2,  v3,  v4,  v5
         shift_store_4   \type, \d_strd, v6, v7
         b.le            0f
         load_4h         \sr2, \src, \s_strd, v6, v7
-        smull_smlal_4   v1,  v3,  v4,  v5,  v6
-        smull_smlal_4   v2,  v4,  v5,  v6,  v7
+        smull_smlal_4tap v1,  v3,  v4,  v5,  v6
+        smull_smlal_4tap v2,  v4,  v5,  v6,  v7
         shift_store_4   \type, \d_strd, v1, v2
 0:
         ret
@@ -1885,10 +1908,10 @@
 48:
         subs            \h,  \h,  #4
         load_4h         \sr2, \src, \s_strd, v23, v24, v25, v26
-        smull_smlal_8   v1,  v16, v17, v18, v19, v20, v21, v22, v23
-        smull_smlal_8   v2,  v17, v18, v19, v20, v21, v22, v23, v24
-        smull_smlal_8   v3,  v18, v19, v20, v21, v22, v23, v24, v25
-        smull_smlal_8   v4,  v19, v20, v21, v22, v23, v24, v25, v26
+        smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23
+        smull_smlal_\taps v2, v17, v18, v19, v20, v21, v22, v23, v24
+        smull_smlal_\taps v3, v18, v19, v20, v21, v22, v23, v24, v25
+        smull_smlal_\taps v4, v19, v20, v21, v22, v23, v24, v25, v26
         shift_store_4   \type, \d_strd, v1, v2, v3, v4
         b.le            0f
         cmp             \h,  #2
@@ -1903,8 +1926,8 @@
         b               48b
 46:
         load_4h         \sr2, \src, \s_strd, v23, v24
-        smull_smlal_8   v1,  v16, v17, v18, v19, v20, v21, v22, v23
-        smull_smlal_8   v2,  v17, v18, v19, v20, v21, v22, v23, v24
+        smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23
+        smull_smlal_\taps v2, v17, v18, v19, v20, v21, v22, v23, v24
         shift_store_4   \type, \d_strd, v1, v2
 0:
         ret
@@ -1925,17 +1948,17 @@
         sxtl            v0.8h,   v0.8b
 
         load_8h         \src, \sr2, \s_strd, v1, v2, v3, v4, v5
-        smull_smlal_4   v16, v1,  v2,  v3,  v4
-        smull2_smlal2_4 v17, v1,  v2,  v3,  v4
-        smull_smlal_4   v18, v2,  v3,  v4,  v5
-        smull2_smlal2_4 v19, v2,  v3,  v4,  v5
+        smull_smlal_4tap   v16, v1,  v2,  v3,  v4
+        smull2_smlal2_4tap v17, v1,  v2,  v3,  v4
+        smull_smlal_4tap   v18, v2,  v3,  v4,  v5
+        smull2_smlal2_4tap v19, v2,  v3,  v4,  v5
         shift_store_8   \type, \d_strd, v16, v17, v18, v19
         b.le            0f
         load_8h         \sr2, \src, \s_strd, v6, v7
-        smull_smlal_4   v16, v3,  v4,  v5,  v6
-        smull2_smlal2_4 v17, v3,  v4,  v5,  v6
-        smull_smlal_4   v18, v4,  v5,  v6,  v7
-        smull2_smlal2_4 v19, v4,  v5,  v6,  v7
+        smull_smlal_4tap   v16, v3,  v4,  v5,  v6
+        smull2_smlal2_4tap v17, v3,  v4,  v5,  v6
+        smull_smlal_4tap   v18, v4,  v5,  v6,  v7
+        smull2_smlal2_4tap v19, v4,  v5,  v6,  v7
         shift_store_8   \type, \d_strd, v16, v17, v18, v19
 0:
         ret
@@ -1962,18 +1985,18 @@
 88:
         subs            \h,  \h,  #2
         load_8h         \sr2, \src, \s_strd, v23, v24
-        smull_smlal_8   v1,  v16, v17, v18, v19, v20, v21, v22, v23
-        smull2_smlal2_8 v2,  v16, v17, v18, v19, v20, v21, v22, v23
-        smull_smlal_8   v3,  v17, v18, v19, v20, v21, v22, v23, v24
-        smull2_smlal2_8 v4,  v17, v18, v19, v20, v21, v22, v23, v24
+        smull_smlal_\taps   v1, v16, v17, v18, v19, v20, v21, v22, v23
+        smull2_smlal2_\taps v2, v16, v17, v18, v19, v20, v21, v22, v23
+        smull_smlal_\taps   v3, v17, v18, v19, v20, v21, v22, v23, v24
+        smull2_smlal2_\taps v4, v17, v18, v19, v20, v21, v22, v23, v24
         shift_store_8   \type, \d_strd, v1, v2, v3, v4
         b.le            9f
         subs            \h,  \h,  #2
         load_8h         \sr2, \src, \s_strd, v25, v26
-        smull_smlal_8   v1,  v18, v19, v20, v21, v22, v23, v24, v25
-        smull2_smlal2_8 v2,  v18, v19, v20, v21, v22, v23, v24, v25
-        smull_smlal_8   v3,  v19, v20, v21, v22, v23, v24, v25, v26
-        smull2_smlal2_8 v4,  v19, v20, v21, v22, v23, v24, v25, v26
+        smull_smlal_\taps   v1, v18, v19, v20, v21, v22, v23, v24, v25
+        smull2_smlal2_\taps v2, v18, v19, v20, v21, v22, v23, v24, v25
+        smull_smlal_\taps   v3, v19, v20, v21, v22, v23, v24, v25, v26
+        smull2_smlal2_\taps v4, v19, v20, v21, v22, v23, v24, v25, v26
         shift_store_8   \type, \d_strd, v1, v2, v3, v4
         b.le            9f
         mov             v16.16b, v20.16b
@@ -2013,10 +2036,10 @@
 16:
         load_16h        \src, \src, \s_strd, v22, v23
         subs            \h,  \h,  #1
-        smull_smlal_4   v1,  v16, v18, v20, v22
-        smull2_smlal2_4 v2,  v16, v18, v20, v22
-        smull_smlal_4   v3,  v17, v19, v21, v23
-        smull2_smlal2_4 v4,  v17, v19, v21, v23
+        smull_smlal_4tap   v1, v16, v18, v20, v22
+        smull2_smlal2_4tap v2, v16, v18, v20, v22
+        smull_smlal_4tap   v3, v17, v19, v21, v23
+        smull2_smlal2_4tap v4, v17, v19, v21, v23
         shift_store_16  \type, \d_strd, x0, v1, v2, v3, v4
         b.le            0f
         mov             v16.16b, v18.16b
@@ -2029,17 +2052,17 @@
 0:
         ret
 
-L(\type\()_8tap_v_tbl):
-        .hword L(\type\()_8tap_v_tbl) - 1280b
-        .hword L(\type\()_8tap_v_tbl) -  640b
-        .hword L(\type\()_8tap_v_tbl) -  320b
-        .hword L(\type\()_8tap_v_tbl) -  160b
-        .hword L(\type\()_8tap_v_tbl) -   80b
-        .hword L(\type\()_8tap_v_tbl) -   40b
-        .hword L(\type\()_8tap_v_tbl) -   20b
+L(\type\()_\taps\()_v_tbl):
+        .hword L(\type\()_\taps\()_v_tbl) - 1280b
+        .hword L(\type\()_\taps\()_v_tbl) -  640b
+        .hword L(\type\()_\taps\()_v_tbl) -  320b
+        .hword L(\type\()_\taps\()_v_tbl) -  160b
+        .hword L(\type\()_\taps\()_v_tbl) -   80b
+        .hword L(\type\()_\taps\()_v_tbl) -   40b
+        .hword L(\type\()_\taps\()_v_tbl) -   20b
         .hword 0
 
-L(\type\()_8tap_hv):
+L(\type\()_\taps\()_hv):
         cmp             \h,  #4
         ubfx            w10, \my, #7, #7
         and             \my, \my, #0x7f
@@ -2048,7 +2071,7 @@
 4:
         add             \xmy, x11, \my, uxtw #3
 
-        adr             x10, L(\type\()_8tap_hv_tbl)
+        adr             x10, L(\type\()_\taps\()_hv_tbl)
         dup             v30.4s,  w12           // 6 - intermediate_bits
         ldrh            w9,  [x10, x9, lsl #1]
         neg             v30.4s,  v30.4s        // -(6-intermediate_bits)
@@ -2089,7 +2112,7 @@
         addp            v27.4s,  v27.4s,  v28.4s
         addp            v16.4s,  v27.4s,  v27.4s
         srshl           v16.2s,  v16.2s,  v30.2s // -(6-intermediate_bits)
-        bl              L(\type\()_8tap_filter_2)
+        bl              L(\type\()_\taps\()_filter_2)
         // The intermediates from the horizontal pass fit in 16 bit without
         // any bias; we could just as well keep them as .4s, but narrowing
         // them to .4h gives a significant speedup on out of order cores
@@ -2100,7 +2123,7 @@
         mov             v17.8b,  v24.8b
 
 2:
-        bl              L(\type\()_8tap_filter_2)
+        bl              L(\type\()_\taps\()_filter_2)
 
         ext             v18.8b,  v17.8b,  v24.8b,  #4
         smull           v2.4s,   v16.4h,  v1.h[0]
@@ -2143,20 +2166,28 @@
         // them to .4h gives a significant speedup on out of order cores
         // (at the cost of a smaller slowdown on in-order cores such as A53).
 
-        bl              L(\type\()_8tap_filter_2)
+        bl              L(\type\()_\taps\()_filter_2)
         xtn             v16.4h,  v16.4s
         trn1            v16.2s,  v16.2s,  v24.2s
         mov             v17.8b,  v24.8b
-        bl              L(\type\()_8tap_filter_2)
+        bl              L(\type\()_\taps\()_filter_2)
         ext             v18.8b,  v17.8b,  v24.8b,  #4
         mov             v19.8b,  v24.8b
-        bl              L(\type\()_8tap_filter_2)
+        bl              L(\type\()_\taps\()_filter_2)
         ext             v20.8b,  v19.8b,  v24.8b,  #4
         mov             v21.8b,  v24.8b
 
 28:
-        bl              L(\type\()_8tap_filter_2)
+        bl              L(\type\()_\taps\()_filter_2)
         ext             v22.8b,  v21.8b,  v24.8b,  #4
+.ifc \taps, 6tap
+        smull           v3.4s,   v17.4h,  v1.h[1]
+        smlal           v3.4s,   v18.4h,  v1.h[2]
+        smlal           v3.4s,   v19.4h,  v1.h[3]
+        smlal           v3.4s,   v20.4h,  v1.h[4]
+        smlal           v3.4s,   v21.4h,  v1.h[5]
+        smlal           v3.4s,   v22.4h,  v1.h[6]
+.else   // 8tap
         smull           v3.4s,   v16.4h,  v1.h[0]
         smlal           v3.4s,   v17.4h,  v1.h[1]
         smlal           v3.4s,   v18.4h,  v1.h[2]
@@ -2165,6 +2196,7 @@
         smlal           v3.4s,   v21.4h,  v1.h[5]
         smlal           v3.4s,   v22.4h,  v1.h[6]
         smlal           v3.4s,   v24.4h,  v1.h[7]
+.endif
 
         srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
         sqxtun          v3.4h,   v3.4s
@@ -2184,7 +2216,7 @@
 0:
         ret             x15
 
-L(\type\()_8tap_filter_2):
+L(\type\()_\taps\()_filter_2):
         ld1             {v25.8h},  [\sr2], \s_strd
         ld1             {v27.8h},  [\src], \s_strd
         ext             v26.16b, v25.16b, v25.16b, #2
@@ -2234,12 +2266,12 @@
         // (at the cost of a smaller slowdown on in-order cores such as A53).
         xtn             v16.4h,  v16.4s
 
-        bl              L(\type\()_8tap_filter_4)
+        bl              L(\type\()_\taps\()_filter_4)
         mov             v17.8b,  v24.8b
         mov             v18.8b,  v25.8b
 
 4:
-        bl              L(\type\()_8tap_filter_4)
+        bl              L(\type\()_\taps\()_filter_4)
         smull           v2.4s,   v16.4h,  v1.h[0]
         smlal           v2.4s,   v17.4h,  v1.h[1]
         smlal           v2.4s,   v18.4h,  v1.h[2]
@@ -2272,8 +2304,13 @@
 480:    // 4x8, 4x16, 4x32 hv
         ld1             {v1.8b},  [\xmy]
         sub             \src, \src, #2
+.ifc \taps, 6tap
+        sub             \sr2, \src, \s_strd
+        sub             \src, \src, \s_strd, lsl #1
+.else
         sub             \sr2, \src, \s_strd, lsl #1
         sub             \src, \sr2, \s_strd
+.endif
         add             \ds2, \dst, \d_strd
         lsl             \s_strd, \s_strd, #1
         lsl             \d_strd, \d_strd, #1
@@ -2294,20 +2331,38 @@
         // any bias; we could just as well keep them as .4s, but narrowing
         // them to .4h gives a significant speedup on out of order cores
         // (at the cost of a smaller slowdown on in-order cores such as A53).
+.ifc \taps, 6tap
+        xtn             v18.4h,  v16.4s
+.else
         xtn             v16.4h,  v16.4s
 
-        bl              L(\type\()_8tap_filter_4)
+        bl              L(\type\()_\taps\()_filter_4)
         mov             v17.8b,  v24.8b
         mov             v18.8b,  v25.8b
-        bl              L(\type\()_8tap_filter_4)
+.endif
+        bl              L(\type\()_\taps\()_filter_4)
         mov             v19.8b,  v24.8b
         mov             v20.8b,  v25.8b
-        bl              L(\type\()_8tap_filter_4)
+        bl              L(\type\()_\taps\()_filter_4)
         mov             v21.8b,  v24.8b
         mov             v22.8b,  v25.8b
 
 48:
-        bl              L(\type\()_8tap_filter_4)
+        bl              L(\type\()_\taps\()_filter_4)
+.ifc \taps, 6tap
+        smull           v3.4s,   v18.4h,  v1.h[1]
+        smlal           v3.4s,   v19.4h,  v1.h[2]
+        smlal           v3.4s,   v20.4h,  v1.h[3]
+        smlal           v3.4s,   v21.4h,  v1.h[4]
+        smlal           v3.4s,   v22.4h,  v1.h[5]
+        smlal           v3.4s,   v24.4h,  v1.h[6]
+        smull           v4.4s,   v19.4h,  v1.h[1]
+        smlal           v4.4s,   v20.4h,  v1.h[2]
+        smlal           v4.4s,   v21.4h,  v1.h[3]
+        smlal           v4.4s,   v22.4h,  v1.h[4]
+        smlal           v4.4s,   v24.4h,  v1.h[5]
+        smlal           v4.4s,   v25.4h,  v1.h[6]
+.else   // 8tap
         smull           v3.4s,   v16.4h,  v1.h[0]
         smlal           v3.4s,   v17.4h,  v1.h[1]
         smlal           v3.4s,   v18.4h,  v1.h[2]
@@ -2324,6 +2379,7 @@
         smlal           v4.4s,   v22.4h,  v1.h[5]
         smlal           v4.4s,   v24.4h,  v1.h[6]
         smlal           v4.4s,   v25.4h,  v1.h[7]
+.endif
 .ifc \type, put
         srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
         srshl           v4.4s,   v4.4s,   v29.4s // -(6+intermediate_bits)
@@ -2339,8 +2395,10 @@
         st1             {v3.d}[0], [\dst], \d_strd
         st1             {v3.d}[1], [\ds2], \d_strd
         b.le            0f
+.ifc \taps, 8tap
         mov             v16.8b,  v18.8b
         mov             v17.8b,  v19.8b
+.endif
         mov             v18.8b,  v20.8b
         mov             v19.8b,  v21.8b
         mov             v20.8b,  v22.8b
@@ -2350,7 +2408,7 @@
 0:
         ret             x15
 
-L(\type\()_8tap_filter_4):
+L(\type\()_\taps\()_filter_4):
         ld1             {v24.8h}, [\sr2], \s_strd
         ld1             {v25.8h}, [\src], \s_strd
         ext             v26.16b, v24.16b, v24.16b, #2
@@ -2411,14 +2469,14 @@
         // and conserves register space (no need to clobber v8-v15).
         uzp1            v16.8h,  v24.8h,  v25.8h // Same as xtn, xtn2
 
-        bl              L(\type\()_8tap_filter_8)
+        bl              L(\type\()_\taps\()_filter_8)
         mov             v17.16b, v23.16b
         mov             v18.16b, v24.16b
 
 8:
         smull           v2.4s,   v16.4h,  v1.h[0]
         smull2          v3.4s,   v16.8h,  v1.h[0]
-        bl              L(\type\()_8tap_filter_8)
+        bl              L(\type\()_\taps\()_filter_8)
         smull           v4.4s,   v17.4h,  v1.h[0]
         smull2          v5.4s,   v17.8h,  v1.h[0]
         smlal           v2.4s,   v17.4h,  v1.h[1]
@@ -2480,7 +2538,9 @@
         ld1             {v0.8b},  [\xmx]
         ld1             {v1.8b},  [\xmy]
         sub             \src,  \src,  #6
+.ifc \taps, 8tap
         sub             \src,  \src,  \s_strd
+.endif
         sub             \src,  \src,  \s_strd, lsl #1
         sxtl            v0.8h,   v0.8b
         sxtl            v1.8h,   v1.8b
@@ -2494,6 +2554,16 @@
         lsl             \s_strd, \s_strd, #1
 
         ld1             {v27.8h, v28.8h},  [\src], \s_strd
+.ifc \taps, 6tap
+        ext             v26.16b, v27.16b, v28.16b, #2
+        smull           v24.4s,  v26.4h,  v0.h[1]
+        smull2          v25.4s,  v26.8h,  v0.h[1]
+.irpc i, 23456
+        ext             v26.16b, v27.16b, v28.16b, #(2*\i)
+        smlal           v24.4s,  v26.4h,  v0.h[\i]
+        smlal2          v25.4s,  v26.8h,  v0.h[\i]
+.endr
+.else   // 8tap
         smull           v24.4s,  v27.4h,  v0.h[0]
         smull2          v25.4s,  v27.8h,  v0.h[0]
 .irpc i, 1234567
@@ -2501,6 +2571,7 @@
         smlal           v24.4s,  v26.4h,  v0.h[\i]
         smlal2          v25.4s,  v26.8h,  v0.h[\i]
 .endr
+.endif
         srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
         srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
         // The intermediates from the horizontal pass fit in 16 bit without
@@ -2508,22 +2579,53 @@
         // them to .4h gives a significant speedup on out of order cores
         // (at the cost of a smaller slowdown on in-order cores such as A53),
         // and conserves register space (no need to clobber v8-v15).
+.ifc \taps, 6tap
+        uzp1            v18.8h,  v24.8h,  v25.8h // Same as xtn, xtn2
+.else
         uzp1            v16.8h,  v24.8h,  v25.8h // Same as xtn, xtn2
 
-        bl              L(\type\()_8tap_filter_8)
+        bl              L(\type\()_\taps\()_filter_8)
         mov             v17.16b, v23.16b
         mov             v18.16b, v24.16b
-        bl              L(\type\()_8tap_filter_8)
+.endif
+        bl              L(\type\()_\taps\()_filter_8)
         mov             v19.16b, v23.16b
         mov             v20.16b, v24.16b
-        bl              L(\type\()_8tap_filter_8)
+        bl              L(\type\()_\taps\()_filter_8)
         mov             v21.16b, v23.16b
         mov             v22.16b, v24.16b
 
 88:
+.ifc \taps, 6tap
+        smull           v2.4s,   v18.4h,  v1.h[1]
+        smull2          v3.4s,   v18.8h,  v1.h[1]
+        bl              L(\type\()_\taps\()_filter_8)
+        smull           v4.4s,   v19.4h,  v1.h[1]
+        smull2          v5.4s,   v19.8h,  v1.h[1]
+        smlal           v2.4s,   v19.4h,  v1.h[2]
+        smlal2          v3.4s,   v19.8h,  v1.h[2]
+        smlal           v4.4s,   v20.4h,  v1.h[2]
+        smlal2          v5.4s,   v20.8h,  v1.h[2]
+        smlal           v2.4s,   v20.4h,  v1.h[3]
+        smlal2          v3.4s,   v20.8h,  v1.h[3]
+        smlal           v4.4s,   v21.4h,  v1.h[3]
+        smlal2          v5.4s,   v21.8h,  v1.h[3]
+        smlal           v2.4s,   v21.4h,  v1.h[4]
+        smlal2          v3.4s,   v21.8h,  v1.h[4]
+        smlal           v4.4s,   v22.4h,  v1.h[4]
+        smlal2          v5.4s,   v22.8h,  v1.h[4]
+        smlal           v2.4s,   v22.4h,  v1.h[5]
+        smlal2          v3.4s,   v22.8h,  v1.h[5]
+        smlal           v4.4s,   v23.4h,  v1.h[5]
+        smlal2          v5.4s,   v23.8h,  v1.h[5]
+        smlal           v2.4s,   v23.4h,  v1.h[6]
+        smlal2          v3.4s,   v23.8h,  v1.h[6]
+        smlal           v4.4s,   v24.4h,  v1.h[6]
+        smlal2          v5.4s,   v24.8h,  v1.h[6]
+.else   // 8tap
         smull           v2.4s,   v16.4h,  v1.h[0]
         smull2          v3.4s,   v16.8h,  v1.h[0]
-        bl              L(\type\()_8tap_filter_8)
+        bl              L(\type\()_\taps\()_filter_8)
         smull           v4.4s,   v17.4h,  v1.h[0]
         smull2          v5.4s,   v17.8h,  v1.h[0]
         smlal           v2.4s,   v17.4h,  v1.h[1]
@@ -2554,6 +2656,7 @@
         smlal2          v3.4s,   v23.8h,  v1.h[7]
         smlal           v4.4s,   v24.4h,  v1.h[7]
         smlal2          v5.4s,   v24.8h,  v1.h[7]
+.endif
 .ifc \type, put
         srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
         srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
@@ -2577,8 +2680,10 @@
         st1             {v2.8h}, [\dst], \d_strd
         st1             {v3.8h}, [\ds2], \d_strd
         b.le            9f
+.ifc \taps, 8tap
         mov             v16.16b, v18.16b
         mov             v17.16b, v19.16b
+.endif
         mov             v18.16b, v20.16b
         mov             v19.16b, v21.16b
         mov             v20.16b, v22.16b
@@ -2596,13 +2701,32 @@
         mov             \h,  \my
         add             \src,  \src,  #16
         add             \dst,  \dst,  #16
+.ifc \taps, 6tap
+        add             \src,  \src,  \s_strd,  lsl #1
+.endif
         b               168b
 0:
         ret             x15
 
-L(\type\()_8tap_filter_8):
+L(\type\()_\taps\()_filter_8):
         ld1             {v4.8h, v5.8h},  [\sr2], \s_strd
         ld1             {v6.8h, v7.8h},  [\src], \s_strd
+.ifc \taps, 6tap
+        ext             v23.16b, v4.16b,  v5.16b,  #2
+        ext             v24.16b, v6.16b,  v7.16b,  #2
+        smull           v25.4s,  v23.4h,  v0.h[1]
+        smull2          v26.4s,  v23.8h,  v0.h[1]
+        smull           v27.4s,  v24.4h,  v0.h[1]
+        smull2          v28.4s,  v24.8h,  v0.h[1]
+.irpc i, 23456
+        ext             v23.16b, v4.16b,  v5.16b,  #(2*\i)
+        ext             v24.16b, v6.16b,  v7.16b,  #(2*\i)
+        smlal           v25.4s,  v23.4h,  v0.h[\i]
+        smlal2          v26.4s,  v23.8h,  v0.h[\i]
+        smlal           v27.4s,  v24.4h,  v0.h[\i]
+        smlal2          v28.4s,  v24.8h,  v0.h[\i]
+.endr
+.else   // 8tap
         smull           v25.4s,  v4.4h,   v0.h[0]
         smull2          v26.4s,  v4.8h,   v0.h[0]
         smull           v27.4s,  v6.4h,   v0.h[0]
@@ -2615,6 +2739,7 @@
         smlal           v27.4s,  v24.4h,  v0.h[\i]
         smlal2          v28.4s,  v24.8h,  v0.h[\i]
 .endr
+.endif
         srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
         srshl           v26.4s,  v26.4s,  v30.4s // -(6-intermediate_bits)
         srshl           v27.4s,  v27.4s,  v30.4s // -(6-intermediate_bits)
@@ -2623,18 +2748,20 @@
         uzp1            v24.8h,  v27.8h,  v28.8h // Ditto
         ret
 
-L(\type\()_8tap_hv_tbl):
-        .hword L(\type\()_8tap_hv_tbl) - 1280b
-        .hword L(\type\()_8tap_hv_tbl) -  640b
-        .hword L(\type\()_8tap_hv_tbl) -  320b
-        .hword L(\type\()_8tap_hv_tbl) -  160b
-        .hword L(\type\()_8tap_hv_tbl) -   80b
-        .hword L(\type\()_8tap_hv_tbl) -   40b
-        .hword L(\type\()_8tap_hv_tbl) -   20b
+L(\type\()_\taps\()_hv_tbl):
+        .hword L(\type\()_\taps\()_hv_tbl) - 1280b
+        .hword L(\type\()_\taps\()_hv_tbl) -  640b
+        .hword L(\type\()_\taps\()_hv_tbl) -  320b
+        .hword L(\type\()_\taps\()_hv_tbl) -  160b
+        .hword L(\type\()_\taps\()_hv_tbl) -   80b
+        .hword L(\type\()_\taps\()_hv_tbl) -   40b
+        .hword L(\type\()_\taps\()_hv_tbl) -   20b
         .hword 0
 endfunc
+.endm
 
 
+.macro filter_bilin_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2
 function \type\()_bilin_16bpc_neon, export=1
 .ifc \bdmax, w8
         ldr             w8,  [sp]
@@ -3236,8 +3363,34 @@
 endfunc
 .endm
 
-filter_fn put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10
-filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10
+make_8tap_fn    put,  regular_sharp,  REGULAR, SHARP,   8tap
+make_8tap_fn    put,  smooth_sharp,   SMOOTH,  SHARP,   8tap
+make_8tap_fn    put,  sharp,          SHARP,   SHARP,   8tap
+make_8tap_fn    put,  sharp_regular,  SHARP,   REGULAR, 8tap
+make_8tap_fn    put,  sharp_smooth,   SHARP,   SMOOTH,  8tap
+filter_fn       put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10, 8tap
+
+make_8tap_fn    put,  regular,        REGULAR, REGULAR, 6tap
+make_8tap_fn    put,  regular_smooth, REGULAR, SMOOTH,  6tap
+make_8tap_fn    put,  smooth,         SMOOTH,  SMOOTH,  6tap
+make_8tap_fn    put,  smooth_regular, SMOOTH,  REGULAR, 6tap
+filter_fn       put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10, 6tap
+filter_bilin_fn put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10
+
+make_8tap_fn    prep,  regular_sharp,  REGULAR, SHARP,   8tap
+make_8tap_fn    prep,  smooth_sharp,   SMOOTH,  SHARP,   8tap
+make_8tap_fn    prep,  sharp,          SHARP,   SHARP,   8tap
+make_8tap_fn    prep,  sharp_regular,  SHARP,   REGULAR, 8tap
+make_8tap_fn    prep,  sharp_smooth,   SHARP,   SMOOTH,  8tap
+filter_fn       prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10, 8tap
+
+make_8tap_fn    prep,  regular,        REGULAR, REGULAR, 6tap
+make_8tap_fn    prep,  regular_smooth, REGULAR, SMOOTH,  6tap
+make_8tap_fn    prep,  smooth,         SMOOTH,  SMOOTH,  6tap
+make_8tap_fn    prep,  smooth_regular, SMOOTH,  REGULAR, 6tap
+filter_fn       prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10, 6tap
+filter_bilin_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10
+
 
 .macro load_filter_row dst, src, inc
         asr             w13, \src, #10
diff --git a/src/arm/64/msac.S b/src/arm/64/msac.S
index 3a6cf90..7bef924 100644
--- a/src/arm/64/msac.S
+++ b/src/arm/64/msac.S
@@ -208,60 +208,66 @@
         sub             w4,  w4,  w3           // rng = u - v
         clz             w5,  w4                // clz(rng)
         eor             w5,  w5,  #16          // d = clz(rng) ^ 16
-        mvn             x7,  x7                // ~dif
-        add             x7,  x7,  x3, lsl #48  // ~dif + (v << 48)
+        sub             x7,  x7,  x3, lsl #48  // dif - (v << 48)
 L(renorm2):
         lsl             w4,  w4,  w5           // rng << d
         subs            w6,  w6,  w5           // cnt -= d
-        lsl             x7,  x7,  x5           // (~dif + (v << 48)) << d
+        lsl             x7,  x7,  x5           // (dif - (v << 48)) << d
         str             w4,  [x0, #RNG]
-        mvn             x7,  x7                // ~dif
-        b.hs            9f
+        b.hs            4f
 
         // refill
         ldp             x3,  x4,  [x0]         // BUF_POS, BUF_END
         add             x5,  x3,  #8
-        cmp             x5,  x4
-        b.gt            2f
+        subs            x5,  x5,  x4
+        b.hi            6f
 
-        ldr             x3,  [x3]              // next_bits
-        add             w8,  w6,  #23          // shift_bits = cnt + 23
-        add             w6,  w6,  #16          // cnt += 16
-        rev             x3,  x3                // next_bits = bswap(next_bits)
-        sub             x5,  x5,  x8, lsr #3   // buf_pos -= shift_bits >> 3
-        and             w8,  w8,  #24          // shift_bits &= 24
-        lsr             x3,  x3,  x8           // next_bits >>= shift_bits
-        sub             w8,  w8,  w6           // shift_bits -= 16 + cnt
-        str             x5,  [x0, #BUF_POS]
-        lsl             x3,  x3,  x8           // next_bits <<= shift_bits
-        mov             w4,  #48
-        sub             w6,  w4,  w8           // cnt = cnt + 64 - shift_bits
-        eor             x7,  x7,  x3           // dif ^= next_bits
-        b               9f
+        ldr             x8,  [x3]              // next_bits
+        add             w4,  w6,  #-48         // shift_bits = cnt + 16 (- 64)
+        mvn             x8,  x8
+        neg             w5,  w4
+        rev             x8,  x8                // next_bits = bswap(next_bits)
+        lsr             w5,  w5,  #3           // num_bytes_read
+        lsr             x8,  x8,  x4           // next_bits >>= (shift_bits & 63)
 
-2:      // refill_eob
-        mov             w14, #40
-        sub             w5,  w14, w6           // c = 40 - cnt
-3:
-        cmp             x3,  x4
-        b.ge            4f
-        ldrb            w8,  [x3], #1
-        lsl             x8,  x8,  x5
-        eor             x7,  x7,  x8
-        subs            w5,  w5,  #8
-        b.ge            3b
-
-4:      // refill_eob_end
+2:      // refill_end
+        add             x3,  x3,  x5
+        add             w6,  w6,  w5, lsl #3   // cnt += num_bits_read
         str             x3,  [x0, #BUF_POS]
-        sub             w6,  w14, w5           // cnt = 40 - c
 
-9:
+3:      // refill_end2
+        orr             x7,  x7,  x8           // dif |= next_bits
+
+4:      // end
         str             w6,  [x0, #CNT]
         str             x7,  [x0, #DIF]
 
         mov             w0,  w15
         add             sp,  sp,  #48
         ret
+
+5:      // pad_with_ones
+        add             w8,  w6,  #-16
+        ror             x8,  x8,  x8
+        b               3b
+
+6:      // refill_eob
+        cmp             x3,  x4
+        b.hs            5b
+
+        ldr             x8,  [x4, #-8]
+        lsl             w5,  w5,  #3
+        lsr             x8,  x8,  x5
+        add             w5,  w6,  #-48
+        mvn             x8,  x8
+        sub             w4,  w4,  w3           // num_bytes_left
+        rev             x8,  x8
+        lsr             x8,  x8,  x5
+        neg             w5,  w5
+        lsr             w5,  w5,  #3
+        cmp             w5,  w4
+        csel            w5,  w5,  w4,  lo      // num_bytes_read
+        b               2b
 endfunc
 
 function msac_decode_symbol_adapt8_neon, export=1
@@ -334,54 +340,37 @@
         sub             w4,  w4,  w3           // rng = u - v
         clz             w5,  w4                // clz(rng)
         eor             w5,  w5,  #16          // d = clz(rng) ^ 16
-        mvn             x7,  x7                // ~dif
-        add             x7,  x7,  x3, lsl #48  // ~dif + (v << 48)
+        sub             x7,  x7,  x3, lsl #48  // dif - (v << 48)
         lsl             w4,  w4,  w5           // rng << d
         subs            w6,  w6,  w5           // cnt -= d
-        lsl             x7,  x7,  x5           // (~dif + (v << 48)) << d
+        lsl             x7,  x7,  x5           // (dif - (v << 48)) << d
         str             w4,  [x0, #RNG]
         dup             v3.4h,   w4
-        mvn             x7,  x7                // ~dif
-        b.hs            9f
+        b.hs            5f
 
         // refill
         ldp             x3,  x4,  [x0]         // BUF_POS, BUF_END
         add             x5,  x3,  #8
-        cmp             x5,  x4
-        b.gt            2f
+        subs            x5,  x5,  x4
+        b.hi            7f
 
-        ldr             x3,  [x3]              // next_bits
-        add             w8,  w6,  #23          // shift_bits = cnt + 23
-        add             w6,  w6,  #16          // cnt += 16
-        rev             x3,  x3                // next_bits = bswap(next_bits)
-        sub             x5,  x5,  x8, lsr #3   // buf_pos -= shift_bits >> 3
-        and             w8,  w8,  #24          // shift_bits &= 24
-        lsr             x3,  x3,  x8           // next_bits >>= shift_bits
-        sub             w8,  w8,  w6           // shift_bits -= 16 + cnt
-        str             x5,  [x0, #BUF_POS]
-        lsl             x3,  x3,  x8           // next_bits <<= shift_bits
-        mov             w4,  #48
-        sub             w6,  w4,  w8           // cnt = cnt + 64 - shift_bits
-        eor             x7,  x7,  x3           // dif ^= next_bits
-        b               9f
+        ldr             x8,  [x3]              // next_bits
+        add             w4,  w6,  #-48         // shift_bits = cnt + 16 (- 64)
+        mvn             x8,  x8
+        neg             w5,  w4
+        rev             x8,  x8                // next_bits = bswap(next_bits)
+        lsr             w5,  w5,  #3           // num_bytes_read
+        lsr             x8,  x8,  x4           // next_bits >>= (shift_bits & 63)
 
-2:      // refill_eob
-        mov             w14, #40
-        sub             w5,  w14, w6           // c = 40 - cnt
-3:
-        cmp             x3,  x4
-        b.ge            4f
-        ldrb            w8,  [x3], #1
-        lsl             x8,  x8,  x5
-        eor             x7,  x7,  x8
-        subs            w5,  w5,  #8
-        b.ge            3b
-
-4:      // refill_eob_end
+3:      // refill_end
+        add             x3,  x3,  x5
+        add             w6,  w6,  w5, lsl #3   // cnt += num_bits_read
         str             x3,  [x0, #BUF_POS]
-        sub             w6,  w14, w5           // cnt = 40 - c
 
-9:
+4:      // refill_end2
+        orr             x7,  x7,  x8           // dif |= next_bits
+
+5:      // end
         lsl             w15, w15, #1
         sub             w15, w15, #5
         lsr             x12, x7,  #48
@@ -394,6 +383,29 @@
         str             x7,  [x0, #DIF]
         lsr             w0,  w13, #1
         ret
+
+6:      // pad_with_ones
+        add             w8,  w6,  #-16
+        ror             x8,  x8,  x8
+        b               4b
+
+7:      // refill_eob
+        cmp             x3,  x4
+        b.hs            6b
+
+        ldr             x8,  [x4, #-8]
+        lsl             w5,  w5,  #3
+        lsr             x8,  x8,  x5
+        add             w5,  w6,  #-48
+        mvn             x8,  x8
+        sub             w4,  w4,  w3           // num_bytes_left
+        rev             x8,  x8
+        lsr             x8,  x8,  x5
+        neg             w5,  w5
+        lsr             w5,  w5,  #3
+        cmp             w5,  w4
+        csel            w5,  w5,  w4,  lo      // num_bytes_read
+        b               3b
 endfunc
 
 function msac_decode_bool_equi_neon, export=1
@@ -410,7 +422,6 @@
         csel            x7,  x8,  x7,  hs      // if (ret) dif = dif - vw;
 
         clz             w5,  w4                // clz(rng)
-        mvn             x7,  x7                // ~dif
         eor             w5,  w5,  #16          // d = clz(rng) ^ 16
         b               L(renorm2)
 endfunc
@@ -431,7 +442,6 @@
         csel            x7,  x8,  x7,  hs      // if (ret) dif = dif - vw;
 
         clz             w5,  w4                // clz(rng)
-        mvn             x7,  x7                // ~dif
         eor             w5,  w5,  #16          // d = clz(rng) ^ 16
         b               L(renorm2)
 endfunc
@@ -455,7 +465,6 @@
         ldr             w10, [x0, #ALLOW_UPDATE_CDF]
 
         clz             w5,  w4                // clz(rng)
-        mvn             x7,  x7                // ~dif
         eor             w5,  w5,  #16          // d = clz(rng) ^ 16
 
         cbz             w10, L(renorm2)
diff --git a/src/arm/64/util.S b/src/arm/64/util.S
index 9013fd4..64d73e3 100644
--- a/src/arm/64/util.S
+++ b/src/arm/64/util.S
@@ -149,6 +149,35 @@
         trn2            \r7\().2d,  \t9\().2d,  \r7\().2d
 .endm
 
+.macro transpose_8x8h_mov r0, r1, r2, r3, r4, r5, r6, r7, t8, t9, o0, o1, o2, o3, o4, o5, o6, o7
+        trn1            \t8\().8h,  \r0\().8h,  \r1\().8h
+        trn2            \t9\().8h,  \r0\().8h,  \r1\().8h
+        trn1            \r1\().8h,  \r2\().8h,  \r3\().8h
+        trn2            \r3\().8h,  \r2\().8h,  \r3\().8h
+        trn1            \r0\().8h,  \r4\().8h,  \r5\().8h
+        trn2            \r5\().8h,  \r4\().8h,  \r5\().8h
+        trn1            \r2\().8h,  \r6\().8h,  \r7\().8h
+        trn2            \r7\().8h,  \r6\().8h,  \r7\().8h
+
+        trn1            \r4\().4s,  \r0\().4s,  \r2\().4s
+        trn2            \r2\().4s,  \r0\().4s,  \r2\().4s
+        trn1            \r6\().4s,  \r5\().4s,  \r7\().4s
+        trn2            \r7\().4s,  \r5\().4s,  \r7\().4s
+        trn1            \r5\().4s,  \t9\().4s,  \r3\().4s
+        trn2            \t9\().4s,  \t9\().4s,  \r3\().4s
+        trn1            \r3\().4s,  \t8\().4s,  \r1\().4s
+        trn2            \t8\().4s,  \t8\().4s,  \r1\().4s
+
+        trn1            \o0\().2d,  \r3\().2d,  \r4\().2d
+        trn2            \o4\().2d,  \r3\().2d,  \r4\().2d
+        trn1            \o1\().2d,  \r5\().2d,  \r6\().2d
+        trn2            \o5\().2d,  \r5\().2d,  \r6\().2d
+        trn2            \o6\().2d,  \t8\().2d,  \r2\().2d
+        trn1            \o2\().2d,  \t8\().2d,  \r2\().2d
+        trn1            \o3\().2d,  \t9\().2d,  \r7\().2d
+        trn2            \o7\().2d,  \t9\().2d,  \r7\().2d
+.endm
+
 .macro transpose_8x16b r0, r1, r2, r3, r4, r5, r6, r7, t8, t9
         trn1            \t8\().16b, \r0\().16b, \r1\().16b
         trn2            \t9\().16b, \r0\().16b, \r1\().16b
@@ -226,4 +255,16 @@
         trn2            \r3\().4s,  \t5\().4s,  \t7\().4s
 .endm
 
+.macro  transpose_4x8h_mov r0, r1, r2, r3, t4, t5, t6, t7, o0, o1, o2, o3
+        trn1            \t4\().8h,  \r0\().8h,  \r1\().8h
+        trn2            \t5\().8h,  \r0\().8h,  \r1\().8h
+        trn1            \t6\().8h,  \r2\().8h,  \r3\().8h
+        trn2            \t7\().8h,  \r2\().8h,  \r3\().8h
+
+        trn1            \o0\().4s,  \t4\().4s,  \t6\().4s
+        trn2            \o2\().4s,  \t4\().4s,  \t6\().4s
+        trn1            \o1\().4s,  \t5\().4s,  \t7\().4s
+        trn2            \o3\().4s,  \t5\().4s,  \t7\().4s
+.endm
+
 #endif /* DAV1D_SRC_ARM_64_UTIL_S */
diff --git a/src/arm/asm.S b/src/arm/asm.S
index dc50415..fed73b3 100644
--- a/src/arm/asm.S
+++ b/src/arm/asm.S
@@ -34,6 +34,50 @@
 #define x18 do_not_use_x18
 #define w18 do_not_use_w18
 
+#if HAVE_AS_ARCH_DIRECTIVE
+        .arch AS_ARCH_LEVEL
+#endif
+
+#if HAVE_AS_ARCHEXT_DOTPROD_DIRECTIVE
+#define ENABLE_DOTPROD  .arch_extension dotprod
+#define DISABLE_DOTPROD .arch_extension nodotprod
+#else
+#define ENABLE_DOTPROD
+#define DISABLE_DOTPROD
+#endif
+#if HAVE_AS_ARCHEXT_I8MM_DIRECTIVE
+#define ENABLE_I8MM  .arch_extension i8mm
+#define DISABLE_I8MM .arch_extension noi8mm
+#else
+#define ENABLE_I8MM
+#define DISABLE_I8MM
+#endif
+#if HAVE_AS_ARCHEXT_SVE_DIRECTIVE
+#define ENABLE_SVE  .arch_extension sve
+#define DISABLE_SVE .arch_extension nosve
+#else
+#define ENABLE_SVE
+#define DISABLE_SVE
+#endif
+#if HAVE_AS_ARCHEXT_SVE2_DIRECTIVE
+#define ENABLE_SVE2  .arch_extension sve2
+#define DISABLE_SVE2 .arch_extension nosve2
+#else
+#define ENABLE_SVE2
+#define DISABLE_SVE2
+#endif
+
+/* If we do support the .arch_extension directives, disable support for all
+ * the extensions that we may use, in case they were implicitly enabled by
+ * the .arch level. This makes it clear if we try to assemble an instruction
+ * from an unintended extension set; we only allow assmbling such instructions
+ * within regions where we explicitly enable those extensions. */
+DISABLE_DOTPROD
+DISABLE_I8MM
+DISABLE_SVE
+DISABLE_SVE2
+
+
 /* Support macros for
  *   - Armv8.3-A Pointer Authentication and
  *   - Armv8.5-A Branch Target Identification
diff --git a/src/arm/cpu.c b/src/arm/cpu.c
index b7a0d3a..d9b1751 100644
--- a/src/arm/cpu.c
+++ b/src/arm/cpu.c
@@ -31,22 +31,95 @@
 
 #include "src/arm/cpu.h"
 
-#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
-// NEON is always available; runtime tests are not needed.
-#elif defined(HAVE_GETAUXVAL) && ARCH_ARM
+#if defined(HAVE_GETAUXVAL) || defined(HAVE_ELF_AUX_INFO)
 #include <sys/auxv.h>
 
+#if ARCH_AARCH64
+
+#define HWCAP_AARCH64_ASIMDDP (1 << 20)
+#define HWCAP_AARCH64_SVE     (1 << 22)
+#define HWCAP2_AARCH64_SVE2   (1 << 1)
+#define HWCAP2_AARCH64_I8MM   (1 << 13)
+
+COLD unsigned dav1d_get_cpu_flags_arm(void) {
+#ifdef HAVE_GETAUXVAL
+    unsigned long hw_cap = getauxval(AT_HWCAP);
+    unsigned long hw_cap2 = getauxval(AT_HWCAP2);
+#else
+    unsigned long hw_cap = 0;
+    unsigned long hw_cap2 = 0;
+    elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap));
+    elf_aux_info(AT_HWCAP2, &hw_cap2, sizeof(hw_cap2));
+#endif
+
+    unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
+    flags |= (hw_cap & HWCAP_AARCH64_ASIMDDP) ? DAV1D_ARM_CPU_FLAG_DOTPROD : 0;
+    flags |= (hw_cap2 & HWCAP2_AARCH64_I8MM) ? DAV1D_ARM_CPU_FLAG_I8MM : 0;
+    flags |= (hw_cap & HWCAP_AARCH64_SVE) ? DAV1D_ARM_CPU_FLAG_SVE : 0;
+    flags |= (hw_cap2 & HWCAP2_AARCH64_SVE2) ? DAV1D_ARM_CPU_FLAG_SVE2 : 0;
+    return flags;
+}
+#else  /* !ARCH_AARCH64 */
+
 #ifndef HWCAP_ARM_NEON
-#define HWCAP_ARM_NEON (1 << 12)
+#define HWCAP_ARM_NEON    (1 << 12)
 #endif
-#define NEON_HWCAP HWCAP_ARM_NEON
+#define HWCAP_ARM_ASIMDDP (1 << 24)
+#define HWCAP_ARM_I8MM    (1 << 27)
 
-#elif defined(HAVE_ELF_AUX_INFO) && ARCH_ARM
-#include <sys/auxv.h>
+COLD unsigned dav1d_get_cpu_flags_arm(void) {
+#ifdef HAVE_GETAUXVAL
+    unsigned long hw_cap = getauxval(AT_HWCAP);
+#else
+    unsigned long hw_cap = 0;
+    elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap));
+#endif
 
-#define NEON_HWCAP HWCAP_NEON
+    unsigned flags = (hw_cap & HWCAP_ARM_NEON) ? DAV1D_ARM_CPU_FLAG_NEON : 0;
+    flags |= (hw_cap & HWCAP_ARM_ASIMDDP) ? DAV1D_ARM_CPU_FLAG_DOTPROD : 0;
+    flags |= (hw_cap & HWCAP_ARM_I8MM) ? DAV1D_ARM_CPU_FLAG_I8MM : 0;
+    return flags;
+}
+#endif /* ARCH_AARCH64 */
+
+#elif defined(__APPLE__)
+#include <sys/sysctl.h>
+
+static int have_feature(const char *feature) {
+    int supported = 0;
+    size_t size = sizeof(supported);
+    if (sysctlbyname(feature, &supported, &size, NULL, 0) != 0) {
+        return 0;
+    }
+    return supported;
+}
+
+COLD unsigned dav1d_get_cpu_flags_arm(void) {
+    unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
+    if (have_feature("hw.optional.arm.FEAT_DotProd"))
+        flags |= DAV1D_ARM_CPU_FLAG_DOTPROD;
+    if (have_feature("hw.optional.arm.FEAT_I8MM"))
+        flags |= DAV1D_ARM_CPU_FLAG_I8MM;
+    /* No SVE and SVE2 feature detection available on Apple platforms. */
+    return flags;
+}
+
+#elif defined(_WIN32)
+#include <windows.h>
+
+COLD unsigned dav1d_get_cpu_flags_arm(void) {
+    unsigned flags = DAV1D_ARM_CPU_FLAG_NEON;
+#ifdef PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE
+    if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE))
+        flags |= DAV1D_ARM_CPU_FLAG_DOTPROD;
+#endif
+    /* No I8MM or SVE feature detection available on Windows at the time of
+     * writing. */
+    return flags;
+}
 
 #elif defined(__ANDROID__)
+#include <ctype.h>
 #include <stdio.h>
 #include <string.h>
 
@@ -58,18 +131,25 @@
     char line_buffer[120];
     const char *line;
 
+    size_t flaglen = strlen(flag);
     while ((line = fgets(line_buffer, sizeof(line_buffer), file))) {
-        if (strstr(line, flag)) {
-            fclose(file);
-            return 1;
+        // check all occurances as whole words
+        const char *found = line;
+        while ((found = strstr(found, flag))) {
+            if ((found == line_buffer || !isgraph(found[-1])) &&
+                (isspace(found[flaglen]) || feof(file))) {
+                fclose(file);
+                return 1;
+            }
+            found += flaglen;
         }
         // if line is incomplete seek back to avoid splitting the search
         // string into two buffers
-        if (!strchr(line, '\n') && strlen(line) > strlen(flag)) {
+        if (!strchr(line, '\n') && strlen(line) > flaglen) {
             // use fseek since the 64 bit fseeko is only available since
             // Android API level 24 and meson defines _FILE_OFFSET_BITS
             // by default 64
-            if (fseek(file, -strlen(flag), SEEK_CUR))
+            if (fseek(file, -flaglen, SEEK_CUR))
                 break;
         }
     }
@@ -78,22 +158,23 @@
 
     return 0;
 }
-#endif
 
 COLD unsigned dav1d_get_cpu_flags_arm(void) {
-    unsigned flags = 0;
-#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
-    flags |= DAV1D_ARM_CPU_FLAG_NEON;
-#elif defined(HAVE_GETAUXVAL) && ARCH_ARM
-    unsigned long hw_cap = getauxval(AT_HWCAP);
-    flags |= (hw_cap & NEON_HWCAP) ? DAV1D_ARM_CPU_FLAG_NEON : 0;
-#elif defined(HAVE_ELF_AUX_INFO) && ARCH_ARM
-    unsigned long hw_cap = 0;
-    elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap));
-    flags |= (hw_cap & NEON_HWCAP) ? DAV1D_ARM_CPU_FLAG_NEON : 0;
-#elif defined(__ANDROID__)
-    flags |= parse_proc_cpuinfo("neon") ? DAV1D_ARM_CPU_FLAG_NEON : 0;
-#endif
-
+    unsigned flags = parse_proc_cpuinfo("neon") ? DAV1D_ARM_CPU_FLAG_NEON : 0;
+    flags |= parse_proc_cpuinfo("asimd") ? DAV1D_ARM_CPU_FLAG_NEON : 0;
+    flags |= parse_proc_cpuinfo("asimddp") ? DAV1D_ARM_CPU_FLAG_DOTPROD : 0;
+    flags |= parse_proc_cpuinfo("i8mm") ? DAV1D_ARM_CPU_FLAG_I8MM : 0;
+#if ARCH_AARCH64
+    flags |= parse_proc_cpuinfo("sve") ? DAV1D_ARM_CPU_FLAG_SVE : 0;
+    flags |= parse_proc_cpuinfo("sve2") ? DAV1D_ARM_CPU_FLAG_SVE2 : 0;
+#endif /* ARCH_AARCH64 */
     return flags;
 }
+
+#else  /* Unsupported OS */
+
+COLD unsigned dav1d_get_cpu_flags_arm(void) {
+    return 0;
+}
+
+#endif
diff --git a/src/arm/cpu.h b/src/arm/cpu.h
index 8c10a1b..de9bde6 100644
--- a/src/arm/cpu.h
+++ b/src/arm/cpu.h
@@ -30,6 +30,10 @@
 
 enum CpuFlags {
     DAV1D_ARM_CPU_FLAG_NEON = 1 << 0,
+    DAV1D_ARM_CPU_FLAG_DOTPROD = 1 << 1,
+    DAV1D_ARM_CPU_FLAG_I8MM = 1 << 2,
+    DAV1D_ARM_CPU_FLAG_SVE = 1 << 3,
+    DAV1D_ARM_CPU_FLAG_SVE2 = 1 << 4,
 };
 
 unsigned dav1d_get_cpu_flags_arm(void);
diff --git a/src/arm/itx.h b/src/arm/itx.h
index 2ecd086..17234e0 100644
--- a/src/arm/itx.h
+++ b/src/arm/itx.h
@@ -117,9 +117,11 @@
 
     if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
 
+    assign_itx_fn( , 4, 4, wht_wht,           WHT_WHT,           neon);
+
     if (BITDEPTH == 16 && bpc != 10) return;
 
-    assign_itx17_fn( ,  4,  4, neon);
+    assign_itx16_fn( ,  4,  4, neon);
     assign_itx16_fn(R,  4,  8, neon);
     assign_itx16_fn(R,  4, 16, neon);
     assign_itx16_fn(R,  8,  4, neon);
diff --git a/src/arm/msac.h b/src/arm/msac.h
index 9db0bf8..6eee0da 100644
--- a/src/arm/msac.h
+++ b/src/arm/msac.h
@@ -39,7 +39,7 @@
 unsigned dav1d_msac_decode_bool_equi_neon(MsacContext *s);
 unsigned dav1d_msac_decode_bool_neon(MsacContext *s, unsigned f);
 
-#if ARCH_AARCH64 || defined(__ARM_NEON)
+#if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
 #define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt4_neon
 #define dav1d_msac_decode_symbol_adapt8  dav1d_msac_decode_symbol_adapt8_neon
 #define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_neon
diff --git a/src/cpu.h b/src/cpu.h
index c9009c7..d20c5f0 100644
--- a/src/cpu.h
+++ b/src/cpu.h
@@ -64,6 +64,20 @@
 #if defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32) || ARCH_AARCH64
     flags |= DAV1D_ARM_CPU_FLAG_NEON;
 #endif
+#ifdef __ARM_FEATURE_DOTPROD
+    flags |= DAV1D_ARM_CPU_FLAG_DOTPROD;
+#endif
+#ifdef __ARM_FEATURE_MATMUL_INT8
+    flags |= DAV1D_ARM_CPU_FLAG_I8MM;
+#endif
+#if ARCH_AARCH64
+#ifdef __ARM_FEATURE_SVE
+    flags |= DAV1D_ARM_CPU_FLAG_SVE;
+#endif
+#ifdef __ARM_FEATURE_SVE2
+    flags |= DAV1D_ARM_CPU_FLAG_SVE2;
+#endif
+#endif /* ARCH_AARCH64 */
 #elif ARCH_PPC64LE
 #if defined(__VSX__)
     flags |= DAV1D_PPC_CPU_FLAG_VSX;
diff --git a/src/ext/x86/x86inc.asm b/src/ext/x86/x86inc.asm
index 68b1f74..2282d9b 100644
--- a/src/ext/x86/x86inc.asm
+++ b/src/ext/x86/x86inc.asm
@@ -1,7 +1,7 @@
 ;*****************************************************************************
 ;* x86inc.asm: x86 abstraction layer
 ;*****************************************************************************
-;* Copyright (C) 2005-2022 x264 project
+;* Copyright (C) 2005-2024 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Henrik Gramner <henrik@gramner.com>
@@ -104,7 +104,7 @@
 %endif
 
 %define HAVE_PRIVATE_EXTERN 1
-%ifdef __NASM_VER__
+%ifdef __NASM_VERSION_ID__
     %use smartalign
     %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14
         %define HAVE_PRIVATE_EXTERN 0
@@ -845,9 +845,26 @@
     %1: %2
 %endmacro
 
-; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default.
 %if FORMAT_ELF
+    ; The GNU linker assumes the stack is executable by default.
     [SECTION .note.GNU-stack noalloc noexec nowrite progbits]
+
+    %ifdef __NASM_VERSION_ID__
+        %if __NASM_VERSION_ID__ >= 0x020e0300 ; 2.14.03
+            %if ARCH_X86_64
+                ; Control-flow Enforcement Technology (CET) properties.
+                [SECTION .note.gnu.property alloc noexec nowrite note align=gprsize]
+                dd 0x00000004  ; n_namesz
+                dd gprsize + 8 ; n_descsz
+                dd 0x00000005  ; n_type = NT_GNU_PROPERTY_TYPE_0
+                db "GNU",0     ; n_name
+                dd 0xc0000002  ; pr_type = GNU_PROPERTY_X86_FEATURE_1_AND
+                dd 0x00000004  ; pr_datasz
+                dd 0x00000002  ; pr_data = GNU_PROPERTY_X86_FEATURE_1_SHSTK
+                dd 0x00000000  ; pr_padding
+            %endif
+        %endif
+    %endif
 %endif
 
 ; Tell debuggers how large the function was.
@@ -883,21 +900,22 @@
 %assign cpuflags_sse4      (1<<10) | cpuflags_ssse3
 %assign cpuflags_sse42     (1<<11) | cpuflags_sse4
 %assign cpuflags_aesni     (1<<12) | cpuflags_sse42
-%assign cpuflags_gfni      (1<<13) | cpuflags_sse42
-%assign cpuflags_avx       (1<<14) | cpuflags_sse42
-%assign cpuflags_xop       (1<<15) | cpuflags_avx
-%assign cpuflags_fma4      (1<<16) | cpuflags_avx
-%assign cpuflags_fma3      (1<<17) | cpuflags_avx
-%assign cpuflags_bmi1      (1<<18) | cpuflags_avx|cpuflags_lzcnt
-%assign cpuflags_bmi2      (1<<19) | cpuflags_bmi1
-%assign cpuflags_avx2      (1<<20) | cpuflags_fma3|cpuflags_bmi2
-%assign cpuflags_avx512    (1<<21) | cpuflags_avx2 ; F, CD, BW, DQ, VL
-%assign cpuflags_avx512icl (1<<22) | cpuflags_avx512|cpuflags_gfni ; VNNI, IFMA, VBMI, VBMI2, VPOPCNTDQ, BITALG, VAES, VPCLMULQDQ
+%assign cpuflags_clmul     (1<<13) | cpuflags_sse42
+%assign cpuflags_gfni      (1<<14) | cpuflags_aesni|cpuflags_clmul
+%assign cpuflags_avx       (1<<15) | cpuflags_sse42
+%assign cpuflags_xop       (1<<16) | cpuflags_avx
+%assign cpuflags_fma4      (1<<17) | cpuflags_avx
+%assign cpuflags_fma3      (1<<18) | cpuflags_avx
+%assign cpuflags_bmi1      (1<<19) | cpuflags_avx|cpuflags_lzcnt
+%assign cpuflags_bmi2      (1<<20) | cpuflags_bmi1
+%assign cpuflags_avx2      (1<<21) | cpuflags_fma3|cpuflags_bmi2
+%assign cpuflags_avx512    (1<<22) | cpuflags_avx2 ; F, CD, BW, DQ, VL
+%assign cpuflags_avx512icl (1<<23) | cpuflags_avx512|cpuflags_gfni ; VNNI, IFMA, VBMI, VBMI2, VPOPCNTDQ, BITALG, VAES, VPCLMULQDQ
 
-%assign cpuflags_cache32   (1<<23)
-%assign cpuflags_cache64   (1<<24)
-%assign cpuflags_aligned   (1<<25) ; not a cpu feature, but a function variant
-%assign cpuflags_atom      (1<<26)
+%assign cpuflags_cache32   (1<<24)
+%assign cpuflags_cache64   (1<<25)
+%assign cpuflags_aligned   (1<<26) ; not a cpu feature, but a function variant
+%assign cpuflags_atom      (1<<27)
 
 ; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
 %define    cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
@@ -939,13 +957,13 @@
     %endif
 
     %if ARCH_X86_64 || cpuflag(sse2)
-        %ifdef __NASM_VER__
+        %ifdef __NASM_VERSION_ID__
             ALIGNMODE p6
         %else
             CPU amdnop
         %endif
     %else
-        %ifdef __NASM_VER__
+        %ifdef __NASM_VERSION_ID__
             ALIGNMODE nop
         %else
             CPU basicnop
@@ -1035,6 +1053,7 @@
     %if WIN64
         AVX512_MM_PERMUTATION 6 ; Swap callee-saved registers with volatile registers
     %endif
+    %xdefine bcstw 1to8
     %xdefine bcstd 1to4
     %xdefine bcstq 1to2
 %endmacro
@@ -1050,6 +1069,7 @@
     INIT_CPUFLAGS %1
     DEFINE_MMREGS ymm
     AVX512_MM_PERMUTATION
+    %xdefine bcstw 1to16
     %xdefine bcstd 1to8
     %xdefine bcstq 1to4
 %endmacro
@@ -1065,6 +1085,7 @@
     INIT_CPUFLAGS %1
     DEFINE_MMREGS zmm
     AVX512_MM_PERMUTATION
+    %xdefine bcstw 1to32
     %xdefine bcstd 1to16
     %xdefine bcstq 1to8
 %endmacro
@@ -1607,11 +1628,11 @@
 AVX_INSTR pavgw, mmx2, 0, 0, 1
 AVX_INSTR pblendvb, sse4, 0, 1, 0 ; last operand must be xmm0 with legacy encoding
 AVX_INSTR pblendw, sse4, 0, 1, 0
-AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0
-AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0
-AVX_INSTR pclmullqhqdq, fnord, 0, 0, 0
-AVX_INSTR pclmullqlqdq, fnord, 0, 0, 0
-AVX_INSTR pclmulqdq, fnord, 0, 1, 0
+AVX_INSTR pclmulhqhqdq, clmul, 0, 0, 0
+AVX_INSTR pclmulhqlqdq, clmul, 0, 0, 0
+AVX_INSTR pclmullqhqdq, clmul, 0, 0, 0
+AVX_INSTR pclmullqlqdq, clmul, 0, 0, 0
+AVX_INSTR pclmulqdq, clmul, 0, 1, 0
 AVX_INSTR pcmpeqb, mmx, 0, 0, 1
 AVX_INSTR pcmpeqd, mmx, 0, 0, 1
 AVX_INSTR pcmpeqq, sse4, 0, 0, 1
@@ -1766,6 +1787,7 @@
 GPR_INSTR blsmsk, bmi1
 GPR_INSTR blsr, bmi1
 GPR_INSTR bzhi, bmi2
+GPR_INSTR crc32, sse42
 GPR_INSTR mulx, bmi2
 GPR_INSTR pdep, bmi2
 GPR_INSTR pext, bmi2
diff --git a/src/itx_1d.c b/src/itx_1d.c
index ca14fc8..8f75c65 100644
--- a/src/itx_1d.c
+++ b/src/itx_1d.c
@@ -1016,6 +1016,10 @@
         c[stride * i] *= 4;
 }
 
+#if !(HAVE_ASM && TRIM_DSP_FUNCTIONS && ( \
+  ARCH_AARCH64 || \
+  (ARCH_ARM && (defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32))) \
+))
 void dav1d_inv_wht4_1d_c(int32_t *const c, const ptrdiff_t stride) {
     assert(stride > 0);
     const int in0 = c[0 * stride], in1 = c[1 * stride];
@@ -1032,3 +1036,4 @@
     c[2 * stride] = t1;
     c[3 * stride] = t2 + t1;
 }
+#endif
diff --git a/src/itx_tmpl.c b/src/itx_tmpl.c
index 8ff245a..a226223 100644
--- a/src/itx_tmpl.c
+++ b/src/itx_tmpl.c
@@ -159,6 +159,10 @@
 inv_txfm_fn64(64, 32, 1)
 inv_txfm_fn64(64, 64, 2)
 
+#if !(HAVE_ASM && TRIM_DSP_FUNCTIONS && ( \
+  ARCH_AARCH64 || \
+  (ARCH_ARM && (defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32))) \
+))
 static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride,
                                        coef *const coeff, const int eob
                                        HIGHBD_DECL_SUFFIX)
@@ -179,6 +183,7 @@
         for (int x = 0; x < 4; x++)
             dst[x] = iclip_pixel(dst[x] + *c++);
 }
+#endif
 
 #if HAVE_ASM
 #if ARCH_AARCH64 || ARCH_ARM
@@ -236,7 +241,12 @@
     c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \
         inv_txfm_add_identity_adst_##w##x##h##_c; \
 
+#if !(HAVE_ASM && TRIM_DSP_FUNCTIONS && ( \
+  ARCH_AARCH64 || \
+  (ARCH_ARM && (defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32))) \
+))
     c->itxfm_add[TX_4X4][WHT_WHT] = inv_txfm_add_wht_wht_4x4_c;
+#endif
     assign_itx_all_fn84( 4,  4, );
     assign_itx_all_fn84( 4,  8, R);
     assign_itx_all_fn84( 4, 16, R);
diff --git a/src/loongarch/msac.S b/src/loongarch/msac.S
index c371eba..5bf1825 100644
--- a/src/loongarch/msac.S
+++ b/src/loongarch/msac.S
@@ -133,55 +133,58 @@
     slli.d          t4,      t4,      48
     vpickve2gr.d    t6,      vr2,     0
     sub.d           t6,      t6,      t4   // dif
-    addi.d          t6,      t6,      1
     clz.w           t4,      t5            // d
     xori            t4,      t4,      16   // d
     sll.d           t6,      t6,      t4
-    addi.d          t6,      t6,      -1   // dif
     addi.d          a5,      a0,      28   // cnt
-    ld.w            t7,      a5,      0
-    sub.w           t7,      t7,      t4   // cnt-d
+    ld.w            t0,      a5,      0
     sll.w           t5,      t5,      t4
+    sub.w           t7,      t0,      t4   // cnt-d
     st.w            t5,      a4,      0    // store rng
-    bge             t7,      zero,    9f
+    bgeu            t0,      t4,      9f
 
     // refill
     ld.d            t0,      a0,      0    // buf_pos
-    addi.d          t1,      a0,      8
-    ld.d            t1,      t1,      0    // buf_end
+    ld.d            t1,      a0,      8    // buf_end
     addi.d          t2,      t0,      8
-    blt             t1,      t2,      1f
+    bltu            t1,      t2,      2f
 
-    ld.d            t0,      t0,      0    // next_bits
-    addi.w          t3,      t7,      23   // shift_bits = cnt + 23
-    addi.w          t7,      t7,      16   // cnt += 16
-    revb.d          t0,      t0            // next_bits = bswap(next_bits)
-    srli.w          t4,      t3,      3
-    sub.d           t2,      t2,      t4   // buf_pos -= shift_bits >> 3
-    st.d            t2,      a0,      0
-    andi            t3,      t3,      24   // shift_bits &= 24
-    srl.d           t0,      t0,      t3   // next_bits >>= shift_bits
-    sub.w           t3,      t3,      t7   // shift_bits -= 16 + cnt
-    sll.d           t0,      t0,      t3   // next_bits <<= shift_bits
-    li.w            t5,      48
-    sub.w           t7,      t5,      t3   // cnt = cnt + 64 - shift_bits
-    xor             t6,      t6,      t0   // dif ^= next_bits
-    b               9f
+    ld.d            t3,      t0,      0    // next_bits
+    addi.w          t1,      t7,      -48  // shift_bits = cnt + 16 (- 64)
+    nor             t3,      t3,      t3
+    sub.w           t2,      zero,    t1
+    revb.d          t3,      t3            // next_bits = bswap(next_bits)
+    srli.w          t2,      t2,      3    // num_bytes_read
+    srl.d           t3,      t3,      t1   // next_bits >>= (shift_bits & 63)
+    b               3f
 1:
-    li.w            t4,      40
-    sub.w           t5,      t4,      t7   // c = 40 - cnt
+    addi.w          t3,      t7,      -48
+    srl.d           t3,      t3,      t3   // pad with ones
+    b               4f
 2:
-    bge             t0,      t1,      3f
-    ld.bu           t2,      t0,      0
-    addi.d          t0,      t0,      1
-    sll.d           t2,      t2,      t5
-    xor             t6,      t6,      t2
-    addi.w          t5,      t5,      -8
-    bge             t5,      zero,    2b
-    // refill_eob_end
+    bgeu            t0,      t1,      1b
+    ld.d            t3,      t1,      -8   // next_bits
+    sub.w           t2,      t2,      t1
+    sub.w           t1,      t1,      t0   // num_bytes_left
+    slli.w          t2,      t2,      3
+    srl.d           t3,      t3,      t2
+    addi.w          t2,      t7,      -48
+    nor             t3,      t3,      t3
+    sub.w           t4,      zero,    t2
+    revb.d          t3,      t3
+    srli.w          t4,      t4,      3
+    srl.d           t3,      t3,      t2
+    sltu            t2,      t1,      t4
+    maskeqz         t1,      t1,      t2
+    masknez         t2,      t4,      t2
+    or              t2,      t2,      t1   // num_bytes_read
 3:
-    st.d            t0,      a0,      0    // s->buf_pos = buf_pos
-    sub.w           t7,      t4,      t5   // cnt = 40 - c
+    slli.w          t1,      t2,      3
+    add.d           t0,      t0,      t2
+    add.w           t7,      t7,      t1   // cnt += num_bits_read
+    st.d            t0,      a0,      0
+4:
+    or              t6,      t6,      t3   // dif |= next_bits
 9:
     st.w            t7,      a5,      0    // store cnt
     st.d            t6,      a6,      0    // store dif
@@ -208,7 +211,6 @@
     srli.w          t2,      t0,      8    // r >> 8
     mul.w           t2,      t2,      a1
     ld.w            a5,      a0,      28   // cnt
-    addi.d          t1,      t1,      1    // dif + 1
     srli.w          t2,      t2,      1
     addi.w          t2,      t2,      4    // v
     slli.d          t3,      t2,      48   // vw
@@ -226,49 +228,53 @@
     clz.w           t4,      t5            // d
     xori            t4,      t4,      16   // d
     sll.d           t6,      t6,      t4
-    addi.d          t6,      t6,      -1   // dif
-    sub.w           t7,      a5,      t4   // cnt-d
     sll.w           t5,      t5,      t4
+    sub.w           t7,      a5,      t4   // cnt-d
     st.w            t5,      a0,      24   // store rng
-    bge             t7,      zero,    9f
+    bgeu            a5,      t4,      9f
 
     // refill
     ld.d            t0,      a0,      0    // buf_pos
-    addi.d          t1,      a0,      8
-    ld.d            t1,      t1,      0    // buf_end
+    ld.d            t1,      a0,      8    // buf_end
     addi.d          t2,      t0,      8
-    blt             t1,      t2,      1f
+    bltu            t1,      t2,      2f
 
-    ld.d            t0,      t0,      0    // next_bits
-    addi.w          t3,      t7,      23   // shift_bits = cnt + 23
-    addi.w          t7,      t7,      16   // cnt += 16
-    revb.d          t0,      t0            // next_bits = bswap(next_bits)
-    srli.w          t4,      t3,      3
-    sub.d           t2,      t2,      t4   // buf_pos -= shift_bits >> 3
-    st.d            t2,      a0,      0
-    andi            t3,      t3,      24   // shift_bits &= 24
-    srl.d           t0,      t0,      t3   // next_bits >>= shift_bits
-    sub.w           t3,      t3,      t7   // shift_bits -= 16 + cnt
-    sll.d           t0,      t0,      t3   // next_bits <<= shift_bits
-    li.w            t5,      48
-    sub.w           t7,      t5,      t3   // cnt = cnt + 64 - shift_bits
-    xor             t6,      t6,      t0   // dif ^= next_bits
-    b               9f
+    ld.d            t3,      t0,      0    // next_bits
+    addi.w          t1,      t7,      -48  // shift_bits = cnt + 16 (- 64)
+    nor             t3,      t3,      t3
+    sub.w           t2,      zero,    t1
+    revb.d          t3,      t3            // next_bits = bswap(next_bits)
+    srli.w          t2,      t2,      3    // num_bytes_read
+    srl.d           t3,      t3,      t1   // next_bits >>= (shift_bits & 63)
+    b               3f
 1:
-    li.w            t4,      40
-    sub.w           t5,      t4,      t7   // c = 40 - cnt
+    addi.w          t3,      t7,      -48
+    srl.d           t3,      t3,      t3   // pad with ones
+    b               4f
 2:
-    bge             t0,      t1,      3f
-    ld.bu           t2,      t0,      0
-    addi.d          t0,      t0,      1
-    sll.d           t2,      t2,      t5
-    xor             t6,      t6,      t2
-    addi.w          t5,      t5,      -8
-    bge             t5,      zero,    2b
-    // refill_eob_end
+    bgeu            t0,      t1,      1b
+    ld.d            t3,      t1,      -8   // next_bits
+    sub.w           t2,      t2,      t1
+    sub.w           t1,      t1,      t0   // num_bytes_left
+    slli.w          t2,      t2,      3
+    srl.d           t3,      t3,      t2
+    addi.w          t2,      t7,      -48
+    nor             t3,      t3,      t3
+    sub.w           t4,      zero,    t2
+    revb.d          t3,      t3
+    srli.w          t4,      t4,      3
+    srl.d           t3,      t3,      t2
+    sltu            t2,      t1,      t4
+    maskeqz         t1,      t1,      t2
+    masknez         t2,      t4,      t2
+    or              t2,      t2,      t1   // num_bytes_read
 3:
-    st.d            t0,      a0,      0    // s->buf_pos = buf_pos
-    sub.w           t7,      t4,      t5   // cnt = 40 - c
+    slli.w          t1,      t2,      3
+    add.d           t0,      t0,      t2
+    add.w           t7,      t7,      t1   // cnt += num_bits_read
+    st.d            t0,      a0,      0
+4:
+    or              t6,      t6,      t3   // dif |= next_bits
 9:
     st.w            t7,      a0,      28   // store cnt
     st.d            t6,      a0,      16   // store dif
@@ -313,54 +319,56 @@
     st.h            t0,      a1,      2
 
 .renorm:
-    // renorm
-    addi.d          t6,      t6,      1
     clz.w           t4,      t5            // d
     xori            t4,      t4,      16   // d
     sll.d           t6,      t6,      t4
-    addi.d          t6,      t6,      -1   // dif
-    sub.w           t7,      a5,      t4   // cnt-d
     sll.w           t5,      t5,      t4
+    sub.w           t7,      a5,      t4   // cnt-d
     st.w            t5,      a0,      24   // store rng
-    bge             t7,      zero,    9f
+    bgeu            a5,      t4,      9f
 
     // refill
     ld.d            t0,      a0,      0    // buf_pos
-    addi.d          t1,      a0,      8
-    ld.d            t1,      t1,      0    // buf_end
+    ld.d            t1,      a0,      8    // buf_end
     addi.d          t2,      t0,      8
-    blt             t1,      t2,      1f
+    bltu            t1,      t2,      2f
 
-    ld.d            t0,      t0,      0    // next_bits
-    addi.w          t3,      t7,      23   // shift_bits = cnt + 23
-    addi.w          t7,      t7,      16   // cnt += 16
-    revb.d          t0,      t0            // next_bits = bswap(next_bits)
-    srli.w          t4,      t3,      3
-    sub.d           t2,      t2,      t4   // buf_pos -= shift_bits >> 3
-    st.d            t2,      a0,      0
-    andi            t3,      t3,      24   // shift_bits &= 24
-    srl.d           t0,      t0,      t3   // next_bits >>= shift_bits
-    sub.w           t3,      t3,      t7   // shift_bits -= 16 + cnt
-    sll.d           t0,      t0,      t3   // next_bits <<= shift_bits
-    li.w            t5,      48
-    sub.w           t7,      t5,      t3   // cnt = cnt + 64 - shift_bits
-    xor             t6,      t6,      t0   // dif ^= next_bits
-    b               9f
+    ld.d            t3,      t0,      0    // next_bits
+    addi.w          t1,      t7,      -48  // shift_bits = cnt + 16 (- 64)
+    nor             t3,      t3,      t3
+    sub.w           t2,      zero,    t1
+    revb.d          t3,      t3            // next_bits = bswap(next_bits)
+    srli.w          t2,      t2,      3    // num_bytes_read
+    srl.d           t3,      t3,      t1   // next_bits >>= (shift_bits & 63)
+    b               3f
 1:
-    li.w            t4,      40
-    sub.w           t5,      t4,      t7   // c = 40 - cnt
+    addi.w          t3,      t7,      -48
+    srl.d           t3,      t3,      t3   // pad with ones
+    b               4f
 2:
-    bge             t0,      t1,      3f
-    ld.bu           t2,      t0,      0
-    addi.d          t0,      t0,      1
-    sll.d           t2,      t2,      t5
-    xor             t6,      t6,      t2
-    addi.w          t5,      t5,      -8
-    bge             t5,      zero,    2b
-    // refill_eob_end
+    bgeu            t0,      t1,      1b
+    ld.d            t3,      t1,      -8   // next_bits
+    sub.w           t2,      t2,      t1
+    sub.w           t1,      t1,      t0   // num_bytes_left
+    slli.w          t2,      t2,      3
+    srl.d           t3,      t3,      t2
+    addi.w          t2,      t7,      -48
+    nor             t3,      t3,      t3
+    sub.w           t4,      zero,    t2
+    revb.d          t3,      t3
+    srli.w          t4,      t4,      3
+    srl.d           t3,      t3,      t2
+    sltu            t2,      t1,      t4
+    maskeqz         t1,      t1,      t2
+    masknez         t2,      t4,      t2
+    or              t2,      t2,      t1   // num_bytes_read
 3:
-    st.d            t0,      a0,      0    // s->buf_pos = buf_pos
-    sub.w           t7,      t4,      t5   // cnt = 40 - c
+    slli.w          t1,      t2,      3
+    add.d           t0,      t0,      t2
+    add.w           t7,      t7,      t1   // cnt += num_bits_read
+    st.d            t0,      a0,      0
+4:
+    or              t6,      t6,      t3   // dif |= next_bits
 9:
     st.w            t7,      a0,      28   // store cnt
     st.d            t6,      a0,      16   // store dif
diff --git a/src/msac.c b/src/msac.c
index 43d8ae5..971ba85 100644
--- a/src/msac.c
+++ b/src/msac.c
@@ -43,15 +43,40 @@
     const uint8_t *buf_end = s->buf_end;
     int c = EC_WIN_SIZE - s->cnt - 24;
     ec_win dif = s->dif;
-    while (c >= 0 && buf_pos < buf_end) {
-        dif ^= ((ec_win)*buf_pos++) << c;
+    do {
+        if (buf_pos >= buf_end) {
+            // set remaining bits to 1;
+            dif |= ~(~(ec_win)0xff << c);
+            break;
+        }
+        dif |= (ec_win)(*buf_pos++ ^ 0xff) << c;
         c -= 8;
-    }
+    } while (c >= 0);
     s->dif = dif;
     s->cnt = EC_WIN_SIZE - c - 24;
     s->buf_pos = buf_pos;
 }
 
+int dav1d_msac_decode_subexp(MsacContext *const s, const int ref,
+                             const int n, unsigned k)
+{
+    assert(n >> k == 8);
+
+    unsigned a = 0;
+    if (dav1d_msac_decode_bool_equi(s)) {
+        if (dav1d_msac_decode_bool_equi(s))
+            k += dav1d_msac_decode_bool_equi(s) + 1;
+        a = 1 << k;
+    }
+    const unsigned v = dav1d_msac_decode_bools(s, k) + a;
+    return ref * 2 <= n ? inv_recenter(ref, v) :
+                          n - 1 - inv_recenter(n - 1 - ref, v);
+}
+
+#if !(HAVE_ASM && TRIM_DSP_FUNCTIONS && ( \
+  ARCH_AARCH64 || \
+  (ARCH_ARM && (defined(__ARM_NEON) || defined(__APPLE__) || defined(_WIN32))) \
+))
 /* Takes updated dif and range values, renormalizes them so that
  * 32768 <= rng < 65536 (reading more bytes from the stream into dif if
  * necessary), and stores them back in the decoder context.
@@ -61,11 +86,13 @@
                             const unsigned rng)
 {
     const int d = 15 ^ (31 ^ clz(rng));
+    const int cnt = s->cnt;
     assert(rng <= 65535U);
-    s->cnt -= d;
-    s->dif = ((dif + 1) << d) - 1; /* Shift in 1s in the LSBs */
+    s->dif = dif << d;
     s->rng = rng << d;
-    if (s->cnt < 0)
+    s->cnt = cnt - d;
+    // unsigned compare avoids redundant refills at eob
+    if ((unsigned)cnt < (unsigned)d)
         ctx_refill(s);
 }
 
@@ -100,22 +127,6 @@
     return !ret;
 }
 
-int dav1d_msac_decode_subexp(MsacContext *const s, const int ref,
-                             const int n, unsigned k)
-{
-    assert(n >> k == 8);
-
-    unsigned a = 0;
-    if (dav1d_msac_decode_bool_equi(s)) {
-        if (dav1d_msac_decode_bool_equi(s))
-            k += dav1d_msac_decode_bool_equi(s) + 1;
-        a = 1 << k;
-    }
-    const unsigned v = dav1d_msac_decode_bools(s, k) + a;
-    return ref * 2 <= n ? inv_recenter(ref, v) :
-                          n - 1 - inv_recenter(n - 1 - ref, v);
-}
-
 /* Decodes a symbol given an inverse cumulative distribution function (CDF)
  * table in Q15. */
 unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *const s,
@@ -188,13 +199,14 @@
     }
     return tok;
 }
+#endif
 
 void dav1d_msac_init(MsacContext *const s, const uint8_t *const data,
                      const size_t sz, const int disable_cdf_update_flag)
 {
     s->buf_pos = data;
     s->buf_end = data + sz;
-    s->dif = ((ec_win)1 << (EC_WIN_SIZE - 1)) - 1;
+    s->dif = 0;
     s->rng = 0x8000;
     s->cnt = -15;
     s->allow_update_cdf = !disable_cdf_update_flag;
diff --git a/src/ppc/cdef_tmpl.c b/src/ppc/cdef_tmpl.c
index e2e7598..6ef87ad 100644
--- a/src/ppc/cdef_tmpl.c
+++ b/src/ppc/cdef_tmpl.c
@@ -29,11 +29,10 @@
 
 #if BITDEPTH == 8
 static inline i16x8 vconstrain(const i16x8 diff, const int16_t threshold,
-                               const int damping)
+                               const uint16_t shift)
 {
     const i16x8 zero = vec_splat_s16(0);
     if (!threshold) return zero;
-    const uint16_t shift = imax(0, damping - ulog2(threshold));
     const i16x8 abs_diff = vec_abs(diff);
     const b16x8 mask = vec_cmplt(diff, zero);
     const i16x8 thr = vec_splats(threshold);
@@ -44,7 +43,7 @@
     return vec_sel(min, neg, mask);
 }
 
-static inline void copy4xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
+static inline void copy4xN(uint16_t *tmp,
                            const uint8_t *src, const ptrdiff_t src_stride,
                            const uint8_t (*left)[2], const uint8_t *const top,
                            const uint8_t *const bottom, const int w, const int h,
@@ -114,7 +113,7 @@
     }
 }
 
-static inline void copy8xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
+static inline void copy8xN(uint16_t *tmp,
                            const uint8_t *src, const ptrdiff_t src_stride,
                            const uint8_t (*left)[2], const uint8_t *const top,
                            const uint8_t *const bottom, const int w, const int h,
@@ -218,16 +217,12 @@
 
 #define LOAD_PIX(addr) \
     const i16x8 px = (i16x8)vec_vsx_ld(0, addr); \
-    i16x8 max = px; \
-    i16x8 min = px; \
     i16x8 sum = vec_splat_s16(0);
 
 #define LOAD_PIX4(addr) \
     const i16x8 a = (i16x8)vec_vsx_ld(0, addr); \
-    const i16x8 b = (i16x8)vec_vsx_ld(0, addr + tmp_stride); \
+    const i16x8 b = (i16x8)vec_vsx_ld(0, addr + 8); \
     const i16x8 px = vec_xxpermdi(a, b, 0); \
-    i16x8 max = px; \
-    i16x8 min = px; \
     i16x8 sum = vec_splat_s16(0);
 
 #define LOAD_DIR(p, addr, o0, o1) \
@@ -238,22 +233,26 @@
 
 #define LOAD_DIR4(p, addr, o0, o1) \
     LOAD_DIR(p ## a, addr, o0, o1) \
-    LOAD_DIR(p ## b, addr + tmp_stride, o0, o1) \
+    LOAD_DIR(p ## b, addr + 8, o0, o1) \
     const i16x8 p ## 0 = vec_xxpermdi(p ## a ## 0, p ## b ## 0, 0); \
     const i16x8 p ## 1 = vec_xxpermdi(p ## a ## 1, p ## b ## 1, 0); \
     const i16x8 p ## 2 = vec_xxpermdi(p ## a ## 2, p ## b ## 2, 0); \
     const i16x8 p ## 3 = vec_xxpermdi(p ## a ## 3, p ## b ## 3, 0);
 
-#define CONSTRAIN(p, strength) \
+#define CONSTRAIN(p, strength, shift) \
     const i16x8 p ## _d0 = vec_sub(p ## 0, px); \
     const i16x8 p ## _d1 = vec_sub(p ## 1, px); \
     const i16x8 p ## _d2 = vec_sub(p ## 2, px); \
     const i16x8 p ## _d3 = vec_sub(p ## 3, px); \
 \
-    i16x8 p ## _c0 = vconstrain(p ## _d0, strength, damping); \
-    i16x8 p ## _c1 = vconstrain(p ## _d1, strength, damping); \
-    i16x8 p ## _c2 = vconstrain(p ## _d2, strength, damping); \
-    i16x8 p ## _c3 = vconstrain(p ## _d3, strength, damping);
+    i16x8 p ## _c0 = vconstrain(p ## _d0, strength, shift); \
+    i16x8 p ## _c1 = vconstrain(p ## _d1, strength, shift); \
+    i16x8 p ## _c2 = vconstrain(p ## _d2, strength, shift); \
+    i16x8 p ## _c3 = vconstrain(p ## _d3, strength, shift);
+
+#define SETUP_MINMAX \
+    i16x8 max = px; \
+    i16x8 min = px; \
 
 #define MIN_MAX(p) \
     max = max_mask(p ## 0, max); \
@@ -265,19 +264,16 @@
     max = max_mask(p ## 3, max); \
     min = vec_min(p ## 3, min);
 
-#define PRI_0(p) \
-    p ## _c0 = vec_add(vec_sl(p ## _c0, vec_splat_u16(1)), vec_sl(p ## _c0, vec_splats(tap_even))); \
-    p ## _c1 = vec_add(vec_sl(p ## _c1, vec_splat_u16(1)), vec_sl(p ## _c1, vec_splats(tap_even)));
+#define MAKE_TAPS \
+    const int16_t tap_odd = (pri_strength >> bitdepth_min_8) & 1; \
+    const i16x8 tap0 = vec_splats((int16_t)(4 - tap_odd)); \
+    const i16x8 tap1 = vec_splats((int16_t)(2 + tap_odd));
 
-#define PRI_1(p) \
-    p ## _c2 = vec_sub(vec_sl(p ## _c2, vec_splat_u16(2)), vec_sl(p ## _c2, vec_splats(tap_even))); \
-    p ## _c3 = vec_sub(vec_sl(p ## _c3, vec_splat_u16(2)), vec_sl(p ## _c3, vec_splats(tap_even)));
-
-#define SEC_0(p) \
-    p ## _c0 = vec_sl(p ## _c0, vec_splat_u16(1)); \
-    p ## _c1 = vec_sl(p ## _c1, vec_splat_u16(1)); \
-    p ## _c2 = vec_sl(p ## _c2, vec_splat_u16(1)); \
-    p ## _c3 = vec_sl(p ## _c3, vec_splat_u16(1));
+#define PRI_0_UPDATE_SUM(p) \
+    sum = vec_madd(tap0, p ## _c0, sum); \
+    sum = vec_madd(tap0, p ## _c1, sum); \
+    sum = vec_madd(tap1, p ## _c2, sum); \
+    sum = vec_madd(tap1, p ## _c3, sum);
 
 #define UPDATE_SUM(p) \
     const i16x8 p ## sum0 = vec_add(p ## _c0, p ## _c1); \
@@ -285,36 +281,152 @@
     sum = vec_add(sum, p ## sum0); \
     sum = vec_add(sum, p ## sum1);
 
+#define SEC_0_UPDATE_SUM(p) \
+    sum = vec_madd(vec_splat_s16(2), p ## _c0, sum); \
+    sum = vec_madd(vec_splat_s16(2), p ## _c1, sum); \
+    sum = vec_madd(vec_splat_s16(2), p ## _c2, sum); \
+    sum = vec_madd(vec_splat_s16(2), p ## _c3, sum);
+
+#define BIAS \
+    i16x8 bias = vec_and((i16x8)vec_cmplt(sum, vec_splat_s16(0)), vec_splat_s16(1)); \
+    bias = vec_sub(vec_splat_s16(8), bias); \
+
+#define STORE4 \
+    dst[0] = vdst[0]; \
+    dst[1] = vdst[1]; \
+    dst[2] = vdst[2]; \
+    dst[3] = vdst[3]; \
+\
+    tmp += 8; \
+    dst += PXSTRIDE(dst_stride); \
+    dst[0] = vdst[4]; \
+    dst[1] = vdst[5]; \
+    dst[2] = vdst[6]; \
+    dst[3] = vdst[7]; \
+\
+    tmp += 8; \
+    dst += PXSTRIDE(dst_stride);
+
+#define STORE4_CLAMPED \
+    BIAS \
+    i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); \
+    i16x8 vdst = vec_max(vec_min(unclamped, max), min); \
+    STORE4
+
+#define STORE4_UNCLAMPED \
+    BIAS \
+    i16x8 vdst = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); \
+    STORE4
+
+#define STORE8 \
+    dst[0] = vdst[0]; \
+    dst[1] = vdst[1]; \
+    dst[2] = vdst[2]; \
+    dst[3] = vdst[3]; \
+    dst[4] = vdst[4]; \
+    dst[5] = vdst[5]; \
+    dst[6] = vdst[6]; \
+    dst[7] = vdst[7]; \
+\
+    tmp += 16; \
+    dst += PXSTRIDE(dst_stride);
+
+#define STORE8_CLAMPED \
+    BIAS \
+    i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); \
+    i16x8 vdst = vec_max(vec_min(unclamped, max), min); \
+    STORE8
+
+#define STORE8_UNCLAMPED \
+    BIAS \
+    i16x8 vdst = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); \
+    STORE8
+
+#define DIRECTIONS(w, tmp_stride) \
+    static const int8_t cdef_directions##w[8 /* dir */][2 /* pass */] = { \
+        { -1 * tmp_stride + 1, -2 * tmp_stride + 2 }, \
+        {  0 * tmp_stride + 1, -1 * tmp_stride + 2 }, \
+        {  0 * tmp_stride + 1,  0 * tmp_stride + 2 }, \
+        {  0 * tmp_stride + 1,  1 * tmp_stride + 2 }, \
+        {  1 * tmp_stride + 1,  2 * tmp_stride + 2 }, \
+        {  1 * tmp_stride + 0,  2 * tmp_stride + 1 }, \
+        {  1 * tmp_stride + 0,  2 * tmp_stride + 0 }, \
+        {  1 * tmp_stride + 0,  2 * tmp_stride - 1 } \
+    };
+
+DIRECTIONS(4, 8)
+DIRECTIONS(8, 16)
+
 static inline void
 filter_4xN(pixel *dst, const ptrdiff_t dst_stride,
            const pixel (*left)[2], const pixel *const top,
            const pixel *const bottom, const int w, const int h,
            const int pri_strength, const int sec_strength, const int dir,
-           const int damping, const enum CdefEdgeFlags edges,
-           const ptrdiff_t tmp_stride, uint16_t *tmp)
+           const int pri_shift, const int sec_shift,
+           const enum CdefEdgeFlags edges, uint16_t *tmp)
 {
-    const int8_t cdef_directions[8 /* dir */][2 /* pass */] = {
-        { -1 * tmp_stride + 1, -2 * tmp_stride + 2 },
-        {  0 * tmp_stride + 1, -1 * tmp_stride + 2 },
-        {  0 * tmp_stride + 1,  0 * tmp_stride + 2 },
-        {  0 * tmp_stride + 1,  1 * tmp_stride + 2 },
-        {  1 * tmp_stride + 1,  2 * tmp_stride + 2 },
-        {  1 * tmp_stride + 0,  2 * tmp_stride + 1 },
-        {  1 * tmp_stride + 0,  2 * tmp_stride + 0 },
-        {  1 * tmp_stride + 0,  2 * tmp_stride - 1 }
-    };
     const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
-    const uint16_t tap_even = !((pri_strength >> bitdepth_min_8) & 1);
-    const int off1 = cdef_directions[dir][0];
-    const int off1_1 = cdef_directions[dir][1];
+    const int off1 = cdef_directions4[dir][0];
+    const int off1_1 = cdef_directions4[dir][1];
 
-    const int off2 = cdef_directions[(dir + 2) & 7][0];
-    const int off3 = cdef_directions[(dir + 6) & 7][0];
+    const int off2 = cdef_directions4[(dir + 2) & 7][0];
+    const int off3 = cdef_directions4[(dir + 6) & 7][0];
 
-    const int off2_1 = cdef_directions[(dir + 2) & 7][1];
-    const int off3_1 = cdef_directions[(dir + 6) & 7][1];
+    const int off2_1 = cdef_directions4[(dir + 2) & 7][1];
+    const int off3_1 = cdef_directions4[(dir + 6) & 7][1];
 
-    copy4xN(tmp - 2, tmp_stride, dst, dst_stride, left, top, bottom, w, h, edges);
+    MAKE_TAPS
+
+    for (int y = 0; y < h / 2; y++) {
+        LOAD_PIX4(tmp)
+
+        SETUP_MINMAX
+
+        // Primary pass
+        LOAD_DIR4(p, tmp, off1, off1_1)
+
+        CONSTRAIN(p, pri_strength, pri_shift)
+
+        MIN_MAX(p)
+
+        PRI_0_UPDATE_SUM(p)
+
+        // Secondary pass 1
+        LOAD_DIR4(s, tmp, off2, off3)
+
+        CONSTRAIN(s, sec_strength, sec_shift)
+
+        MIN_MAX(s)
+
+        SEC_0_UPDATE_SUM(s)
+
+        // Secondary pass 2
+        LOAD_DIR4(s2, tmp, off2_1, off3_1)
+
+        CONSTRAIN(s2, sec_strength, sec_shift)
+
+        MIN_MAX(s2)
+
+        UPDATE_SUM(s2)
+
+        // Store
+        STORE4_CLAMPED
+    }
+}
+
+static inline void
+filter_4xN_pri(pixel *dst, const ptrdiff_t dst_stride,
+           const pixel (*left)[2], const pixel *const top,
+           const pixel *const bottom, const int w, const int h,
+           const int pri_strength, const int dir,
+           const int pri_shift, const enum CdefEdgeFlags edges,
+           uint16_t *tmp)
+{
+    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+    const int off1 = cdef_directions4[dir][0];
+    const int off1_1 = cdef_directions4[dir][1];
+
+    MAKE_TAPS
 
     for (int y = 0; y < h / 2; y++) {
         LOAD_PIX4(tmp)
@@ -322,55 +434,45 @@
         // Primary pass
         LOAD_DIR4(p, tmp, off1, off1_1)
 
-        CONSTRAIN(p, pri_strength)
+        CONSTRAIN(p, pri_strength, pri_shift)
 
-        MIN_MAX(p)
+        PRI_0_UPDATE_SUM(p)
 
-        PRI_0(p)
-        PRI_1(p)
+        STORE4_UNCLAMPED
+    }
+}
 
-        UPDATE_SUM(p)
+static inline void
+filter_4xN_sec(pixel *dst, const ptrdiff_t dst_stride,
+           const pixel (*left)[2], const pixel *const top,
+           const pixel *const bottom, const int w, const int h,
+           const int sec_strength, const int dir,
+           const int sec_shift, const enum CdefEdgeFlags edges,
+           uint16_t *tmp)
+{
+    const int off2 = cdef_directions4[(dir + 2) & 7][0];
+    const int off3 = cdef_directions4[(dir + 6) & 7][0];
 
+    const int off2_1 = cdef_directions4[(dir + 2) & 7][1];
+    const int off3_1 = cdef_directions4[(dir + 6) & 7][1];
+
+    for (int y = 0; y < h / 2; y++) {
+        LOAD_PIX4(tmp)
         // Secondary pass 1
         LOAD_DIR4(s, tmp, off2, off3)
 
-        CONSTRAIN(s, sec_strength)
+        CONSTRAIN(s, sec_strength, sec_shift)
 
-        MIN_MAX(s)
-
-        SEC_0(s)
-
-        UPDATE_SUM(s)
+        SEC_0_UPDATE_SUM(s)
 
         // Secondary pass 2
         LOAD_DIR4(s2, tmp, off2_1, off3_1)
 
-        CONSTRAIN(s2, sec_strength)
-
-        MIN_MAX(s2)
+        CONSTRAIN(s2, sec_strength, sec_shift)
 
         UPDATE_SUM(s2)
 
-        // Store
-        i16x8 bias = vec_and((i16x8)vec_cmplt(sum, vec_splat_s16(0)), vec_splat_s16(1));
-        bias = vec_sub(vec_splat_s16(8), bias);
-        i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4)));
-        i16x8 vdst = vec_max(vec_min(unclamped, max), min);
-
-        dst[0] = vdst[0];
-        dst[1] = vdst[1];
-        dst[2] = vdst[2];
-        dst[3] = vdst[3];
-
-        tmp += tmp_stride;
-        dst += PXSTRIDE(dst_stride);
-        dst[0] = vdst[4];
-        dst[1] = vdst[5];
-        dst[2] = vdst[6];
-        dst[3] = vdst[7];
-
-        tmp += tmp_stride;
-        dst += PXSTRIDE(dst_stride);
+        STORE4_UNCLAMPED
     }
 }
 
@@ -379,33 +481,73 @@
            const pixel (*left)[2], const pixel *const top,
            const pixel *const bottom, const int w, const int h,
            const int pri_strength, const int sec_strength, const int dir,
-           const int damping, const enum CdefEdgeFlags edges,
-           const ptrdiff_t tmp_stride, uint16_t *tmp)
+           const int pri_shift, const int sec_shift, const enum CdefEdgeFlags edges,
+           uint16_t *tmp)
 {
-    const int8_t cdef_directions[8 /* dir */][2 /* pass */] = {
-        { -1 * tmp_stride + 1, -2 * tmp_stride + 2 },
-        {  0 * tmp_stride + 1, -1 * tmp_stride + 2 },
-        {  0 * tmp_stride + 1,  0 * tmp_stride + 2 },
-        {  0 * tmp_stride + 1,  1 * tmp_stride + 2 },
-        {  1 * tmp_stride + 1,  2 * tmp_stride + 2 },
-        {  1 * tmp_stride + 0,  2 * tmp_stride + 1 },
-        {  1 * tmp_stride + 0,  2 * tmp_stride + 0 },
-        {  1 * tmp_stride + 0,  2 * tmp_stride - 1 }
-    };
     const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
 
+    const int off1 = cdef_directions8[dir][0];
+    const int off1_1 = cdef_directions8[dir][1];
 
-    const uint16_t tap_even = !((pri_strength >> bitdepth_min_8) & 1);
-    const int off1 = cdef_directions[dir][0];
-    const int off1_1 = cdef_directions[dir][1];
+    const int off2 = cdef_directions8[(dir + 2) & 7][0];
+    const int off3 = cdef_directions8[(dir + 6) & 7][0];
 
-    const int off2 = cdef_directions[(dir + 2) & 7][0];
-    const int off3 = cdef_directions[(dir + 6) & 7][0];
+    const int off2_1 = cdef_directions8[(dir + 2) & 7][1];
+    const int off3_1 = cdef_directions8[(dir + 6) & 7][1];
 
-    const int off2_1 = cdef_directions[(dir + 2) & 7][1];
-    const int off3_1 = cdef_directions[(dir + 6) & 7][1];
+    MAKE_TAPS
 
-    copy8xN(tmp - 2, tmp_stride, dst, dst_stride, left, top, bottom, w, h, edges);
+    for (int y = 0; y < h; y++) {
+        LOAD_PIX(tmp)
+
+        SETUP_MINMAX
+
+        // Primary pass
+        LOAD_DIR(p, tmp, off1, off1_1)
+
+        CONSTRAIN(p, pri_strength, pri_shift)
+
+        MIN_MAX(p)
+
+        PRI_0_UPDATE_SUM(p)
+
+        // Secondary pass 1
+        LOAD_DIR(s, tmp, off2, off3)
+
+        CONSTRAIN(s, sec_strength, sec_shift)
+
+        MIN_MAX(s)
+
+        SEC_0_UPDATE_SUM(s)
+
+        // Secondary pass 2
+        LOAD_DIR(s2, tmp, off2_1, off3_1)
+
+        CONSTRAIN(s2, sec_strength, sec_shift)
+
+        MIN_MAX(s2)
+
+        UPDATE_SUM(s2)
+
+        // Store
+        STORE8_CLAMPED
+    }
+
+}
+
+static inline void
+filter_8xN_pri(pixel *dst, const ptrdiff_t dst_stride,
+           const pixel (*left)[2], const pixel *const top,
+           const pixel *const bottom, const int w, const int h,
+           const int pri_strength, const int dir,
+           const int pri_shift, const enum CdefEdgeFlags edges,
+           uint16_t *tmp)
+{
+    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
+    const int off1 = cdef_directions8[dir][0];
+    const int off1_1 = cdef_directions8[dir][1];
+
+    MAKE_TAPS
 
     for (int y = 0; y < h; y++) {
         LOAD_PIX(tmp)
@@ -413,54 +555,47 @@
         // Primary pass
         LOAD_DIR(p, tmp, off1, off1_1)
 
-        CONSTRAIN(p, pri_strength)
+        CONSTRAIN(p, pri_strength, pri_shift)
 
-        MIN_MAX(p)
+        PRI_0_UPDATE_SUM(p)
 
-        PRI_0(p)
-        PRI_1(p)
+        STORE8_UNCLAMPED
+    }
+}
 
-        UPDATE_SUM(p)
+static inline void
+filter_8xN_sec(pixel *dst, const ptrdiff_t dst_stride,
+           const pixel (*left)[2], const pixel *const top,
+           const pixel *const bottom, const int w, const int h,
+           const int sec_strength, const int dir,
+           const int sec_shift, const enum CdefEdgeFlags edges,
+           uint16_t *tmp)
+{
+    const int off2 = cdef_directions8[(dir + 2) & 7][0];
+    const int off3 = cdef_directions8[(dir + 6) & 7][0];
+
+    const int off2_1 = cdef_directions8[(dir + 2) & 7][1];
+    const int off3_1 = cdef_directions8[(dir + 6) & 7][1];
+
+    for (int y = 0; y < h; y++) {
+        LOAD_PIX(tmp)
 
         // Secondary pass 1
         LOAD_DIR(s, tmp, off2, off3)
 
-        CONSTRAIN(s, sec_strength)
+        CONSTRAIN(s, sec_strength, sec_shift)
 
-        MIN_MAX(s)
-
-        SEC_0(s)
-
-        UPDATE_SUM(s)
+        SEC_0_UPDATE_SUM(s)
 
         // Secondary pass 2
         LOAD_DIR(s2, tmp, off2_1, off3_1)
 
-        CONSTRAIN(s2, sec_strength)
-
-        MIN_MAX(s2)
+        CONSTRAIN(s2, sec_strength, sec_shift)
 
         UPDATE_SUM(s2)
 
-        // Store
-        i16x8 bias = vec_and((i16x8)vec_cmplt(sum, vec_splat_s16(0)), vec_splat_s16(1));
-        bias = vec_sub(vec_splat_s16(8), bias);
-        i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4)));
-        i16x8 vdst = vec_max(vec_min(unclamped, max), min);
-
-        dst[0] = vdst[0];
-        dst[1] = vdst[1];
-        dst[2] = vdst[2];
-        dst[3] = vdst[3];
-        dst[4] = vdst[4];
-        dst[5] = vdst[5];
-        dst[6] = vdst[6];
-        dst[7] = vdst[7];
-
-        tmp += tmp_stride;
-        dst += PXSTRIDE(dst_stride);
+        STORE8_UNCLAMPED
     }
-
 }
 
 #define cdef_fn(w, h, tmp_stride) \
@@ -477,8 +612,22 @@
 { \
     ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride + 8,); \
     uint16_t *tmp = tmp_buf + 2 * tmp_stride + 2; \
-    filter_##w##xN(dst, dst_stride, left, top, bottom, w, h, pri_strength, \
-                   sec_strength, dir, damping, edges, tmp_stride, tmp); \
+    copy##w##xN(tmp - 2, dst, dst_stride, left, top, bottom, w, h, edges); \
+    if (pri_strength) { \
+        const int pri_shift = imax(0, damping - ulog2(pri_strength)); \
+        if (sec_strength) { \
+            const int sec_shift = damping - ulog2(sec_strength); \
+            filter_##w##xN(dst, dst_stride, left, top, bottom, w, h, pri_strength, \
+                           sec_strength, dir, pri_shift, sec_shift, edges, tmp); \
+        } else { \
+            filter_##w##xN_pri(dst, dst_stride, left, top, bottom, w, h, pri_strength, \
+                               dir, pri_shift, edges, tmp); \
+        } \
+    } else { \
+        const int sec_shift = damping - ulog2(sec_strength); \
+        filter_##w##xN_sec(dst, dst_stride, left, top, bottom, w, h, sec_strength, \
+                           dir, sec_shift, edges, tmp); \
+    } \
 }
 
 cdef_fn(4, 4, 8);
diff --git a/src/riscv/64/itx.S b/src/riscv/64/itx.S
index 5677cf4..dfec548 100644
--- a/src/riscv/64/itx.S
+++ b/src/riscv/64/itx.S
@@ -163,48 +163,48 @@
   vssub.vv \o3, v16, v20
 .endm
 
-.macro iadst_4 o0, o1, o2, o3
+.macro iadst_4 o0, o1, o2, o3, lm2, lm
   li t1, 1321
   li t2, 3803
   li t3, 2482
 
-  vwmul.vx v4, v0, t1
-  vwmul.vx v5, v0, t3
+  vwmul.vx v16, v0, t1
+  vwmul.vx v18, v0, t3
   neg t1, t1
-  vwmacc.vx v4, t2, v2
-  vwmacc.vx v5, t1, v2
+  vwmacc.vx v16, t2, v2
+  vwmacc.vx v18, t1, v2
   neg t2, t2
-  vwmacc.vx v4, t3, v3
-  vwmacc.vx v5, t2, v3
+  vwmacc.vx v16, t3, v3
+  vwmacc.vx v18, t2, v3
 
-  vwsub.vv v6, v0, v2
-  vwadd.wv v6, v6, v3
+  vwsub.vv v20,  v0, v2
+  vwadd.wv v20, v20, v3
 
   li t1, 3344
-  vwmul.vx v7, v1, t1
+  vwmul.vx v22, v1, t1
 
-  vsetvli zero, zero, e32, m1, ta, ma
+  vsetvli zero, zero, e32, \lm2, ta, ma
 
-  vmul.vx v6, v6, t1
+  vmul.vx v20, v20, t1
 
-  vadd.vv v8, v4, v5
-  vadd.vv v4, v4, v7
-  vadd.vv v5, v5, v7
-  vsub.vv v7, v8, v7
+  vadd.vv v24, v16, v18
+  vadd.vv v16, v16, v22
+  vadd.vv v18, v18, v22
+  vsub.vv v22, v24, v22
 
   li t1, 2048
 
-  vadd.vx v4, v4, t1
-  vadd.vx v5, v5, t1
-  vadd.vx v6, v6, t1
-  vadd.vx v7, v7, t1
+  vadd.vx v16, v16, t1
+  vadd.vx v18, v18, t1
+  vadd.vx v20, v20, t1
+  vadd.vx v22, v22, t1
 
-  vsetvli zero, zero, e16, mf2, ta, ma
+  vsetvli zero, zero, e16, \lm, ta, ma
 
-  vnsra.wi \o0, v4, 12
-  vnsra.wi \o1, v5, 12
-  vnsra.wi \o2, v6, 12
-  vnsra.wi \o3, v7, 12
+  vnsra.wi \o0, v16, 12
+  vnsra.wi \o1, v18, 12
+  vnsra.wi \o2, v20, 12
+  vnsra.wi \o3, v22, 12
 .endm
 
 function inv_dct_e16_x4_rvv, export=1, ext=v
@@ -213,12 +213,22 @@
 endfunc
 
 function inv_adst_e16_x4_rvv, export=1, ext=v
-  iadst_4 v0, v1, v2, v3
+  iadst_4 v0, v1, v2, v3, m1, mf2
   jr t0
 endfunc
 
 function inv_flipadst_e16_x4_rvv, export=1, ext=v
-  iadst_4 v3, v2, v1, v0
+  iadst_4 v3, v2, v1, v0, m1, mf2
+  jr t0
+endfunc
+
+function inv_adst_e16_x4w_rvv, export=1, ext=v
+  iadst_4 v0, v1, v2, v3, m2, m1
+  jr t0
+endfunc
+
+function inv_flipadst_e16_x4w_rvv, export=1, ext=v
+  iadst_4 v3, v2, v1, v0, m2, m1
   jr t0
 endfunc
 
@@ -328,6 +338,8 @@
 
 .ifc \variant, identity_
   // The identity vsadd.vv and downshift vssra.vi 1 cancel out
+
+  j L(itx_8x8_epilog)
 .else
   jalr t0, a4
 
@@ -339,8 +351,8 @@
   vssra.vi v5, v5, 1
   vssra.vi v6, v6, 1
   vssra.vi v7, v7, 1
-.endif
 
+L(itx_8x8_epilog):
   vsseg8e16.v v0, (a2)
   vle16.v v0, (a2)
   addi t0, a2, 16
@@ -374,9 +386,7 @@
   vmv.v.x v8, zero
   vse16.v v8, (a2)
 
-.ifc \variant, identity_
 itx_8x8_end:
-.endif
   vsetivli zero, 8, e8, mf2, ta, ma
   vle8.v v8, (a0)
   add t0, a0, a1
@@ -403,7 +413,7 @@
   vwaddu.wv v6, v6, v14
   vwaddu.wv v7, v7, v15
 
-  vsetvli zero, zero, e16, m1
+  vsetvli zero, zero, e16, m1, ta, ma
   vmax.vx v0, v0, zero
   vmax.vx v1, v1, zero
   vmax.vx v2, v2, zero
@@ -441,11 +451,12 @@
   vse8.v v15, (a0)
 
   ret
+.endif
 endfunc
 .endm
 
-def_fn_8x8_base
 def_fn_8x8_base identity_
+def_fn_8x8_base
 
 function inv_identity_e16_x8_rvv, export=1, ext=v
   vsadd.vv v0, v0, v0
@@ -530,23 +541,23 @@
   li t5, 2598
   li t6, 3166
 
-  vwmul.vx v8, v7, t1
+  vwmul.vx v16, v7, t1
   neg t1, t1
-  vwmul.vx v10, v7, t2
-  vwmacc.vx v8, t2, v0
-  vwmacc.vx v10, t1, v0
+  vwmul.vx v18, v7, t2
+  vwmacc.vx v16, t2, v0
+  vwmacc.vx v18, t1, v0
 
-  vwmul.vx v12, v5, t3
+  vwmul.vx v20, v5, t3
   neg t3, t3
-  vwmul.vx v14, v5, t4
-  vwmacc.vx v12, t4, v2
-  vwmacc.vx v14, t3, v2
+  vwmul.vx v22, v5, t4
+  vwmacc.vx v20, t4, v2
+  vwmacc.vx v22, t3, v2
 
-  vwmul.vx v16, v3, t5
+  vwmul.vx v24, v3, t5
   neg t5, t5
-  vwmul.vx v18, v3, t6
-  vwmacc.vx v16, t6, v4
-  vwmacc.vx v18, t5, v4
+  vwmul.vx v26, v3, t6
+  vwmacc.vx v24, t6, v4
+  vwmacc.vx v26, t5, v4
 
   li t1, 2048
   li t2, 1189
@@ -555,95 +566,95 @@
   li t5, 3784
   li t6, 2896
 
-  vwmul.vx v20, v1, t2
+  vwmul.vx v28, v1, t2
   neg t2, t2
-  vwmul.vx v22, v1, t3
-  vwmacc.vx v20, t3, v6
-  vwmacc.vx v22, t2, v6
+  vwmul.vx v30, v1, t3
+  vwmacc.vx v28, t3, v6
+  vwmacc.vx v30, t2, v6
 
-  vwadd.wx v8, v8, t1
-  vwadd.wx v10, v10, t1
-  vwadd.wx v12, v12, t1
-  vwadd.wx v14, v14, t1
+  vwadd.wx v16, v16, t1
+  vwadd.wx v18, v18, t1
+  vwadd.wx v20, v20, t1
+  vwadd.wx v22, v22, t1
+  vwadd.wx v24, v24, t1
+  vwadd.wx v26, v26, t1
+  vwadd.wx v28, v28, t1
+  vwadd.wx v30, v30, t1
+
+  vnsra.wi v16, v16, 12
+  vnsra.wi v18, v18, 12
+  vnsra.wi v20, v20, 12
+  vnsra.wi v22, v22, 12
+  vnsra.wi v24, v24, 12
+  vnsra.wi v26, v26, 12
+  vnsra.wi v28, v28, 12
+  vnsra.wi v30, v30, 12
+
+  vssub.vv  v4, v16, v24
+  vsadd.vv v16, v16, v24
+  vsadd.vv  v1, v18, v26
+  vsadd.vv  v2, v20, v28
+  vsadd.vv  v3, v22, v30
+  vssub.vv  v5, v18, v26
+  vssub.vv  v6, v20, v28
+  vssub.vv v30, v22, v30
+
+  vsadd.vv \o0, v16, v2
+  vsadd.vv \o7,  v1, v3
+  vssub.vv  v2, v16, v2
+  vssub.vv  v3,  v1, v3
+
+  vwmul.vx v16,  v4, t5
+  vwmul.vx v18,  v4, t4
+  vwmul.vx v20, v30, t5
+  vwmul.vx v22, v30, t4
+  vwmacc.vx v16, t4, v5
+  neg t4, t4
+  vwmacc.vx v22, t5, v6
+  neg t5, t5
+  vwmacc.vx v20, t4, v6
+  vwmacc.vx v18, t5, v5
+
   vwadd.wx v16, v16, t1
   vwadd.wx v18, v18, t1
   vwadd.wx v20, v20, t1
   vwadd.wx v22, v22, t1
 
-  vnsra.wi v8, v8, 12
-  vnsra.wi v10, v10, 12
-  vnsra.wi v12, v12, 12
-  vnsra.wi v14, v14, 12
   vnsra.wi v16, v16, 12
   vnsra.wi v18, v18, 12
   vnsra.wi v20, v20, 12
   vnsra.wi v22, v22, 12
 
-  vssub.vv v4, v8, v16
-  vsadd.vv v8, v8, v16
-  vsadd.vv v1, v10, v18
-  vsadd.vv v2, v12, v20
-  vsadd.vv v3, v14, v22
-  vssub.vv v5, v10, v18
-  vssub.vv v6, v12, v20
-  vssub.vv v22, v14, v22
+  vsadd.vv \o1, v16, v20
+  vsadd.vv \o6, v18, v22
+  vssub.vv v16, v16, v20
+  vssub.vv v17, v18, v22
 
-  vsadd.vv \o0, v8, v2
-  vsadd.vv \o7, v1, v3
-  vssub.vv v2, v8, v2
-  vssub.vv v3, v1, v3
-
-  vwmul.vx v8, v4, t5
-  vwmul.vx v10, v4, t4
-  vwmul.vx v12, v22, t5
-  vwmul.vx v14, v22, t4
-  vwmacc.vx v8, t4, v5
-  neg t4, t4
-  vwmacc.vx v14, t5, v6
-  neg t5, t5
-  vwmacc.vx v12, t4, v6
-  vwmacc.vx v10, t5, v5
-
-  vwadd.wx v8, v8, t1
-  vwadd.wx v10, v10, t1
-  vwadd.wx v12, v12, t1
-  vwadd.wx v14, v14, t1
-
-  vnsra.wi v8, v8, 12
-  vnsra.wi v10, v10, 12
-  vnsra.wi v12, v12, 12
-  vnsra.wi v14, v14, 12
-
-  vsadd.vv \o1, v8, v12
-  vsadd.vv \o6, v10, v14
-  vssub.vv v8, v8, v12
-  vssub.vv v9, v10, v14
-
-  vwmul.vx v10, v2, t6
-  vwmul.vx v12, v2, t6
-  vwmul.vx v14, v8, t6
-  vwmul.vx v16, v8, t6
-  vwmacc.vx v10, t6, v3
-  vwmacc.vx v14, t6, v9
+  vwmul.vx v18, v2, t6
+  vwmul.vx v20, v2, t6
+  vwmul.vx v22, v16, t6
+  vwmul.vx v24, v16, t6
+  vwmacc.vx v18, t6, v3
+  vwmacc.vx v22, t6, v17
   neg t6, t6
-  vwmacc.vx v12, t6, v3
-  vwmacc.vx v16, t6, v9
+  vwmacc.vx v20, t6, v3
+  vwmacc.vx v24, t6, v17
 
-  vwadd.wx v10, v10, t1
-  vwadd.wx v12, v12, t1
-  vwadd.wx v14, v14, t1
-  vwadd.wx v16, v16, t1
+  vwadd.wx v18, v18, t1
+  vwadd.wx v20, v20, t1
+  vwadd.wx v22, v22, t1
+  vwadd.wx v24, v24, t1
 
-  vnsra.wi \o3, v10, 12
-  vnsra.wi \o4, v12, 12
-  vnsra.wi \o2, v14, 12
-  vnsra.wi \o5, v16, 12
+  vnsra.wi \o3, v18, 12
+  vnsra.wi \o4, v20, 12
+  vnsra.wi \o2, v22, 12
+  vnsra.wi \o5, v24, 12
 
-  vmv.v.x v8, zero
-  vssub.vv \o1, v8, \o1
-  vssub.vv \o3, v8, \o3
-  vssub.vv \o5, v8, \o5
-  vssub.vv \o7, v8, \o7
+  vmv.v.x v16, zero
+  vssub.vv \o1, v16, \o1
+  vssub.vv \o3, v16, \o3
+  vssub.vv \o5, v16, \o5
+  vssub.vv \o7, v16, \o7
 .endm
 
 function inv_dct_e16_x8_rvv, export=1, ext=v
@@ -714,6 +725,206 @@
 def_fn_8x8 identity, adst
 def_fn_8x8 identity, flipadst
 
+function inv_txfm_add_4x8_rvv, export=1, ext=v
+  csrw vxrm, zero
+
+  vsetivli zero, 8, e16, m1, ta, ma
+  vle16.v v0, (a2)
+  addi t0, a2, 16
+  vle16.v v1, (t0)
+  addi t0, t0, 16
+  vle16.v v2, (t0)
+  addi t0, t0, 16
+  vle16.v v3, (t0)
+
+  li t1, 2896*8
+.irp i, 0, 1, 2, 3
+  vsmul.vx v\i, v\i, t1
+.endr
+
+  jalr t0, a4
+
+  vsseg4e16.v v0, (a2)
+
+  vsetivli zero, 4, e16, mf2, ta, ma
+  vmv.v.x v8, zero
+  vle16.v v0, (a2)
+  vse16.v v8, (a2)
+.irp i, 1, 2, 3, 4, 5, 6, 7
+  addi a2, a2, 8
+  vle16.v v\i, (a2)
+  vse16.v v8, (a2)
+.endr
+
+  jalr t0, a5
+
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+  vssra.vi v\i, v\i, 4
+.endr
+
+  vsetvli zero, zero, e8, mf4, ta, ma
+  vle8.v v8, (a0)
+  add t0, a0, a1
+  vle8.v v9, (t0)
+.irp i, 10, 11, 12, 13, 14, 15
+  add t0, t0, a1
+  vle8.v v\i, (t0)
+.endr
+
+  vwaddu.wv v0, v0,  v8
+  vwaddu.wv v1, v1,  v9
+  vwaddu.wv v2, v2, v10
+  vwaddu.wv v3, v3, v11
+  vwaddu.wv v4, v4, v12
+  vwaddu.wv v5, v5, v13
+  vwaddu.wv v6, v6, v14
+  vwaddu.wv v7, v7, v15
+
+  vsetvli zero, zero, e16, mf2, ta, ma
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+  vmax.vx v\i, v\i, zero
+.endr
+
+  vsetvli zero, zero, e8, mf4, ta, ma
+
+  vnclipu.wi  v8, v0, 0
+  vnclipu.wi  v9, v1, 0
+  vnclipu.wi v10, v2, 0
+  vnclipu.wi v11, v3, 0
+  vnclipu.wi v12, v4, 0
+  vnclipu.wi v13, v5, 0
+  vnclipu.wi v14, v6, 0
+  vnclipu.wi v15, v7, 0
+
+  vse8.v v8, (a0)
+.irp i, 9, 10, 11, 12, 13, 14, 15
+  add a0, a0, a1
+  vse8.v v\i, (a0)
+.endr
+
+  ret
+endfunc
+
+function inv_txfm_add_8x4_rvv, export=1, ext=v
+  csrw vxrm, zero
+
+  vsetivli zero, 4, e16, mf2, ta, ma
+  vle16.v v0, (a2)
+  addi t0, a2, 8
+  vle16.v v1, (t0)
+.irp i, 2, 3, 4, 5, 6, 7
+  addi t0, t0, 8
+  vle16.v v\i, (t0)
+.endr
+
+  li t1, 2896*8
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+  vsmul.vx v\i, v\i, t1
+.endr
+
+  jalr t0, a4
+
+  vsseg8e16.v v0, (a2)
+
+  vsetivli zero, 8, e16, m1, ta, ma
+  vmv.v.x v4, zero
+  vle16.v v0, (a2)
+  vse16.v v4, (a2)
+.irp i, 1, 2, 3
+  addi a2, a2, 16
+  vle16.v v\i, (a2)
+  vse16.v v4, (a2)
+.endr
+
+  jalr t0, a5
+
+  vssra.vi v0, v0, 4
+  vssra.vi v1, v1, 4
+  vssra.vi v2, v2, 4
+  vssra.vi v3, v3, 4
+
+  vsetvli zero, zero, e8, mf2, ta, ma
+  vle8.v v4, (a0)
+  add t0, a0, a1
+  vle8.v v5, (t0)
+  add t0, t0, a1
+  vle8.v v6, (t0)
+  add t0, t0, a1
+  vle8.v v7, (t0)
+
+  vwaddu.wv v0, v0, v4
+  vwaddu.wv v1, v1, v5
+  vwaddu.wv v2, v2, v6
+  vwaddu.wv v3, v3, v7
+
+  vsetvli zero, zero, e16, m1, ta, ma
+  vmax.vx v0, v0, zero
+  vmax.vx v1, v1, zero
+  vmax.vx v2, v2, zero
+  vmax.vx v3, v3, zero
+
+  vsetvli zero, zero, e8, mf2, ta, ma
+
+  vnclipu.wi v4, v0, 0
+  vnclipu.wi v5, v1, 0
+  vnclipu.wi v6, v2, 0
+  vnclipu.wi v7, v3, 0
+
+  vse8.v v4, (a0)
+  add a0, a0, a1
+  vse8.v v5, (a0)
+  add a0, a0, a1
+  vse8.v v6, (a0)
+  add a0, a0, a1
+  vse8.v v7, (a0)
+
+  ret
+endfunc
+
+/* Define symbols added in .if statement */
+.equ dct, 1
+.equ identity, 2
+.equ adst, 3
+.equ flipadst, 4
+
+.macro def_fn_48 w, h, txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_rvv, export=1
+.if \w == 4 && (\txfm1 == adst || \txfm1 == flipadst)
+  la a4, inv_\txfm1\()_e16_x\w\()w_rvv
+.else
+  la a4, inv_\txfm1\()_e16_x\w\()_rvv
+.endif
+.if \h == 4 && (\txfm2 == adst || \txfm2 == flipadst)
+  la a5, inv_\txfm2\()_e16_x\h\()w_rvv
+.else
+  la a5, inv_\txfm2\()_e16_x\h\()_rvv
+.endif
+  j inv_txfm_add_\w\()x\h\()_rvv
+endfunc
+.endm
+
+.macro def_fns_48 w, h
+def_fn_48 \w, \h, dct, dct
+def_fn_48 \w, \h, identity, identity
+def_fn_48 \w, \h, dct, adst
+def_fn_48 \w, \h, dct, flipadst
+def_fn_48 \w, \h, dct, identity
+def_fn_48 \w, \h, adst, dct
+def_fn_48 \w, \h, adst, adst
+def_fn_48 \w, \h, adst, flipadst
+def_fn_48 \w, \h, flipadst, dct
+def_fn_48 \w, \h, flipadst, adst
+def_fn_48 \w, \h, flipadst, flipadst
+def_fn_48 \w, \h, identity, dct
+def_fn_48 \w, \h, adst, identity
+def_fn_48 \w, \h, flipadst, identity
+def_fn_48 \w, \h, identity, adst
+def_fn_48 \w, \h, identity, flipadst
+.endm
+
+def_fns_48 4, 8
+def_fns_48 8, 4
+
 function inv_identity_e16_x16_rvv, export=1, ext=v
   li t1, 2*(5793-4096)*8
 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
@@ -1196,10 +1407,12 @@
 .macro def_horz_16 variant
 function inv_txfm_horz\variant\()_16x8_rvv, export=1, ext=v
   vmv.v.x v16, zero
-.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+  vle16.v v0, (t4)
+  vse16.v v16, (t4)
+.irp i, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+  add t4, t4, t6
   vle16.v v\i, (t4)
   vse16.v v16, (t4)
-  add t4, t4, t6
 .endr
 .ifc \variant, _identity
   li t1, 2*(5793-4096)*8
@@ -1208,29 +1421,35 @@
   vsra.vi v16, v16, 1
   vaadd.vv v\i, v\i, v16
 .endr
+  j L(horz_16x8_epilog)
 .else
   jalr t0, a4
 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
   vssra.vi v\i, v\i, 2
 .endr
-.endif
-.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-  vsse16.v v\i, (t5), t6
+L(horz_16x8_epilog):
+  vsse16.v v0, (t5), t6
+.irp i, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
   addi t5, t5, 2
+  vsse16.v v\i, (t5), t6
 .endr
   jr a7
+.endif
 endfunc
 .endm
 
-def_horz_16
 def_horz_16 _identity
+def_horz_16
 
 function inv_txfm_add_vert_8x16_rvv, export=1, ext=v
   vsetivli zero, 8, e16, m1, ta, ma
-.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-  vle16.v v\i, (t4)
+
+  vle16.v v0, (t4)
+.irp i, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
   add t4, t4, t6
+  vle16.v v\i, (t4)
 .endr
+
   jalr t0, a5
 
 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
@@ -1238,10 +1457,13 @@
 .endr
 
   vsetivli zero, 8, e8, mf2, ta, ma
-  mv t0, t5
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
-  vle8.v v\i, (t0)
+
+  vle8.v v16, (t5)
+  add t0, t5, a1
+  vle8.v v17, (t0)
+.irp i, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
   add t0, t0, a1
+  vle8.v v\i, (t0)
 .endr
 
   vwaddu.wv v0, v0, v16
@@ -1261,7 +1483,7 @@
   vwaddu.wv v14, v14, v30
   vwaddu.wv v15, v15, v31
 
-  vsetvli zero, zero, e16, m1
+  vsetvli zero, zero, e16, m1, ta, ma
 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
   vmax.vx v\i, v\i, zero
 .endr
@@ -1284,9 +1506,10 @@
   vnclipu.wi v30, v14, 0
   vnclipu.wi v31, v15, 0
 
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
-  vse8.v v\i, (t5)
+  vse8.v v16, (t5)
+.irp i, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
   add t5, t5, a1
+  vse8.v v\i, (t5)
 .endr
 
   jr a7
@@ -1296,11 +1519,26 @@
   csrw vxrm, zero
   vsetivli zero, 8, e16, m1, ta, ma
   addi sp, sp, -16*32
-.irp i, 0, 8
+.irp i, 8, 0
   addi t4, a2, \i*2
   addi t5, sp, \i*16*2
+.if \i == 8
+  blt a3, a7, 1f
+.endif
   li t6, 16*2
   jalr a7, a6
+.if \i == 8
+  j 2f
+1:
+  li t1, 64
+  vsetvli zero, t1, e16, m8, ta, ma
+  vmv.v.x v0, zero
+  vse16.v v0, (t5)
+  addi t5, t5, 128
+  vse16.v v0, (t5)
+  vsetivli zero, 8, e16, m1, ta, ma
+2:
+.endif
 .endr
 .irp i, 0, 8
   addi t4, sp, \i*2
@@ -1312,7 +1550,7 @@
   ret
 endfunc
 
-.macro def_fn_16x16 txfm1, txfm2
+.macro def_fn_16x16 txfm1, txfm2, eob_half
 function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_rvv, export=1, ext=v
 .ifc \txfm1, identity
   la a6, inv_txfm_horz_identity_16x8_rvv
@@ -1321,19 +1559,558 @@
   la a4, inv_\txfm1\()_e16_x16_rvv
 .endif
   la a5, inv_\txfm2\()_e16_x16_rvv
+  li a7, \eob_half
   j inv_txfm_add_16x16_rvv
 endfunc
 .endm
 
-def_fn_16x16 dct, dct
-def_fn_16x16 identity, identity
-def_fn_16x16 dct, adst
-def_fn_16x16 dct, flipadst
-def_fn_16x16 dct, identity
-def_fn_16x16 adst, dct
-def_fn_16x16 adst, adst
-def_fn_16x16 adst, flipadst
-def_fn_16x16 flipadst, dct
-def_fn_16x16 flipadst, adst
-def_fn_16x16 flipadst, flipadst
-def_fn_16x16 identity, dct
+def_fn_16x16 dct, dct, 36
+def_fn_16x16 identity, identity, 36
+def_fn_16x16 dct, adst, 36
+def_fn_16x16 dct, flipadst, 36
+def_fn_16x16 dct, identity, 8
+def_fn_16x16 adst, dct, 36
+def_fn_16x16 adst, adst, 36
+def_fn_16x16 adst, flipadst, 36
+def_fn_16x16 flipadst, dct, 36
+def_fn_16x16 flipadst, adst, 36
+def_fn_16x16 flipadst, flipadst, 36
+def_fn_16x16 identity, dct, 8
+
+.macro def_fn_416_base variant
+function inv_txfm_\variant\()add_4x16_rvv, export=1, ext=v
+  csrw vxrm, zero
+
+  vsetivli zero, 8, e16, m1, ta, ma
+
+  blt a3, a6, 1f
+
+  addi t0, a2, 16
+  vle16.v v0, (t0)
+  addi t0, t0, 32
+  vle16.v v1, (t0)
+  addi t0, t0, 32
+  vle16.v v2, (t0)
+  addi t0, t0, 32
+  vle16.v v3, (t0)
+
+.ifc \variant, identity_
+  li t1, (5793-4096)*8
+  vsmul.vx v8, v0, t1
+  vaadd.vv v4, v0, v8
+  vsmul.vx v8, v1, t1
+  vaadd.vv v5, v1, v8
+  vsmul.vx v8, v2, t1
+  vaadd.vv v6, v2, v8
+  vsmul.vx v8, v3, t1
+  vaadd.vv v7, v3, v8
+.else
+  jalr t0, a4
+
+  vssra.vi v4, v0, 1
+  vssra.vi v5, v1, 1
+  vssra.vi v6, v2, 1
+  vssra.vi v7, v3, 1
+.endif
+
+  j 2f
+
+1:
+.irp i, 4, 5, 6, 7
+  vmv.v.x v\i, zero
+.endr
+
+2:
+  vle16.v v0, (a2)
+  addi t0, a2, 32
+  vle16.v v1, (t0)
+  addi t0, t0, 32
+  vle16.v v2, (t0)
+  addi t0, t0, 32
+  vle16.v v3, (t0)
+
+.ifc \variant, identity_
+  li t1, (5793-4096)*8
+.irp i, 0, 1, 2, 3
+  vsmul.vx v8, v\i, t1
+  vaadd.vv v\i, v\i, v8
+.endr
+
+  j L(itx_4x16_epilog)
+.else
+  jalr t0, a4
+
+  vssra.vi v0, v0, 1
+  vssra.vi v1, v1, 1
+  vssra.vi v2, v2, 1
+  vssra.vi v3, v3, 1
+
+L(itx_4x16_epilog):
+  vsseg4e16.v v0, (a2)
+  addi t0, a2, 64
+  vsseg4e16.v v4, (t0)
+
+  vsetivli zero, 4, e16, mf2, ta, ma
+
+  vmv.v.x v16, zero
+  vle16.v v0, (a2)
+  vse16.v v16, (a2)
+  addi t0, a2, 8
+  vle16.v v1, (t0)
+  vse16.v v16, (t0)
+.irp i, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+  addi t0, t0, 8
+  vle16.v v\i, (t0)
+  vse16.v v16, (t0)
+.endr
+
+  jalr t0, a5
+
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+  vssra.vi v\i, v\i, 4
+.endr
+
+  vsetvli zero, zero, e8, mf4, ta, ma
+
+  vle8.v v16, (a0)
+  add t0, a0, a1
+  vle8.v v17, (t0)
+.irp i, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+  add t0, t0, a1
+  vle8.v v\i, (t0)
+.endr
+
+  vwaddu.wv  v0,  v0, v16
+  vwaddu.wv  v1,  v1, v17
+  vwaddu.wv  v2,  v2, v18
+  vwaddu.wv  v3,  v3, v19
+  vwaddu.wv  v4,  v4, v20
+  vwaddu.wv  v5,  v5, v21
+  vwaddu.wv  v6,  v6, v22
+  vwaddu.wv  v7,  v7, v23
+  vwaddu.wv  v8,  v8, v24
+  vwaddu.wv  v9,  v9, v25
+  vwaddu.wv v10, v10, v26
+  vwaddu.wv v11, v11, v27
+  vwaddu.wv v12, v12, v28
+  vwaddu.wv v13, v13, v29
+  vwaddu.wv v14, v14, v30
+  vwaddu.wv v15, v15, v31
+
+  vsetvli zero, zero, e16, mf2, ta, ma
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+  vmax.vx v\i, v\i, zero
+.endr
+
+  vsetvli zero, zero, e8, mf4, ta, ma
+
+  vnclipu.wi v16,  v0, 0
+  vnclipu.wi v17,  v1, 0
+  vnclipu.wi v18,  v2, 0
+  vnclipu.wi v19,  v3, 0
+  vnclipu.wi v20,  v4, 0
+  vnclipu.wi v21,  v5, 0
+  vnclipu.wi v22,  v6, 0
+  vnclipu.wi v23,  v7, 0
+  vnclipu.wi v24,  v8, 0
+  vnclipu.wi v25,  v9, 0
+  vnclipu.wi v26, v10, 0
+  vnclipu.wi v27, v11, 0
+  vnclipu.wi v28, v12, 0
+  vnclipu.wi v29, v13, 0
+  vnclipu.wi v30, v14, 0
+  vnclipu.wi v31, v15, 0
+
+  vse8.v v16, (a0)
+.irp i, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+  add a0, a0, a1
+  vse8.v v\i, (a0)
+.endr
+
+  ret
+.endif
+endfunc
+
+function inv_txfm_\variant\()add_16x4_rvv, export=1, ext=v
+  csrw vxrm, zero
+
+  vsetivli zero, 4, e16, mf2, ta, ma
+  vle16.v v0, (a2)
+  addi t0, a2, 8
+  vle16.v v1, (t0)
+.irp i, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+  addi t0, t0, 8
+  vle16.v v\i, (t0)
+.endr
+
+.ifc \variant, identity_
+  li t1, 2*(5793-4096)*8
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+  vsmul.vx v16, v\i, t1
+  vssra.vi v16, v16, 1
+  vsadd.vv v\i, v\i, v16
+.endr
+
+  j L(itx_16x4_epilog)
+.else
+  jalr t0, a4
+
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+  vssra.vi v\i, v\i, 1
+.endr
+
+L(itx_16x4_epilog):
+  li t0, 32
+  vssseg8e16.v v0, (a2), t0
+  addi t1, a2, 16
+  vssseg8e16.v v8, (t1), t0
+
+.irp j, 0, 8
+  vsetivli zero, 8, e16, m1, ta, ma
+
+  vmv.v.x v4, zero
+  addi t0, a2, \j*2
+  vle16.v v0, (t0)
+  vse16.v v4, (t0)
+.irp i, 1, 2, 3
+  addi t0, t0, 32
+  vle16.v v\i, (t0)
+  vse16.v v4, (t0)
+.endr
+
+  jalr t0, a5
+
+  vssra.vi v0, v0, 4
+  vssra.vi v1, v1, 4
+  vssra.vi v2, v2, 4
+  vssra.vi v3, v3, 4
+
+  vsetvli zero, zero, e8, mf2, ta, ma
+  addi t0, a0, \j
+  vle8.v v4, (t0)
+  add t0, t0, a1
+  vle8.v v5, (t0)
+  add t0, t0, a1
+  vle8.v v6, (t0)
+  add t0, t0, a1
+  vle8.v v7, (t0)
+
+  vwaddu.wv v0, v0, v4
+  vwaddu.wv v1, v1, v5
+  vwaddu.wv v2, v2, v6
+  vwaddu.wv v3, v3, v7
+
+  vsetvli zero, zero, e16, m1, ta, ma
+  vmax.vx v0, v0, zero
+  vmax.vx v1, v1, zero
+  vmax.vx v2, v2, zero
+  vmax.vx v3, v3, zero
+
+  vsetvli zero, zero, e8, mf2, ta, ma
+
+  vnclipu.wi v4, v0, 0
+  vnclipu.wi v5, v1, 0
+  vnclipu.wi v6, v2, 0
+  vnclipu.wi v7, v3, 0
+
+  addi t0, a0, \j
+  vse8.v v4, (t0)
+  add t0, t0, a1
+  vse8.v v5, (t0)
+  add t0, t0, a1
+  vse8.v v6, (t0)
+  add t0, t0, a1
+  vse8.v v7, (t0)
+.endr
+
+  ret
+.endif
+endfunc
+.endm
+
+def_fn_416_base identity_
+def_fn_416_base
+
+.macro def_fn_416 w, h, txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_rvv, export=1
+.if \w == 4 && (\txfm1 == adst || \txfm1 == flipadst)
+  la a4, inv_\txfm1\()_e16_x\w\()w_rvv
+.elseif \txfm1 != identity
+  la a4, inv_\txfm1\()_e16_x\w\()_rvv
+.endif
+.if \h == 4 && (\txfm2 == adst || \txfm2 == flipadst)
+  la a5, inv_\txfm2\()_e16_x\h\()w_rvv
+.else
+  la a5, inv_\txfm2\()_e16_x\h\()_rvv
+.endif
+.if \w == 4
+  li a6, \eob_half
+.endif
+.ifc \txfm1, identity
+  j inv_txfm_identity_add_\w\()x\h\()_rvv
+.else
+  j inv_txfm_add_\w\()x\h\()_rvv
+.endif
+endfunc
+.endm
+
+.macro def_fns_416 w, h
+def_fn_416 \w, \h, dct, dct, 29
+def_fn_416 \w, \h, identity, identity, 29
+def_fn_416 \w, \h, dct, adst, 29
+def_fn_416 \w, \h, dct, flipadst, 29
+def_fn_416 \w, \h, dct, identity, 8
+def_fn_416 \w, \h, adst, dct, 29
+def_fn_416 \w, \h, adst, adst, 29
+def_fn_416 \w, \h, adst, flipadst, 29
+def_fn_416 \w, \h, flipadst, dct, 29
+def_fn_416 \w, \h, flipadst, adst, 29
+def_fn_416 \w, \h, flipadst, flipadst, 29
+def_fn_416 \w, \h, identity, dct, 32
+def_fn_416 \w, \h, adst, identity, 8
+def_fn_416 \w, \h, flipadst, identity, 8
+def_fn_416 \w, \h, identity, adst, 32
+def_fn_416 \w, \h, identity, flipadst, 32
+.endm
+
+def_fns_416 4, 16
+def_fns_416 16, 4
+
+.macro def_fn_816_base variant
+function inv_txfm_\variant\()add_8x16_rvv, export=1, ext=v
+  csrw vxrm, zero
+
+  vsetivli zero, 8, e16, m1, ta, ma
+
+  blt a3, a6, 1f
+
+  vmv.v.x v16, zero
+  addi t0, a2, 16
+  vle16.v v0, (t0)
+  vse16.v v16, (t0)
+.irp i, 1, 2, 3, 4, 5, 6, 7
+  addi t0, t0, 32
+  vle16.v v\i, (t0)
+  vse16.v v16, (t0)
+.endr
+
+  li t1, 2896*8
+.ifc \variant, identity_
+  vsmul.vx  v8, v0, t1
+  vsmul.vx  v9, v1, t1
+  vsmul.vx v10, v2, t1
+  vsmul.vx v11, v3, t1
+  vsmul.vx v12, v4, t1
+  vsmul.vx v13, v5, t1
+  vsmul.vx v14, v6, t1
+  vsmul.vx v15, v7, t1
+.else
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+  vsmul.vx v\i, v\i, t1
+.endr
+
+  jalr t0, a4
+
+  vssra.vi  v8, v0, 1
+  vssra.vi  v9, v1, 1
+  vssra.vi v10, v2, 1
+  vssra.vi v11, v3, 1
+  vssra.vi v12, v4, 1
+  vssra.vi v13, v5, 1
+  vssra.vi v14, v6, 1
+  vssra.vi v15, v7, 1
+.endif
+
+  j 2f
+
+1:
+.irp i, 8, 9, 10, 11, 12, 13, 14, 15
+  vmv.v.x v\i, zero
+.endr
+
+2:
+  vmv.v.x v16, zero
+  vle16.v v0, (a2)
+  vse16.v v16, (a2)
+  addi t0, a2, 32
+  vle16.v v1, (t0)
+  vse16.v v16, (t0)
+.irp i, 2, 3, 4, 5, 6, 7
+  addi t0, t0, 32
+  vle16.v v\i, (t0)
+  vse16.v v16, (t0)
+.endr
+
+  li t1, 2896*8
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+  vsmul.vx v\i, v\i, t1
+.endr
+
+.ifc \variant, identity_
+  j L(itx_8x16_epilog)
+.else
+  jalr t0, a4
+
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+  vssra.vi v\i, v\i, 1
+.endr
+
+L(itx_8x16_epilog):
+  addi t4, sp, -8*32
+  vsseg8e16.v v0, (t4)
+  addi t0, t4, 8*16
+  vsseg8e16.v v8, (t0)
+
+  mv t5, a0
+  li t6, 16
+  jal a7, inv_txfm_add_vert_8x16_rvv
+
+  ret
+.endif
+endfunc
+
+function inv_txfm_\variant\()add_16x8_rvv, export=1, ext=v
+  csrw vxrm, zero
+
+  vsetivli zero, 8, e16, m1, ta, ma
+  vle16.v v0, (a2)
+  addi t0, a2, 16
+  vle16.v v1, (t0)
+.irp i, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+  addi t0, t0, 16
+  vle16.v v\i, (t0)
+.endr
+
+  li t1, 2896*8
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+  vsmul.vx v\i, v\i, t1
+.endr
+
+.ifc \variant, identity_
+  li t1, 2*(5793-4096)*8
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+  vsmul.vx v16, v\i, t1
+  vssra.vi v16, v16, 1
+  vsadd.vv v\i, v\i, v16
+.endr
+
+  j L(itx_16x8_epilog)
+.else
+  jalr t0, a4
+
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+  vssra.vi v\i, v\i, 1
+.endr
+
+L(itx_16x8_epilog):
+  li t0, 32
+  vssseg8e16.v v0, (a2), t0
+  addi t1, a2, 16
+  vssseg8e16.v v8, (t1), t0
+
+.irp j, 0, 8
+  vsetivli zero, 8, e16, m1, ta, ma
+
+  vmv.v.x v8, zero
+  addi t0, a2, \j*2
+  vle16.v v0, (t0)
+  vse16.v v8, (t0)
+.irp i, 1, 2, 3, 4, 5, 6, 7
+  addi t0, t0, 32
+  vle16.v v\i, (t0)
+  vse16.v v8, (t0)
+.endr
+
+  jalr t0, a5
+
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+  vssra.vi v\i, v\i, 4
+.endr
+
+  vsetvli zero, zero, e8, mf2, ta, ma
+  addi t0, a0, \j
+  vle8.v v8, (t0)
+.irp i, 9, 10, 11, 12, 13, 14, 15
+  add t0, t0, a1
+  vle8.v v\i, (t0)
+.endr
+
+  vwaddu.wv v0, v0, v8
+  vwaddu.wv v1, v1, v9
+  vwaddu.wv v2, v2, v10
+  vwaddu.wv v3, v3, v11
+  vwaddu.wv v4, v4, v12
+  vwaddu.wv v5, v5, v13
+  vwaddu.wv v6, v6, v14
+  vwaddu.wv v7, v7, v15
+
+  vsetvli zero, zero, e16, m1, ta, ma
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7
+  vmax.vx v\i, v\i, zero
+.endr
+
+  vsetvli zero, zero, e8, mf2, ta, ma
+
+  vnclipu.wi  v8, v0, 0
+  vnclipu.wi  v9, v1, 0
+  vnclipu.wi v10, v2, 0
+  vnclipu.wi v11, v3, 0
+  vnclipu.wi v12, v4, 0
+  vnclipu.wi v13, v5, 0
+  vnclipu.wi v14, v6, 0
+  vnclipu.wi v15, v7, 0
+
+  addi t0, a0, \j
+  vse8.v v8, (t0)
+.irp i, 9, 10, 11, 12, 13, 14, 15
+  add t0, t0, a1
+  vse8.v v\i, (t0)
+.endr
+.endr
+
+  ret
+.endif
+endfunc
+.endm
+
+def_fn_816_base identity_
+def_fn_816_base
+
+.macro def_fn_816 w, h, txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_rvv, export=1
+.ifnc \txfm1, identity
+  la a4, inv_\txfm1\()_e16_x\w\()_rvv
+.endif
+  la a5, inv_\txfm2\()_e16_x\h\()_rvv
+.if \w == 8
+  li a6, \eob_half
+.endif
+.ifc \txfm1, identity
+  j inv_txfm_identity_add_\w\()x\h\()_rvv
+.else
+  j inv_txfm_add_\w\()x\h\()_rvv
+.endif
+endfunc
+.endm
+
+.macro def_fns_816 w, h
+def_fn_816 \w, \h, dct, dct, 43
+def_fn_816 \w, \h, identity, identity, 43
+def_fn_816 \w, \h, dct, adst, 43
+def_fn_816 \w, \h, dct, flipadst, 43
+def_fn_816 \w, \h, dct, identity, 8
+def_fn_816 \w, \h, adst, dct, 43
+def_fn_816 \w, \h, adst, adst, 43
+def_fn_816 \w, \h, adst, flipadst, 43
+def_fn_816 \w, \h, flipadst, dct, 43
+def_fn_816 \w, \h, flipadst, adst, 43
+def_fn_816 \w, \h, flipadst, flipadst, 43
+def_fn_816 \w, \h, identity, dct, 64
+def_fn_816 \w, \h, adst, identity, 8
+def_fn_816 \w, \h, flipadst, identity, 8
+def_fn_816 \w, \h, identity, adst, 64
+def_fn_816 \w, \h, identity, flipadst, 64
+.endm
+
+def_fns_816 8, 16
+def_fns_816 16, 8
diff --git a/src/riscv/asm.S b/src/riscv/asm.S
index 2435170..eed4d67 100644
--- a/src/riscv/asm.S
+++ b/src/riscv/asm.S
@@ -123,4 +123,6 @@
         end_thread_local
 .endm
 
+#define L(x) .L ## x
+
 #endif /* DAV1D_SRC_RISCV_ASM_S */
diff --git a/src/riscv/itx.h b/src/riscv/itx.h
index 28c5e54..d3f9a03 100644
--- a/src/riscv/itx.h
+++ b/src/riscv/itx.h
@@ -58,7 +58,13 @@
 
 #define decl_itx_fns(ext) \
 decl_itx17_fns( 4,  4, ext); \
+decl_itx16_fns( 4,  8, ext); \
+decl_itx16_fns( 4, 16, ext); \
+decl_itx16_fns( 8,  4, ext); \
 decl_itx16_fns( 8,  8, ext); \
+decl_itx16_fns( 8, 16, ext); \
+decl_itx16_fns(16,  4, ext); \
+decl_itx16_fns(16,  8, ext); \
 decl_itx16_fns(16, 16, ext)
 
 decl_itx_fns(rvv);
@@ -105,7 +111,13 @@
 
 #if BITDEPTH == 8
   assign_itx17_fn( ,  4,  4, rvv);
+  assign_itx16_fn(R,  4,  8, rvv);
+  assign_itx16_fn(R,  4, 16, rvv);
+  assign_itx16_fn(R,  8,  4, rvv);
   assign_itx16_fn( ,  8,  8, rvv);
+  assign_itx16_fn(R,  8, 16, rvv);
+  assign_itx16_fn(R, 16,  4, rvv);
+  assign_itx16_fn(R, 16,  8, rvv);
   assign_itx12_fn( , 16, 16, rvv);
 #endif
 }
diff --git a/src/x86/looprestoration_sse.asm b/src/x86/looprestoration_sse.asm
index 01eb6fa..b5c73a5 100644
--- a/src/x86/looprestoration_sse.asm
+++ b/src/x86/looprestoration_sse.asm
@@ -42,7 +42,6 @@
 pb_right_ext_mask: times 24 db 0xff
                    times 8 db 0
 pb_1:          times 16 db 1
-pb_3:          times 16 db 3
 pw_256:        times 8 dw 256
 pw_2056:       times 8 dw 2056
 pw_m16380:     times 8 dw -16380
@@ -290,7 +289,7 @@
     call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v
     jmp .v1
 .extend_right:
-    movd            m2, [lpfq-4]
+    movd            m2, [lpfq-1]
 %if ARCH_X86_64
     push            r0
     lea             r0, [pb_right_ext_mask+21]
@@ -302,10 +301,11 @@
     movu            m1, [r6+xq+8]
 %endif
 %if cpuflag(ssse3)
-    pshufb          m2, [base+pb_3]
+    pxor            m3, m3
+    pshufb          m2, m3
 %else
     punpcklbw       m2, m2
-    pshuflw         m2, m2, q3333
+    pshuflw         m2, m2, q0000
     punpcklqdq      m2, m2
 %endif
     pand            m4, m0
diff --git a/src/x86/msac.asm b/src/x86/msac.asm
index 9f05c92..4156efe 100644
--- a/src/x86/msac.asm
+++ b/src/x86/msac.asm
@@ -143,10 +143,9 @@
     mov           esp, [esp]
 %endif
 %endif
-    not            t4
     sub           t2d, t1d ; rng
     shl            t1, gprsize*8-16
-    add            t4, t1  ; ~dif
+    sub            t4, t1  ; dif - v
 .renorm3:
     mov           t1d, [t0+msac.cnt]
     movifnidn      t7, t0
@@ -157,33 +156,31 @@
     shl           t2d, cl
     shl            t4, cl
     mov [t7+msac.rng], t2d
-    not            t4
     sub           t1d, ecx
     jae .end ; no refill required
 
 ; refill:
-    mov            t2, [t7+msac.buf]
-    mov           rcx, [t7+msac.end]
 %if ARCH_X86_64 == 0
     push           t5
 %endif
-    lea            t5, [t2+gprsize]
-    cmp            t5, rcx
+    mov            t2, [t7+msac.buf]
+    mov            t5, [t7+msac.end]
+    lea           rcx, [t2+gprsize]
+    sub           rcx, t5
     ja .refill_eob
-    mov            t2, [t2]
-    lea           ecx, [t1+23]
-    add           t1d, 16
-    shr           ecx, 3   ; shift_bytes
-    bswap          t2
-    sub            t5, rcx
-    shl           ecx, 3   ; shift_bits
-    shr            t2, cl
-    sub           ecx, t1d ; shift_bits - 16 - cnt
-    mov           t1d, gprsize*8-16
-    shl            t2, cl
-    mov [t7+msac.buf], t5
-    sub           t1d, ecx ; cnt + gprsize*8 - shift_bits
-    xor            t4, t2
+    mov            t5, [t2]
+    lea           ecx, [t1+16-gprsize*8]
+    not            t5
+    bswap          t5
+    shr            t5, cl
+    neg           ecx
+    shr           ecx, 3 ; num_bytes_read
+    or             t4, t5
+.refill_end:
+    add            t2, rcx
+    lea           t1d, [t1+rcx*8] ; cnt += num_bits_read
+    mov [t7+msac.buf], t2
+.refill_end2:
 %if ARCH_X86_64 == 0
     pop            t5
 %endif
@@ -191,29 +188,35 @@
     mov [t7+msac.cnt], t1d
     mov [t7+msac.dif], t4
     RET
-.refill_eob: ; avoid overreading the input buffer
-    mov            t5, rcx
-    mov           ecx, gprsize*8-24
-    sub           ecx, t1d ; c
-.refill_eob_loop:
-    cmp            t2, t5
-    jae .refill_eob_end    ; eob reached
-    movzx         t1d, byte [t2]
-    inc            t2
-    shl            t1, cl
-    xor            t4, t1
-    sub           ecx, 8
-    jge .refill_eob_loop
-.refill_eob_end:
-    mov           t1d, gprsize*8-24
-%if ARCH_X86_64 == 0
-    pop            t5
+.pad_with_ones:
+    lea           ecx, [t1-16]
+%if ARCH_X86_64
+    ror           rcx, cl
+%else
+    shr           ecx, cl
 %endif
-    sub           t1d, ecx
-    mov [t7+msac.buf], t2
-    mov [t7+msac.dif], t4
-    mov [t7+msac.cnt], t1d
-    RET
+    or             t4, rcx
+    jmp .refill_end2
+.refill_eob: ; avoid overreading the input buffer
+    cmp            t2, t5
+    jae .pad_with_ones ; eob reached
+    ; We can safely do a register-sized load of the last bytes of the buffer
+    ; as this code is only reached if the msac buffer size is >= gprsize.
+    mov            t5, [t5-gprsize]
+    shl           ecx, 3
+    shr            t5, cl
+    lea           ecx, [t1+16-gprsize*8]
+    not            t5
+    bswap          t5
+    shr            t5, cl
+    neg           ecx
+    or             t4, t5
+    mov           t5d, [t7+msac.end]
+    shr           ecx, 3
+    sub           t5d, t2d ; num_bytes_left
+    cmp           ecx, t5d
+    cmovae        ecx, t5d ; num_bytes_read
+    jmp .refill_end
 
 cglobal msac_decode_symbol_adapt8, 0, 6, 6
     DECODE_SYMBOL_ADAPT_INIT
@@ -366,7 +369,6 @@
 %if ARCH_X86_64 == 0
     movzx         eax, al
 %endif
-    not            t4
     test          t3d, t3d
     jz m(msac_decode_symbol_adapt4, SUFFIX).renorm3
 %if UNIX64 == 0
@@ -420,7 +422,6 @@
     mov           ecx, 0xbfff
     setb           al ; the upper 32 bits contains garbage but that's OK
     sub           ecx, t2d
-    not            t4
     ; In this case of this function, (d =) 16 - clz(v) = 2 - (v >> 14)
     ;   i.e. (0 <= d <= 2) and v < (3 << 14)
     shr           ecx, 14           ; d
@@ -447,7 +448,6 @@
     cmovb         t2d, t1d
     cmovb          t4, t3
     setb           al
-    not            t4
 %if ARCH_X86_64 == 0
     movzx         eax, al
 %endif
@@ -497,48 +497,45 @@
     tzcnt         eax, eax
     movzx         ecx, word [buf+rax+16]
     movzx         t2d, word [buf+rax+14]
-    not            t4
 %if ARCH_X86_64
     add           t6d, 5
 %endif
     sub           eax, 5   ; setup for merging the tok_br and tok branches
     sub           t2d, ecx
     shl           rcx, gprsize*8-16
-    add            t4, rcx
+    sub            t4, rcx
     bsr           ecx, t2d
     xor           ecx, 15
     shl           t2d, cl
     shl            t4, cl
     movd           m2, t2d
     mov [t7+msac.rng], t2d
-    not            t4
     sub           t5d, ecx
     jae %%end
-    mov            t2, [t7+msac.buf]
-    mov           rcx, [t7+msac.end]
 %if UNIX64 == 0
     push           t8
 %endif
-    lea            t8, [t2+gprsize]
-    cmp            t8, rcx
+    mov            t2, [t7+msac.buf]
+    mov            t8, [t7+msac.end]
+    lea           rcx, [t2+gprsize]
+    sub           rcx, t8
     ja %%refill_eob
-    mov            t2, [t2]
-    lea           ecx, [t5+23]
-    add           t5d, 16
+    mov            t8, [t2]
+    lea           ecx, [t5+16-gprsize*8]
+    not            t8
+    bswap          t8
+    shr            t8, cl
+    neg           ecx
     shr           ecx, 3
-    bswap          t2
-    sub            t8, rcx
-    shl           ecx, 3
-    shr            t2, cl
-    sub           ecx, t5d
-    mov           t5d, gprsize*8-16
-    shl            t2, cl
-    mov [t7+msac.buf], t8
+    or             t4, t8
+%%refill_end:
+    add            t2, rcx
+    lea           t5d, [t5+rcx*8]
+    mov [t7+msac.buf], t2
+%%refill_end2:
 %if UNIX64 == 0
     pop            t8
 %endif
-    sub           t5d, ecx
-    xor            t4, t2
 %%end:
     movp           m3, t4
 %if ARCH_X86_64
@@ -559,27 +556,34 @@
     shr           eax, 1
     mov [t7+msac.cnt], t5d
     RET
-%%refill_eob:
-    mov            t8, rcx
-    mov           ecx, gprsize*8-24
-    sub           ecx, t5d
-%%refill_eob_loop:
-    cmp            t2, t8
-    jae %%refill_eob_end
-    movzx         t5d, byte [t2]
-    inc            t2
-    shl            t5, cl
-    xor            t4, t5
-    sub           ecx, 8
-    jge %%refill_eob_loop
-%%refill_eob_end:
-%if UNIX64 == 0
-    pop            t8
+%%pad_with_ones:
+    ; ensure that dif is padded with at least 15 bits of ones at the end
+    lea           ecx, [t5-16]
+%if ARCH_X86_64
+    ror           rcx, cl
+%else
+    shr           ecx, cl
 %endif
-    mov           t5d, gprsize*8-24
-    mov [t7+msac.buf], t2
-    sub           t5d, ecx
-    jmp %%end
+    or             t4, rcx
+    jmp %%refill_end2
+%%refill_eob:
+    cmp            t2, t8
+    jae %%pad_with_ones
+    mov            t8, [t8-gprsize]
+    shl           ecx, 3
+    shr            t8, cl
+    lea           ecx, [t5+16-gprsize*8]
+    not            t8
+    bswap          t8
+    shr            t8, cl
+    neg           ecx
+    or             t4, t8
+    mov           t8d, [t7+msac.end]
+    shr           ecx, 3
+    sub           t8d, t2d
+    cmp           ecx, t8d
+    cmovae        ecx, t8d
+    jmp %%refill_end
 %endmacro
 
 cglobal msac_decode_hi_tok, 0, 7 + ARCH_X86_64, 6
diff --git a/tests/checkasm/arm/checkasm_64.S b/tests/checkasm/arm/checkasm_64.S
index 2574914..d0d7ec4 100644
--- a/tests/checkasm/arm/checkasm_64.S
+++ b/tests/checkasm/arm/checkasm_64.S
@@ -209,3 +209,13 @@
         ldp             x29, x30, [sp], #16
         ret
 endfunc
+
+#if HAVE_SVE
+ENABLE_SVE
+function sve_length, export=1
+        cntb            x0
+        lsl             x0,  x0,  #3
+        ret
+endfunc
+DISABLE_SVE
+#endif
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 844ae44..9a01da7 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -102,6 +102,12 @@
     { "AVX-512 (Ice Lake)", "avx512icl", DAV1D_X86_CPU_FLAG_AVX512ICL },
 #elif ARCH_AARCH64 || ARCH_ARM
     { "NEON",               "neon",      DAV1D_ARM_CPU_FLAG_NEON },
+    { "DOTPROD",            "dotprod",   DAV1D_ARM_CPU_FLAG_DOTPROD },
+    { "I8MM",               "i8mm",      DAV1D_ARM_CPU_FLAG_I8MM },
+#if ARCH_AARCH64
+    { "SVE",                "sve",       DAV1D_ARM_CPU_FLAG_SVE },
+    { "SVE2",               "sve2",      DAV1D_ARM_CPU_FLAG_SVE2 },
+#endif /* ARCH_AARCH64 */
 #elif ARCH_LOONGARCH
     { "LSX",                "lsx",       DAV1D_LOONGARCH_CPU_FLAG_LSX },
     { "LASX",               "lasx",      DAV1D_LOONGARCH_CPU_FLAG_LASX },
@@ -113,6 +119,12 @@
     { 0 }
 };
 
+#if ARCH_AARCH64 && HAVE_SVE
+int checkasm_sve_length(void);
+#elif ARCH_RISCV
+int checkasm_get_vlenb(void);
+#endif
+
 typedef struct CheckasmFuncVersion {
     struct CheckasmFuncVersion *next;
     void *func;
@@ -130,6 +142,13 @@
     char name[];
 } CheckasmFunc;
 
+typedef enum {
+    RUN_NORMAL = 0,
+    RUN_BENCHMARK,
+    RUN_CPUFLAG_LISTING,
+    RUN_FUNCTION_LISTING,
+} CheckasmRunMode;
+
 /* Internal state */
 static struct {
     CheckasmFunc *funcs;
@@ -144,9 +163,8 @@
     const char *test_pattern;
     const char *function_pattern;
     unsigned seed;
-    int bench;
+    CheckasmRunMode run_mode;
     int verbose;
-    int function_listing;
     volatile sig_atomic_t catch_signals;
     int suffix_length;
     int max_function_name_length;
@@ -252,18 +270,18 @@
 
 /* Print colored text to stderr if the terminal supports it */
 static int use_printf_color;
-static void color_printf(const int color, const char *const fmt, ...) {
+static void color_fprintf(FILE *const f, const int color, const char *const fmt, ...) {
     va_list arg;
 
     if (use_printf_color)
-        fprintf(stderr, "\x1b[0;%dm", color);
+        fprintf(f, "\x1b[0;%dm", color);
 
     va_start(arg, fmt);
-    vfprintf(stderr, fmt, arg);
+    vfprintf(f, fmt, arg);
     va_end(arg);
 
     if (use_printf_color)
-        fprintf(stderr, "\x1b[0m");
+        fprintf(f, "\x1b[0m");
 }
 
 /* Deallocate a tree */
@@ -532,7 +550,7 @@
 /* Print the name of the current CPU flag, but only do it once */
 static void print_cpu_name(void) {
     if (state.cpu_flag_name) {
-        color_printf(COLOR_YELLOW, "%s:\n", state.cpu_flag_name);
+        color_fprintf(stderr, COLOR_YELLOW, "%s:\n", state.cpu_flag_name);
         state.cpu_flag_name = NULL;
     }
 }
@@ -571,6 +589,7 @@
                     "    --test=<pattern> -t        Test only <pattern>\n"
                     "    --function=<pattern> -f    Test only the functions matching <pattern>\n"
                     "    --bench -b                 Benchmark the tested functions\n"
+                    "    --list-cpuflags            List available cpu flags\n"
                     "    --list-functions           List available functions\n"
                     "    --list-tests               List available tests\n"
                     "    --verbose -v               Print verbose output\n");
@@ -581,7 +600,7 @@
                     "checkasm: --bench is not supported on your system\n");
             return 1;
 #endif
-            state.bench = 1;
+            state.run_mode = RUN_BENCHMARK;
         } else if (!strncmp(argv[1], "--test=", 7)) {
             state.test_pattern = argv[1] + 7;
         } else if (!strcmp(argv[1], "-t")) {
@@ -594,8 +613,11 @@
             state.function_pattern = argc > 1 ? argv[2] : "";
             argc--;
             argv++;
+        } else if (!strcmp(argv[1], "--list-cpuflags")) {
+            state.run_mode = RUN_CPUFLAG_LISTING;
+            break;
         } else if (!strcmp(argv[1], "--list-functions")) {
-            state.function_listing = 1;
+            state.run_mode = RUN_FUNCTION_LISTING;
         } else if (!strcmp(argv[1], "--list-tests")) {
             for (int i = 0; tests[i].name; i++)
                 printf("%s\n", tests[i].name);
@@ -671,7 +693,8 @@
 #if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
     AddVectoredExceptionHandler(0, signal_handler);
 
-    HANDLE con = GetStdHandle(STD_ERROR_HANDLE);
+    HANDLE con = GetStdHandle(state.run_mode >= RUN_CPUFLAG_LISTING ?
+                              STD_OUTPUT_HANDLE : STD_ERROR_HANDLE);
     DWORD con_mode = 0;
     use_printf_color = con && con != INVALID_HANDLE_VALUE &&
                        GetConsoleMode(con, &con_mode) &&
@@ -683,12 +706,14 @@
     sigaction(SIGILL,  &signal_handler_act, NULL);
     sigaction(SIGSEGV, &signal_handler_act, NULL);
 
-    const char *const term = getenv("TERM");
-    use_printf_color = term && strcmp(term, "dumb") && isatty(2);
+    if (isatty(state.run_mode >= RUN_CPUFLAG_LISTING ? 1 : 2)) {
+        const char *const term = getenv("TERM");
+        use_printf_color = term && strcmp(term, "dumb");
+    }
 #endif
 
 #ifdef readtime
-    if (state.bench) {
+    if (state.run_mode == RUN_BENCHMARK) {
         if (!checkasm_save_context()) {
             checkasm_set_signal_handler_state(1);
             readtime();
@@ -702,11 +727,22 @@
 
     int ret = 0;
 
-    if (!state.function_listing) {
+    if (state.run_mode != RUN_FUNCTION_LISTING) {
+        const unsigned cpu_flags = dav1d_get_cpu_flags();
+        if (state.run_mode == RUN_CPUFLAG_LISTING) {
+            const int last_i = (int)(sizeof(cpus) / sizeof(*cpus)) - 2;
+            for (int i = 0; i <= last_i ; i++) {
+                if (cpus[i].flag & cpu_flags)
+                    color_fprintf(stdout, COLOR_GREEN, "%s", cpus[i].suffix);
+                else
+                    color_fprintf(stdout, COLOR_RED, "~%s", cpus[i].suffix);
+                printf(i == last_i ? "\n" : ", ");
+            }
+            return 0;
+        }
 #if ARCH_X86_64
         void checkasm_warmup_avx2(void);
         void checkasm_warmup_avx512(void);
-        const unsigned cpu_flags = dav1d_get_cpu_flags();
         if (cpu_flags & DAV1D_X86_CPU_FLAG_AVX512ICL)
             state.simd_warmup = checkasm_warmup_avx512;
         else if (cpu_flags & DAV1D_X86_CPU_FLAG_AVX2)
@@ -720,6 +756,18 @@
         for (size_t len = strlen(name); len && name[len-1] == ' '; len--)
             name[len-1] = '\0'; /* trim trailing whitespace */
         fprintf(stderr, "checkasm: %s (%08X) using random seed %u\n", name, cpuid, state.seed);
+#elif ARCH_RISCV
+        char buf[32] = "";
+        if (cpu_flags & DAV1D_RISCV_CPU_FLAG_V) {
+            const int vlen = 8*checkasm_get_vlenb();
+            snprintf(buf, sizeof(buf), "VLEN=%i bits, ", vlen);
+        }
+        fprintf(stderr, "checkasm: %susing random seed %u\n", buf, state.seed);
+#elif ARCH_AARCH64 && HAVE_SVE
+        char buf[48] = "";
+        if (cpu_flags & DAV1D_ARM_CPU_FLAG_SVE)
+            snprintf(buf, sizeof(buf), "SVE %d bits, ", checkasm_sve_length());
+        fprintf(stderr, "checkasm: %susing random seed %u\n", buf, state.seed);
 #else
         fprintf(stderr, "checkasm: using random seed %u\n", state.seed);
 #endif
@@ -729,7 +777,7 @@
     for (int i = 0; cpus[i].flag; i++)
         check_cpu_flag(cpus[i].name, cpus[i].flag);
 
-    if (state.function_listing) {
+    if (state.run_mode == RUN_FUNCTION_LISTING) {
         print_functions(state.funcs);
     } else if (state.num_failed) {
         fprintf(stderr, "checkasm: %d of %d tests failed\n",
@@ -741,7 +789,7 @@
         else
             fprintf(stderr, "checkasm: no tests to perform\n");
 #ifdef readtime
-        if (state.bench && state.max_function_name_length) {
+        if (state.run_mode == RUN_BENCHMARK && state.max_function_name_length) {
             state.nop_time = measure_nop_time();
             if (state.verbose)
                 printf("nop:%*.1f\n", state.max_function_name_length + 6, state.nop_time);
@@ -801,7 +849,7 @@
     v->ok = 1;
     v->cpu = state.cpu_flag;
     state.current_func_ver = v;
-    if (state.function_listing) /* Save function names without running tests */
+    if (state.run_mode == RUN_FUNCTION_LISTING) /* Save function names without running tests */
         return NULL;
 
     xor128_srand(state.seed);
@@ -814,7 +862,7 @@
 
 /* Decide whether or not the current function needs to be benchmarked */
 int checkasm_bench_func(void) {
-    return !state.num_failed && state.bench;
+    return !state.num_failed && state.run_mode == RUN_BENCHMARK;
 }
 
 /* Indicate that the current test has failed, return whether verbose printing
@@ -863,9 +911,9 @@
         fprintf(stderr, "%*c", imax(pad_length, 0) + 2, '[');
 
         if (state.num_failed == prev_failed)
-            color_printf(COLOR_GREEN, "OK");
+            color_fprintf(stderr, COLOR_GREEN, "OK");
         else
-            color_printf(COLOR_RED, "FAILED");
+            color_fprintf(stderr, COLOR_RED, "FAILED");
         fprintf(stderr, "]\n");
 
         prev_checked = state.num_checked;
diff --git a/tests/checkasm/mc.c b/tests/checkasm/mc.c
index 047ef7b..f1f5dc3 100644
--- a/tests/checkasm/mc.c
+++ b/tests/checkasm/mc.c
@@ -98,6 +98,7 @@
                                                     w, h, "dst");
 
                         if (filter == FILTER_2D_8TAP_REGULAR ||
+                            filter == FILTER_2D_8TAP_SHARP ||
                             filter == FILTER_2D_BILINEAR)
                         {
                             bench_new(a_dst, a_dst_stride, src, src_stride, w, h,
@@ -155,6 +156,7 @@
                                                 w, h, "tmp");
 
                         if (filter == FILTER_2D_8TAP_REGULAR ||
+                            filter == FILTER_2D_8TAP_SHARP ||
                             filter == FILTER_2D_BILINEAR)
                         {
                             bench_new(a_tmp, src, src_stride, w, h,
diff --git a/tests/checkasm/msac.c b/tests/checkasm/msac.c
index 81fd593..26d4a56 100644
--- a/tests/checkasm/msac.c
+++ b/tests/checkasm/msac.c
@@ -33,7 +33,7 @@
 #include <stdio.h>
 #include <string.h>
 
-#define BUF_SIZE 8192
+#define BUF_SIZE 128
 
 /* The normal code doesn't use function pointers */
 typedef unsigned (*decode_symbol_adapt_fn)(MsacContext *s, uint16_t *cdf,
@@ -64,9 +64,16 @@
 
 /* memcmp() on structs can have weird behavior due to padding etc. */
 static int msac_cmp(const MsacContext *const a, const MsacContext *const b) {
-    return a->buf_pos != b->buf_pos || a->buf_end != b->buf_end ||
-           a->dif != b->dif || a->rng != b->rng || a->cnt != b->cnt ||
-           a->allow_update_cdf != b->allow_update_cdf;
+    if (a->buf_pos != b->buf_pos || a->buf_end != b->buf_end ||
+        a->rng != b->rng || a->cnt != b->cnt ||
+        a->allow_update_cdf != b->allow_update_cdf)
+    {
+        return 1;
+    }
+
+    /* Only check valid dif bits, ignoring partial bytes at the end */
+    const ec_win dif_mask = ~((~(ec_win)0) >> (imax(a->cnt, 0) + 16));
+    return !!((a->dif ^ b->dif) & dif_mask);
 }
 
 static void msac_dump(unsigned c_res, unsigned a_res,
@@ -86,7 +93,7 @@
         fprintf(stderr, "rng %u vs %u\n", a->rng, b->rng);
     if (a->cnt != b->cnt)
         fprintf(stderr, "cnt %d vs %d\n", a->cnt, b->cnt);
-    if (a->allow_update_cdf)
+    if (a->allow_update_cdf != b->allow_update_cdf)
         fprintf(stderr, "allow_update_cdf %d vs %d\n",
                 a->allow_update_cdf, b->allow_update_cdf);
     if (num_cdf && memcmp(cdf_a, cdf_b, sizeof(*cdf_a) * (num_cdf + 1))) {
@@ -113,7 +120,7 @@
                 s_a = s_c;                                                 \
                 randomize_cdf(cdf[0], ns);                                 \
                 memcpy(cdf[1], cdf[0], sizeof(*cdf));                      \
-                for (int i = 0; i < 64; i++) {                             \
+                while (s_c.cnt >= 0) {                                     \
                     unsigned c_res = call_ref(&s_c, cdf[0], ns);           \
                     unsigned a_res = call_new(&s_a, cdf[1], ns);           \
                     if (c_res != a_res || msac_cmp(&s_c, &s_a) ||          \
@@ -154,7 +161,7 @@
             s_a = s_c;
             cdf[0][0] = cdf[1][0] = rnd() % 32767 + 1;
             cdf[0][1] = cdf[1][1] = 0;
-            for (int i = 0; i < 64; i++) {
+            while (s_c.cnt >= 0) {
                 unsigned c_res = call_ref(&s_c, cdf[0]);
                 unsigned a_res = call_new(&s_a, cdf[1]);
                 if (c_res != a_res || msac_cmp(&s_c, &s_a) ||
@@ -177,7 +184,7 @@
     if (check_func(c->decode_bool_equi, "msac_decode_bool_equi")) {
         dav1d_msac_init(&s_c, buf, BUF_SIZE, 1);
         s_a = s_c;
-        for (int i = 0; i < 64; i++) {
+        while (s_c.cnt >= 0) {
             unsigned c_res = call_ref(&s_c);
             unsigned a_res = call_new(&s_a);
             if (c_res != a_res || msac_cmp(&s_c, &s_a)) {
@@ -196,7 +203,7 @@
     if (check_func(c->decode_bool, "msac_decode_bool")) {
         dav1d_msac_init(&s_c, buf, BUF_SIZE, 1);
         s_a = s_c;
-        for (int i = 0; i < 64; i++) {
+        while (s_c.cnt >= 0) {
             const unsigned f = rnd() & 0x7fff;
             unsigned c_res = call_ref(&s_c, f);
             unsigned a_res = call_new(&s_a, f);
@@ -228,7 +235,7 @@
             s_a = s_c;
             randomize_cdf(cdf[0], 3);
             memcpy(cdf[1], cdf[0], sizeof(*cdf));
-            for (int i = 0; i < 64; i++) {
+            while (s_c.cnt >= 0) {
                 unsigned c_res = call_ref(&s_c, cdf[0]);
                 unsigned a_res = call_new(&s_a, cdf[1]);
                 if (c_res != a_res || msac_cmp(&s_c, &s_a) ||
diff --git a/tests/checkasm/riscv/checkasm_64.S b/tests/checkasm/riscv/checkasm_64.S
index 0d02e5f..8557eab 100644
--- a/tests/checkasm/riscv/checkasm_64.S
+++ b/tests/checkasm/riscv/checkasm_64.S
@@ -83,6 +83,11 @@
 
 thread_local saved_regs, quads=29 # 5 + 12 + 12
 
+function get_vlenb, export=1
+  csrr a0, vlenb
+  ret
+endfunc
+
 function checked_call, export=1, ext=v
   /* Save the function ptr, RA, SP, unallocatable and callee-saved registers */
   la.tls.ie t0, saved_regs
diff --git a/tests/dav1d_argon.bash b/tests/dav1d_argon.bash
index 27a8d61..954dad8 100755
--- a/tests/dav1d_argon.bash
+++ b/tests/dav1d_argon.bash
@@ -4,8 +4,8 @@
 ARGON_DIR='.'
 FILMGRAIN=1
 CPUMASK=-1
-THREADS=0
-JOBS=1
+THREADS=1
+JOBS=0
 
 usage() {
     NAME=$(basename "$0")
@@ -19,8 +19,8 @@
         printf " -a dir    path to argon dir (default: 'tests/argon' if found; '.' otherwise)\n"
         printf " -g \$num   enable filmgrain (default: 1)\n"
         printf " -c \$mask  use restricted cpumask (default: -1)\n"
-        printf " -t \$num   number of threads per dav1d (default: 0)\n"
-        printf " -j \$num   number of parallel dav1d processes (default: 1)\n\n"
+        printf " -t \$num   number of threads per dav1d (default: 1)\n"
+        printf " -j \$num   number of parallel dav1d processes (default: 0)\n\n"
     } >&2
     exit 1
 }
@@ -110,6 +110,14 @@
 done
 shift $((OPTIND-1))
 
+if [ "$JOBS" -eq 0 ]; then
+    if [ "$THREADS" -gt 0 ]; then
+        JOBS="$((($( (nproc || sysctl -n hw.logicalcpu || getconf _NPROCESSORS_ONLN || echo 1) 2>/dev/null)+THREADS-1)/THREADS))"
+    else
+        JOBS=1
+    fi
+fi
+
 if [ "$#" -eq 0 ]; then
     # Everything except large scale tiles and stress files.
     dirs=("$ARGON_DIR/profile0_core"       "$ARGON_DIR/profile0_core_special"
diff --git a/tools/dav1d_cli_parse.c b/tools/dav1d_cli_parse.c
index 5fdbab3..39fe54d 100644
--- a/tools/dav1d_cli_parse.c
+++ b/tools/dav1d_cli_parse.c
@@ -219,7 +219,13 @@
 
 static const EnumParseTable cpu_mask_tbl[] = {
 #if ARCH_AARCH64 || ARCH_ARM
-    { "neon", DAV1D_ARM_CPU_FLAG_NEON },
+    { "neon",    DAV1D_ARM_CPU_FLAG_NEON },
+    { "dotprod", DAV1D_ARM_CPU_FLAG_DOTPROD },
+    { "i8mm",    DAV1D_ARM_CPU_FLAG_I8MM },
+#if ARCH_AARCH64
+    { "sve",     DAV1D_ARM_CPU_FLAG_SVE },
+    { "sve2",    DAV1D_ARM_CPU_FLAG_SVE2 },
+#endif /* ARCH_AARCH64 */
 #elif ARCH_LOONGARCH
     { "lsx", DAV1D_LOONGARCH_CPU_FLAG_LSX },
     { "lasx", DAV1D_LOONGARCH_CPU_FLAG_LASX },