external/libgav1: update to v0.16.3 am: 81461368d4 Original change: https://android-review.googlesource.com/c/platform/external/libgav1/+/1676768 Change-Id: I69cad99291a6817597bcd9e22fdfb62ed183f089

commit: 13ed79329b963d5057e84274c6142096ad2553c4 [log] [tgz]
author: James Zern <jzern@google.com> Fri Apr 16 18:04:07 2021 +0000
committer: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com> Fri Apr 16 18:04:07 2021 +0000
tree: 74a1966c5ad4e106db4bfa2e0148efd8383a73da
parent: 7cad90a8e9b500821a58d1f07aa40abf745a7b99 [diff]
parent: 81461368d4b72d6cf819f395ec0710ef6fb7b80c [diff]
diff --git a/AUTHORS b/AUTHORS
deleted file mode 100644
index d92ea0a..0000000
--- a/AUTHORS
+++ /dev/null

@@ -1,6 +0,0 @@
-# This is the list of libgav1 authors for copyright purposes.
-#
-# This does not necessarily list everyone who has contributed code, since in
-# some cases, their employer may be the copyright holder.  To see the full list
-# of contributors, see the revision history in source control.
-Google LLC

diff --git a/Android.bp b/Android.bp
index d3ddd1a..3ea8b18 100644
--- a/Android.bp
+++ b/Android.bp

@@ -50,10 +50,10 @@
         "libgav1",
     ],
 
-    // Note: if optimizations are required for x86 the sse4 files should be
-    // split to their own target to receive the correct flagging. All files in
-    // the library can be built for any target without producing empty object
-    // files.
+    // Note: if optimizations are required for x86 the avx2 & sse4 files should
+    // be split to their own targets to receive the correct flagging. All files
+    // in the library can be built for any target without producing empty
+    // object files.
     srcs: [
         "libgav1/src/buffer_pool.cc",
         "libgav1/src/decoder.cc",
@@ -67,9 +67,10 @@
         "libgav1/src/dsp/arm/intra_edge_neon.cc",
         "libgav1/src/dsp/arm/intrapred_cfl_neon.cc",
         "libgav1/src/dsp/arm/intrapred_directional_neon.cc",
-        "libgav1/src/dsp/arm/intrapred_filter_intra_neon.cc",
+        "libgav1/src/dsp/arm/intrapred_filter_neon.cc",
         "libgav1/src/dsp/arm/intrapred_neon.cc",
         "libgav1/src/dsp/arm/intrapred_smooth_neon.cc",
+        "libgav1/src/dsp/arm/inverse_transform_10bit_neon.cc",
         "libgav1/src/dsp/arm/inverse_transform_neon.cc",
         "libgav1/src/dsp/arm/loop_filter_neon.cc",
         "libgav1/src/dsp/arm/loop_restoration_neon.cc",
@@ -89,6 +90,10 @@
         "libgav1/src/dsp/film_grain.cc",
         "libgav1/src/dsp/intra_edge.cc",
         "libgav1/src/dsp/intrapred.cc",
+        "libgav1/src/dsp/intrapred_cfl.cc",
+        "libgav1/src/dsp/intrapred_directional.cc",
+        "libgav1/src/dsp/intrapred_filter.cc",
+        "libgav1/src/dsp/intrapred_smooth.cc",
         "libgav1/src/dsp/inverse_transform.cc",
         "libgav1/src/dsp/loop_filter.cc",
         "libgav1/src/dsp/loop_restoration.cc",
@@ -100,15 +105,23 @@
         "libgav1/src/dsp/warp.cc",
         "libgav1/src/dsp/weight_mask.cc",
         "libgav1/src/dsp/x86/average_blend_sse4.cc",
+        "libgav1/src/dsp/x86/cdef_avx2.cc",
         "libgav1/src/dsp/x86/cdef_sse4.cc",
+        "libgav1/src/dsp/x86/convolve_avx2.cc",
         "libgav1/src/dsp/x86/convolve_sse4.cc",
         "libgav1/src/dsp/x86/distance_weighted_blend_sse4.cc",
+        "libgav1/src/dsp/x86/film_grain_sse4.cc",
         "libgav1/src/dsp/x86/intra_edge_sse4.cc",
         "libgav1/src/dsp/x86/intrapred_cfl_sse4.cc",
+        "libgav1/src/dsp/x86/intrapred_directional_sse4.cc",
+        "libgav1/src/dsp/x86/intrapred_filter_sse4.cc",
         "libgav1/src/dsp/x86/intrapred_smooth_sse4.cc",
         "libgav1/src/dsp/x86/intrapred_sse4.cc",
         "libgav1/src/dsp/x86/inverse_transform_sse4.cc",
         "libgav1/src/dsp/x86/loop_filter_sse4.cc",
+        "libgav1/src/dsp/x86/loop_restoration_10bit_avx2.cc",
+        "libgav1/src/dsp/x86/loop_restoration_10bit_sse4.cc",
+        "libgav1/src/dsp/x86/loop_restoration_avx2.cc",
         "libgav1/src/dsp/x86/loop_restoration_sse4.cc",
         "libgav1/src/dsp/x86/mask_blend_sse4.cc",
         "libgav1/src/dsp/x86/motion_field_projection_sse4.cc",
@@ -140,8 +153,8 @@
         "libgav1/src/tile/bitstream/partition.cc",
         "libgav1/src/tile/bitstream/transform_size.cc",
         "libgav1/src/tile/prediction.cc",
-        "libgav1/src/tile_scratch_buffer.cc",
         "libgav1/src/tile/tile.cc",
+        "libgav1/src/tile_scratch_buffer.cc",
         "libgav1/src/utils/bit_reader.cc",
         "libgav1/src/utils/block_parameters_holder.cc",
         "libgav1/src/utils/constants.cc",
@@ -149,7 +162,6 @@
         "libgav1/src/utils/entropy_decoder.cc",
         "libgav1/src/utils/executor.cc",
         "libgav1/src/utils/logging.cc",
-        "libgav1/src/utils/parameter_tree.cc",
         "libgav1/src/utils/raw_bit_reader.cc",
         "libgav1/src/utils/segmentation.cc",
         "libgav1/src/utils/segmentation_map.cc",

diff --git a/README.version b/README.version
index b65b65a..89f9d10 100644
--- a/README.version
+++ b/README.version

@@ -1,5 +1,5 @@
 URL: https://chromium.googlesource.com/codecs/libgav1
-Version: v0.16.0
+Version: v0.16.3
 BugComponent: 324837
 Local Modifications:
 None

diff --git a/libgav1/CMakeLists.txt b/libgav1/CMakeLists.txt
index f033bae..5e9e17a 100644
--- a/libgav1/CMakeLists.txt
+++ b/libgav1/CMakeLists.txt

@@ -36,6 +36,26 @@
 set(libgav1_examples "${libgav1_root}/examples")
 set(libgav1_source "${libgav1_root}/src")
 
+include("${libgav1_root}/cmake/libgav1_options.cmake")
+
+libgav1_option(NAME LIBGAV1_ENABLE_OPTIMIZATIONS HELPSTRING
+               "Enables optimized code." VALUE ON)
+libgav1_option(NAME LIBGAV1_ENABLE_AVX2 HELPSTRING "Enables avx2 optimizations."
+               VALUE ON)
+libgav1_option(NAME LIBGAV1_ENABLE_NEON HELPSTRING "Enables neon optimizations."
+               VALUE ON)
+libgav1_option(NAME LIBGAV1_ENABLE_SSE4_1 HELPSTRING
+               "Enables sse4.1 optimizations." VALUE ON)
+libgav1_option(NAME LIBGAV1_ENABLE_TESTS HELPSTRING "Enables tests." VALUE ON)
+libgav1_option(
+  NAME LIBGAV1_VERBOSE HELPSTRING
+  "Enables verbose build system output. Higher numbers are more verbose." VALUE
+  OFF)
+
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Release)
+endif()
+
 include(FindThreads)
 
 include("${libgav1_examples}/libgav1_examples.cmake")
@@ -45,29 +65,14 @@
 include("${libgav1_root}/cmake/libgav1_helpers.cmake")
 include("${libgav1_root}/cmake/libgav1_install.cmake")
 include("${libgav1_root}/cmake/libgav1_intrinsics.cmake")
-include("${libgav1_root}/cmake/libgav1_options.cmake")
 include("${libgav1_root}/cmake/libgav1_sanitizer.cmake")
 include("${libgav1_root}/cmake/libgav1_targets.cmake")
 include("${libgav1_root}/cmake/libgav1_variables.cmake")
+include("${libgav1_root}/tests/libgav1_tests.cmake")
 include("${libgav1_source}/dsp/libgav1_dsp.cmake")
 include("${libgav1_source}/libgav1_decoder.cmake")
 include("${libgav1_source}/utils/libgav1_utils.cmake")
 
-libgav1_option(NAME LIBGAV1_ENABLE_OPTIMIZATIONS HELPSTRING
-               "Enables optimized code." VALUE ON)
-libgav1_option(NAME LIBGAV1_ENABLE_NEON HELPSTRING "Enables neon optimizations."
-               VALUE ON)
-libgav1_option(NAME LIBGAV1_ENABLE_SSE4_1 HELPSTRING
-               "Enables sse4.1 optimizations." VALUE ON)
-libgav1_option(
-  NAME LIBGAV1_VERBOSE HELPSTRING
-  "Enables verbose build system output. Higher numbers are more verbose." VALUE
-  OFF)
-
-if(NOT CMAKE_BUILD_TYPE)
-  set(CMAKE_BUILD_TYPE Release)
-endif()
-
 libgav1_optimization_detect()
 libgav1_set_build_definitions()
 libgav1_set_cxx_flags()
@@ -107,13 +112,27 @@
   separate_arguments(LIBGAV1_EXE_LINKER_FLAGS)
 endif()
 
-add_subdirectory("${libgav1_root}/third_party/abseil-cpp"
-                 "${libgav1_abseil_build}" EXCLUDE_FROM_ALL)
+# Set test-only flags based on LIBGAV1_CXX_FLAGS.
+libgav1_set_test_flags()
+
+set(libgav1_abseil "${libgav1_root}/third_party/abseil-cpp")
+if(NOT EXISTS "${libgav1_abseil}")
+  message(
+    FATAL_ERROR
+      "Abseil not found. This dependency is required by the"
+      " examples & tests and libgav1 when LIBGAV1_THREADPOOL_USE_STD_MUTEX is"
+      " not defined. To continue, download the Abseil repository to"
+      " third_party/abseil-cpp:\n  git \\\n    -C ${libgav1_root} \\\n"
+      "    clone \\\n"
+      "    https://github.com/abseil/abseil-cpp.git third_party/abseil-cpp")
+endif()
+add_subdirectory("${libgav1_abseil}" "${libgav1_abseil_build}" EXCLUDE_FROM_ALL)
 
 libgav1_reset_target_lists()
 libgav1_add_dsp_targets()
 libgav1_add_decoder_targets()
 libgav1_add_examples_targets()
+libgav1_add_tests_targets()
 libgav1_add_utils_targets()
 libgav1_setup_install_target()
 

diff --git a/libgav1/README.md b/libgav1/README.md
index b935679..3155970 100644
--- a/libgav1/README.md
+++ b/libgav1/README.md

@@ -20,7 +20,18 @@
     From within the libgav1 directory:
 
     ```shell
-      $ git clone https://github.com/abseil/abseil-cpp.git third_party/abseil-cpp
+    $ git clone https://github.com/abseil/abseil-cpp.git third_party/abseil-cpp
+    ```
+
+    Note: Abseil is required by the examples and tests. libgav1 will depend on
+    it if `LIBGAV1_THREADPOOL_USE_STD_MUTEX` is set to `0` (see below).
+
+4.  (Optional) [GoogleTest](https://github.com/google/googletest)
+
+    From within the libgav1 directory:
+
+    ```shell
+    $ git clone https://github.com/google/googletest.git third_party/googletest
     ```
 
 ### Compile
@@ -39,10 +50,13 @@
     [symbol reduction](#symbol-reduction) in an optimized build to keep all
     versions of dsp functions available. Automatically defined in
     `src/dsp/dsp.h` if unset.
+*   `LIBGAV1_ENABLE_AVX2`: define to a non-zero value to enable avx2
+    optimizations. Automatically defined in `src/utils/cpu.h` if unset.
 *   `LIBGAV1_ENABLE_NEON`: define to a non-zero value to enable NEON
-    optimizations. Automatically defined in `src/dsp/dsp.h` if unset.
+    optimizations. Automatically defined in `src/utils/cpu.h` if unset.
 *   `LIBGAV1_ENABLE_SSE4_1`: define to a non-zero value to enable sse4.1
-    optimizations. Automatically defined in `src/dsp/dsp.h` if unset.
+    optimizations. Automatically defined in `src/utils/cpu.h` if unset. Note
+    setting this to 0 will also disable AVX2.
 *   `LIBGAV1_ENABLE_LOGGING`: define to 0/1 to control debug logging.
     Automatically defined in `src/utils/logging.h` if unset.
 *   `LIBGAV1_EXAMPLES_ENABLE_LOGGING`: define to 0/1 to control error logging in
@@ -55,10 +69,11 @@
 *   `LIBGAV1_THREADPOOL_USE_STD_MUTEX`: controls use of std::mutex and
     absl::Mutex in ThreadPool. Defining this to 1 will remove any Abseil
     dependency from the core library. Automatically defined in
-    `src/utils/threadpool.h` if unset.
+    `src/utils/threadpool.h` if unset. Defaults to 1 on Android & iOS, 0
+    otherwise.
 *   `LIBGAV1_MAX_THREADS`: sets the number of threads that the library is
-    allowed to create. Has to be an integer > 0. Otherwise this is ignored.
-    The default value is 128.
+    allowed to create. Has to be an integer > 0. Otherwise this is ignored. The
+    default value is 128.
 *   `LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER`: the threshold multiplier that
     is used to determine when to use frame parallel decoding. Frame parallel
     decoding will be used if |threads| > |tile_count| * this multiplier. Has to

diff --git a/libgav1/cmake/libgav1_build_definitions.cmake b/libgav1/cmake/libgav1_build_definitions.cmake
index 930d8f5..fc83490 100644
--- a/libgav1/cmake/libgav1_build_definitions.cmake
+++ b/libgav1/cmake/libgav1_build_definitions.cmake

@@ -21,7 +21,24 @@
   string(TOLOWER "${CMAKE_BUILD_TYPE}" build_type_lowercase)
 
   libgav1_load_version_info()
-  set(LIBGAV1_SOVERSION 0)
+
+  # Library version info. See the libtool docs for updating the values:
+  # https://www.gnu.org/software/libtool/manual/libtool.html#Updating-version-info
+  #
+  # c=<current>, r=<revision>, a=<age>
+  #
+  # libtool generates a .so file as .so.[c-a].a.r, while -version-info c:r:a is
+  # passed to libtool.
+  #
+  # We set LIBGAV1_SOVERSION = [c-a].a.r
+  set(LT_CURRENT 0)
+  set(LT_REVISION 0)
+  set(LT_AGE 0)
+  math(EXPR LIBGAV1_SOVERSION_MAJOR "${LT_CURRENT} - ${LT_AGE}")
+  set(LIBGAV1_SOVERSION "${LIBGAV1_SOVERSION_MAJOR}.${LT_AGE}.${LT_REVISION}")
+  unset(LT_CURRENT)
+  unset(LT_REVISION)
+  unset(LT_AGE)
 
   list(APPEND libgav1_include_paths "${libgav1_root}" "${libgav1_root}/src"
               "${libgav1_build}" "${libgav1_root}/third_party/abseil-cpp")
@@ -89,9 +106,7 @@
   endif()
 
   if(build_type_lowercase MATCHES "rel")
-    # TODO(tomfinegan): this value is only a concern for the core library and
-    # can be made smaller if the test targets are avoided.
-    list(APPEND libgav1_base_cxx_flags "-Wstack-usage=196608")
+    list(APPEND libgav1_base_cxx_flags "-Wframe-larger-than=196608")
   endif()
 
   list(APPEND libgav1_msvc_cxx_flags
@@ -144,6 +159,7 @@
 
   # Source file names ending in these suffixes will have the appropriate
   # compiler flags added to their compile commands to enable intrinsics.
+  set(libgav1_avx2_source_file_suffix "avx2.cc")
   set(libgav1_neon_source_file_suffix "neon.cc")
   set(libgav1_sse4_source_file_suffix "sse4.cc")
 endmacro()

diff --git a/libgav1/cmake/libgav1_cpu_detection.cmake b/libgav1/cmake/libgav1_cpu_detection.cmake
index 6972d34..e17e27c 100644
--- a/libgav1/cmake/libgav1_cpu_detection.cmake
+++ b/libgav1/cmake/libgav1_cpu_detection.cmake

@@ -24,10 +24,17 @@
     if(cpu_lowercase MATCHES "^arm|^aarch64")
       set(libgav1_have_neon ON)
     elseif(cpu_lowercase MATCHES "^x86|amd64")
+      set(libgav1_have_avx2 ON)
       set(libgav1_have_sse4 ON)
     endif()
   endif()
 
+  if(libgav1_have_avx2 AND LIBGAV1_ENABLE_AVX2)
+    list(APPEND libgav1_defines "LIBGAV1_ENABLE_AVX2=1")
+  else()
+    list(APPEND libgav1_defines "LIBGAV1_ENABLE_AVX2=0")
+  endif()
+
   if(libgav1_have_neon AND LIBGAV1_ENABLE_NEON)
     list(APPEND libgav1_defines "LIBGAV1_ENABLE_NEON=1")
   else()

diff --git a/libgav1/cmake/libgav1_flags.cmake b/libgav1/cmake/libgav1_flags.cmake
index 0b8df60..a5408e2 100644
--- a/libgav1/cmake/libgav1_flags.cmake
+++ b/libgav1/cmake/libgav1_flags.cmake

@@ -118,6 +118,12 @@
     if(cxx_flags)
       message("--- Testing flags from $cxx_flags: " "${cxx_flags}")
       foreach(cxx_flag ${cxx_flags})
+        # Between 3.17.0 and 3.18.2 check_cxx_compiler_flag() sets a normal
+        # variable at parent scope while check_cxx_source_compiles() continues
+        # to set an internal cache variable, so we unset both to avoid the
+        # failure / success state persisting between checks. See
+        # https://gitlab.kitware.com/cmake/cmake/-/issues/21207.
+        unset(cxx_flag_test_passed)
         unset(cxx_flag_test_passed CACHE)
         message("--- Testing flag: ${cxx_flag}")
         check_cxx_compiler_flag("${cxx_flag}" cxx_flag_test_passed)
@@ -199,7 +205,7 @@
 
   # Restore cached global exe linker flags.
   if(cached_CMAKE_EXE_LINKER_FLAGS)
-    set(CMAKE_EXE_LINKER_FLAGS cached_CMAKE_EXE_LINKER_FLAGS)
+    set(CMAKE_EXE_LINKER_FLAGS ${cached_CMAKE_EXE_LINKER_FLAGS})
   else()
     unset(CMAKE_EXE_LINKER_FLAGS)
   endif()
@@ -243,3 +249,15 @@
 
   libgav1_test_cxx_flag(FLAG_LIST_VAR_NAMES ${cxx_flag_lists})
 endmacro()
+
+# Sets LIBGAV1_TEST_C_FLAGS and LIBGAV1_TEST_CXX_FLAGS.
+#
+# Note: libgav1_set_cxx_flags() must be called before this macro. Furthermore,
+# the call to this macro should be made after all additions to LIBGAV1_CXX_FLAGS
+# are complete.
+macro(libgav1_set_test_flags)
+  if(LIBGAV1_ENABLE_TESTS)
+    set(LIBGAV1_TEST_CXX_FLAGS ${LIBGAV1_CXX_FLAGS})
+    list(FILTER LIBGAV1_TEST_CXX_FLAGS EXCLUDE REGEX "-Wframe-larger-than")
+  endif()
+endmacro()

diff --git a/libgav1/cmake/libgav1_helpers.cmake b/libgav1/cmake/libgav1_helpers.cmake
index 76d8d67..ac16257 100644
--- a/libgav1/cmake/libgav1_helpers.cmake
+++ b/libgav1/cmake/libgav1_helpers.cmake

@@ -20,7 +20,13 @@
 # Kills build generation using message(FATAL_ERROR) and outputs all data passed
 # to the console via use of $ARGN.
 macro(libgav1_die)
-  message(FATAL_ERROR ${ARGN})
+  # macro parameters are not variables so a temporary is needed to work with
+  # list().
+  set(msg ${ARGN})
+  # message(${ARGN}) will merge all list elements with no separator while
+  # "${ARGN}" will output the list as a ';' delimited string.
+  list(JOIN msg " " msg)
+  message(FATAL_ERROR "${msg}")
 endmacro()
 
 # Converts semi-colon delimited list variable(s) to string. Output is written to
@@ -94,10 +100,10 @@
       "${dummy_source_dir}/libgav1_${cdsf_TARGET}_${cdsf_BASENAME}.cc")
   set(dummy_source_code
       "// Generated file. DO NOT EDIT!\n"
-      "// C++ source file created for target ${cdsf_TARGET}. \n"
-      "void libgav1_${cdsf_TARGET}_${cdsf_BASENAME}_dummy_function(void);\n"
+      "// C++ source file created for target ${cdsf_TARGET}.\n"
+      "void libgav1_${cdsf_TARGET}_${cdsf_BASENAME}_dummy_function(void)\;\n"
       "void libgav1_${cdsf_TARGET}_${cdsf_BASENAME}_dummy_function(void) {}\n")
-  file(WRITE "${dummy_source_file}" "${dummy_source_code}")
+  file(WRITE "${dummy_source_file}" ${dummy_source_code})
 
   target_sources(${cdsf_TARGET} PRIVATE ${dummy_source_file})
 

diff --git a/libgav1/cmake/libgav1_intrinsics.cmake b/libgav1/cmake/libgav1_intrinsics.cmake
index 039ef35..a2e9ddb 100644
--- a/libgav1/cmake/libgav1_intrinsics.cmake
+++ b/libgav1/cmake/libgav1_intrinsics.cmake

@@ -38,6 +38,12 @@
     if(NOT MSVC)
       set(${intrinsics_VARIABLE} "${LIBGAV1_NEON_INTRINSICS_FLAG}")
     endif()
+  elseif(intrinsics_SUFFIX MATCHES "avx2")
+    if(MSVC)
+      set(${intrinsics_VARIABLE} "/arch:AVX2")
+    else()
+      set(${intrinsics_VARIABLE} "-mavx2")
+    endif()
   elseif(intrinsics_SUFFIX MATCHES "sse4")
     if(NOT MSVC)
       set(${intrinsics_VARIABLE} "-msse4.1")
@@ -57,7 +63,7 @@
 # necessary: libgav1_process_intrinsics_sources(SOURCES <sources>)
 #
 # Detects requirement for intrinsics flags using source file name suffix.
-# Currently supports only SSE4.1.
+# Currently supports AVX2 and SSE4.1.
 macro(libgav1_process_intrinsics_sources)
   unset(arg_TARGET)
   unset(arg_SOURCES)
@@ -71,6 +77,25 @@
                         "SOURCES required.")
   endif()
 
+  if(LIBGAV1_ENABLE_AVX2 AND libgav1_have_avx2)
+    unset(avx2_sources)
+    list(APPEND avx2_sources ${arg_SOURCES})
+
+    list(FILTER avx2_sources INCLUDE REGEX
+         "${libgav1_avx2_source_file_suffix}$")
+
+    if(avx2_sources)
+      unset(avx2_flags)
+      libgav1_get_intrinsics_flag_for_suffix(SUFFIX
+                                             ${libgav1_avx2_source_file_suffix}
+                                             VARIABLE avx2_flags)
+      if(avx2_flags)
+        libgav1_set_compiler_flags_for_sources(SOURCES ${avx2_sources} FLAGS
+                                               ${avx2_flags})
+      endif()
+    endif()
+  endif()
+
   if(LIBGAV1_ENABLE_SSE4_1 AND libgav1_have_sse4)
     unset(sse4_sources)
     list(APPEND sse4_sources ${arg_SOURCES})

diff --git a/libgav1/cmake/libgav1_sanitizer.cmake b/libgav1/cmake/libgav1_sanitizer.cmake
index 4bb2263..2f9ee07 100644
--- a/libgav1/cmake/libgav1_sanitizer.cmake
+++ b/libgav1/cmake/libgav1_sanitizer.cmake

@@ -39,7 +39,9 @@
     list(APPEND LIBGAV1_CXX_FLAGS "-fno-omit-frame-pointer"
                 "-fno-optimize-sibling-calls")
 
-    libgav1_test_cxx_flag(FLAG_LIST_VAR_NAMES LIBGAV1_CXX_FLAGS FLAG_REQUIRED)
+    # Check the linker flags first as they may be required in the compile check
+    # to avoid undefined symbols related to the sanitizer.
     libgav1_test_exe_linker_flag(FLAG_LIST_VAR_NAME LIBGAV1_EXE_LINKER_FLAGS)
+    libgav1_test_cxx_flag(FLAG_LIST_VAR_NAMES LIBGAV1_CXX_FLAGS FLAG_REQUIRED)
   endif()
 endmacro()

diff --git a/libgav1/cmake/libgav1_targets.cmake b/libgav1/cmake/libgav1_targets.cmake
index 78b4865..997f8bd 100644
--- a/libgav1/cmake/libgav1_targets.cmake
+++ b/libgav1/cmake/libgav1_targets.cmake

@@ -29,7 +29,7 @@
 
 # Creates an executable target. The target name is passed as a parameter to the
 # NAME argument, and the sources passed as a parameter to the SOURCES argument:
-# libgav1_add_test(NAME <name> SOURCES <sources> [optional args])
+# libgav1_add_executable(NAME <name> SOURCES <sources> [optional args])
 #
 # Optional args:
 # cmake-format: off
@@ -115,15 +115,35 @@
     target_include_directories(${exe_NAME} PRIVATE ${exe_INCLUDES})
   endif()
 
-  if(exe_COMPILE_FLAGS OR LIBGAV1_CXX_FLAGS)
+  unset(exe_LIBGAV1_COMPILE_FLAGS)
+  if(exe_TEST)
+    list(FILTER exe_SOURCES INCLUDE REGEX "\\.c$")
+    list(LENGTH exe_SOURCES exe_SOURCES_length)
+    if(exe_SOURCES_length EQUAL 0)
+      set(exe_LIBGAV1_COMPILE_FLAGS ${LIBGAV1_TEST_CXX_FLAGS})
+    else()
+      set(exe_LIBGAV1_COMPILE_FLAGS ${LIBGAV1_TEST_C_FLAGS})
+    endif()
+  else()
+    set(exe_LIBGAV1_COMPILE_FLAGS ${LIBGAV1_CXX_FLAGS})
+  endif()
+
+  if(exe_COMPILE_FLAGS OR exe_LIBGAV1_COMPILE_FLAGS)
     target_compile_options(${exe_NAME}
-                           PRIVATE ${exe_COMPILE_FLAGS} ${LIBGAV1_CXX_FLAGS})
+                           PRIVATE ${exe_COMPILE_FLAGS}
+                                   ${exe_LIBGAV1_COMPILE_FLAGS})
   endif()
 
   if(exe_LINK_FLAGS OR LIBGAV1_EXE_LINKER_FLAGS)
-    set_target_properties(${exe_NAME}
-                          PROPERTIES LINK_FLAGS ${exe_LINK_FLAGS}
-                                     ${LIBGAV1_EXE_LINKER_FLAGS})
+    list(APPEND exe_LINK_FLAGS "${LIBGAV1_EXE_LINKER_FLAGS}")
+    if(${CMAKE_VERSION} VERSION_LESS "3.13")
+      # LINK_FLAGS is managed as a string.
+      libgav1_set_and_stringify(SOURCE "${exe_LINK_FLAGS}" DEST exe_LINK_FLAGS)
+      set_target_properties(${exe_NAME}
+                            PROPERTIES LINK_FLAGS "${exe_LINK_FLAGS}")
+    else()
+      target_link_options(${exe_NAME} PRIVATE ${exe_LINK_FLAGS})
+    endif()
   endif()
 
   if(exe_OBJLIB_DEPS)
@@ -137,7 +157,7 @@
   endif()
 
   if(BUILD_SHARED_LIBS AND (MSVC OR WIN32))
-    target_compile_definitions(${lib_NAME} PRIVATE "LIBGAV1_BUILDING_DLL=0")
+    target_compile_definitions(${exe_NAME} PRIVATE "LIBGAV1_BUILDING_DLL=0")
   endif()
 
   if(exe_LIB_DEPS)
@@ -321,7 +341,9 @@
   endif()
 
   if(lib_TYPE STREQUAL SHARED AND NOT MSVC)
-    set_target_properties(${lib_NAME} PROPERTIES SOVERSION ${LIBGAV1_SOVERSION})
+    set_target_properties(${lib_NAME}
+                          PROPERTIES VERSION ${LIBGAV1_SOVERSION} SOVERSION
+                                     ${LIBGAV1_SOVERSION_MAJOR})
   endif()
 
   if(BUILD_SHARED_LIBS AND (MSVC OR WIN32))

diff --git a/libgav1/examples/gav1_decode.cc b/libgav1/examples/gav1_decode.cc
index e7d3246..1408e8c 100644
--- a/libgav1/examples/gav1_decode.cc
+++ b/libgav1/examples/gav1_decode.cc

@@ -370,16 +370,15 @@
 
     const libgav1::DecoderBuffer* buffer;
     status = decoder.DequeueFrame(&buffer);
-    if (status != libgav1::kStatusOk &&
-        status != libgav1::kStatusNothingToDequeue) {
-      fprintf(stderr, "Unable to dequeue frame: %s\n",
-              libgav1::GetErrorString(status));
-      return EXIT_FAILURE;
-    }
     if (status == libgav1::kStatusNothingToDequeue) {
       dequeue_finished = true;
       continue;
     }
+    if (status != libgav1::kStatusOk) {
+      fprintf(stderr, "Unable to dequeue frame: %s\n",
+              libgav1::GetErrorString(status));
+      return EXIT_FAILURE;
+    }
     dequeue_finished = false;
     if (buffer == nullptr) continue;
     ++decoded_frames;
@@ -420,6 +419,9 @@
         input_buffers.ReleaseInputBuffer(input_buffer);
       }
       input_buffer = nullptr;
+      // Clear any in progress frames to ensure the output frame limit is
+      // respected.
+      decoder.SignalEOS();
     }
   } while (input_buffer != nullptr ||
            (!file_reader->IsEndOfFile() && !limit_reached) ||

diff --git a/libgav1/examples/logging.h b/libgav1/examples/logging.h
index c0bcad7..cf5a09f 100644
--- a/libgav1/examples/logging.h
+++ b/libgav1/examples/logging.h

@@ -46,7 +46,7 @@
 #define LIBGAV1_EXAMPLES_LOG_ERROR(error_string)                              \
   do {                                                                        \
     constexpr const char* libgav1_examples_basename =                         \
-        ::libgav1::examples::Basename(__FILE__, sizeof(__FILE__) - 1);        \
+        libgav1::examples::Basename(__FILE__, sizeof(__FILE__) - 1);          \
     fprintf(stderr, "%s:%d (%s): %s.\n", libgav1_examples_basename, __LINE__, \
             __func__, error_string);                                          \
   } while (false)

diff --git a/libgav1/src/decoder_impl.cc b/libgav1/src/decoder_impl.cc
index e40c692..e23903c 100644
--- a/libgav1/src/decoder_impl.cc
+++ b/libgav1/src/decoder_impl.cc

@@ -31,13 +31,11 @@
 #include "src/obu_parser.h"
 #include "src/post_filter.h"
 #include "src/prediction_mask.h"
-#include "src/quantizer.h"
 #include "src/threading_strategy.h"
 #include "src/utils/blocking_counter.h"
 #include "src/utils/common.h"
 #include "src/utils/constants.h"
 #include "src/utils/logging.h"
-#include "src/utils/parameter_tree.h"
 #include "src/utils/raw_bit_reader.h"
 #include "src/utils/segmentation.h"
 #include "src/utils/threadpool.h"
@@ -632,10 +630,6 @@
 }
 
 StatusCode DecoderImpl::Init() {
-  if (!GenerateWedgeMask(&wedge_masks_)) {
-    LIBGAV1_DLOG(ERROR, "GenerateWedgeMask() failed.");
-    return kStatusOutOfMemory;
-  }
   if (!output_frame_queue_.Init(kMaxLayers)) {
     LIBGAV1_DLOG(ERROR, "output_frame_queue_.Init() failed.");
     return kStatusOutOfMemory;
@@ -854,6 +848,14 @@
       LIBGAV1_DLOG(ERROR, "Failed to parse OBU.");
       return status;
     }
+    if (!MaybeInitializeQuantizerMatrix(obu->frame_header())) {
+      LIBGAV1_DLOG(ERROR, "InitializeQuantizerMatrix() failed.");
+      return kStatusOutOfMemory;
+    }
+    if (!MaybeInitializeWedgeMasks(obu->frame_header().frame_type)) {
+      LIBGAV1_DLOG(ERROR, "InitializeWedgeMasks() failed.");
+      return kStatusOutOfMemory;
+    }
     if (IsNewSequenceHeader(*obu)) {
       const ObuSequenceHeader& sequence_header = obu->sequence_header();
       const Libgav1ImageFormat image_format =
@@ -1043,6 +1045,14 @@
       LIBGAV1_DLOG(ERROR, "Failed to parse OBU.");
       return status;
     }
+    if (!MaybeInitializeQuantizerMatrix(obu->frame_header())) {
+      LIBGAV1_DLOG(ERROR, "InitializeQuantizerMatrix() failed.");
+      return kStatusOutOfMemory;
+    }
+    if (!MaybeInitializeWedgeMasks(obu->frame_header().frame_type)) {
+      LIBGAV1_DLOG(ERROR, "InitializeWedgeMasks() failed.");
+      return kStatusOutOfMemory;
+    }
     if (IsNewSequenceHeader(*obu)) {
       const ObuSequenceHeader& sequence_header = obu->sequence_header();
       const Libgav1ImageFormat image_format =
@@ -1145,7 +1155,7 @@
   buffer_.bitdepth = yuv_buffer->bitdepth();
   const int num_planes =
       yuv_buffer->is_monochrome() ? kMaxPlanesMonochrome : kMaxPlanes;
-  int plane = 0;
+  int plane = kPlaneY;
   for (; plane < num_planes; ++plane) {
     buffer_.stride[plane] = yuv_buffer->stride(plane);
     buffer_.plane[plane] = yuv_buffer->data(plane);
@@ -1188,6 +1198,12 @@
                  "Failed to allocate memory for loop restoration info units.");
     return kStatusOutOfMemory;
   }
+  ThreadingStrategy& threading_strategy =
+      frame_scratch_buffer->threading_strategy;
+  if (!is_frame_parallel_ &&
+      !threading_strategy.Reset(frame_header, settings_.threads)) {
+    return kStatusOutOfMemory;
+  }
   const bool do_cdef =
       PostFilter::DoCdef(frame_header, settings_.post_filter_mask);
   const int num_planes = sequence_header.color_config.is_monochrome
@@ -1198,15 +1214,11 @@
   const bool do_superres =
       PostFilter::DoSuperRes(frame_header, settings_.post_filter_mask);
   // Use kBorderPixels for the left, right, and top borders. Only the bottom
-  // border may need to be bigger. SuperRes border is needed only if we are
-  // applying SuperRes in-place which is being done only in single threaded
-  // mode.
+  // border may need to be bigger. Cdef border is needed only if we apply Cdef
+  // without multithreading.
   const int bottom_border = GetBottomBorderPixels(
-      do_cdef, do_restoration,
-      do_superres &&
-          frame_scratch_buffer->threading_strategy.post_filter_thread_pool() ==
-              nullptr,
-      sequence_header.color_config.subsampling_y);
+      do_cdef && threading_strategy.post_filter_thread_pool() == nullptr,
+      do_restoration, do_superres, sequence_header.color_config.subsampling_y);
   current_frame->set_chroma_sample_position(
       sequence_header.color_config.chroma_sample_position);
   if (!current_frame->Realloc(sequence_header.color_config.bitdepth,
@@ -1269,8 +1281,7 @@
   // without having to check for boundary conditions.
   if (!frame_scratch_buffer->block_parameters_holder.Reset(
           frame_header.rows4x4 + kMaxBlockHeight4x4,
-          frame_header.columns4x4 + kMaxBlockWidth4x4,
-          sequence_header.use_128x128_superblock)) {
+          frame_header.columns4x4 + kMaxBlockWidth4x4)) {
     return kStatusOutOfMemory;
   }
   const dsp::Dsp* const dsp =
@@ -1288,12 +1299,6 @@
     LIBGAV1_DLOG(ERROR, "tiles.reserve(%d) failed.\n", tile_count);
     return kStatusOutOfMemory;
   }
-  ThreadingStrategy& threading_strategy =
-      frame_scratch_buffer->threading_strategy;
-  if (!is_frame_parallel_ &&
-      !threading_strategy.Reset(frame_header, settings_.threads)) {
-    return kStatusOutOfMemory;
-  }
 
   if (threading_strategy.row_thread_pool(0) != nullptr || is_frame_parallel_) {
     if (frame_scratch_buffer->residual_buffer_pool == nullptr) {
@@ -1318,43 +1323,36 @@
     }
   }
 
-  if (threading_strategy.post_filter_thread_pool() != nullptr &&
-      (do_cdef || do_restoration)) {
-    const int window_buffer_width = PostFilter::GetWindowBufferWidth(
-        threading_strategy.post_filter_thread_pool(), frame_header);
-    size_t threaded_window_buffer_size =
-        window_buffer_width *
-        PostFilter::GetWindowBufferHeight(
-            threading_strategy.post_filter_thread_pool(), frame_header) *
-        (sequence_header.color_config.bitdepth == 8 ? sizeof(uint8_t)
-                                                    : sizeof(uint16_t));
-    if (do_cdef) {
-      // TODO(chengchen): for cdef U, V planes, if there's subsampling, we can
-      // use smaller buffer.
-      threaded_window_buffer_size *= num_planes;
-    }
-    // To avoid false sharing, PostFilter's window width in bytes should be a
-    // multiple of the cache line size. For simplicity, we check the window
-    // width in pixels.
-    assert(window_buffer_width % kCacheLineSize == 0);
-    if (!frame_scratch_buffer->threaded_window_buffer.Resize(
-            threaded_window_buffer_size)) {
-      LIBGAV1_DLOG(ERROR,
-                   "Failed to resize threaded loop restoration buffer.\n");
+  if (threading_strategy.post_filter_thread_pool() != nullptr && do_cdef) {
+    // We need to store 4 rows per 64x64 unit.
+    const int num_units =
+        MultiplyBy4(RightShiftWithCeiling(frame_header.rows4x4, 4));
+    // subsampling_y is set to zero irrespective of the actual frame's
+    // subsampling since we need to store exactly |num_units| rows of the loop
+    // restoration border pixels.
+    if (!frame_scratch_buffer->cdef_border.Realloc(
+            sequence_header.color_config.bitdepth,
+            sequence_header.color_config.is_monochrome,
+            MultiplyBy4(frame_header.columns4x4), num_units,
+            sequence_header.color_config.subsampling_x,
+            /*subsampling_y=*/0, kBorderPixels, kBorderPixels, kBorderPixels,
+            kBorderPixels, nullptr, nullptr, nullptr)) {
       return kStatusOutOfMemory;
     }
   }
 
-  if (do_cdef && do_restoration) {
+  if (do_restoration &&
+      (do_cdef || threading_strategy.post_filter_thread_pool() != nullptr)) {
     // We need to store 4 rows per 64x64 unit.
-    const int num_deblock_units = MultiplyBy4(Ceil(frame_header.rows4x4, 16));
+    const int num_units =
+        MultiplyBy4(RightShiftWithCeiling(frame_header.rows4x4, 4));
     // subsampling_y is set to zero irrespective of the actual frame's
-    // subsampling since we need to store exactly |num_deblock_units| rows of
-    // the deblocked pixels.
-    if (!frame_scratch_buffer->deblock_buffer.Realloc(
+    // subsampling since we need to store exactly |num_units| rows of the loop
+    // restoration border pixels.
+    if (!frame_scratch_buffer->loop_restoration_border.Realloc(
             sequence_header.color_config.bitdepth,
             sequence_header.color_config.is_monochrome,
-            frame_header.upscaled_width, num_deblock_units,
+            frame_header.upscaled_width, num_units,
             sequence_header.color_config.subsampling_x,
             /*subsampling_y=*/0, kBorderPixels, kBorderPixels, kBorderPixels,
             kBorderPixels, nullptr, nullptr, nullptr)) {
@@ -1363,18 +1361,45 @@
   }
 
   if (do_superres) {
+    const int pixel_size = sequence_header.color_config.bitdepth == 8
+                               ? sizeof(uint8_t)
+                               : sizeof(uint16_t);
+    if (!frame_scratch_buffer->superres_coefficients[kPlaneTypeY].Resize(
+            kSuperResFilterTaps * Align(frame_header.upscaled_width, 16) *
+            pixel_size)) {
+      LIBGAV1_DLOG(ERROR,
+                   "Failed to Resize superres_coefficients[kPlaneTypeY].");
+      return kStatusOutOfMemory;
+    }
+    if (!sequence_header.color_config.is_monochrome &&
+        sequence_header.color_config.subsampling_x != 0 &&
+        !frame_scratch_buffer->superres_coefficients[kPlaneTypeUV].Resize(
+            kSuperResFilterTaps *
+            Align(SubsampledValue(frame_header.upscaled_width, 1), 16) *
+            pixel_size)) {
+      LIBGAV1_DLOG(ERROR,
+                   "Failed to Resize superres_coefficients[kPlaneTypeUV].");
+      return kStatusOutOfMemory;
+    }
+  }
+
+  if (do_superres && threading_strategy.post_filter_thread_pool() != nullptr) {
     const int num_threads =
-        1 + ((threading_strategy.post_filter_thread_pool() == nullptr)
-                 ? 0
-                 : threading_strategy.post_filter_thread_pool()->num_threads());
-    const size_t superres_line_buffer_size =
-        num_threads *
-        (MultiplyBy4(frame_header.columns4x4) +
-         MultiplyBy2(kSuperResHorizontalBorder) + kSuperResHorizontalPadding) *
-        (sequence_header.color_config.bitdepth == 8 ? sizeof(uint8_t)
-                                                    : sizeof(uint16_t));
-    if (!frame_scratch_buffer->superres_line_buffer.Resize(
-            superres_line_buffer_size)) {
+        threading_strategy.post_filter_thread_pool()->num_threads() + 1;
+    // subsampling_y is set to zero irrespective of the actual frame's
+    // subsampling since we need to store exactly |num_threads| rows of the
+    // down-scaled pixels.
+    // Left and right borders are for line extension. They are doubled for the Y
+    // plane to make sure the U and V planes have enough space after possible
+    // subsampling.
+    if (!frame_scratch_buffer->superres_line_buffer.Realloc(
+            sequence_header.color_config.bitdepth,
+            sequence_header.color_config.is_monochrome,
+            MultiplyBy4(frame_header.columns4x4), num_threads,
+            sequence_header.color_config.subsampling_x,
+            /*subsampling_y=*/0, 2 * kSuperResHorizontalBorder,
+            2 * (kSuperResHorizontalBorder + kSuperResHorizontalPadding), 0, 0,
+            nullptr, nullptr, nullptr)) {
       LIBGAV1_DLOG(ERROR, "Failed to resize superres line buffer.\n");
       return kStatusOutOfMemory;
     }
@@ -1384,14 +1409,11 @@
                          current_frame->buffer(), dsp,
                          settings_.post_filter_mask);
 
-  if (is_frame_parallel_) {
+  if (is_frame_parallel_ && !IsIntraFrame(frame_header.frame_type)) {
     // We can parse the current frame if all the reference frames have been
     // parsed.
-    for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
-      if (!state.reference_valid[i] || state.reference_frame[i] == nullptr) {
-        continue;
-      }
-      if (!state.reference_frame[i]->WaitUntilParsed()) {
+    for (const int index : frame_header.reference_frame_index) {
+      if (!state.reference_frame[index]->WaitUntilParsed()) {
         return kStatusUnknownError;
       }
     }
@@ -1434,7 +1456,7 @@
     }
     IntraPredictionBuffer* const intra_prediction_buffers =
         frame_scratch_buffer->intra_prediction_buffers.get();
-    for (int plane = 0; plane < num_planes; ++plane) {
+    for (int plane = kPlaneY; plane < num_planes; ++plane) {
       const int subsampling =
           (plane == kPlaneY) ? 0 : sequence_header.color_config.subsampling_x;
       const size_t intra_prediction_buffer_size =
@@ -1462,9 +1484,9 @@
         tile_number, tile_buffers[tile_number].data,
         tile_buffers[tile_number].size, sequence_header, frame_header,
         current_frame, state, frame_scratch_buffer, wedge_masks_,
-        &saved_symbol_decoder_context, prev_segment_ids, &post_filter, dsp,
-        threading_strategy.row_thread_pool(tile_number), &pending_tiles,
-        is_frame_parallel_, use_intra_prediction_buffer);
+        quantizer_matrix_, &saved_symbol_decoder_context, prev_segment_ids,
+        &post_filter, dsp, threading_strategy.row_thread_pool(tile_number),
+        &pending_tiles, is_frame_parallel_, use_intra_prediction_buffer);
     if (tile == nullptr) {
       LIBGAV1_DLOG(ERROR, "Failed to create tile.");
       return kStatusOutOfMemory;
@@ -1626,4 +1648,27 @@
   return sequence_header_changed;
 }
 
+bool DecoderImpl::MaybeInitializeWedgeMasks(FrameType frame_type) {
+  if (IsIntraFrame(frame_type) || wedge_masks_initialized_) {
+    return true;
+  }
+  if (!GenerateWedgeMask(&wedge_masks_)) {
+    return false;
+  }
+  wedge_masks_initialized_ = true;
+  return true;
+}
+
+bool DecoderImpl::MaybeInitializeQuantizerMatrix(
+    const ObuFrameHeader& frame_header) {
+  if (quantizer_matrix_initialized_ || !frame_header.quantizer.use_matrix) {
+    return true;
+  }
+  if (!InitializeQuantizerMatrix(&quantizer_matrix_)) {
+    return false;
+  }
+  quantizer_matrix_initialized_ = true;
+  return true;
+}
+
 }  // namespace libgav1

diff --git a/libgav1/src/decoder_impl.h b/libgav1/src/decoder_impl.h
index df1b091..b52ecdf 100644
--- a/libgav1/src/decoder_impl.h
+++ b/libgav1/src/decoder_impl.h

@@ -32,6 +32,7 @@
 #include "src/gav1/decoder_settings.h"
 #include "src/gav1/status_code.h"
 #include "src/obu_parser.h"
+#include "src/quantizer.h"
 #include "src/residual_buffer_pool.h"
 #include "src/symbol_decoder_context.h"
 #include "src/tile.h"
@@ -57,7 +58,7 @@
         temporal_unit(nullptr),
         frame(frame),
         position_in_temporal_unit(position_in_temporal_unit) {
-    obu->MoveTileBuffer(&tile_buffers);
+    obu->MoveTileBuffers(&tile_buffers);
     frame->MarkFrameAsStarted();
   }
 
@@ -210,6 +211,14 @@
     return failure_status_ != kStatusOk;
   }
 
+  // Initializes the |quantizer_matrix_| if necessary and sets
+  // |quantizer_matrix_initialized_| to true.
+  bool MaybeInitializeQuantizerMatrix(const ObuFrameHeader& frame_header);
+
+  // Allocates and generates the |wedge_masks_| if necessary and sets
+  // |wedge_masks_initialized_| to true.
+  bool MaybeInitializeWedgeMasks(FrameType frame_type);
+
   // Elements in this queue cannot be moved with std::move since the
   // |EncodedFrame.temporal_unit| stores a pointer to elements in this queue.
   Queue<TemporalUnit> temporal_units_;
@@ -228,6 +237,9 @@
 
   BufferPool buffer_pool_;
   WedgeMaskArray wedge_masks_;
+  bool wedge_masks_initialized_ = false;
+  QuantizerMatrix quantizer_matrix_;
+  bool quantizer_matrix_initialized_ = false;
   FrameScratchBufferPool frame_scratch_buffer_pool_;
 
   // Used to synchronize the accesses into |temporal_units_| in order to update

diff --git a/libgav1/src/decoder_state.h b/libgav1/src/decoder_state.h
index 897c99f..ea5c792 100644
--- a/libgav1/src/decoder_state.h
+++ b/libgav1/src/decoder_state.h

@@ -33,7 +33,6 @@
     for (int ref_index = 0, mask = refresh_frame_flags; mask != 0;
          ++ref_index, mask >>= 1) {
       if ((mask & 1) != 0) {
-        reference_valid[ref_index] = true;
         reference_frame_id[ref_index] = current_frame_id;
         reference_frame[ref_index] = current_frame;
         reference_order_hint[ref_index] = order_hint;
@@ -43,7 +42,6 @@
 
   // Clears all the reference frames.
   void ClearReferenceFrames() {
-    reference_valid = {};
     reference_frame_id = {};
     reference_order_hint = {};
     for (int ref_index = 0; ref_index < kNumReferenceFrameTypes; ++ref_index) {
@@ -51,12 +49,11 @@
     }
   }
 
-  // reference_valid and reference_frame_id are used only if
-  // sequence_header_.frame_id_numbers_present is true.
-  // The reference_valid array is indexed by a reference picture slot number.
-  // A value (boolean) in the array signifies whether the corresponding
-  // reference picture slot is valid for use as a reference picture.
-  std::array<bool, kNumReferenceFrameTypes> reference_valid = {};
+  // reference_frame_id and current_frame_id have meaningful values and are used
+  // in checks only if sequence_header_.frame_id_numbers_present is true. If
+  // sequence_header_.frame_id_numbers_present is false, reference_frame_id and
+  // current_frame_id are assigned the default value 0 and are not used in
+  // checks.
   std::array<uint16_t, kNumReferenceFrameTypes> reference_frame_id = {};
   // A valid value of current_frame_id is an unsigned integer of at most 16
   // bits. -1 indicates current_frame_id is not initialized.
@@ -81,6 +78,11 @@
   // * |true| indicates that the reference frame is a backwards reference.
   // Note: reference_frame_sign_bias[0] (for kReferenceFrameIntra) is not used.
   std::array<bool, kNumReferenceFrameTypes> reference_frame_sign_bias = {};
+  // The RefValid[i] variable in the spec does not need to be stored explicitly.
+  // If the RefValid[i] variable in the spec is 0, then reference_frame[i] is a
+  // null pointer. (Whenever the spec sets the RefValid[i] variable to 0, we set
+  // reference_frame[i] to a null pointer.) If the RefValid[i] variable in the
+  // spec is 1, then reference_frame[i] contains a frame buffer pointer.
   std::array<RefCountedBufferPtr, kNumReferenceFrameTypes> reference_frame;
 };
 

diff --git a/libgav1/src/dsp/arm/average_blend_neon.cc b/libgav1/src/dsp/arm/average_blend_neon.cc
index d946d70..5b4c094 100644
--- a/libgav1/src/dsp/arm/average_blend_neon.cc
+++ b/libgav1/src/dsp/arm/average_blend_neon.cc

@@ -35,6 +35,11 @@
 constexpr int kInterPostRoundBit =
     kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
 
+}  // namespace
+
+namespace low_bitdepth {
+namespace {
+
 inline uint8x8_t AverageBlend8Row(const int16_t* prediction_0,
                                   const int16_t* prediction_1) {
   const int16x8_t pred0 = vld1q_s16(prediction_0);
@@ -46,19 +51,24 @@
 inline void AverageBlendLargeRow(const int16_t* prediction_0,
                                  const int16_t* prediction_1, const int width,
                                  uint8_t* dest) {
-  int x = 0;
+  int x = width;
   do {
-    const int16x8_t pred_00 = vld1q_s16(&prediction_0[x]);
-    const int16x8_t pred_01 = vld1q_s16(&prediction_1[x]);
+    const int16x8_t pred_00 = vld1q_s16(prediction_0);
+    const int16x8_t pred_01 = vld1q_s16(prediction_1);
+    prediction_0 += 8;
+    prediction_1 += 8;
     const int16x8_t res0 = vaddq_s16(pred_00, pred_01);
     const uint8x8_t res_out0 = vqrshrun_n_s16(res0, kInterPostRoundBit + 1);
-    const int16x8_t pred_10 = vld1q_s16(&prediction_0[x + 8]);
-    const int16x8_t pred_11 = vld1q_s16(&prediction_1[x + 8]);
+    const int16x8_t pred_10 = vld1q_s16(prediction_0);
+    const int16x8_t pred_11 = vld1q_s16(prediction_1);
+    prediction_0 += 8;
+    prediction_1 += 8;
     const int16x8_t res1 = vaddq_s16(pred_10, pred_11);
     const uint8x8_t res_out1 = vqrshrun_n_s16(res1, kInterPostRoundBit + 1);
-    vst1q_u8(dest + x, vcombine_u8(res_out0, res_out1));
-    x += 16;
-  } while (x < width);
+    vst1q_u8(dest, vcombine_u8(res_out0, res_out1));
+    dest += 16;
+    x -= 16;
+  } while (x != 0);
 }
 
 void AverageBlend_NEON(const void* prediction_0, const void* prediction_1,
@@ -123,13 +133,139 @@
 }
 
 }  // namespace
+}  // namespace low_bitdepth
 
-void AverageBlendInit_NEON() { Init8bpp(); }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+inline uint16x8_t AverageBlend8Row(const uint16_t* prediction_0,
+                                   const uint16_t* prediction_1,
+                                   const int32x4_t compound_offset,
+                                   const uint16x8_t v_bitdepth) {
+  const uint16x8_t pred0 = vld1q_u16(prediction_0);
+  const uint16x8_t pred1 = vld1q_u16(prediction_1);
+  const uint32x4_t pred_lo =
+      vaddl_u16(vget_low_u16(pred0), vget_low_u16(pred1));
+  const uint32x4_t pred_hi =
+      vaddl_u16(vget_high_u16(pred0), vget_high_u16(pred1));
+  const int32x4_t offset_lo =
+      vsubq_s32(vreinterpretq_s32_u32(pred_lo), compound_offset);
+  const int32x4_t offset_hi =
+      vsubq_s32(vreinterpretq_s32_u32(pred_hi), compound_offset);
+  const uint16x4_t res_lo = vqrshrun_n_s32(offset_lo, kInterPostRoundBit + 1);
+  const uint16x4_t res_hi = vqrshrun_n_s32(offset_hi, kInterPostRoundBit + 1);
+  return vminq_u16(vcombine_u16(res_lo, res_hi), v_bitdepth);
+}
+
+inline void AverageBlendLargeRow(const uint16_t* prediction_0,
+                                 const uint16_t* prediction_1, const int width,
+                                 uint16_t* dest,
+                                 const int32x4_t compound_offset,
+                                 const uint16x8_t v_bitdepth) {
+  int x = width;
+  do {
+    vst1q_u16(dest, AverageBlend8Row(prediction_0, prediction_1,
+                                     compound_offset, v_bitdepth));
+    prediction_0 += 8;
+    prediction_1 += 8;
+    dest += 8;
+
+    vst1q_u16(dest, AverageBlend8Row(prediction_0, prediction_1,
+                                     compound_offset, v_bitdepth));
+    prediction_0 += 8;
+    prediction_1 += 8;
+    dest += 8;
+
+    x -= 16;
+  } while (x != 0);
+}
+
+void AverageBlend_NEON(const void* prediction_0, const void* prediction_1,
+                       const int width, const int height, void* const dest,
+                       const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y = height;
+
+  const ptrdiff_t dst_stride = dest_stride >> 1;
+  const int32x4_t compound_offset =
+      vdupq_n_s32(static_cast<int32_t>(kCompoundOffset + kCompoundOffset));
+  const uint16x8_t v_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1);
+  if (width == 4) {
+    do {
+      const uint16x8_t result =
+          AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth);
+      pred_0 += 8;
+      pred_1 += 8;
+
+      vst1_u16(dst, vget_low_u16(result));
+      dst += dst_stride;
+      vst1_u16(dst, vget_high_u16(result));
+      dst += dst_stride;
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+
+  if (width == 8) {
+    do {
+      vst1q_u16(dst,
+                AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth));
+      dst += dst_stride;
+      pred_0 += 8;
+      pred_1 += 8;
+
+      vst1q_u16(dst,
+                AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth));
+      dst += dst_stride;
+      pred_0 += 8;
+      pred_1 += 8;
+
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+
+  do {
+    AverageBlendLargeRow(pred_0, pred_1, width, dst, compound_offset,
+                         v_bitdepth);
+    dst += dst_stride;
+    pred_0 += width;
+    pred_1 += width;
+
+    AverageBlendLargeRow(pred_0, pred_1, width, dst, compound_offset,
+                         v_bitdepth);
+    dst += dst_stride;
+    pred_0 += width;
+    pred_1 += width;
+
+    y -= 2;
+  } while (y != 0);
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->average_blend = AverageBlend_NEON;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void AverageBlendInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}
 
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 
 namespace libgav1 {
 namespace dsp {

diff --git a/libgav1/src/dsp/arm/cdef_neon.cc b/libgav1/src/dsp/arm/cdef_neon.cc
index 968b0ff..60c72d6 100644
--- a/libgav1/src/dsp/arm/cdef_neon.cc
+++ b/libgav1/src/dsp/arm/cdef_neon.cc

@@ -265,7 +265,7 @@
   // 05 15 25 35 45 55 65 75  00 00 00 00 00 00 00 00
   // 06 16 26 36 46 56 66 76  00 00 00 00 00 00 00 00
   // 07 17 27 37 47 57 67 77  00 00 00 00 00 00 00 00
-  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[0]), partial_lo[2], 0);
+  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[0]), vdupq_n_u16(0), 0);
   partial_lo[2] = vsetq_lane_u16(SumVector(v_src[1]), partial_lo[2], 1);
   partial_lo[2] = vsetq_lane_u16(SumVector(v_src[2]), partial_lo[2], 2);
   partial_lo[2] = vsetq_lane_u16(SumVector(v_src[3]), partial_lo[2], 3);
@@ -285,9 +285,8 @@
   // 50 51 52 53 54 55 56 57  00 00 00 00 00 00 00 00
   // 60 61 62 63 64 65 66 67  00 00 00 00 00 00 00 00
   // 70 71 72 73 74 75 76 77  00 00 00 00 00 00 00 00
-  const uint8x8_t v_zero = vdup_n_u8(0);
-  partial_lo[6] = vaddl_u8(v_zero, v_src[0]);
-  for (int i = 1; i < 8; ++i) {
+  partial_lo[6] = vaddl_u8(v_src[0], v_src[1]);
+  for (int i = 2; i < 8; ++i) {
     partial_lo[6] = vaddw_u8(partial_lo[6], v_src[i]);
   }
 
@@ -360,7 +359,7 @@
 }
 
 void CdefDirection_NEON(const void* const source, ptrdiff_t stride,
-                        int* const direction, int* const variance) {
+                        uint8_t* const direction, int* const variance) {
   assert(direction != nullptr);
   assert(variance != nullptr);
   const auto* src = static_cast<const uint8_t*>(source);
@@ -451,7 +450,7 @@
 
 int16x8_t Constrain(const uint16x8_t pixel, const uint16x8_t reference,
                     const uint16x8_t threshold, const int16x8_t damping) {
-  // If reference > pixel, the difference will be negative, so covert to 0 or
+  // If reference > pixel, the difference will be negative, so convert to 0 or
   // -1.
   const uint16x8_t sign = vcgtq_u16(reference, pixel);
   const uint16x8_t abs_diff = vabdq_u16(pixel, reference);
@@ -686,7 +685,7 @@
 
 }  // namespace dsp
 }  // namespace libgav1
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 

diff --git a/libgav1/src/dsp/arm/common_neon.h b/libgav1/src/dsp/arm/common_neon.h
index e8367ab..05e0d05 100644
--- a/libgav1/src/dsp/arm/common_neon.h
+++ b/libgav1/src/dsp/arm/common_neon.h

@@ -28,8 +28,7 @@
 
 #if 0
 #include <cstdio>
-
-#include "absl/strings/str_cat.h"
+#include <string>
 
 constexpr bool kEnablePrintRegs = true;
 
@@ -86,11 +85,11 @@
 
 inline void PrintReg(const int32x4x2_t val, const std::string& name) {
   DebugRegisterQ r;
-  vst1q_u32(r.u32, val.val[0]);
-  const std::string name0 = absl::StrCat(name, ".val[0]").c_str();
+  vst1q_s32(r.i32, val.val[0]);
+  const std::string name0 = name + std::string(".val[0]");
   PrintVectQ(r, name0.c_str(), 32);
-  vst1q_u32(r.u32, val.val[1]);
-  const std::string name1 = absl::StrCat(name, ".val[1]").c_str();
+  vst1q_s32(r.i32, val.val[1]);
+  const std::string name1 = name + std::string(".val[1]");
   PrintVectQ(r, name1.c_str(), 32);
 }
 
@@ -169,14 +168,14 @@
 // Print an individual (non-vector) value in decimal format.
 inline void PrintReg(const int x, const char* name) {
   if (kEnablePrintRegs) {
-    printf("%s: %d\n", name, x);
+    fprintf(stderr, "%s: %d\n", name, x);
   }
 }
 
 // Print an individual (non-vector) value in hexadecimal format.
 inline void PrintHex(const int x, const char* name) {
   if (kEnablePrintRegs) {
-    printf("%s: %x\n", name, x);
+    fprintf(stderr, "%s: %x\n", name, x);
   }
 }
 
@@ -277,22 +276,32 @@
   ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u16(val), lane));
 }
 
+// Simplify code when caller has |buf| cast as uint8_t*.
+inline void Store4(void* const buf, const uint16x4_t val) {
+  vst1_u16(static_cast<uint16_t*>(buf), val);
+}
+
+// Simplify code when caller has |buf| cast as uint8_t*.
+inline void Store8(void* const buf, const uint16x8_t val) {
+  vst1q_u16(static_cast<uint16_t*>(buf), val);
+}
+
 //------------------------------------------------------------------------------
 // Bit manipulation.
 
 // vshXX_n_XX() requires an immediate.
 template <int shift>
-inline uint8x8_t LeftShift(const uint8x8_t vector) {
+inline uint8x8_t LeftShiftVector(const uint8x8_t vector) {
   return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vector), shift));
 }
 
 template <int shift>
-inline uint8x8_t RightShift(const uint8x8_t vector) {
+inline uint8x8_t RightShiftVector(const uint8x8_t vector) {
   return vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(vector), shift));
 }
 
 template <int shift>
-inline int8x8_t RightShift(const int8x8_t vector) {
+inline int8x8_t RightShiftVector(const int8x8_t vector) {
   return vreinterpret_s8_u64(vshr_n_u64(vreinterpret_u64_s8(vector), shift));
 }
 
@@ -387,6 +396,15 @@
 #endif  // defined(__aarch64__)
 }
 
+inline uint32_t SumVector(const uint32x2_t a) {
+#if defined(__aarch64__)
+  return vaddv_u32(a);
+#else
+  const uint64x1_t b = vpaddl_u32(a);
+  return vget_lane_u32(vreinterpret_u32_u64(b), 0);
+#endif  // defined(__aarch64__)
+}
+
 inline uint32_t SumVector(const uint32x4_t a) {
 #if defined(__aarch64__)
   return vaddvq_u32(a);
@@ -447,6 +465,36 @@
 }
 
 // Input:
+// 00 01 02 03
+// 10 11 12 13
+// 20 21 22 23
+// 30 31 32 33
+inline void Transpose4x4(uint16x4_t a[4]) {
+  // b:
+  // 00 10 02 12
+  // 01 11 03 13
+  const uint16x4x2_t b = vtrn_u16(a[0], a[1]);
+  // c:
+  // 20 30 22 32
+  // 21 31 23 33
+  const uint16x4x2_t c = vtrn_u16(a[2], a[3]);
+  // d:
+  // 00 10 20 30
+  // 02 12 22 32
+  const uint32x2x2_t d =
+      vtrn_u32(vreinterpret_u32_u16(b.val[0]), vreinterpret_u32_u16(c.val[0]));
+  // e:
+  // 01 11 21 31
+  // 03 13 23 33
+  const uint32x2x2_t e =
+      vtrn_u32(vreinterpret_u32_u16(b.val[1]), vreinterpret_u32_u16(c.val[1]));
+  a[0] = vreinterpret_u16_u32(d.val[0]);
+  a[1] = vreinterpret_u16_u32(e.val[0]);
+  a[2] = vreinterpret_u16_u32(d.val[1]);
+  a[3] = vreinterpret_u16_u32(e.val[1]);
+}
+
+// Input:
 // a: 00 01 02 03 10 11 12 13
 // b: 20 21 22 23 30 31 32 33
 // Output:
@@ -587,6 +635,28 @@
   a[7] = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
 }
 
+inline void Transpose8x8(uint8x8_t in[8], uint8x16_t out[4]) {
+  const uint8x16x2_t a0 =
+      vtrnq_u8(vcombine_u8(in[0], in[4]), vcombine_u8(in[1], in[5]));
+  const uint8x16x2_t a1 =
+      vtrnq_u8(vcombine_u8(in[2], in[6]), vcombine_u8(in[3], in[7]));
+
+  const uint16x8x2_t b0 = vtrnq_u16(vreinterpretq_u16_u8(a0.val[0]),
+                                    vreinterpretq_u16_u8(a1.val[0]));
+  const uint16x8x2_t b1 = vtrnq_u16(vreinterpretq_u16_u8(a0.val[1]),
+                                    vreinterpretq_u16_u8(a1.val[1]));
+
+  const uint32x4x2_t c0 = vuzpq_u32(vreinterpretq_u32_u16(b0.val[0]),
+                                    vreinterpretq_u32_u16(b1.val[0]));
+  const uint32x4x2_t c1 = vuzpq_u32(vreinterpretq_u32_u16(b0.val[1]),
+                                    vreinterpretq_u32_u16(b1.val[1]));
+
+  out[0] = vreinterpretq_u8_u32(c0.val[0]);
+  out[1] = vreinterpretq_u8_u32(c1.val[0]);
+  out[2] = vreinterpretq_u8_u32(c0.val[1]);
+  out[3] = vreinterpretq_u8_u32(c1.val[1]);
+}
+
 // Input:
 // a[0]: 00 01 02 03 04 05 06 07
 // a[1]: 10 11 12 13 14 15 16 17
@@ -667,6 +737,83 @@
   a[7] = d3.val[1];
 }
 
+// Input:
+// a[0]: 00 01 02 03 04 05 06 07  80 81 82 83 84 85 86 87
+// a[1]: 10 11 12 13 14 15 16 17  90 91 92 93 94 95 96 97
+// a[2]: 20 21 22 23 24 25 26 27  a0 a1 a2 a3 a4 a5 a6 a7
+// a[3]: 30 31 32 33 34 35 36 37  b0 b1 b2 b3 b4 b5 b6 b7
+// a[4]: 40 41 42 43 44 45 46 47  c0 c1 c2 c3 c4 c5 c6 c7
+// a[5]: 50 51 52 53 54 55 56 57  d0 d1 d2 d3 d4 d5 d6 d7
+// a[6]: 60 61 62 63 64 65 66 67  e0 e1 e2 e3 e4 e5 e6 e7
+// a[7]: 70 71 72 73 74 75 76 77  f0 f1 f2 f3 f4 f5 f6 f7
+
+// Output:
+// a[0]: 00 10 20 30 40 50 60 70  80 90 a0 b0 c0 d0 e0 f0
+// a[1]: 01 11 21 31 41 51 61 71  81 91 a1 b1 c1 d1 e1 f1
+// a[2]: 02 12 22 32 42 52 62 72  82 92 a2 b2 c2 d2 e2 f2
+// a[3]: 03 13 23 33 43 53 63 73  83 93 a3 b3 c3 d3 e3 f3
+// a[4]: 04 14 24 34 44 54 64 74  84 94 a4 b4 c4 d4 e4 f4
+// a[5]: 05 15 25 35 45 55 65 75  85 95 a5 b5 c5 d5 e5 f5
+// a[6]: 06 16 26 36 46 56 66 76  86 96 a6 b6 c6 d6 e6 f6
+// a[7]: 07 17 27 37 47 57 67 77  87 97 a7 b7 c7 d7 e7 f7
+inline void Transpose8x16(uint8x16_t a[8]) {
+  // b0.val[0]: 00 10 02 12 04 14 06 16  80 90 82 92 84 94 86 96
+  // b0.val[1]: 01 11 03 13 05 15 07 17  81 91 83 93 85 95 87 97
+  // b1.val[0]: 20 30 22 32 24 34 26 36  a0 b0 a2 b2 a4 b4 a6 b6
+  // b1.val[1]: 21 31 23 33 25 35 27 37  a1 b1 a3 b3 a5 b5 a7 b7
+  // b2.val[0]: 40 50 42 52 44 54 46 56  c0 d0 c2 d2 c4 d4 c6 d6
+  // b2.val[1]: 41 51 43 53 45 55 47 57  c1 d1 c3 d3 c5 d5 c7 d7
+  // b3.val[0]: 60 70 62 72 64 74 66 76  e0 f0 e2 f2 e4 f4 e6 f6
+  // b3.val[1]: 61 71 63 73 65 75 67 77  e1 f1 e3 f3 e5 f5 e7 f7
+  const uint8x16x2_t b0 = vtrnq_u8(a[0], a[1]);
+  const uint8x16x2_t b1 = vtrnq_u8(a[2], a[3]);
+  const uint8x16x2_t b2 = vtrnq_u8(a[4], a[5]);
+  const uint8x16x2_t b3 = vtrnq_u8(a[6], a[7]);
+
+  // c0.val[0]: 00 10 20 30 04 14 24 34  80 90 a0 b0 84 94 a4 b4
+  // c0.val[1]: 02 12 22 32 06 16 26 36  82 92 a2 b2 86 96 a6 b6
+  // c1.val[0]: 01 11 21 31 05 15 25 35  81 91 a1 b1 85 95 a5 b5
+  // c1.val[1]: 03 13 23 33 07 17 27 37  83 93 a3 b3 87 97 a7 b7
+  // c2.val[0]: 40 50 60 70 44 54 64 74  c0 d0 e0 f0 c4 d4 e4 f4
+  // c2.val[1]: 42 52 62 72 46 56 66 76  c2 d2 e2 f2 c6 d6 e6 f6
+  // c3.val[0]: 41 51 61 71 45 55 65 75  c1 d1 e1 f1 c5 d5 e5 f5
+  // c3.val[1]: 43 53 63 73 47 57 67 77  c3 d3 e3 f3 c7 d7 e7 f7
+  const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+                                    vreinterpretq_u16_u8(b1.val[0]));
+  const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+                                    vreinterpretq_u16_u8(b1.val[1]));
+  const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
+                                    vreinterpretq_u16_u8(b3.val[0]));
+  const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
+                                    vreinterpretq_u16_u8(b3.val[1]));
+
+  // d0.val[0]: 00 10 20 30 40 50 60 70  80 90 a0 b0 c0 d0 e0 f0
+  // d0.val[1]: 04 14 24 34 44 54 64 74  84 94 a4 b4 c4 d4 e4 f4
+  // d1.val[0]: 01 11 21 31 41 51 61 71  81 91 a1 b1 c1 d1 e1 f1
+  // d1.val[1]: 05 15 25 35 45 55 65 75  85 95 a5 b5 c5 d5 e5 f5
+  // d2.val[0]: 02 12 22 32 42 52 62 72  82 92 a2 b2 c2 d2 e2 f2
+  // d2.val[1]: 06 16 26 36 46 56 66 76  86 96 a6 b6 c6 d6 e6 f6
+  // d3.val[0]: 03 13 23 33 43 53 63 73  83 93 a3 b3 c3 d3 e3 f3
+  // d3.val[1]: 07 17 27 37 47 57 67 77  87 97 a7 b7 c7 d7 e7 f7
+  const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
+                                    vreinterpretq_u32_u16(c2.val[0]));
+  const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
+                                    vreinterpretq_u32_u16(c3.val[0]));
+  const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
+                                    vreinterpretq_u32_u16(c2.val[1]));
+  const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
+                                    vreinterpretq_u32_u16(c3.val[1]));
+
+  a[0] = vreinterpretq_u8_u32(d0.val[0]);
+  a[1] = vreinterpretq_u8_u32(d1.val[0]);
+  a[2] = vreinterpretq_u8_u32(d2.val[0]);
+  a[3] = vreinterpretq_u8_u32(d3.val[0]);
+  a[4] = vreinterpretq_u8_u32(d0.val[1]);
+  a[5] = vreinterpretq_u8_u32(d1.val[1]);
+  a[6] = vreinterpretq_u8_u32(d2.val[1]);
+  a[7] = vreinterpretq_u8_u32(d3.val[1]);
+}
+
 inline int16x8_t ZeroExtend(const uint8x8_t in) {
   return vreinterpretq_s16_u16(vmovl_u8(in));
 }

diff --git a/libgav1/src/dsp/arm/convolve_neon.cc b/libgav1/src/dsp/arm/convolve_neon.cc
index 2c2557f..331bfe2 100644
--- a/libgav1/src/dsp/arm/convolve_neon.cc
+++ b/libgav1/src/dsp/arm/convolve_neon.cc

@@ -35,9 +35,8 @@
 namespace low_bitdepth {
 namespace {
 
-constexpr int kIntermediateStride = kMaxSuperBlockSizeInPixels;
-constexpr int kHorizontalOffset = 3;
-constexpr int kFilterIndexShift = 6;
+// Include the constants and utility functions inside the anonymous namespace.
+#include "src/dsp/convolve.inc"
 
 // Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
 // sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
@@ -102,245 +101,278 @@
   return vreinterpretq_s16_u16(sum);
 }
 
-template <int filter_index, bool negative_outside_taps>
-int16x8_t SumHorizontalTaps(const uint8_t* const src,
-                            const uint8x8_t* const v_tap) {
-  uint8x8_t v_src[8];
-  const uint8x16_t src_long = vld1q_u8(src);
-  int16x8_t sum;
-
-  if (filter_index < 2) {
-    v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 1));
-    v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 2));
-    v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 3));
-    v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 4));
-    v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 5));
-    v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 6));
-    sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 1);
-  } else if (filter_index == 2) {
-    v_src[0] = vget_low_u8(src_long);
-    v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
-    v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
-    v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
-    v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
-    v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
-    v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6));
-    v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7));
-    sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap);
-  } else if (filter_index == 3) {
-    v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 3));
-    v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 4));
-    sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 3);
-  } else if (filter_index > 3) {
-    v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 2));
-    v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 3));
-    v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 4));
-    v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 5));
-    sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 2);
-  }
-  return sum;
-}
-
-template <int filter_index, bool negative_outside_taps>
-uint8x8_t SimpleHorizontalTaps(const uint8_t* const src,
-                               const uint8x8_t* const v_tap) {
-  int16x8_t sum =
-      SumHorizontalTaps<filter_index, negative_outside_taps>(src, v_tap);
-
-  // Normally the Horizontal pass does the downshift in two passes:
-  // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
-  // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
-  // requires adding the rounding offset from the skipped shift.
-  constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
-
-  sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
-  return vqrshrun_n_s16(sum, kFilterBits - 1);
-}
-
-template <int filter_index, bool negative_outside_taps>
-uint16x8_t HorizontalTaps8To16(const uint8_t* const src,
-                               const uint8x8_t* const v_tap) {
-  const int16x8_t sum =
-      SumHorizontalTaps<filter_index, negative_outside_taps>(src, v_tap);
-
-  return vreinterpretq_u16_s16(
-      vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
-}
-
-template <int filter_index>
-int16x8_t SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
-                               const uint8x8_t* const v_tap) {
-  uint16x8_t sum;
-  const uint8x8_t input0 = vld1_u8(src);
-  src += src_stride;
-  const uint8x8_t input1 = vld1_u8(src);
-  uint8x8x2_t input = vzip_u8(input0, input1);
-
-  if (filter_index == 3) {
-    // tap signs : + +
-    sum = vmull_u8(vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
-    sum = vmlal_u8(sum, input.val[1], v_tap[4]);
-  } else if (filter_index == 4) {
-    // tap signs : - + + -
-    sum = vmull_u8(vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
-    sum = vmlsl_u8(sum, RightShift<4 * 8>(input.val[0]), v_tap[2]);
-    sum = vmlal_u8(sum, input.val[1], v_tap[4]);
-    sum = vmlsl_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]);
-  } else {
-    // tap signs : + + + +
-    sum = vmull_u8(RightShift<4 * 8>(input.val[0]), v_tap[2]);
-    sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
-    sum = vmlal_u8(sum, input.val[1], v_tap[4]);
-    sum = vmlal_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]);
-  }
-
-  return vreinterpretq_s16_u16(sum);
-}
-
-template <int filter_index>
-uint8x8_t SimpleHorizontalTaps2x2(const uint8_t* src,
-                                  const ptrdiff_t src_stride,
-                                  const uint8x8_t* const v_tap) {
-  int16x8_t sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
-
-  // Normally the Horizontal pass does the downshift in two passes:
-  // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
-  // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
-  // requires adding the rounding offset from the skipped shift.
-  constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
-
-  sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
-  return vqrshrun_n_s16(sum, kFilterBits - 1);
-}
-
-template <int filter_index>
-uint16x8_t HorizontalTaps8To16_2x2(const uint8_t* src,
-                                   const ptrdiff_t src_stride,
-                                   const uint8x8_t* const v_tap) {
-  const int16x8_t sum =
-      SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
-
-  return vreinterpretq_u16_s16(
-      vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
-}
-
-template <int num_taps, int step, int filter_index,
-          bool negative_outside_taps = true, bool is_2d = false,
-          bool is_compound = false>
-void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
-                      void* const dest, const ptrdiff_t pred_stride,
-                      const int width, const int height,
-                      const uint8x8_t* const v_tap) {
+template <int filter_index, bool negative_outside_taps, bool is_2d,
+          bool is_compound>
+void FilterHorizontalWidth8AndUp(const uint8_t* src, const ptrdiff_t src_stride,
+                                 void* const dest, const ptrdiff_t pred_stride,
+                                 const int width, const int height,
+                                 const uint8x8_t* const v_tap) {
   auto* dest8 = static_cast<uint8_t*>(dest);
   auto* dest16 = static_cast<uint16_t*>(dest);
-
-  // 4 tap filters are never used when width > 4.
-  if (num_taps != 4 && width > 4) {
-    int y = 0;
+  if (!is_2d) {
+    int y = height;
     do {
       int x = 0;
-      do {
-        if (is_2d || is_compound) {
-          const uint16x8_t v_sum =
-              HorizontalTaps8To16<filter_index, negative_outside_taps>(&src[x],
-                                                                       v_tap);
+      do {  // Increasing loop counter x is better.
+        const uint8x16_t src_long = vld1q_u8(src + x);
+        uint8x8_t v_src[8];
+        int16x8_t sum;
+        if (filter_index < 2) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+          v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+          v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+          v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+          sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src,
+                                                                    v_tap + 1);
+        } else if (filter_index == 2) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+          v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+          v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+          v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+          v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6));
+          v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7));
+          sum = SumOnePassTaps<filter_index, false>(v_src, v_tap);
+        } else if (filter_index == 3) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 3);
+        } else if (filter_index > 3) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+          v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+          sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 2);
+        }
+        if (is_compound) {
+          const uint16x8_t v_sum = vreinterpretq_u16_s16(
+              vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
           vst1q_u16(&dest16[x], v_sum);
         } else {
-          const uint8x8_t result =
-              SimpleHorizontalTaps<filter_index, negative_outside_taps>(&src[x],
-                                                                        v_tap);
+          // Normally the Horizontal pass does the downshift in two passes:
+          // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+          // kInterRoundBitsHorizontal). Each one uses a rounding shift.
+          // Combining them requires adding the rounding offset from the skipped
+          // shift.
+          constexpr int first_shift_rounding_bit =
+              1 << (kInterRoundBitsHorizontal - 2);
+          sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
+          const uint8x8_t result = vqrshrun_n_s16(sum, kFilterBits - 1);
           vst1_u8(&dest8[x], result);
         }
-        x += step;
+        x += 8;
       } while (x < width);
       src += src_stride;
       dest8 += pred_stride;
       dest16 += pred_stride;
-    } while (++y < height);
+    } while (--y != 0);
+  } else {
+    int x = 0;
+    do {
+      const uint8_t* s = src + x;
+      int y = height;
+      do {  // Increasing loop counter x is better.
+        const uint8x16_t src_long = vld1q_u8(s);
+        uint8x8_t v_src[8];
+        int16x8_t sum;
+        if (filter_index < 2) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+          v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+          v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+          v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+          sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src,
+                                                                    v_tap + 1);
+        } else if (filter_index == 2) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+          v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+          v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+          v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+          v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6));
+          v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7));
+          sum = SumOnePassTaps<filter_index, false>(v_src, v_tap);
+        } else if (filter_index == 3) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 3);
+        } else if (filter_index > 3) {
+          v_src[0] = vget_low_u8(src_long);
+          v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+          v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+          v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+          sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 2);
+        }
+        const uint16x8_t v_sum = vreinterpretq_u16_s16(
+            vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
+        vst1q_u16(dest16, v_sum);
+        s += src_stride;
+        dest16 += 8;
+      } while (--y != 0);
+      x += 8;
+    } while (x < width);
+  }
+}
+
+template <int filter_index, bool is_2d, bool is_compound>
+void FilterHorizontalWidth4(const uint8_t* src, const ptrdiff_t src_stride,
+                            void* const dest, const ptrdiff_t pred_stride,
+                            const int height, const uint8x8_t* const v_tap) {
+  auto* dest8 = static_cast<uint8_t*>(dest);
+  auto* dest16 = static_cast<uint16_t*>(dest);
+  int y = height;
+  do {
+    uint8x8_t v_src[4];
+    int16x8_t sum;
+    v_src[0] = vld1_u8(src);
+    if (filter_index == 3) {
+      v_src[1] = RightShiftVector<1 * 8>(v_src[0]);
+      sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 3);
+    } else {
+      v_src[1] = RightShiftVector<1 * 8>(v_src[0]);
+      v_src[2] = RightShiftVector<2 * 8>(v_src[0]);
+      v_src[3] = RightShiftVector<3 * 8>(v_src[0]);
+      sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 2);
+    }
+    if (is_2d || is_compound) {
+      const uint16x4_t v_sum = vreinterpret_u16_s16(
+          vrshr_n_s16(vget_low_s16(sum), kInterRoundBitsHorizontal - 1));
+      vst1_u16(dest16, v_sum);
+    } else {
+      constexpr int first_shift_rounding_bit =
+          1 << (kInterRoundBitsHorizontal - 2);
+      sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
+      const uint8x8_t result = vqrshrun_n_s16(sum, kFilterBits - 1);
+      StoreLo4(&dest8[0], result);
+    }
+    src += src_stride;
+    dest8 += pred_stride;
+    dest16 += pred_stride;
+  } while (--y != 0);
+}
+
+template <int filter_index, bool is_2d>
+void FilterHorizontalWidth2(const uint8_t* src, const ptrdiff_t src_stride,
+                            void* const dest, const ptrdiff_t pred_stride,
+                            const int height, const uint8x8_t* const v_tap) {
+  auto* dest8 = static_cast<uint8_t*>(dest);
+  auto* dest16 = static_cast<uint16_t*>(dest);
+  int y = height >> 1;
+  do {
+    const uint8x8_t input0 = vld1_u8(src);
+    const uint8x8_t input1 = vld1_u8(src + src_stride);
+    const uint8x8x2_t input = vzip_u8(input0, input1);
+    uint16x8_t sum;
+    if (filter_index == 3) {
+      // tap signs : + +
+      sum = vmull_u8(input.val[0], v_tap[3]);
+      sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 2), v_tap[4]);
+    } else if (filter_index == 4) {
+      // tap signs : - + + -
+      sum = vmull_u8(RightShiftVector<2 * 8>(input.val[0]), v_tap[3]);
+      sum = vmlsl_u8(sum, input.val[0], v_tap[2]);
+      sum = vmlal_u8(sum, RightShiftVector<4 * 8>(input.val[0]), v_tap[4]);
+      sum = vmlsl_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[5]);
+    } else {
+      // tap signs : + + + +
+      sum = vmull_u8(input.val[0], v_tap[2]);
+      sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input.val[0]), v_tap[3]);
+      sum = vmlal_u8(sum, RightShiftVector<4 * 8>(input.val[0]), v_tap[4]);
+      sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[5]);
+    }
+    int16x8_t s = vreinterpretq_s16_u16(sum);
+    if (is_2d) {
+      const uint16x8_t v_sum =
+          vreinterpretq_u16_s16(vrshrq_n_s16(s, kInterRoundBitsHorizontal - 1));
+      dest16[0] = vgetq_lane_u16(v_sum, 0);
+      dest16[1] = vgetq_lane_u16(v_sum, 2);
+      dest16 += pred_stride;
+      dest16[0] = vgetq_lane_u16(v_sum, 1);
+      dest16[1] = vgetq_lane_u16(v_sum, 3);
+      dest16 += pred_stride;
+    } else {
+      // Normally the Horizontal pass does the downshift in two passes:
+      // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+      // kInterRoundBitsHorizontal). Each one uses a rounding shift.
+      // Combining them requires adding the rounding offset from the skipped
+      // shift.
+      constexpr int first_shift_rounding_bit =
+          1 << (kInterRoundBitsHorizontal - 2);
+      s = vaddq_s16(s, vdupq_n_s16(first_shift_rounding_bit));
+      const uint8x8_t result = vqrshrun_n_s16(s, kFilterBits - 1);
+      dest8[0] = vget_lane_u8(result, 0);
+      dest8[1] = vget_lane_u8(result, 2);
+      dest8 += pred_stride;
+      dest8[0] = vget_lane_u8(result, 1);
+      dest8[1] = vget_lane_u8(result, 3);
+      dest8 += pred_stride;
+    }
+    src += src_stride << 1;
+  } while (--y != 0);
+
+  // The 2d filters have an odd |height| because the horizontal pass
+  // generates context for the vertical pass.
+  if (is_2d) {
+    assert(height % 2 == 1);
+    const uint8x8_t input = vld1_u8(src);
+    uint16x8_t sum;
+    if (filter_index == 3) {
+      sum = vmull_u8(input, v_tap[3]);
+      sum = vmlal_u8(sum, RightShiftVector<1 * 8>(input), v_tap[4]);
+    } else if (filter_index == 4) {
+      sum = vmull_u8(RightShiftVector<1 * 8>(input), v_tap[3]);
+      sum = vmlsl_u8(sum, input, v_tap[2]);
+      sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input), v_tap[4]);
+      sum = vmlsl_u8(sum, RightShiftVector<3 * 8>(input), v_tap[5]);
+    } else {
+      assert(filter_index == 5);
+      sum = vmull_u8(input, v_tap[2]);
+      sum = vmlal_u8(sum, RightShiftVector<1 * 8>(input), v_tap[3]);
+      sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input), v_tap[4]);
+      sum = vmlal_u8(sum, RightShiftVector<3 * 8>(input), v_tap[5]);
+    }
+    // |sum| contains an int16_t value.
+    sum = vreinterpretq_u16_s16(vrshrq_n_s16(vreinterpretq_s16_u16(sum),
+                                             kInterRoundBitsHorizontal - 1));
+    Store2<0>(dest16, sum);
+  }
+}
+
+template <int filter_index, bool negative_outside_taps, bool is_2d,
+          bool is_compound>
+void FilterHorizontal(const uint8_t* const src, const ptrdiff_t src_stride,
+                      void* const dest, const ptrdiff_t pred_stride,
+                      const int width, const int height,
+                      const uint8x8_t* const v_tap) {
+  assert(width < 8 || filter_index <= 3);
+  // Don't simplify the redundant if conditions with the template parameters,
+  // which helps the compiler generate compact code.
+  if (width >= 8 && filter_index <= 3) {
+    FilterHorizontalWidth8AndUp<filter_index, negative_outside_taps, is_2d,
+                                is_compound>(src, src_stride, dest, pred_stride,
+                                             width, height, v_tap);
     return;
   }
 
-  // Horizontal passes only needs to account for |num_taps| 2 and 4 when
+  // Horizontal passes only needs to account for number of taps 2 and 4 when
   // |width| <= 4.
   assert(width <= 4);
-  assert(num_taps <= 4);
-  if (num_taps <= 4) {
+  assert(filter_index >= 3 && filter_index <= 5);
+  if (filter_index >= 3 && filter_index <= 5) {
     if (width == 4) {
-      int y = 0;
-      do {
-        if (is_2d || is_compound) {
-          const uint16x8_t v_sum =
-              HorizontalTaps8To16<filter_index, negative_outside_taps>(src,
-                                                                       v_tap);
-          vst1_u16(dest16, vget_low_u16(v_sum));
-        } else {
-          const uint8x8_t result =
-              SimpleHorizontalTaps<filter_index, negative_outside_taps>(src,
-                                                                        v_tap);
-          StoreLo4(&dest8[0], result);
-        }
-        src += src_stride;
-        dest8 += pred_stride;
-        dest16 += pred_stride;
-      } while (++y < height);
+      FilterHorizontalWidth4<filter_index, is_2d, is_compound>(
+          src, src_stride, dest, pred_stride, height, v_tap);
       return;
     }
-
+    assert(width == 2);
     if (!is_compound) {
-      int y = 0;
-      do {
-        if (is_2d) {
-          const uint16x8_t sum =
-              HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap);
-          dest16[0] = vgetq_lane_u16(sum, 0);
-          dest16[1] = vgetq_lane_u16(sum, 2);
-          dest16 += pred_stride;
-          dest16[0] = vgetq_lane_u16(sum, 1);
-          dest16[1] = vgetq_lane_u16(sum, 3);
-          dest16 += pred_stride;
-        } else {
-          const uint8x8_t sum =
-              SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
-
-          dest8[0] = vget_lane_u8(sum, 0);
-          dest8[1] = vget_lane_u8(sum, 2);
-          dest8 += pred_stride;
-
-          dest8[0] = vget_lane_u8(sum, 1);
-          dest8[1] = vget_lane_u8(sum, 3);
-          dest8 += pred_stride;
-        }
-
-        src += src_stride << 1;
-        y += 2;
-      } while (y < height - 1);
-
-      // The 2d filters have an odd |height| because the horizontal pass
-      // generates context for the vertical pass.
-      if (is_2d) {
-        assert(height % 2 == 1);
-        uint16x8_t sum;
-        const uint8x8_t input = vld1_u8(src);
-        if (filter_index == 3) {  // |num_taps| == 2
-          sum = vmull_u8(RightShift<3 * 8>(input), v_tap[3]);
-          sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
-        } else if (filter_index == 4) {
-          sum = vmull_u8(RightShift<3 * 8>(input), v_tap[3]);
-          sum = vmlsl_u8(sum, RightShift<2 * 8>(input), v_tap[2]);
-          sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
-          sum = vmlsl_u8(sum, RightShift<5 * 8>(input), v_tap[5]);
-        } else {
-          assert(filter_index == 5);
-          sum = vmull_u8(RightShift<2 * 8>(input), v_tap[2]);
-          sum = vmlal_u8(sum, RightShift<3 * 8>(input), v_tap[3]);
-          sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
-          sum = vmlal_u8(sum, RightShift<5 * 8>(input), v_tap[5]);
-        }
-        // |sum| contains an int16_t value.
-        sum = vreinterpretq_u16_s16(vrshrq_n_s16(
-            vreinterpretq_s16_u16(sum), kInterRoundBitsHorizontal - 1));
-        Store2<0>(dest16, sum);
-      }
+      FilterHorizontalWidth2<filter_index, is_2d>(src, src_stride, dest,
+                                                  pred_stride, height, v_tap);
     }
   }
 }
@@ -452,78 +484,85 @@
 }
 
 template <int num_taps, bool is_compound = false>
-void Filter2DVertical(const uint16_t* src, void* const dst,
-                      const ptrdiff_t dst_stride, const int width,
-                      const int height, const int16x8_t taps) {
+void Filter2DVerticalWidth8AndUp(const uint16_t* src, void* const dst,
+                                 const ptrdiff_t dst_stride, const int width,
+                                 const int height, const int16x8_t taps) {
   assert(width >= 8);
   constexpr int next_row = num_taps - 1;
-  // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
-  const ptrdiff_t src_stride = width;
-
-  auto* dst8 = static_cast<uint8_t*>(dst);
-  auto* dst16 = static_cast<uint16_t*>(dst);
+  auto* const dst8 = static_cast<uint8_t*>(dst);
+  auto* const dst16 = static_cast<uint16_t*>(dst);
 
   int x = 0;
   do {
-    int16x8_t srcs[8];
-    const uint16_t* src_x = src + x;
-    srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src_x));
-    src_x += src_stride;
+    int16x8_t srcs[9];
+    srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src));
+    src += 8;
     if (num_taps >= 4) {
-      srcs[1] = vreinterpretq_s16_u16(vld1q_u16(src_x));
-      src_x += src_stride;
-      srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src_x));
-      src_x += src_stride;
+      srcs[1] = vreinterpretq_s16_u16(vld1q_u16(src));
+      src += 8;
+      srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src));
+      src += 8;
       if (num_taps >= 6) {
-        srcs[3] = vreinterpretq_s16_u16(vld1q_u16(src_x));
-        src_x += src_stride;
-        srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src_x));
-        src_x += src_stride;
+        srcs[3] = vreinterpretq_s16_u16(vld1q_u16(src));
+        src += 8;
+        srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src));
+        src += 8;
         if (num_taps == 8) {
-          srcs[5] = vreinterpretq_s16_u16(vld1q_u16(src_x));
-          src_x += src_stride;
-          srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src_x));
-          src_x += src_stride;
+          srcs[5] = vreinterpretq_s16_u16(vld1q_u16(src));
+          src += 8;
+          srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src));
+          src += 8;
         }
       }
     }
 
-    int y = 0;
+    uint8_t* d8 = dst8 + x;
+    uint16_t* d16 = dst16 + x;
+    int y = height;
     do {
-      srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src_x));
-      src_x += src_stride;
-
-      const int16x8_t sum =
-          SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+      srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src));
+      src += 8;
+      srcs[next_row + 1] = vreinterpretq_s16_u16(vld1q_u16(src));
+      src += 8;
+      const int16x8_t sum0 =
+          SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs + 0, taps);
+      const int16x8_t sum1 =
+          SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs + 1, taps);
       if (is_compound) {
-        vst1q_u16(dst16 + x + y * dst_stride, vreinterpretq_u16_s16(sum));
+        vst1q_u16(d16, vreinterpretq_u16_s16(sum0));
+        d16 += dst_stride;
+        vst1q_u16(d16, vreinterpretq_u16_s16(sum1));
+        d16 += dst_stride;
       } else {
-        vst1_u8(dst8 + x + y * dst_stride, vqmovun_s16(sum));
+        vst1_u8(d8, vqmovun_s16(sum0));
+        d8 += dst_stride;
+        vst1_u8(d8, vqmovun_s16(sum1));
+        d8 += dst_stride;
       }
-
-      srcs[0] = srcs[1];
+      srcs[0] = srcs[2];
       if (num_taps >= 4) {
-        srcs[1] = srcs[2];
-        srcs[2] = srcs[3];
+        srcs[1] = srcs[3];
+        srcs[2] = srcs[4];
         if (num_taps >= 6) {
-          srcs[3] = srcs[4];
-          srcs[4] = srcs[5];
+          srcs[3] = srcs[5];
+          srcs[4] = srcs[6];
           if (num_taps == 8) {
-            srcs[5] = srcs[6];
-            srcs[6] = srcs[7];
+            srcs[5] = srcs[7];
+            srcs[6] = srcs[8];
           }
         }
       }
-    } while (++y < height);
+      y -= 2;
+    } while (y != 0);
     x += 8;
   } while (x < width);
 }
 
 // Take advantage of |src_stride| == |width| to process two rows at a time.
 template <int num_taps, bool is_compound = false>
-void Filter2DVertical4xH(const uint16_t* src, void* const dst,
-                         const ptrdiff_t dst_stride, const int height,
-                         const int16x8_t taps) {
+void Filter2DVerticalWidth4(const uint16_t* src, void* const dst,
+                            const ptrdiff_t dst_stride, const int height,
+                            const int16x8_t taps) {
   auto* dst8 = static_cast<uint8_t*>(dst);
   auto* dst16 = static_cast<uint16_t*>(dst);
 
@@ -546,7 +585,7 @@
     }
   }
 
-  int y = 0;
+  int y = height;
   do {
     srcs[num_taps] = vreinterpretq_s16_u16(vld1q_u16(src));
     src += 8;
@@ -581,15 +620,15 @@
         }
       }
     }
-    y += 2;
-  } while (y < height);
+    y -= 2;
+  } while (y != 0);
 }
 
 // Take advantage of |src_stride| == |width| to process four rows at a time.
 template <int num_taps>
-void Filter2DVertical2xH(const uint16_t* src, void* const dst,
-                         const ptrdiff_t dst_stride, const int height,
-                         const int16x8_t taps) {
+void Filter2DVerticalWidth2(const uint16_t* src, void* const dst,
+                            const ptrdiff_t dst_stride, const int height,
+                            const int16x8_t taps) {
   constexpr int next_row = (num_taps < 6) ? 4 : 8;
 
   auto* dst8 = static_cast<uint8_t*>(dst);
@@ -662,11 +701,10 @@
 LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
     const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
     const ptrdiff_t dst_stride, const int width, const int height,
-    const int subpixel, const int filter_index) {
+    const int filter_id, const int filter_index) {
   // Duplicate the absolute value for each tap.  Negative taps are corrected
   // by using the vmlsl_u8 instruction.  Positive taps use vmlal_u8.
   uint8x8_t v_tap[kSubPixelTaps];
-  const int filter_id = (subpixel >> 6) & kSubPixelMask;
   assert(filter_id != 0);
 
   for (int k = 0; k < kSubPixelTaps; ++k) {
@@ -674,67 +712,58 @@
   }
 
   if (filter_index == 2) {  // 8 tap.
-    FilterHorizontal<8, 8, 2, true, is_2d, is_compound>(
+    FilterHorizontal<2, true, is_2d, is_compound>(
         src, src_stride, dst, dst_stride, width, height, v_tap);
   } else if (filter_index == 1) {  // 6 tap.
     // Check if outside taps are positive.
     if ((filter_id == 1) | (filter_id == 15)) {
-      FilterHorizontal<6, 8, 1, false, is_2d, is_compound>(
-          src, src_stride, dst, dst_stride, width, height, v_tap);
+      FilterHorizontal<1, false, is_2d, is_compound>(
+          src + 1, src_stride, dst, dst_stride, width, height, v_tap);
     } else {
-      FilterHorizontal<6, 8, 1, true, is_2d, is_compound>(
-          src, src_stride, dst, dst_stride, width, height, v_tap);
+      FilterHorizontal<1, true, is_2d, is_compound>(
+          src + 1, src_stride, dst, dst_stride, width, height, v_tap);
     }
   } else if (filter_index == 0) {  // 6 tap.
-    FilterHorizontal<6, 8, 0, true, is_2d, is_compound>(
-        src, src_stride, dst, dst_stride, width, height, v_tap);
+    FilterHorizontal<0, true, is_2d, is_compound>(
+        src + 1, src_stride, dst, dst_stride, width, height, v_tap);
   } else if (filter_index == 4) {  // 4 tap.
-    FilterHorizontal<4, 8, 4, true, is_2d, is_compound>(
-        src, src_stride, dst, dst_stride, width, height, v_tap);
+    FilterHorizontal<4, true, is_2d, is_compound>(
+        src + 2, src_stride, dst, dst_stride, width, height, v_tap);
   } else if (filter_index == 5) {  // 4 tap.
-    FilterHorizontal<4, 8, 5, true, is_2d, is_compound>(
-        src, src_stride, dst, dst_stride, width, height, v_tap);
+    FilterHorizontal<5, true, is_2d, is_compound>(
+        src + 2, src_stride, dst, dst_stride, width, height, v_tap);
   } else {  // 2 tap.
-    FilterHorizontal<2, 8, 3, true, is_2d, is_compound>(
-        src, src_stride, dst, dst_stride, width, height, v_tap);
+    FilterHorizontal<3, true, is_2d, is_compound>(
+        src + 3, src_stride, dst, dst_stride, width, height, v_tap);
   }
 }
 
-int GetNumTapsInFilter(const int filter_index) {
-  if (filter_index < 2) {
-    // Despite the names these only use 6 taps.
-    // kInterpolationFilterEightTap
-    // kInterpolationFilterEightTapSmooth
-    return 6;
+template <int vertical_taps>
+void Filter2DVertical(const uint16_t* const intermediate_result,
+                      const int width, const int height, const int16x8_t taps,
+                      void* const prediction, const ptrdiff_t pred_stride) {
+  auto* const dest = static_cast<uint8_t*>(prediction);
+  if (width >= 8) {
+    Filter2DVerticalWidth8AndUp<vertical_taps>(
+        intermediate_result, dest, pred_stride, width, height, taps);
+  } else if (width == 4) {
+    Filter2DVerticalWidth4<vertical_taps>(intermediate_result, dest,
+                                          pred_stride, height, taps);
+  } else {
+    assert(width == 2);
+    Filter2DVerticalWidth2<vertical_taps>(intermediate_result, dest,
+                                          pred_stride, height, taps);
   }
-
-  if (filter_index == 2) {
-    // kInterpolationFilterEightTapSharp
-    return 8;
-  }
-
-  if (filter_index == 3) {
-    // kInterpolationFilterBilinear
-    return 2;
-  }
-
-  assert(filter_index > 3);
-  // For small sizes (width/height <= 4) the large filters are replaced with 4
-  // tap options.
-  // If the original filters were |kInterpolationFilterEightTap| or
-  // |kInterpolationFilterEightTapSharp| then it becomes
-  // |kInterpolationFilterSwitchable|.
-  // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4
-  // tap filter.
-  return 4;
 }
 
 void Convolve2D_NEON(const void* const reference,
                      const ptrdiff_t reference_stride,
                      const int horizontal_filter_index,
-                     const int vertical_filter_index, const int subpixel_x,
-                     const int subpixel_y, const int width, const int height,
-                     void* prediction, const ptrdiff_t pred_stride) {
+                     const int vertical_filter_index,
+                     const int horizontal_filter_id,
+                     const int vertical_filter_id, const int width,
+                     const int height, void* const prediction,
+                     const ptrdiff_t pred_stride) {
   const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
   const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
   const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
@@ -744,68 +773,31 @@
       intermediate_result[kMaxSuperBlockSizeInPixels *
                           (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
   const int intermediate_height = height + vertical_taps - 1;
-
   const ptrdiff_t src_stride = reference_stride;
-  const auto* src = static_cast<const uint8_t*>(reference) -
-                    (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
+  const auto* const src = static_cast<const uint8_t*>(reference) -
+                          (vertical_taps / 2 - 1) * src_stride -
+                          kHorizontalOffset;
 
   DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result, width,
-                                   width, intermediate_height, subpixel_x,
-                                   horiz_filter_index);
+                                   width, intermediate_height,
+                                   horizontal_filter_id, horiz_filter_index);
 
   // Vertical filter.
-  auto* dest = static_cast<uint8_t*>(prediction);
-  const ptrdiff_t dest_stride = pred_stride;
-  const int filter_id = ((subpixel_y & 1023) >> 6) & kSubPixelMask;
-  assert(filter_id != 0);
-
-  const int16x8_t taps =
-      vmovl_s8(vld1_s8(kHalfSubPixelFilters[vert_filter_index][filter_id]));
-
+  assert(vertical_filter_id != 0);
+  const int16x8_t taps = vmovl_s8(
+      vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]));
   if (vertical_taps == 8) {
-    if (width == 2) {
-      Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height,
-                             taps);
-    } else if (width == 4) {
-      Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height,
-                             taps);
-    } else {
-      Filter2DVertical<8>(intermediate_result, dest, dest_stride, width, height,
-                          taps);
-    }
+    Filter2DVertical<8>(intermediate_result, width, height, taps, prediction,
+                        pred_stride);
   } else if (vertical_taps == 6) {
-    if (width == 2) {
-      Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height,
-                             taps);
-    } else if (width == 4) {
-      Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height,
-                             taps);
-    } else {
-      Filter2DVertical<6>(intermediate_result, dest, dest_stride, width, height,
-                          taps);
-    }
+    Filter2DVertical<6>(intermediate_result, width, height, taps, prediction,
+                        pred_stride);
   } else if (vertical_taps == 4) {
-    if (width == 2) {
-      Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height,
-                             taps);
-    } else if (width == 4) {
-      Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height,
-                             taps);
-    } else {
-      Filter2DVertical<4>(intermediate_result, dest, dest_stride, width, height,
-                          taps);
-    }
+    Filter2DVertical<4>(intermediate_result, width, height, taps, prediction,
+                        pred_stride);
   } else {  // |vertical_taps| == 2
-    if (width == 2) {
-      Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height,
-                             taps);
-    } else if (width == 4) {
-      Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height,
-                             taps);
-    } else {
-      Filter2DVertical<2>(intermediate_result, dest, dest_stride, width, height,
-                          taps);
-    }
+    Filter2DVertical<2>(intermediate_result, width, height, taps, prediction,
+                        pred_stride);
   }
 }
 
@@ -818,7 +810,7 @@
 // increments. The first load covers the initial elements of src_x, while the
 // final load covers the taps.
 template <int grade_x>
-inline uint8x8x3_t LoadSrcVals(const uint8_t* src_x) {
+inline uint8x8x3_t LoadSrcVals(const uint8_t* const src_x) {
   uint8x8x3_t ret;
   const uint8x16_t src_val = vld1q_u8(src_x);
   ret.val[0] = vget_low_u8(src_val);
@@ -841,7 +833,7 @@
 }
 
 template <int grade_x>
-inline void ConvolveKernelHorizontal2Tap(const uint8_t* src,
+inline void ConvolveKernelHorizontal2Tap(const uint8_t* const src,
                                          const ptrdiff_t src_stride,
                                          const int width, const int subpixel_x,
                                          const int step_x,
@@ -873,7 +865,7 @@
     // on x.
     const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
                                VQTbl1U8(filter_taps1, filter_indices)};
-    int y = 0;
+    int y = intermediate_height;
     do {
       // Load a pool of samples to select from using stepped indices.
       const uint8x16_t src_vals = vld1q_u8(src_x);
@@ -890,7 +882,7 @@
                              kInterRoundBitsHorizontal - 1));
       src_x += src_stride;
       intermediate += kIntermediateStride;
-    } while (++y < intermediate_height);
+    } while (--y != 0);
     return;
   }
 
@@ -913,7 +905,7 @@
     // on x.
     const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
                                VQTbl1U8(filter_taps1, filter_indices)};
-    int y = 0;
+    int y = intermediate_height;
     do {
       // Load a pool of samples to select from using stepped indices.
       const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
@@ -930,7 +922,7 @@
                              kInterRoundBitsHorizontal - 1));
       src_x += src_stride;
       intermediate_x += kIntermediateStride;
-    } while (++y < intermediate_height);
+    } while (--y != 0);
     x += 8;
     p += step_x8;
   } while (x < width);
@@ -951,7 +943,7 @@
 
 // This filter is only possible when width <= 4.
 void ConvolveKernelHorizontalPositive4Tap(
-    const uint8_t* src, const ptrdiff_t src_stride, const int subpixel_x,
+    const uint8_t* const src, const ptrdiff_t src_stride, const int subpixel_x,
     const int step_x, const int intermediate_height, int16_t* intermediate) {
   const int kernel_offset = 2;
   const int ref_x = subpixel_x >> kScaleSubPixelBits;
@@ -980,7 +972,7 @@
 
   const uint8x8_t src_indices =
       vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
-  int y = 0;
+  int y = intermediate_height;
   do {
     // Load a pool of samples to select from using stepped index vectors.
     const uint8x16_t src_vals = vld1q_u8(src_x);
@@ -1000,7 +992,7 @@
 
     src_x += src_stride;
     intermediate += kIntermediateStride;
-  } while (++y < intermediate_height);
+  } while (--y != 0);
 }
 
 // Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[4].
@@ -1018,7 +1010,7 @@
 
 // This filter is only possible when width <= 4.
 inline void ConvolveKernelHorizontalSigned4Tap(
-    const uint8_t* src, const ptrdiff_t src_stride, const int subpixel_x,
+    const uint8_t* const src, const ptrdiff_t src_stride, const int subpixel_x,
     const int step_x, const int intermediate_height, int16_t* intermediate) {
   const int kernel_offset = 2;
   const int ref_x = subpixel_x >> kScaleSubPixelBits;
@@ -1055,7 +1047,7 @@
                                     vadd_u8(src_indices_base, vdup_n_u8(2)),
                                     vadd_u8(src_indices_base, vdup_n_u8(3))};
 
-  int y = 0;
+  int y = intermediate_height;
   do {
     // Load a pool of samples to select from using stepped indices.
     const uint8x16_t src_vals = vld1q_u8(src_x);
@@ -1072,7 +1064,7 @@
                            kInterRoundBitsHorizontal - 1));
     src_x += src_stride;
     intermediate += kIntermediateStride;
-  } while (++y < intermediate_height);
+  } while (--y != 0);
 }
 
 // Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[0].
@@ -1093,9 +1085,9 @@
 // This filter is only possible when width >= 8.
 template <int grade_x>
 inline void ConvolveKernelHorizontalSigned6Tap(
-    const uint8_t* src, const ptrdiff_t src_stride, const int width,
+    const uint8_t* const src, const ptrdiff_t src_stride, const int width,
     const int subpixel_x, const int step_x, const int intermediate_height,
-    int16_t* intermediate) {
+    int16_t* const intermediate) {
   const int kernel_offset = 1;
   const uint8x8_t one = vdup_n_u8(1);
   const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
@@ -1137,7 +1129,7 @@
     for (int i = 0; i < 6; ++i) {
       taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
     }
-    int y = 0;
+    int y = intermediate_height;
     do {
       // Load a pool of samples to select from using stepped indices.
       const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
@@ -1152,7 +1144,7 @@
                              kInterRoundBitsHorizontal - 1));
       src_x += src_stride;
       intermediate_x += kIntermediateStride;
-    } while (++y < intermediate_height);
+    } while (--y != 0);
     x += 8;
     p += step_x8;
   } while (x < width);
@@ -1186,9 +1178,9 @@
 // This filter is only possible when width >= 8.
 template <int grade_x>
 inline void ConvolveKernelHorizontalMixed6Tap(
-    const uint8_t* src, const ptrdiff_t src_stride, const int width,
+    const uint8_t* const src, const ptrdiff_t src_stride, const int width,
     const int subpixel_x, const int step_x, const int intermediate_height,
-    int16_t* intermediate) {
+    int16_t* const intermediate) {
   const int kernel_offset = 1;
   const uint8x8_t one = vdup_n_u8(1);
   const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
@@ -1235,7 +1227,7 @@
     mixed_taps[0] = vmovl_s8(VQTbl1S8(mixed_filter_taps[0], filter_indices));
     mixed_taps[1] = vmovl_s8(VQTbl1S8(mixed_filter_taps[1], filter_indices));
 
-    int y = 0;
+    int y = intermediate_height;
     do {
       // Load a pool of samples to select from using stepped indices.
       const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
@@ -1254,7 +1246,7 @@
                                              kInterRoundBitsHorizontal - 1));
       src_x += src_stride;
       intermediate_x += kIntermediateStride;
-    } while (++y < intermediate_height);
+    } while (--y != 0);
     x += 8;
     p += step_x8;
   } while (x < width);
@@ -1280,9 +1272,9 @@
 // This filter is only possible when width >= 8.
 template <int grade_x>
 inline void ConvolveKernelHorizontalSigned8Tap(
-    const uint8_t* src, const ptrdiff_t src_stride, const int width,
+    const uint8_t* const src, const ptrdiff_t src_stride, const int width,
     const int subpixel_x, const int step_x, const int intermediate_height,
-    int16_t* intermediate) {
+    int16_t* const intermediate) {
   const uint8x8_t one = vdup_n_u8(1);
   const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
   const int ref_x = subpixel_x >> kScaleSubPixelBits;
@@ -1320,7 +1312,7 @@
       taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
     }
 
-    int y = 0;
+    int y = intermediate_height;
     do {
       // Load a pool of samples to select from using stepped indices.
       const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
@@ -1336,7 +1328,7 @@
                              kInterRoundBitsHorizontal - 1));
       src_x += src_stride;
       intermediate_x += kIntermediateStride;
-    } while (++y < intermediate_height);
+    } while (--y != 0);
     x += 8;
     p += step_x8;
   } while (x < width);
@@ -1344,9 +1336,9 @@
 
 // This function handles blocks of width 2 or 4.
 template <int num_taps, int grade_y, int width, bool is_compound>
-void ConvolveVerticalScale4xH(const int16_t* src, const int subpixel_y,
+void ConvolveVerticalScale4xH(const int16_t* const src, const int subpixel_y,
                               const int filter_index, const int step_y,
-                              const int height, void* dest,
+                              const int height, void* const dest,
                               const ptrdiff_t dest_stride) {
   constexpr ptrdiff_t src_stride = kIntermediateStride;
   const int16_t* src_y = src;
@@ -1357,8 +1349,8 @@
 
   int p = subpixel_y & 1023;
   int prev_p = p;
-  int y = 0;
-  do {  // y < height
+  int y = height;
+  do {
     for (int i = 0; i < num_taps; ++i) {
       s[i] = vld1_s16(src_y + i * src_stride);
     }
@@ -1411,16 +1403,16 @@
     prev_p = p;
     dest16_y += dest_stride;
     dest_y += dest_stride;
-
-    y += 2;
-  } while (y < height);
+    y -= 2;
+  } while (y != 0);
 }
 
 template <int num_taps, int grade_y, bool is_compound>
-inline void ConvolveVerticalScale(const int16_t* src, const int width,
+inline void ConvolveVerticalScale(const int16_t* const src, const int width,
                                   const int subpixel_y, const int filter_index,
                                   const int step_y, const int height,
-                                  void* dest, const ptrdiff_t dest_stride) {
+                                  void* const dest,
+                                  const ptrdiff_t dest_stride) {
   constexpr ptrdiff_t src_stride = kIntermediateStride;
   // A possible improvement is to use arithmetic to decide how many times to
   // apply filters to same source before checking whether to load new srcs.
@@ -1431,15 +1423,15 @@
   uint8_t* dest_y;
 
   int x = 0;
-  do {  // x < width
-    const int16_t* src_x = src + x;
+  do {
+    const int16_t* const src_x = src + x;
     const int16_t* src_y = src_x;
     dest16_y = static_cast<uint16_t*>(dest) + x;
     dest_y = static_cast<uint8_t*>(dest) + x;
     int p = subpixel_y & 1023;
     int prev_p = p;
-    int y = 0;
-    do {  // y < height
+    int y = height;
+    do {
       for (int i = 0; i < num_taps; ++i) {
         s[i] = vld1q_s16(src_y + i * src_stride);
       }
@@ -1478,9 +1470,8 @@
       prev_p = p;
       dest16_y += dest_stride;
       dest_y += dest_stride;
-
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
     x += 8;
   } while (x < width);
 }
@@ -1492,7 +1483,7 @@
                           const int vertical_filter_index, const int subpixel_x,
                           const int subpixel_y, const int step_x,
                           const int step_y, const int width, const int height,
-                          void* prediction, const ptrdiff_t pred_stride) {
+                          void* const prediction, const ptrdiff_t pred_stride) {
   const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
   const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
   assert(step_x <= 2048);
@@ -1727,16 +1718,18 @@
                              const ptrdiff_t reference_stride,
                              const int horizontal_filter_index,
                              const int /*vertical_filter_index*/,
-                             const int subpixel_x, const int /*subpixel_y*/,
-                             const int width, const int height,
-                             void* prediction, const ptrdiff_t pred_stride) {
+                             const int horizontal_filter_id,
+                             const int /*vertical_filter_id*/, const int width,
+                             const int height, void* const prediction,
+                             const ptrdiff_t pred_stride) {
   const int filter_index = GetFilterIndex(horizontal_filter_index, width);
   // Set |src| to the outermost tap.
-  const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
-  auto* dest = static_cast<uint8_t*>(prediction);
+  const auto* const src =
+      static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+  auto* const dest = static_cast<uint8_t*>(prediction);
 
   DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
-                   subpixel_x, filter_index);
+                   horizontal_filter_id, filter_index);
 }
 
 // The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
@@ -1748,14 +1741,14 @@
 
 template <int filter_index, bool is_compound = false,
           bool negative_outside_taps = false>
-void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
+void FilterVertical(const uint8_t* const src, const ptrdiff_t src_stride,
                     void* const dst, const ptrdiff_t dst_stride,
                     const int width, const int height,
                     const uint8x8_t* const taps) {
   const int num_taps = GetNumTapsInFilter(filter_index);
   const int next_row = num_taps - 1;
-  auto* dst8 = static_cast<uint8_t*>(dst);
-  auto* dst16 = static_cast<uint16_t*>(dst);
+  auto* const dst8 = static_cast<uint8_t*>(dst);
+  auto* const dst16 = static_cast<uint16_t*>(dst);
   assert(width >= 8);
 
   int x = 0;
@@ -1783,6 +1776,9 @@
       }
     }
 
+    // Decreasing the y loop counter produces worse code with clang.
+    // Don't unroll this loop since it generates too much code and the decoder
+    // is even slower.
     int y = 0;
     do {
       srcs[next_row] = vld1_u8(src_x);
@@ -1833,7 +1829,7 @@
     srcs[0] = Load4(src);
     src += src_stride;
 
-    int y = 0;
+    int y = height;
     do {
       srcs[0] = Load4<1>(src, srcs[0]);
       src += src_stride;
@@ -1858,8 +1854,8 @@
       }
 
       srcs[0] = srcs[2];
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
   } else if (num_taps == 4) {
     srcs[4] = vdup_n_u8(0);
 
@@ -1871,7 +1867,7 @@
     src += src_stride;
     srcs[1] = vext_u8(srcs[0], srcs[2], 4);
 
-    int y = 0;
+    int y = height;
     do {
       srcs[2] = Load4<1>(src, srcs[2]);
       src += src_stride;
@@ -1898,8 +1894,8 @@
       srcs[0] = srcs[2];
       srcs[1] = srcs[3];
       srcs[2] = srcs[4];
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
   } else if (num_taps == 6) {
     srcs[6] = vdup_n_u8(0);
 
@@ -1916,7 +1912,7 @@
     src += src_stride;
     srcs[3] = vext_u8(srcs[2], srcs[4], 4);
 
-    int y = 0;
+    int y = height;
     do {
       srcs[4] = Load4<1>(src, srcs[4]);
       src += src_stride;
@@ -1945,8 +1941,8 @@
       srcs[2] = srcs[4];
       srcs[3] = srcs[5];
       srcs[4] = srcs[6];
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
   } else if (num_taps == 8) {
     srcs[8] = vdup_n_u8(0);
 
@@ -1968,7 +1964,7 @@
     src += src_stride;
     srcs[5] = vext_u8(srcs[4], srcs[6], 4);
 
-    int y = 0;
+    int y = height;
     do {
       srcs[6] = Load4<1>(src, srcs[6]);
       src += src_stride;
@@ -1999,8 +1995,8 @@
       srcs[4] = srcs[6];
       srcs[5] = srcs[7];
       srcs[6] = srcs[8];
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
   }
 }
 
@@ -2213,22 +2209,23 @@
                            const ptrdiff_t reference_stride,
                            const int /*horizontal_filter_index*/,
                            const int vertical_filter_index,
-                           const int /*subpixel_x*/, const int subpixel_y,
-                           const int width, const int height, void* prediction,
+                           const int /*horizontal_filter_id*/,
+                           const int vertical_filter_id, const int width,
+                           const int height, void* const prediction,
                            const ptrdiff_t pred_stride) {
   const int filter_index = GetFilterIndex(vertical_filter_index, height);
   const int vertical_taps = GetNumTapsInFilter(filter_index);
   const ptrdiff_t src_stride = reference_stride;
   const auto* src = static_cast<const uint8_t*>(reference) -
                     (vertical_taps / 2 - 1) * src_stride;
-  auto* dest = static_cast<uint8_t*>(prediction);
+  auto* const dest = static_cast<uint8_t*>(prediction);
   const ptrdiff_t dest_stride = pred_stride;
-  const int filter_id = (subpixel_y >> 6) & kSubPixelMask;
-  assert(filter_id != 0);
+  assert(vertical_filter_id != 0);
 
   uint8x8_t taps[8];
   for (int k = 0; k < kSubPixelTaps; ++k) {
-    taps[k] = vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][filter_id][k]);
+    taps[k] =
+        vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][vertical_filter_id][k]);
   }
 
   if (filter_index == 0) {  // 6 tap.
@@ -2242,8 +2239,8 @@
       FilterVertical<0>(src, src_stride, dest, dest_stride, width, height,
                         taps + 1);
     }
-  } else if ((filter_index == 1) &
-             ((filter_id == 1) | (filter_id == 15))) {  // 5 tap.
+  } else if ((filter_index == 1) & ((vertical_filter_id == 1) |
+                                    (vertical_filter_id == 15))) {  // 5 tap.
     if (width == 2) {
       FilterVertical2xH<1>(src, src_stride, dest, dest_stride, height,
                            taps + 1);
@@ -2255,8 +2252,8 @@
                         taps + 1);
     }
   } else if ((filter_index == 1) &
-             ((filter_id == 7) | (filter_id == 8) |
-              (filter_id == 9))) {  // 6 tap with weird negative taps.
+             ((vertical_filter_id == 7) | (vertical_filter_id == 8) |
+              (vertical_filter_id == 9))) {  // 6 tap with weird negative taps.
     if (width == 2) {
       FilterVertical2xH<1,
                         /*negative_outside_taps=*/true>(
@@ -2302,14 +2299,15 @@
                         taps + 2);
     }
   } else {
-    // 4 tap. When |filter_index| == 1 the |filter_id| values listed below map
-    // to 4 tap filters.
+    // 4 tap. When |filter_index| == 1 the |vertical_filter_id| values listed
+    // below map to 4 tap filters.
     assert(filter_index == 5 ||
            (filter_index == 1 &&
-            (filter_id == 2 || filter_id == 3 || filter_id == 4 ||
-             filter_id == 5 || filter_id == 6 || filter_id == 10 ||
-             filter_id == 11 || filter_id == 12 || filter_id == 13 ||
-             filter_id == 14)));
+            (vertical_filter_id == 2 || vertical_filter_id == 3 ||
+             vertical_filter_id == 4 || vertical_filter_id == 5 ||
+             vertical_filter_id == 6 || vertical_filter_id == 10 ||
+             vertical_filter_id == 11 || vertical_filter_id == 12 ||
+             vertical_filter_id == 13 || vertical_filter_id == 14)));
     // According to GetNumTapsInFilter() this has 6 taps but here we are
     // treating it as though it has 4.
     if (filter_index == 1) src += src_stride;
@@ -2329,8 +2327,9 @@
 void ConvolveCompoundCopy_NEON(
     const void* const reference, const ptrdiff_t reference_stride,
     const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
-    const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
-    const int height, void* prediction, const ptrdiff_t /*pred_stride*/) {
+    const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
+    const int width, const int height, void* const prediction,
+    const ptrdiff_t /*pred_stride*/) {
   const auto* src = static_cast<const uint8_t*>(reference);
   const ptrdiff_t src_stride = reference_stride;
   auto* dest = static_cast<uint16_t*>(prediction);
@@ -2338,7 +2337,7 @@
       kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
 
   if (width >= 16) {
-    int y = 0;
+    int y = height;
     do {
       int x = 0;
       do {
@@ -2354,20 +2353,20 @@
       } while (x < width);
       src += src_stride;
       dest += width;
-    } while (++y < height);
+    } while (--y != 0);
   } else if (width == 8) {
-    int y = 0;
+    int y = height;
     do {
       const uint8x8_t v_src = vld1_u8(&src[0]);
       const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
       vst1q_u16(&dest[0], v_dest);
       src += src_stride;
       dest += width;
-    } while (++y < height);
-  } else { /* width == 4 */
+    } while (--y != 0);
+  } else {  // width == 4
     uint8x8_t v_src = vdup_n_u8(0);
 
-    int y = 0;
+    int y = height;
     do {
       v_src = Load4<0>(&src[0], v_src);
       src += src_stride;
@@ -2376,28 +2375,29 @@
       const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
       vst1q_u16(&dest[0], v_dest);
       dest += 4 << 1;
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
   }
 }
 
 void ConvolveCompoundVertical_NEON(
     const void* const reference, const ptrdiff_t reference_stride,
     const int /*horizontal_filter_index*/, const int vertical_filter_index,
-    const int /*subpixel_x*/, const int subpixel_y, const int width,
-    const int height, void* prediction, const ptrdiff_t /*pred_stride*/) {
+    const int /*horizontal_filter_id*/, const int vertical_filter_id,
+    const int width, const int height, void* const prediction,
+    const ptrdiff_t /*pred_stride*/) {
   const int filter_index = GetFilterIndex(vertical_filter_index, height);
   const int vertical_taps = GetNumTapsInFilter(filter_index);
   const ptrdiff_t src_stride = reference_stride;
   const auto* src = static_cast<const uint8_t*>(reference) -
                     (vertical_taps / 2 - 1) * src_stride;
-  auto* dest = static_cast<uint16_t*>(prediction);
-  const int filter_id = (subpixel_y >> 6) & kSubPixelMask;
-  assert(filter_id != 0);
+  auto* const dest = static_cast<uint16_t*>(prediction);
+  assert(vertical_filter_id != 0);
 
   uint8x8_t taps[8];
   for (int k = 0; k < kSubPixelTaps; ++k) {
-    taps[k] = vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][filter_id][k]);
+    taps[k] =
+        vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][vertical_filter_id][k]);
   }
 
   if (filter_index == 0) {  // 6 tap.
@@ -2408,8 +2408,8 @@
       FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width,
                                               width, height, taps + 1);
     }
-  } else if ((filter_index == 1) &
-             ((filter_id == 1) | (filter_id == 15))) {  // 5 tap.
+  } else if ((filter_index == 1) & ((vertical_filter_id == 1) |
+                                    (vertical_filter_id == 15))) {  // 5 tap.
     if (width == 4) {
       FilterVertical4xH<1, /*is_compound=*/true>(src, src_stride, dest, 4,
                                                  height, taps + 1);
@@ -2418,8 +2418,8 @@
                                               width, height, taps + 1);
     }
   } else if ((filter_index == 1) &
-             ((filter_id == 7) | (filter_id == 8) |
-              (filter_id == 9))) {  // 6 tap with weird negative taps.
+             ((vertical_filter_id == 7) | (vertical_filter_id == 8) |
+              (vertical_filter_id == 9))) {  // 6 tap with weird negative taps.
     if (width == 4) {
       FilterVertical4xH<1, /*is_compound=*/true,
                         /*negative_outside_taps=*/true>(src, src_stride, dest,
@@ -2457,10 +2457,11 @@
     // to 4 tap filters.
     assert(filter_index == 5 ||
            (filter_index == 1 &&
-            (filter_id == 2 || filter_id == 3 || filter_id == 4 ||
-             filter_id == 5 || filter_id == 6 || filter_id == 10 ||
-             filter_id == 11 || filter_id == 12 || filter_id == 13 ||
-             filter_id == 14)));
+            (vertical_filter_id == 2 || vertical_filter_id == 3 ||
+             vertical_filter_id == 4 || vertical_filter_id == 5 ||
+             vertical_filter_id == 6 || vertical_filter_id == 10 ||
+             vertical_filter_id == 11 || vertical_filter_id == 12 ||
+             vertical_filter_id == 13 || vertical_filter_id == 14)));
     // According to GetNumTapsInFilter() this has 6 taps but here we are
     // treating it as though it has 4.
     if (filter_index == 1) src += src_stride;
@@ -2477,22 +2478,41 @@
 void ConvolveCompoundHorizontal_NEON(
     const void* const reference, const ptrdiff_t reference_stride,
     const int horizontal_filter_index, const int /*vertical_filter_index*/,
-    const int subpixel_x, const int /*subpixel_y*/, const int width,
-    const int height, void* prediction, const ptrdiff_t /*pred_stride*/) {
+    const int horizontal_filter_id, const int /*vertical_filter_id*/,
+    const int width, const int height, void* const prediction,
+    const ptrdiff_t /*pred_stride*/) {
   const int filter_index = GetFilterIndex(horizontal_filter_index, width);
-  const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
-  auto* dest = static_cast<uint16_t*>(prediction);
+  const auto* const src =
+      static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+  auto* const dest = static_cast<uint16_t*>(prediction);
 
   DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>(
-      src, reference_stride, dest, width, width, height, subpixel_x,
+      src, reference_stride, dest, width, width, height, horizontal_filter_id,
       filter_index);
 }
 
-void ConvolveCompound2D_NEON(
-    const void* const reference, const ptrdiff_t reference_stride,
-    const int horizontal_filter_index, const int vertical_filter_index,
-    const int subpixel_x, const int subpixel_y, const int width,
-    const int height, void* prediction, const ptrdiff_t /*pred_stride*/) {
+template <int vertical_taps>
+void Compound2DVertical(const uint16_t* const intermediate_result,
+                        const int width, const int height, const int16x8_t taps,
+                        void* const prediction) {
+  auto* const dest = static_cast<uint16_t*>(prediction);
+  if (width == 4) {
+    Filter2DVerticalWidth4<vertical_taps, /*is_compound=*/true>(
+        intermediate_result, dest, width, height, taps);
+  } else {
+    Filter2DVerticalWidth8AndUp<vertical_taps, /*is_compound=*/true>(
+        intermediate_result, dest, width, width, height, taps);
+  }
+}
+
+void ConvolveCompound2D_NEON(const void* const reference,
+                             const ptrdiff_t reference_stride,
+                             const int horizontal_filter_index,
+                             const int vertical_filter_index,
+                             const int horizontal_filter_id,
+                             const int vertical_filter_id, const int width,
+                             const int height, void* const prediction,
+                             const ptrdiff_t /*pred_stride*/) {
   // The output of the horizontal filter, i.e. the intermediate_result, is
   // guaranteed to fit in int16_t.
   uint16_t
@@ -2512,56 +2532,26 @@
   const auto* const src = static_cast<const uint8_t*>(reference) -
                           (vertical_taps / 2 - 1) * src_stride -
                           kHorizontalOffset;
-
   DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
       src, src_stride, intermediate_result, width, width, intermediate_height,
-      subpixel_x, horiz_filter_index);
+      horizontal_filter_id, horiz_filter_index);
 
   // Vertical filter.
-  auto* dest = static_cast<uint16_t*>(prediction);
-  const int filter_id = ((subpixel_y & 1023) >> 6) & kSubPixelMask;
-  assert(filter_id != 0);
-
-  const ptrdiff_t dest_stride = width;
-  const int16x8_t taps =
-      vmovl_s8(vld1_s8(kHalfSubPixelFilters[vert_filter_index][filter_id]));
-
+  assert(vertical_filter_id != 0);
+  const int16x8_t taps = vmovl_s8(
+      vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]));
   if (vertical_taps == 8) {
-    if (width == 4) {
-      Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest,
-                                                   dest_stride, height, taps);
-    } else {
-      Filter2DVertical<8, /*is_compound=*/true>(
-          intermediate_result, dest, dest_stride, width, height, taps);
-    }
+    Compound2DVertical<8>(intermediate_result, width, height, taps, prediction);
   } else if (vertical_taps == 6) {
-    if (width == 4) {
-      Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest,
-                                                   dest_stride, height, taps);
-    } else {
-      Filter2DVertical<6, /*is_compound=*/true>(
-          intermediate_result, dest, dest_stride, width, height, taps);
-    }
+    Compound2DVertical<6>(intermediate_result, width, height, taps, prediction);
   } else if (vertical_taps == 4) {
-    if (width == 4) {
-      Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest,
-                                                   dest_stride, height, taps);
-    } else {
-      Filter2DVertical<4, /*is_compound=*/true>(
-          intermediate_result, dest, dest_stride, width, height, taps);
-    }
+    Compound2DVertical<4>(intermediate_result, width, height, taps, prediction);
   } else {  // |vertical_taps| == 2
-    if (width == 4) {
-      Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest,
-                                                   dest_stride, height, taps);
-    } else {
-      Filter2DVertical<2, /*is_compound=*/true>(
-          intermediate_result, dest, dest_stride, width, height, taps);
-    }
+    Compound2DVertical<2>(intermediate_result, width, height, taps, prediction);
   }
 }
 
-inline void HalfAddHorizontal(const uint8_t* src, uint8_t* dst) {
+inline void HalfAddHorizontal(const uint8_t* const src, uint8_t* const dst) {
   const uint8x16_t left = vld1q_u8(src);
   const uint8x16_t right = vld1q_u8(src + 1);
   vst1q_u8(dst, vrhaddq_u8(left, right));
@@ -2575,7 +2565,7 @@
   const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
   const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
 
-  int y = 0;
+  int y = height;
   do {
     HalfAddHorizontal(src, dst);
     if (width >= 32) {
@@ -2607,7 +2597,7 @@
     }
     src += src_remainder_stride;
     dst += dst_remainder_stride;
-  } while (++y < height);
+  } while (--y != 0);
 }
 
 void ConvolveIntraBlockCopyHorizontal_NEON(
@@ -2631,7 +2621,7 @@
     IntraBlockCopyHorizontal<16>(src, reference_stride, height, dest,
                                  pred_stride);
   } else if (width == 8) {
-    int y = 0;
+    int y = height;
     do {
       const uint8x8_t left = vld1_u8(src);
       const uint8x8_t right = vld1_u8(src + 1);
@@ -2639,11 +2629,11 @@
 
       src += reference_stride;
       dest += pred_stride;
-    } while (++y < height);
+    } while (--y != 0);
   } else if (width == 4) {
     uint8x8_t left = vdup_n_u8(0);
     uint8x8_t right = vdup_n_u8(0);
-    int y = 0;
+    int y = height;
     do {
       left = Load4<0>(src, left);
       right = Load4<0>(src + 1, right);
@@ -2658,13 +2648,13 @@
       dest += pred_stride;
       StoreHi4(dest, result);
       dest += pred_stride;
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
   } else {
     assert(width == 2);
     uint8x8_t left = vdup_n_u8(0);
     uint8x8_t right = vdup_n_u8(0);
-    int y = 0;
+    int y = height;
     do {
       left = Load2<0>(src, left);
       right = Load2<0>(src + 1, right);
@@ -2679,8 +2669,8 @@
       dest += pred_stride;
       Store2<1>(dest, result);
       dest += pred_stride;
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
   }
 }
 
@@ -2715,7 +2705,7 @@
   }
   src += src_remainder_stride;
 
-  int y = 0;
+  int y = height;
   do {
     below[0] = vld1q_u8(src);
     if (width >= 32) {
@@ -2770,14 +2760,15 @@
       }
     }
     dst += dst_remainder_stride;
-  } while (++y < height);
+  } while (--y != 0);
 }
 
 void ConvolveIntraBlockCopyVertical_NEON(
     const void* const reference, const ptrdiff_t reference_stride,
     const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
-    const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
-    const int height, void* const prediction, const ptrdiff_t pred_stride) {
+    const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
+    const int width, const int height, void* const prediction,
+    const ptrdiff_t pred_stride) {
   const auto* src = static_cast<const uint8_t*>(reference);
   auto* dest = static_cast<uint8_t*>(prediction);
 
@@ -2798,7 +2789,7 @@
     row = vld1_u8(src);
     src += reference_stride;
 
-    int y = 0;
+    int y = height;
     do {
       below = vld1_u8(src);
       src += reference_stride;
@@ -2807,13 +2798,13 @@
       dest += pred_stride;
 
       row = below;
-    } while (++y < height);
+    } while (--y != 0);
   } else if (width == 4) {
     uint8x8_t row = Load4(src);
     uint8x8_t below = vdup_n_u8(0);
     src += reference_stride;
 
-    int y = 0;
+    int y = height;
     do {
       below = Load4<0>(src, below);
       src += reference_stride;
@@ -2822,14 +2813,14 @@
       dest += pred_stride;
 
       row = below;
-    } while (++y < height);
+    } while (--y != 0);
   } else {
     assert(width == 2);
     uint8x8_t row = Load2(src);
     uint8x8_t below = vdup_n_u8(0);
     src += reference_stride;
 
-    int y = 0;
+    int y = height;
     do {
       below = Load2<0>(src, below);
       src += reference_stride;
@@ -2838,7 +2829,7 @@
       dest += pred_stride;
 
       row = below;
-    } while (++y < height);
+    } while (--y != 0);
   }
 }
 
@@ -2890,7 +2881,7 @@
   }
   src += src_remainder_stride;
 
-  int y = 0;
+  int y = height;
   do {
     const uint16x8_t below_0 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
     vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[0], below_0), 2));
@@ -3001,14 +2992,15 @@
     }
     src += src_remainder_stride;
     dst += dst_remainder_stride;
-  } while (++y < height);
+  } while (--y != 0);
 }
 
 void ConvolveIntraBlockCopy2D_NEON(
     const void* const reference, const ptrdiff_t reference_stride,
     const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
-    const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
-    const int height, void* const prediction, const ptrdiff_t pred_stride) {
+    const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
+    const int width, const int height, void* const prediction,
+    const ptrdiff_t pred_stride) {
   const auto* src = static_cast<const uint8_t*>(reference);
   auto* dest = static_cast<uint8_t*>(prediction);
   // Note: allow vertical access to height + 1. Because this function is only
@@ -3032,7 +3024,7 @@
 
     uint16x4_t row = vget_low_u16(vaddl_u8(left, right));
 
-    int y = 0;
+    int y = height;
     do {
       left = Load4<0>(src, left);
       right = Load4<0>(src + 1, right);
@@ -3051,8 +3043,8 @@
       dest += pred_stride;
 
       row = vget_high_u16(below);
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
   } else {
     uint8x8_t left = Load2(src);
     uint8x8_t right = Load2(src + 1);
@@ -3060,7 +3052,7 @@
 
     uint16x4_t row = vget_low_u16(vaddl_u8(left, right));
 
-    int y = 0;
+    int y = height;
     do {
       left = Load2<0>(src, left);
       right = Load2<0>(src + 1, right);
@@ -3079,8 +3071,8 @@
       dest += pred_stride;
 
       row = vget_high_u16(below);
-      y += 2;
-    } while (y < height);
+      y -= 2;
+    } while (y != 0);
   }
 }
 
@@ -3112,7 +3104,7 @@
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 
 namespace libgav1 {
 namespace dsp {

diff --git a/libgav1/src/dsp/arm/distance_weighted_blend_neon.cc b/libgav1/src/dsp/arm/distance_weighted_blend_neon.cc
index 04952ab..a0cd0ac 100644
--- a/libgav1/src/dsp/arm/distance_weighted_blend_neon.cc
+++ b/libgav1/src/dsp/arm/distance_weighted_blend_neon.cc

@@ -30,10 +30,12 @@
 
 namespace libgav1 {
 namespace dsp {
-namespace {
 
 constexpr int kInterPostRoundBit = 4;
 
+namespace low_bitdepth {
+namespace {
+
 inline int16x8_t ComputeWeightedAverage8(const int16x8_t pred0,
                                          const int16x8_t pred1,
                                          const int16x4_t weights[2]) {
@@ -185,13 +187,167 @@
 }
 
 }  // namespace
+}  // namespace low_bitdepth
 
-void DistanceWeightedBlendInit_NEON() { Init8bpp(); }
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+inline uint16x4x2_t ComputeWeightedAverage8(const uint16x4x2_t pred0,
+                                            const uint16x4x2_t pred1,
+                                            const uint16x4_t weights[2]) {
+  const uint32x4_t wpred0_lo = vmull_u16(weights[0], pred0.val[0]);
+  const uint32x4_t wpred0_hi = vmull_u16(weights[0], pred0.val[1]);
+  const uint32x4_t blended_lo = vmlal_u16(wpred0_lo, weights[1], pred1.val[0]);
+  const uint32x4_t blended_hi = vmlal_u16(wpred0_hi, weights[1], pred1.val[1]);
+  const int32x4_t offset = vdupq_n_s32(kCompoundOffset * 16);
+  const int32x4_t res_lo = vsubq_s32(vreinterpretq_s32_u32(blended_lo), offset);
+  const int32x4_t res_hi = vsubq_s32(vreinterpretq_s32_u32(blended_hi), offset);
+  const uint16x4_t bd_max = vdup_n_u16((1 << kBitdepth10) - 1);
+  // Clip the result at (1 << bd) - 1.
+  uint16x4x2_t result;
+  result.val[0] =
+      vmin_u16(vqrshrun_n_s32(res_lo, kInterPostRoundBit + 4), bd_max);
+  result.val[1] =
+      vmin_u16(vqrshrun_n_s32(res_hi, kInterPostRoundBit + 4), bd_max);
+  return result;
+}
+
+inline uint16x4x4_t ComputeWeightedAverage8(const uint16x4x4_t pred0,
+                                            const uint16x4x4_t pred1,
+                                            const uint16x4_t weights[2]) {
+  const int32x4_t offset = vdupq_n_s32(kCompoundOffset * 16);
+  const uint32x4_t wpred0 = vmull_u16(weights[0], pred0.val[0]);
+  const uint32x4_t wpred1 = vmull_u16(weights[0], pred0.val[1]);
+  const uint32x4_t blended0 = vmlal_u16(wpred0, weights[1], pred1.val[0]);
+  const uint32x4_t blended1 = vmlal_u16(wpred1, weights[1], pred1.val[1]);
+  const int32x4_t res0 = vsubq_s32(vreinterpretq_s32_u32(blended0), offset);
+  const int32x4_t res1 = vsubq_s32(vreinterpretq_s32_u32(blended1), offset);
+  const uint32x4_t wpred2 = vmull_u16(weights[0], pred0.val[2]);
+  const uint32x4_t wpred3 = vmull_u16(weights[0], pred0.val[3]);
+  const uint32x4_t blended2 = vmlal_u16(wpred2, weights[1], pred1.val[2]);
+  const uint32x4_t blended3 = vmlal_u16(wpred3, weights[1], pred1.val[3]);
+  const int32x4_t res2 = vsubq_s32(vreinterpretq_s32_u32(blended2), offset);
+  const int32x4_t res3 = vsubq_s32(vreinterpretq_s32_u32(blended3), offset);
+  const uint16x4_t bd_max = vdup_n_u16((1 << kBitdepth10) - 1);
+  // Clip the result at (1 << bd) - 1.
+  uint16x4x4_t result;
+  result.val[0] =
+      vmin_u16(vqrshrun_n_s32(res0, kInterPostRoundBit + 4), bd_max);
+  result.val[1] =
+      vmin_u16(vqrshrun_n_s32(res1, kInterPostRoundBit + 4), bd_max);
+  result.val[2] =
+      vmin_u16(vqrshrun_n_s32(res2, kInterPostRoundBit + 4), bd_max);
+  result.val[3] =
+      vmin_u16(vqrshrun_n_s32(res3, kInterPostRoundBit + 4), bd_max);
+
+  return result;
+}
+
+// We could use vld1_u16_x2, but for compatibility reasons, use this function
+// instead. The compiler optimizes to the correct instruction.
+inline uint16x4x2_t LoadU16x4_x2(uint16_t const* ptr) {
+  uint16x4x2_t x;
+  // gcc/clang (64 bit) optimizes the following to ldp.
+  x.val[0] = vld1_u16(ptr);
+  x.val[1] = vld1_u16(ptr + 4);
+  return x;
+}
+
+// We could use vld1_u16_x4, but for compatibility reasons, use this function
+// instead. The compiler optimizes to a pair of vld1_u16_x2, which showed better
+// performance in the speed tests.
+inline uint16x4x4_t LoadU16x4_x4(uint16_t const* ptr) {
+  uint16x4x4_t x;
+  x.val[0] = vld1_u16(ptr);
+  x.val[1] = vld1_u16(ptr + 4);
+  x.val[2] = vld1_u16(ptr + 8);
+  x.val[3] = vld1_u16(ptr + 12);
+  return x;
+}
+
+void DistanceWeightedBlend_NEON(const void* prediction_0,
+                                const void* prediction_1,
+                                const uint8_t weight_0, const uint8_t weight_1,
+                                const int width, const int height,
+                                void* const dest, const ptrdiff_t dest_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  auto* dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t dst_stride = dest_stride / sizeof(dst[0]);
+  const uint16x4_t weights[2] = {vdup_n_u16(weight_0), vdup_n_u16(weight_1)};
+
+  if (width == 4) {
+    int y = height;
+    do {
+      const uint16x4x2_t src0 = LoadU16x4_x2(pred_0);
+      const uint16x4x2_t src1 = LoadU16x4_x2(pred_1);
+      const uint16x4x2_t res = ComputeWeightedAverage8(src0, src1, weights);
+      vst1_u16(dst, res.val[0]);
+      vst1_u16(dst + dst_stride, res.val[1]);
+      dst += dst_stride << 1;
+      pred_0 += 8;
+      pred_1 += 8;
+      y -= 2;
+    } while (y != 0);
+  } else if (width == 8) {
+    int y = height;
+    do {
+      const uint16x4x4_t src0 = LoadU16x4_x4(pred_0);
+      const uint16x4x4_t src1 = LoadU16x4_x4(pred_1);
+      const uint16x4x4_t res = ComputeWeightedAverage8(src0, src1, weights);
+      vst1_u16(dst, res.val[0]);
+      vst1_u16(dst + 4, res.val[1]);
+      vst1_u16(dst + dst_stride, res.val[2]);
+      vst1_u16(dst + dst_stride + 4, res.val[3]);
+      dst += dst_stride << 1;
+      pred_0 += 16;
+      pred_1 += 16;
+      y -= 2;
+    } while (y != 0);
+  } else {
+    int y = height;
+    do {
+      int x = 0;
+      do {
+        const uint16x4x4_t src0 = LoadU16x4_x4(pred_0 + x);
+        const uint16x4x4_t src1 = LoadU16x4_x4(pred_1 + x);
+        const uint16x4x4_t res = ComputeWeightedAverage8(src0, src1, weights);
+        vst1_u16(dst + x, res.val[0]);
+        vst1_u16(dst + x + 4, res.val[1]);
+        vst1_u16(dst + x + 8, res.val[2]);
+        vst1_u16(dst + x + 12, res.val[3]);
+        x += 16;
+      } while (x < width);
+      dst += dst_stride;
+      pred_0 += width;
+      pred_1 += width;
+    } while (--y != 0);
+  }
+}
+
+void Init10bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->distance_weighted_blend = DistanceWeightedBlend_NEON;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void DistanceWeightedBlendInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
 
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 
 namespace libgav1 {
 namespace dsp {

diff --git a/libgav1/src/dsp/arm/distance_weighted_blend_neon.h b/libgav1/src/dsp/arm/distance_weighted_blend_neon.h
index 4d8824c..94a799c 100644
--- a/libgav1/src/dsp/arm/distance_weighted_blend_neon.h
+++ b/libgav1/src/dsp/arm/distance_weighted_blend_neon.h

@@ -34,6 +34,8 @@
 #if LIBGAV1_ENABLE_NEON
 #define LIBGAV1_Dsp8bpp_DistanceWeightedBlend LIBGAV1_CPU_NEON
 
+#define LIBGAV1_Dsp10bpp_DistanceWeightedBlend LIBGAV1_CPU_NEON
+
 #endif  // LIBGAV1_ENABLE_NEON
 
 #endif  // LIBGAV1_SRC_DSP_ARM_DISTANCE_WEIGHTED_BLEND_NEON_H_

diff --git a/libgav1/src/dsp/arm/film_grain_neon.cc b/libgav1/src/dsp/arm/film_grain_neon.cc
index 2612466..8ee3745 100644
--- a/libgav1/src/dsp/arm/film_grain_neon.cc
+++ b/libgav1/src/dsp/arm/film_grain_neon.cc

@@ -1176,7 +1176,7 @@
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 
 namespace libgav1 {
 namespace dsp {

diff --git a/libgav1/src/dsp/arm/intra_edge_neon.cc b/libgav1/src/dsp/arm/intra_edge_neon.cc
index 00b186a..074283f 100644
--- a/libgav1/src/dsp/arm/intra_edge_neon.cc
+++ b/libgav1/src/dsp/arm/intra_edge_neon.cc

@@ -25,7 +25,7 @@
 #include "src/dsp/arm/common_neon.h"
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
-#include "src/utils/common.h"  // RightShiftWithRounding()
+#include "src/utils/common.h"
 
 namespace libgav1 {
 namespace dsp {
@@ -35,6 +35,11 @@
 // required.
 constexpr int kKernelsNEON[3][2] = {{4, 8}, {5, 6}};
 
+}  // namespace
+
+namespace low_bitdepth {
+namespace {
+
 void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
   assert(strength == 1 || strength == 2 || strength == 3);
   const int kernel_index = strength - 1;
@@ -44,6 +49,8 @@
   // elements written is |size| - 1.
   if (size == 1) return;
 
+  const uint8x16_t v_index = vcombine_u8(vcreate_u8(0x0706050403020100),
+                                         vcreate_u8(0x0f0e0d0c0b0a0908));
   // |strength| 1 and 2 use a 3 tap filter.
   if (strength < 3) {
     // The last value requires extending the buffer (duplicating
@@ -89,7 +96,6 @@
     // |remainder| == 1 then we don't have to do anything.
     const int remainder = (size - 1) & 0xf;
     if (remainder > 1) {
-      uint8_t temp[16];
       const uint8x16_t src_1 = vld1q_u8(dst_buffer + i);
       const uint8x16_t src_2 = vld1q_u8(dst_buffer + i + 1);
 
@@ -102,9 +108,11 @@
 
       const uint8x16_t result =
           vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4));
-
-      vst1q_u8(temp, result);
-      memcpy(dst_buffer + i, temp, remainder);
+      const uint8x16_t v_remainder = vdupq_n_u8(remainder);
+      // Create over write mask.
+      const uint8x16_t mask = vcleq_u8(v_remainder, v_index);
+      const uint8x16_t dst_remainder = vbslq_u8(mask, src_1, result);
+      vst1q_u8(dst_buffer + i, dst_remainder);
     }
 
     dst_buffer[size - 1] = last_val;
@@ -173,7 +181,6 @@
   // Like the 3 tap but if there are two remaining values we have already
   // calculated them.
   if (remainder > 2) {
-    uint8_t temp[16];
     const uint8x16_t src_2 = vld1q_u8(dst_buffer + i);
     const uint8x16_t src_3 = vld1q_u8(dst_buffer + i + 1);
     const uint8x16_t src_4 = vld1q_u8(dst_buffer + i + 2);
@@ -193,9 +200,11 @@
 
     const uint8x16_t result =
         vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4));
-
-    vst1q_u8(temp, result);
-    memcpy(dst_buffer + i, temp, remainder);
+    const uint8x16_t v_remainder = vdupq_n_u8(remainder);
+    // Create over write mask.
+    const uint8x16_t mask = vcleq_u8(v_remainder, v_index);
+    const uint8x16_t dst_remainder = vbslq_u8(mask, src_2, result);
+    vst1q_u8(dst_buffer + i, dst_remainder);
   }
 
   dst_buffer[1] = special_vals[0];
@@ -284,13 +293,225 @@
 }
 
 }  // namespace
+}  // namespace low_bitdepth
 
-void IntraEdgeInit_NEON() { Init8bpp(); }
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+const uint16_t kRemainderMask[8][8] = {
+    {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+    {0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+    {0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+    {0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+    {0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000},
+    {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000},
+    {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000},
+    {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000},
+};
+
+void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
+  assert(strength == 1 || strength == 2 || strength == 3);
+  const int kernel_index = strength - 1;
+  auto* const dst_buffer = static_cast<uint16_t*>(buffer);
+
+  // The first element is not written out (but it is input) so the number of
+  // elements written is |size| - 1.
+  if (size == 1) return;
+
+  // |strength| 1 and 2 use a 3 tap filter.
+  if (strength < 3) {
+    // The last value requires extending the buffer (duplicating
+    // |dst_buffer[size - 1]). Calculate it here to avoid extra processing in
+    // neon.
+    const uint16_t last_val = RightShiftWithRounding(
+        kKernelsNEON[kernel_index][0] * dst_buffer[size - 2] +
+            kKernelsNEON[kernel_index][1] * dst_buffer[size - 1] +
+            kKernelsNEON[kernel_index][0] * dst_buffer[size - 1],
+        4);
+
+    const uint16_t krn0 = kKernelsNEON[kernel_index][0];
+    const uint16_t krn1 = kKernelsNEON[kernel_index][1];
+
+    // The first value we need gets overwritten by the output from the
+    // previous iteration.
+    uint16x8_t src_0 = vld1q_u16(dst_buffer);
+    int i = 1;
+
+    // Process blocks until there are less than 16 values remaining.
+    for (; i < size - 7; i += 8) {
+      // Loading these at the end of the block with |src_0| will read past the
+      // end of |top_row_data[160]|, the source of |buffer|.
+      const uint16x8_t src_1 = vld1q_u16(dst_buffer + i);
+      const uint16x8_t src_2 = vld1q_u16(dst_buffer + i + 1);
+      const uint16x8_t sum_02 = vmulq_n_u16(vaddq_u16(src_0, src_2), krn0);
+      const uint16x8_t sum = vmlaq_n_u16(sum_02, src_1, krn1);
+      const uint16x8_t result = vrshrq_n_u16(sum, 4);
+      // Load the next row before overwriting. This loads an extra 7 values
+      // past |size| on the trailing iteration.
+      src_0 = vld1q_u16(dst_buffer + i + 7);
+      vst1q_u16(dst_buffer + i, result);
+    }
+
+    // The last output value |last_val| was already calculated so if
+    // |remainder| == 1 then we don't have to do anything.
+    const int remainder = (size - 1) & 0x7;
+    if (remainder > 1) {
+      const uint16x8_t src_1 = vld1q_u16(dst_buffer + i);
+      const uint16x8_t src_2 = vld1q_u16(dst_buffer + i + 1);
+      const uint16x8_t sum_02 = vmulq_n_u16(vaddq_u16(src_0, src_2), krn0);
+      const uint16x8_t sum = vmlaq_n_u16(sum_02, src_1, krn1);
+      const uint16x8_t result = vrshrq_n_u16(sum, 4);
+      const uint16x8_t mask = vld1q_u16(kRemainderMask[remainder]);
+      const uint16x8_t dst_remainder = vbslq_u16(mask, result, src_1);
+      vst1q_u16(dst_buffer + i, dst_remainder);
+    }
+
+    dst_buffer[size - 1] = last_val;
+    return;
+  }
+
+  assert(strength == 3);
+  // 5 tap filter. The first element requires duplicating |buffer[0]| and the
+  // last two elements require duplicating |buffer[size - 1]|.
+  uint16_t special_vals[3];
+  special_vals[0] = RightShiftWithRounding(
+      (dst_buffer[0] << 1) + (dst_buffer[0] << 2) + (dst_buffer[1] << 2) +
+          (dst_buffer[2] << 2) + (dst_buffer[3] << 1),
+      4);
+  // Clamp index for very small |size| values.
+  const int first_index_min = std::max(size - 4, 0);
+  const int second_index_min = std::max(size - 3, 0);
+  const int third_index_min = std::max(size - 2, 0);
+  special_vals[1] = RightShiftWithRounding(
+      (dst_buffer[first_index_min] << 1) + (dst_buffer[second_index_min] << 2) +
+          (dst_buffer[third_index_min] << 2) + (dst_buffer[size - 1] << 2) +
+          (dst_buffer[size - 1] << 1),
+      4);
+  special_vals[2] = RightShiftWithRounding(
+      (dst_buffer[second_index_min] << 1) + (dst_buffer[third_index_min] << 2) +
+          // x << 2 + x << 2 == x << 3
+          (dst_buffer[size - 1] << 3) + (dst_buffer[size - 1] << 1),
+      4);
+
+  // The first two values we need get overwritten by the output from the
+  // previous iteration.
+  uint16x8_t src_0 = vld1q_u16(dst_buffer - 1);
+  uint16x8_t src_1 = vld1q_u16(dst_buffer);
+  int i = 1;
+
+  for (; i < size - 7; i += 8) {
+    // Loading these at the end of the block with |src_[01]| will read past
+    // the end of |top_row_data[160]|, the source of |buffer|.
+    const uint16x8_t src_2 = vld1q_u16(dst_buffer + i);
+    const uint16x8_t src_3 = vld1q_u16(dst_buffer + i + 1);
+    const uint16x8_t src_4 = vld1q_u16(dst_buffer + i + 2);
+    const uint16x8_t sum_04 = vshlq_n_u16(vaddq_u16(src_0, src_4), 1);
+    const uint16x8_t sum_123 = vaddq_u16(vaddq_u16(src_1, src_2), src_3);
+    const uint16x8_t sum = vaddq_u16(sum_04, vshlq_n_u16(sum_123, 2));
+    const uint16x8_t result = vrshrq_n_u16(sum, 4);
+
+    // Load the next before overwriting.
+    src_0 = vld1q_u16(dst_buffer + i + 6);
+    src_1 = vld1q_u16(dst_buffer + i + 7);
+
+    vst1q_u16(dst_buffer + i, result);
+  }
+
+  const int remainder = (size - 1) & 0x7;
+  // Like the 3 tap but if there are two remaining values we have already
+  // calculated them.
+  if (remainder > 2) {
+    const uint16x8_t src_2 = vld1q_u16(dst_buffer + i);
+    const uint16x8_t src_3 = vld1q_u16(dst_buffer + i + 1);
+    const uint16x8_t src_4 = vld1q_u16(dst_buffer + i + 2);
+    const uint16x8_t sum_04 = vshlq_n_u16(vaddq_u16(src_0, src_4), 1);
+    const uint16x8_t sum_123 = vaddq_u16(vaddq_u16(src_1, src_2), src_3);
+    const uint16x8_t sum = vaddq_u16(sum_04, vshlq_n_u16(sum_123, 2));
+    const uint16x8_t result = vrshrq_n_u16(sum, 4);
+    const uint16x8_t mask = vld1q_u16(kRemainderMask[remainder]);
+    const uint16x8_t dst_remainder = vbslq_u16(mask, result, src_2);
+    vst1q_u16(dst_buffer + i, dst_remainder);
+  }
+
+  dst_buffer[1] = special_vals[0];
+  // Avoid overwriting |dst_buffer[0]|.
+  if (size > 2) dst_buffer[size - 2] = special_vals[1];
+  dst_buffer[size - 1] = special_vals[2];
+}
+
+void IntraEdgeUpsampler_NEON(void* buffer, const int size) {
+  assert(size % 4 == 0 && size <= 16);
+  auto* const pixel_buffer = static_cast<uint16_t*>(buffer);
+
+  // Extend first/last samples
+  pixel_buffer[-2] = pixel_buffer[-1];
+  pixel_buffer[size] = pixel_buffer[size - 1];
+
+  const int16x8_t src_lo = vreinterpretq_s16_u16(vld1q_u16(pixel_buffer - 2));
+  const int16x8_t src_hi =
+      vreinterpretq_s16_u16(vld1q_u16(pixel_buffer - 2 + 8));
+  const int16x8_t src9_hi = vaddq_s16(src_hi, vshlq_n_s16(src_hi, 3));
+  const int16x8_t src9_lo = vaddq_s16(src_lo, vshlq_n_s16(src_lo, 3));
+
+  int16x8_t sum_lo = vsubq_s16(vextq_s16(src9_lo, src9_hi, 1), src_lo);
+  sum_lo = vaddq_s16(sum_lo, vextq_s16(src9_lo, src9_hi, 2));
+  sum_lo = vsubq_s16(sum_lo, vextq_s16(src_lo, src_hi, 3));
+  sum_lo = vrshrq_n_s16(sum_lo, 4);
+
+  uint16x8x2_t result_lo;
+  result_lo.val[0] =
+      vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(sum_lo, vdupq_n_s16(0))),
+                vdupq_n_u16((1 << kBitdepth10) - 1));
+  result_lo.val[1] = vreinterpretq_u16_s16(vextq_s16(src_lo, src_hi, 2));
+
+  if (size > 8) {
+    const int16x8_t src_hi_extra =
+        vreinterpretq_s16_u16(vld1q_u16(pixel_buffer + 16 - 2));
+    const int16x8_t src9_hi_extra =
+        vaddq_s16(src_hi_extra, vshlq_n_s16(src_hi_extra, 3));
+
+    int16x8_t sum_hi = vsubq_s16(vextq_s16(src9_hi, src9_hi_extra, 1), src_hi);
+    sum_hi = vaddq_s16(sum_hi, vextq_s16(src9_hi, src9_hi_extra, 2));
+    sum_hi = vsubq_s16(sum_hi, vextq_s16(src_hi, src_hi_extra, 3));
+    sum_hi = vrshrq_n_s16(sum_hi, 4);
+
+    uint16x8x2_t result_hi;
+    result_hi.val[0] =
+        vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(sum_hi, vdupq_n_s16(0))),
+                  vdupq_n_u16((1 << kBitdepth10) - 1));
+    result_hi.val[1] =
+        vreinterpretq_u16_s16(vextq_s16(src_hi, src_hi_extra, 2));
+    vst2q_u16(pixel_buffer - 1, result_lo);
+    vst2q_u16(pixel_buffer + 15, result_hi);
+  } else {
+    vst2q_u16(pixel_buffer - 1, result_lo);
+  }
+}
+
+void Init10bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->intra_edge_filter = IntraEdgeFilter_NEON;
+  dsp->intra_edge_upsampler = IntraEdgeUpsampler_NEON;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraEdgeInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
 
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 

diff --git a/libgav1/src/dsp/arm/intra_edge_neon.h b/libgav1/src/dsp/arm/intra_edge_neon.h
index d3bb243..28e3494 100644
--- a/libgav1/src/dsp/arm/intra_edge_neon.h
+++ b/libgav1/src/dsp/arm/intra_edge_neon.h

@@ -34,6 +34,9 @@
 #define LIBGAV1_Dsp8bpp_IntraEdgeFilter LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_IntraEdgeUpsampler LIBGAV1_CPU_NEON
 
+#define LIBGAV1_Dsp10bpp_IntraEdgeFilter LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_IntraEdgeUpsampler LIBGAV1_CPU_NEON
+
 #endif  // LIBGAV1_ENABLE_NEON
 
 #endif  // LIBGAV1_SRC_DSP_ARM_INTRA_EDGE_NEON_H_

diff --git a/libgav1/src/dsp/arm/intrapred_cfl_neon.cc b/libgav1/src/dsp/arm/intrapred_cfl_neon.cc
index 45fe33b..8d8748f 100644
--- a/libgav1/src/dsp/arm/intrapred_cfl_neon.cc
+++ b/libgav1/src/dsp/arm/intrapred_cfl_neon.cc

@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_cfl.h"
 #include "src/utils/cpu.h"
 
 #if LIBGAV1_ENABLE_NEON
@@ -27,45 +27,20 @@
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
 #include "src/utils/common.h"
+#include "src/utils/constants.h"
 
 namespace libgav1 {
 namespace dsp {
-namespace low_bitdepth {
-namespace {
-
-uint8x16_t Set2ValuesQ(const uint8_t* a) {
-  uint16_t combined_values = a[0] | a[1] << 8;
-  return vreinterpretq_u8_u16(vdupq_n_u16(combined_values));
-}
-
-uint32_t SumVector(uint32x2_t a) {
-#if defined(__aarch64__)
-  return vaddv_u32(a);
-#else
-  const uint64x1_t b = vpaddl_u32(a);
-  return vget_lane_u32(vreinterpret_u32_u64(b), 0);
-#endif  // defined(__aarch64__)
-}
-
-uint32_t SumVector(uint32x4_t a) {
-#if defined(__aarch64__)
-  return vaddvq_u32(a);
-#else
-  const uint64x2_t b = vpaddlq_u32(a);
-  const uint64x1_t c = vadd_u64(vget_low_u64(b), vget_high_u64(b));
-  return vget_lane_u32(vreinterpret_u32_u64(c), 0);
-#endif  // defined(__aarch64__)
-}
 
 // Divide by the number of elements.
-uint32_t Average(const uint32_t sum, const int width, const int height) {
+inline uint32_t Average(const uint32_t sum, const int width, const int height) {
   return RightShiftWithRounding(sum, FloorLog2(width) + FloorLog2(height));
 }
 
 // Subtract |val| from every element in |a|.
-void BlockSubtract(const uint32_t val,
-                   int16_t a[kCflLumaBufferStride][kCflLumaBufferStride],
-                   const int width, const int height) {
+inline void BlockSubtract(const uint32_t val,
+                          int16_t a[kCflLumaBufferStride][kCflLumaBufferStride],
+                          const int width, const int height) {
   assert(val <= INT16_MAX);
   const int16x8_t val_v = vdupq_n_s16(static_cast<int16_t>(val));
 
@@ -94,6 +69,9 @@
   }
 }
 
+namespace low_bitdepth {
+namespace {
+
 template <int block_width, int block_height>
 void CflSubsampler420_NEON(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
@@ -122,26 +100,27 @@
 
     sum = SumVector(running_sum);
   } else if (block_width == 8) {
-    const uint8x16_t x_index = {0, 0, 2,  2,  4,  4,  6,  6,
-                                8, 8, 10, 10, 12, 12, 14, 14};
-    const uint8x16_t x_max_index = vdupq_n_u8(max_luma_width - 2);
-    const uint8x16_t x_mask = vcltq_u8(x_index, x_max_index);
+    const uint16x8_t x_index = {0, 2, 4, 6, 8, 10, 12, 14};
+    const uint16x8_t x_max_index =
+        vdupq_n_u16(max_luma_width == 8 ? max_luma_width - 2 : 16);
+    const uint16x8_t x_mask = vcltq_u16(x_index, x_max_index);
 
     uint32x4_t running_sum = vdupq_n_u32(0);
 
     for (int y = 0; y < block_height; ++y) {
-      const uint8x16_t x_max0 = Set2ValuesQ(src + max_luma_width - 2);
-      const uint8x16_t x_max1 = Set2ValuesQ(src + max_luma_width - 2 + stride);
+      const uint8x16_t row0 = vld1q_u8(src);
+      const uint8x16_t row1 = vld1q_u8(src + stride);
+      const uint16x8_t sum_row = vpadalq_u8(vpaddlq_u8(row0), row1);
+      const uint16x8_t sum_row_shifted = vshlq_n_u16(sum_row, 1);
 
-      uint8x16_t row0 = vld1q_u8(src);
-      row0 = vbslq_u8(x_mask, row0, x_max0);
-      uint8x16_t row1 = vld1q_u8(src + stride);
-      row1 = vbslq_u8(x_mask, row1, x_max1);
+      // Dup the 2x2 sum at the max luma offset.
+      const uint16x8_t max_luma_sum =
+          vdupq_lane_u16(vget_low_u16(sum_row_shifted), 3);
+      const uint16x8_t final_sum_row =
+          vbslq_u16(x_mask, sum_row_shifted, max_luma_sum);
+      vst1q_s16(luma[y], vreinterpretq_s16_u16(final_sum_row));
 
-      uint16x8_t sum_row = vpadalq_u8(vpaddlq_u8(row0), row1);
-      sum_row = vshlq_n_u16(sum_row, 1);
-      running_sum = vpadalq_u16(running_sum, sum_row);
-      vst1q_s16(luma[y], vreinterpretq_s16_u16(sum_row));
+      running_sum = vpadalq_u16(running_sum, final_sum_row);
 
       if (y << 1 < max_luma_height - 2) {
         src += stride << 1;
@@ -150,45 +129,35 @@
 
     sum = SumVector(running_sum);
   } else /* block_width >= 16 */ {
-    const uint8x16_t x_max_index = vdupq_n_u8(max_luma_width - 2);
+    const uint16x8_t x_max_index = vdupq_n_u16(max_luma_width - 2);
     uint32x4_t running_sum = vdupq_n_u32(0);
 
     for (int y = 0; y < block_height; ++y) {
-      uint8x16_t x_index = {0,  2,  4,  6,  8,  10, 12, 14,
-                            16, 18, 20, 22, 24, 26, 28, 30};
-      const uint8x16_t x_max00 = vdupq_n_u8(src[max_luma_width - 2]);
-      const uint8x16_t x_max01 = vdupq_n_u8(src[max_luma_width - 2 + 1]);
-      const uint8x16_t x_max10 = vdupq_n_u8(src[stride + max_luma_width - 2]);
-      const uint8x16_t x_max11 =
-          vdupq_n_u8(src[stride + max_luma_width - 2 + 1]);
-      for (int x = 0; x < block_width; x += 16) {
-        const ptrdiff_t src_x_offset = x << 1;
-        const uint8x16_t x_mask = vcltq_u8(x_index, x_max_index);
-        const uint8x16x2_t row0 = vld2q_u8(src + src_x_offset);
-        const uint8x16x2_t row1 = vld2q_u8(src + src_x_offset + stride);
-        const uint8x16_t row_masked_00 = vbslq_u8(x_mask, row0.val[0], x_max00);
-        const uint8x16_t row_masked_01 = vbslq_u8(x_mask, row0.val[1], x_max01);
-        const uint8x16_t row_masked_10 = vbslq_u8(x_mask, row1.val[0], x_max10);
-        const uint8x16_t row_masked_11 = vbslq_u8(x_mask, row1.val[1], x_max11);
+      // Calculate the 2x2 sum at the max_luma offset
+      const uint8_t a00 = src[max_luma_width - 2];
+      const uint8_t a01 = src[max_luma_width - 1];
+      const uint8_t a10 = src[max_luma_width - 2 + stride];
+      const uint8_t a11 = src[max_luma_width - 1 + stride];
+      // Dup the 2x2 sum at the max luma offset.
+      const uint16x8_t max_luma_sum =
+          vdupq_n_u16((uint16_t)((a00 + a01 + a10 + a11) << 1));
+      uint16x8_t x_index = {0, 2, 4, 6, 8, 10, 12, 14};
 
-        uint16x8_t sum_row_lo =
-            vaddl_u8(vget_low_u8(row_masked_00), vget_low_u8(row_masked_01));
-        sum_row_lo = vaddw_u8(sum_row_lo, vget_low_u8(row_masked_10));
-        sum_row_lo = vaddw_u8(sum_row_lo, vget_low_u8(row_masked_11));
-        sum_row_lo = vshlq_n_u16(sum_row_lo, 1);
-        running_sum = vpadalq_u16(running_sum, sum_row_lo);
-        vst1q_s16(luma[y] + x, vreinterpretq_s16_u16(sum_row_lo));
+      ptrdiff_t src_x_offset = 0;
+      for (int x = 0; x < block_width; x += 8, src_x_offset += 16) {
+        const uint16x8_t x_mask = vcltq_u16(x_index, x_max_index);
+        const uint8x16_t row0 = vld1q_u8(src + src_x_offset);
+        const uint8x16_t row1 = vld1q_u8(src + src_x_offset + stride);
+        const uint16x8_t sum_row = vpadalq_u8(vpaddlq_u8(row0), row1);
+        const uint16x8_t sum_row_shifted = vshlq_n_u16(sum_row, 1);
+        const uint16x8_t final_sum_row =
+            vbslq_u16(x_mask, sum_row_shifted, max_luma_sum);
+        vst1q_s16(luma[y] + x, vreinterpretq_s16_u16(final_sum_row));
 
-        uint16x8_t sum_row_hi =
-            vaddl_u8(vget_high_u8(row_masked_00), vget_high_u8(row_masked_01));
-        sum_row_hi = vaddw_u8(sum_row_hi, vget_high_u8(row_masked_10));
-        sum_row_hi = vaddw_u8(sum_row_hi, vget_high_u8(row_masked_11));
-        sum_row_hi = vshlq_n_u16(sum_row_hi, 1);
-        running_sum = vpadalq_u16(running_sum, sum_row_hi);
-        vst1q_s16(luma[y] + x + 8, vreinterpretq_s16_u16(sum_row_hi));
-
-        x_index = vaddq_u8(x_index, vdupq_n_u8(32));
+        running_sum = vpadalq_u16(running_sum, final_sum_row);
+        x_index = vaddq_u16(x_index, vdupq_n_u16(16));
       }
+
       if (y << 1 < max_luma_height - 2) {
         src += stride << 1;
       }
@@ -209,17 +178,30 @@
   uint32_t sum;
   if (block_width == 4) {
     assert(max_luma_width >= 4);
+    assert(max_luma_height <= block_height);
+    assert((max_luma_height % 2) == 0);
     uint32x4_t running_sum = vdupq_n_u32(0);
     uint8x8_t row = vdup_n_u8(0);
 
-    for (int y = 0; y < block_height; y += 2) {
+    uint16x8_t row_shifted;
+    int y = 0;
+    do {
       row = Load4<0>(src, row);
       row = Load4<1>(src + stride, row);
       if (y < (max_luma_height - 1)) {
         src += stride << 1;
       }
 
-      const uint16x8_t row_shifted = vshll_n_u8(row, 3);
+      row_shifted = vshll_n_u8(row, 3);
+      running_sum = vpadalq_u16(running_sum, row_shifted);
+      vst1_s16(luma[y], vreinterpret_s16_u16(vget_low_u16(row_shifted)));
+      vst1_s16(luma[y + 1], vreinterpret_s16_u16(vget_high_u16(row_shifted)));
+      y += 2;
+    } while (y < max_luma_height);
+
+    row_shifted =
+        vcombine_u16(vget_high_u16(row_shifted), vget_high_u16(row_shifted));
+    for (; y < block_height; y += 2) {
       running_sum = vpadalq_u16(running_sum, row_shifted);
       vst1_s16(luma[y], vreinterpret_s16_u16(vget_low_u16(row_shifted)));
       vst1_s16(luma[y + 1], vreinterpret_s16_u16(vget_high_u16(row_shifted)));
@@ -463,12 +445,874 @@
 }  // namespace
 }  // namespace low_bitdepth
 
-void IntraPredCflInit_NEON() { low_bitdepth::Init8bpp(); }
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+//------------------------------------------------------------------------------
+// CflSubsampler
+#ifndef __aarch64__
+uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) {
+  return vcombine_u16(vpadd_u16(vget_low_u16(a), vget_high_u16(a)),
+                      vpadd_u16(vget_low_u16(b), vget_high_u16(b)));
+}
+#endif
+
+// This duplicates the last two 16-bit values in |row|.
+inline uint16x8_t LastRowSamples(const uint16x8_t row) {
+  const uint32x2_t a = vget_high_u32(vreinterpretq_u32_u16(row));
+  const uint32x4_t b = vdupq_lane_u32(a, 1);
+  return vreinterpretq_u16_u32(b);
+}
+
+// This duplicates the last unsigned 16-bit value in |row|.
+inline uint16x8_t LastRowResult(const uint16x8_t row) {
+  const uint16x4_t a = vget_high_u16(row);
+  const uint16x8_t b = vdupq_lane_u16(a, 0x3);
+  return b;
+}
+
+// This duplicates the last signed 16-bit value in |row|.
+inline int16x8_t LastRowResult(const int16x8_t row) {
+  const int16x4_t a = vget_high_s16(row);
+  const int16x8_t b = vdupq_lane_s16(a, 0x3);
+  return b;
+}
+
+// Takes in two sums of input row pairs, and completes the computation for two
+// output rows.
+inline uint16x8_t StoreLumaResults4_420(const uint16x8_t vertical_sum0,
+                                        const uint16x8_t vertical_sum1,
+                                        int16_t* luma_ptr) {
+  const uint16x8_t result = vpaddq_u16(vertical_sum0, vertical_sum1);
+  const uint16x8_t result_shifted = vshlq_n_u16(result, 1);
+  vst1_s16(luma_ptr, vreinterpret_s16_u16(vget_low_u16(result_shifted)));
+  vst1_s16(luma_ptr + kCflLumaBufferStride,
+           vreinterpret_s16_u16(vget_high_u16(result_shifted)));
+  return result_shifted;
+}
+
+// Takes two halves of a vertically added pair of rows and completes the
+// computation for one output row.
+inline uint16x8_t StoreLumaResults8_420(const uint16x8_t vertical_sum0,
+                                        const uint16x8_t vertical_sum1,
+                                        int16_t* luma_ptr) {
+  const uint16x8_t result = vpaddq_u16(vertical_sum0, vertical_sum1);
+  const uint16x8_t result_shifted = vshlq_n_u16(result, 1);
+  vst1q_s16(luma_ptr, vreinterpretq_s16_u16(result_shifted));
+  return result_shifted;
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_4xH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* const source, ptrdiff_t stride) {
+  static_assert(block_height_log2 <= 4, "");
+  const int block_height = 1 << block_height_log2;
+  const int visible_height = max_luma_height;
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  uint16x4_t sum = vdup_n_u16(0);
+  uint16x4_t samples[2];
+  int y = visible_height;
+
+  do {
+    samples[0] = vld1_u16(src);
+    samples[1] = vld1_u16(src + src_stride);
+    src += src_stride << 1;
+    sum = vadd_u16(sum, samples[0]);
+    sum = vadd_u16(sum, samples[1]);
+    y -= 2;
+  } while (y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    samples[1] = vshl_n_u16(samples[1], 1);
+    do {
+      sum = vadd_u16(sum, samples[1]);
+      y += 2;
+    } while (y < block_height);
+  }
+
+  // Here the left shift by 3 (to increase precision) is nullified in right
+  // shift ((log2 of width 4) + 1).
+  const uint32_t average_sum =
+      RightShiftWithRounding(SumVector(vpaddl_u16(sum)), block_height_log2 - 1);
+  const int16x4_t averages = vdup_n_s16(static_cast<int16_t>(average_sum));
+
+  const auto* ssrc = static_cast<const int16_t*>(source);
+  int16x4_t ssample;
+  luma_ptr = luma[0];
+  y = visible_height;
+  do {
+    ssample = vld1_s16(ssrc);
+    ssample = vshl_n_s16(ssample, 3);
+    vst1_s16(luma_ptr, vsub_s16(ssample, averages));
+    ssrc += src_stride;
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    // Replicate last line
+    do {
+      vst1_s16(luma_ptr, vsub_s16(ssample, averages));
+      luma_ptr += kCflLumaBufferStride;
+    } while (++y < block_height);
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_4xH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  static_cast<void>(max_luma_width);
+  static_cast<void>(max_luma_height);
+  static_assert(block_height_log2 <= 4, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+  const int block_height = 1 << block_height_log2;
+
+  if (block_height <= max_luma_height) {
+    CflSubsampler444_4xH_NEON<block_height_log2, true>(luma, max_luma_height,
+                                                       source, stride);
+  } else {
+    CflSubsampler444_4xH_NEON<block_height_log2, false>(luma, max_luma_height,
+                                                        source, stride);
+  }
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_8xH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* const source, ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const int visible_height = max_luma_height;
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  uint32x4_t sum = vdupq_n_u32(0);
+  uint16x8_t samples;
+  int y = visible_height;
+
+  do {
+    samples = vld1q_u16(src);
+    src += src_stride;
+    sum = vpadalq_u16(sum, samples);
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    do {
+      sum = vpadalq_u16(sum, samples);
+    } while (++y < block_height);
+  }
+
+  // Here the left shift by 3 (to increase precision) is nullified in right
+  // shift (log2 of width 8).
+  const uint32_t average_sum =
+      RightShiftWithRounding(SumVector(sum), block_height_log2);
+  const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum));
+
+  const auto* ssrc = static_cast<const int16_t*>(source);
+  int16x8_t ssample;
+  luma_ptr = luma[0];
+  y = visible_height;
+  do {
+    ssample = vld1q_s16(ssrc);
+    ssample = vshlq_n_s16(ssample, 3);
+    vst1q_s16(luma_ptr, vsubq_s16(ssample, averages));
+    ssrc += src_stride;
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    // Replicate last line
+    do {
+      vst1q_s16(luma_ptr, vsubq_s16(ssample, averages));
+      luma_ptr += kCflLumaBufferStride;
+    } while (++y < block_height);
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_8xH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  static_cast<void>(max_luma_width);
+  static_cast<void>(max_luma_height);
+  static_assert(block_height_log2 <= 5, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+  const int block_height = 1 << block_height_log2;
+  const int block_width = 8;
+
+  const int horz_inside = block_width <= max_luma_width;
+  const int vert_inside = block_height <= max_luma_height;
+  if (horz_inside && vert_inside) {
+    CflSubsampler444_8xH_NEON<block_height_log2, true>(luma, max_luma_height,
+                                                       source, stride);
+  } else {
+    CflSubsampler444_8xH_NEON<block_height_log2, false>(luma, max_luma_height,
+                                                        source, stride);
+  }
+}
+
+template <int block_width_log2, int block_height_log2, bool is_inside>
+void CflSubsampler444_WxH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const int visible_height = max_luma_height;
+  const int block_width = 1 << block_width_log2;
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  uint32x4_t sum = vdupq_n_u32(0);
+  uint16x8_t samples[4];
+  int y = visible_height;
+
+  do {
+    samples[0] = vld1q_u16(src);
+    samples[1] =
+        (max_luma_width >= 16) ? vld1q_u16(src + 8) : LastRowResult(samples[0]);
+    uint16x8_t inner_sum = vaddq_u16(samples[0], samples[1]);
+    if (block_width == 32) {
+      samples[2] = (max_luma_width >= 24) ? vld1q_u16(src + 16)
+                                          : LastRowResult(samples[1]);
+      samples[3] = (max_luma_width == 32) ? vld1q_u16(src + 24)
+                                          : LastRowResult(samples[2]);
+      inner_sum = vaddq_u16(samples[2], inner_sum);
+      inner_sum = vaddq_u16(samples[3], inner_sum);
+    }
+    sum = vpadalq_u16(sum, inner_sum);
+    src += src_stride;
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    uint16x8_t inner_sum = vaddq_u16(samples[0], samples[1]);
+    if (block_width == 32) {
+      inner_sum = vaddq_u16(samples[2], inner_sum);
+      inner_sum = vaddq_u16(samples[3], inner_sum);
+    }
+    do {
+      sum = vpadalq_u16(sum, inner_sum);
+    } while (++y < block_height);
+  }
+
+  // Here the left shift by 3 (to increase precision) is subtracted in right
+  // shift factor (block_width_log2 + block_height_log2 - 3).
+  const uint32_t average_sum = RightShiftWithRounding(
+      SumVector(sum), block_width_log2 + block_height_log2 - 3);
+  const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum));
+
+  const auto* ssrc = static_cast<const int16_t*>(source);
+  int16x8_t ssamples_ext = vdupq_n_s16(0);
+  int16x8_t ssamples[4];
+  luma_ptr = luma[0];
+  y = visible_height;
+  do {
+    int idx = 0;
+    for (int x = 0; x < block_width; x += 8) {
+      if (max_luma_width > x) {
+        ssamples[idx] = vld1q_s16(&ssrc[x]);
+        ssamples[idx] = vshlq_n_s16(ssamples[idx], 3);
+        ssamples_ext = ssamples[idx];
+      } else {
+        ssamples[idx] = LastRowResult(ssamples_ext);
+      }
+      vst1q_s16(&luma_ptr[x], vsubq_s16(ssamples[idx++], averages));
+    }
+    ssrc += src_stride;
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    // Replicate last line
+    do {
+      int idx = 0;
+      for (int x = 0; x < block_width; x += 8) {
+        vst1q_s16(&luma_ptr[x], vsubq_s16(ssamples[idx++], averages));
+      }
+      luma_ptr += kCflLumaBufferStride;
+    } while (++y < block_height);
+  }
+}
+
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler444_WxH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  static_assert(block_width_log2 == 4 || block_width_log2 == 5,
+                "This function will only work for block_width 16 and 32.");
+  static_assert(block_height_log2 <= 5, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+
+  const int block_height = 1 << block_height_log2;
+  const int vert_inside = block_height <= max_luma_height;
+  if (vert_inside) {
+    CflSubsampler444_WxH_NEON<block_width_log2, block_height_log2, true>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  } else {
+    CflSubsampler444_WxH_NEON<block_width_log2, block_height_log2, false>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler420_4xH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int /*max_luma_width*/, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  const int luma_height = std::min(block_height, max_luma_height >> 1);
+  int y = luma_height;
+
+  uint32x4_t final_sum = vdupq_n_u32(0);
+  do {
+    const uint16x8_t samples_row0 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t samples_row1 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t luma_sum01 = vaddq_u16(samples_row0, samples_row1);
+
+    const uint16x8_t samples_row2 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t samples_row3 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t luma_sum23 = vaddq_u16(samples_row2, samples_row3);
+    uint16x8_t sum = StoreLumaResults4_420(luma_sum01, luma_sum23, luma_ptr);
+    luma_ptr += kCflLumaBufferStride << 1;
+
+    const uint16x8_t samples_row4 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t samples_row5 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t luma_sum45 = vaddq_u16(samples_row4, samples_row5);
+
+    const uint16x8_t samples_row6 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t samples_row7 = vld1q_u16(src);
+    src += src_stride;
+    const uint16x8_t luma_sum67 = vaddq_u16(samples_row6, samples_row7);
+    sum =
+        vaddq_u16(sum, StoreLumaResults4_420(luma_sum45, luma_sum67, luma_ptr));
+    luma_ptr += kCflLumaBufferStride << 1;
+
+    final_sum = vpadalq_u16(final_sum, sum);
+    y -= 4;
+  } while (y != 0);
+
+  const uint16x4_t final_fill =
+      vreinterpret_u16_s16(vld1_s16(luma_ptr - kCflLumaBufferStride));
+  const uint32x4_t final_fill_to_sum = vmovl_u16(final_fill);
+  for (y = luma_height; y < block_height; ++y) {
+    vst1_s16(luma_ptr, vreinterpret_s16_u16(final_fill));
+    luma_ptr += kCflLumaBufferStride;
+    final_sum = vaddq_u32(final_sum, final_fill_to_sum);
+  }
+  const uint32_t average_sum = RightShiftWithRounding(
+      SumVector(final_sum), block_height_log2 + 2 /*log2 of width 4*/);
+  const int16x4_t averages = vdup_n_s16(static_cast<int16_t>(average_sum));
+  luma_ptr = luma[0];
+  y = block_height;
+  do {
+    const int16x4_t samples = vld1_s16(luma_ptr);
+    vst1_s16(luma_ptr, vsub_s16(samples, averages));
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+}
+
+template <int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_8xH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* const source, ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  const int luma_height = std::min(block_height, max_luma_height >> 1);
+  int y = luma_height;
+
+  uint32x4_t final_sum = vdupq_n_u32(0);
+  do {
+    const uint16x8_t samples_row00 = vld1q_u16(src);
+    const uint16x8_t samples_row01 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row00);
+    src += src_stride;
+    const uint16x8_t samples_row10 = vld1q_u16(src);
+    const uint16x8_t samples_row11 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row10);
+    src += src_stride;
+    const uint16x8_t luma_sum00 = vaddq_u16(samples_row00, samples_row10);
+    const uint16x8_t luma_sum01 = vaddq_u16(samples_row01, samples_row11);
+    uint16x8_t sum = StoreLumaResults8_420(luma_sum00, luma_sum01, luma_ptr);
+    luma_ptr += kCflLumaBufferStride;
+
+    const uint16x8_t samples_row20 = vld1q_u16(src);
+    const uint16x8_t samples_row21 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row20);
+    src += src_stride;
+    const uint16x8_t samples_row30 = vld1q_u16(src);
+    const uint16x8_t samples_row31 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row30);
+    src += src_stride;
+    const uint16x8_t luma_sum10 = vaddq_u16(samples_row20, samples_row30);
+    const uint16x8_t luma_sum11 = vaddq_u16(samples_row21, samples_row31);
+    sum =
+        vaddq_u16(sum, StoreLumaResults8_420(luma_sum10, luma_sum11, luma_ptr));
+    luma_ptr += kCflLumaBufferStride;
+
+    const uint16x8_t samples_row40 = vld1q_u16(src);
+    const uint16x8_t samples_row41 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row40);
+    src += src_stride;
+    const uint16x8_t samples_row50 = vld1q_u16(src);
+    const uint16x8_t samples_row51 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row50);
+    src += src_stride;
+    const uint16x8_t luma_sum20 = vaddq_u16(samples_row40, samples_row50);
+    const uint16x8_t luma_sum21 = vaddq_u16(samples_row41, samples_row51);
+    sum =
+        vaddq_u16(sum, StoreLumaResults8_420(luma_sum20, luma_sum21, luma_ptr));
+    luma_ptr += kCflLumaBufferStride;
+
+    const uint16x8_t samples_row60 = vld1q_u16(src);
+    const uint16x8_t samples_row61 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row60);
+    src += src_stride;
+    const uint16x8_t samples_row70 = vld1q_u16(src);
+    const uint16x8_t samples_row71 = (max_luma_width == 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row70);
+    src += src_stride;
+    const uint16x8_t luma_sum30 = vaddq_u16(samples_row60, samples_row70);
+    const uint16x8_t luma_sum31 = vaddq_u16(samples_row61, samples_row71);
+    sum =
+        vaddq_u16(sum, StoreLumaResults8_420(luma_sum30, luma_sum31, luma_ptr));
+    luma_ptr += kCflLumaBufferStride;
+
+    final_sum = vpadalq_u16(final_sum, sum);
+    y -= 4;
+  } while (y != 0);
+
+  // Duplicate the final row downward to the end after max_luma_height.
+  const uint16x8_t final_fill =
+      vreinterpretq_u16_s16(vld1q_s16(luma_ptr - kCflLumaBufferStride));
+  const uint32x4_t final_fill_to_sum =
+      vaddl_u16(vget_low_u16(final_fill), vget_high_u16(final_fill));
+
+  for (y = luma_height; y < block_height; ++y) {
+    vst1q_s16(luma_ptr, vreinterpretq_s16_u16(final_fill));
+    luma_ptr += kCflLumaBufferStride;
+    final_sum = vaddq_u32(final_sum, final_fill_to_sum);
+  }
+
+  const uint32_t average_sum = RightShiftWithRounding(
+      SumVector(final_sum), block_height_log2 + 3 /*log2 of width 8*/);
+  const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum));
+  luma_ptr = luma[0];
+  y = block_height;
+  do {
+    const int16x8_t samples = vld1q_s16(luma_ptr);
+    vst1q_s16(luma_ptr, vsubq_s16(samples, averages));
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+}
+
+template <int block_height_log2>
+void CflSubsampler420_8xH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  if (max_luma_width == 8) {
+    CflSubsampler420Impl_8xH_NEON<block_height_log2, 8>(luma, max_luma_height,
+                                                        source, stride);
+  } else {
+    CflSubsampler420Impl_8xH_NEON<block_height_log2, 16>(luma, max_luma_height,
+                                                         source, stride);
+  }
+}
+
+template <int block_width_log2, int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_WxH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* const source, ptrdiff_t stride) {
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  const int block_height = 1 << block_height_log2;
+  const int luma_height = std::min(block_height, max_luma_height >> 1);
+  int16_t* luma_ptr = luma[0];
+  // Begin first y section, covering width up to 32.
+  int y = luma_height;
+
+  uint16x8_t final_fill0, final_fill1;
+  uint32x4_t final_sum = vdupq_n_u32(0);
+  do {
+    const uint16_t* src_next = src + src_stride;
+    const uint16x8_t samples_row00 = vld1q_u16(src);
+    const uint16x8_t samples_row01 = (max_luma_width >= 16)
+                                         ? vld1q_u16(src + 8)
+                                         : LastRowSamples(samples_row00);
+    const uint16x8_t samples_row02 = (max_luma_width >= 24)
+                                         ? vld1q_u16(src + 16)
+                                         : LastRowSamples(samples_row01);
+    const uint16x8_t samples_row03 = (max_luma_width == 32)
+                                         ? vld1q_u16(src + 24)
+                                         : LastRowSamples(samples_row02);
+    const uint16x8_t samples_row10 = vld1q_u16(src_next);
+    const uint16x8_t samples_row11 = (max_luma_width >= 16)
+                                         ? vld1q_u16(src_next + 8)
+                                         : LastRowSamples(samples_row10);
+    const uint16x8_t samples_row12 = (max_luma_width >= 24)
+                                         ? vld1q_u16(src_next + 16)
+                                         : LastRowSamples(samples_row11);
+    const uint16x8_t samples_row13 = (max_luma_width == 32)
+                                         ? vld1q_u16(src_next + 24)
+                                         : LastRowSamples(samples_row12);
+    const uint16x8_t luma_sum0 = vaddq_u16(samples_row00, samples_row10);
+    const uint16x8_t luma_sum1 = vaddq_u16(samples_row01, samples_row11);
+    const uint16x8_t luma_sum2 = vaddq_u16(samples_row02, samples_row12);
+    const uint16x8_t luma_sum3 = vaddq_u16(samples_row03, samples_row13);
+    final_fill0 = StoreLumaResults8_420(luma_sum0, luma_sum1, luma_ptr);
+    final_fill1 = StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8);
+    const uint16x8_t sum = vaddq_u16(final_fill0, final_fill1);
+
+    final_sum = vpadalq_u16(final_sum, sum);
+
+    // Because max_luma_width is at most 32, any values beyond x=16 will
+    // necessarily be duplicated.
+    if (block_width_log2 == 5) {
+      const uint16x8_t wide_fill = LastRowResult(final_fill1);
+      final_sum = vpadalq_u16(final_sum, vshlq_n_u16(wide_fill, 1));
+    }
+    src += src_stride << 1;
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+
+  // Begin second y section.
+  y = luma_height;
+  if (y < block_height) {
+    uint32x4_t wide_fill;
+    if (block_width_log2 == 5) {
+      // There are 16 16-bit fill values per row, shifting by 2 accounts for
+      // the widening to 32-bit.  (a << 2) = (a + a) << 1.
+      wide_fill = vshll_n_u16(vget_low_u16(LastRowResult(final_fill1)), 2);
+    }
+    const uint16x8_t final_inner_sum = vaddq_u16(final_fill0, final_fill1);
+    const uint32x4_t final_fill_to_sum = vaddl_u16(
+        vget_low_u16(final_inner_sum), vget_high_u16(final_inner_sum));
+
+    do {
+      vst1q_s16(luma_ptr, vreinterpretq_s16_u16(final_fill0));
+      vst1q_s16(luma_ptr + 8, vreinterpretq_s16_u16(final_fill1));
+      if (block_width_log2 == 5) {
+        final_sum = vaddq_u32(final_sum, wide_fill);
+      }
+      luma_ptr += kCflLumaBufferStride;
+      final_sum = vaddq_u32(final_sum, final_fill_to_sum);
+    } while (++y < block_height);
+  }  // End second y section.
+
+  const uint32_t average_sum = RightShiftWithRounding(
+      SumVector(final_sum), block_width_log2 + block_height_log2);
+  const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum));
+
+  luma_ptr = luma[0];
+  y = block_height;
+  do {
+    const int16x8_t samples0 = vld1q_s16(luma_ptr);
+    vst1q_s16(luma_ptr, vsubq_s16(samples0, averages));
+    const int16x8_t samples1 = vld1q_s16(luma_ptr + 8);
+    const int16x8_t final_row_result = vsubq_s16(samples1, averages);
+    vst1q_s16(luma_ptr + 8, final_row_result);
+
+    if (block_width_log2 == 5) {
+      const int16x8_t wide_fill = LastRowResult(final_row_result);
+      vst1q_s16(luma_ptr + 16, wide_fill);
+      vst1q_s16(luma_ptr + 24, wide_fill);
+    }
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+}
+
+//------------------------------------------------------------------------------
+// Choose subsampler based on max_luma_width
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler420_WxH_NEON(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  switch (max_luma_width) {
+    case 8:
+      CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 8>(
+          luma, max_luma_height, source, stride);
+      return;
+    case 16:
+      CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 16>(
+          luma, max_luma_height, source, stride);
+      return;
+    case 24:
+      CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 24>(
+          luma, max_luma_height, source, stride);
+      return;
+    default:
+      assert(max_luma_width == 32);
+      CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 32>(
+          luma, max_luma_height, source, stride);
+      return;
+  }
+}
+
+//------------------------------------------------------------------------------
+// CflIntraPredictor
+
+// |luma| can be within +/-(((1 << bitdepth) - 1) << 3), inclusive.
+// |alpha| can be -16 to 16 (inclusive).
+// Clip |dc + ((alpha * luma) >> 6))| to 0, (1 << bitdepth) - 1.
+inline uint16x8_t Combine8(const int16x8_t luma, const int16x8_t alpha_abs,
+                           const int16x8_t alpha_signed, const int16x8_t dc,
+                           const uint16x8_t max_value) {
+  const int16x8_t luma_abs = vabsq_s16(luma);
+  const int16x8_t luma_alpha_sign =
+      vshrq_n_s16(veorq_s16(luma, alpha_signed), 15);
+  // (alpha * luma) >> 6
+  const int16x8_t la_abs = vqrdmulhq_s16(luma_abs, alpha_abs);
+  // Convert back to signed values.
+  const int16x8_t la =
+      vsubq_s16(veorq_s16(la_abs, luma_alpha_sign), luma_alpha_sign);
+  const int16x8_t result = vaddq_s16(la, dc);
+  const int16x8_t zero = vdupq_n_s16(0);
+  // Clip.
+  return vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(result, zero)), max_value);
+}
+
+template <int block_height, int bitdepth = 10>
+inline void CflIntraPredictor4xN_NEON(
+    void* const dest, const ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t dst_stride = stride >> 1;
+  const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1);
+  const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9);
+  const int16x8_t alpha_abs = vabsq_s16(alpha_signed);
+  const int16x8_t dc = vdupq_n_s16(dst[0]);
+  for (int y = 0; y < block_height; y += 2) {
+    const int16x4_t luma_row0 = vld1_s16(luma[y]);
+    const int16x4_t luma_row1 = vld1_s16(luma[y + 1]);
+    const int16x8_t combined_luma = vcombine_s16(luma_row0, luma_row1);
+    const uint16x8_t sum =
+        Combine8(combined_luma, alpha_abs, alpha_signed, dc, max_value);
+    vst1_u16(dst, vget_low_u16(sum));
+    dst += dst_stride;
+    vst1_u16(dst, vget_high_u16(sum));
+    dst += dst_stride;
+  }
+}
+
+template <int block_height, int bitdepth = 10>
+inline void CflIntraPredictor8xN_NEON(
+    void* const dest, const ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t dst_stride = stride >> 1;
+  const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1);
+  const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9);
+  const int16x8_t alpha_abs = vabsq_s16(alpha_signed);
+  const int16x8_t dc = vdupq_n_s16(dst[0]);
+  for (int y = 0; y < block_height; ++y) {
+    const int16x8_t luma_row = vld1q_s16(luma[y]);
+    const uint16x8_t sum =
+        Combine8(luma_row, alpha_abs, alpha_signed, dc, max_value);
+    vst1q_u16(dst, sum);
+    dst += dst_stride;
+  }
+}
+
+template <int block_height, int bitdepth = 10>
+inline void CflIntraPredictor16xN_NEON(
+    void* const dest, const ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t dst_stride = stride >> 1;
+  const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1);
+  const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9);
+  const int16x8_t alpha_abs = vabsq_s16(alpha_signed);
+  const int16x8_t dc = vdupq_n_s16(dst[0]);
+  for (int y = 0; y < block_height; ++y) {
+    const int16x8_t luma_row_0 = vld1q_s16(luma[y]);
+    const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8);
+    const uint16x8_t sum_0 =
+        Combine8(luma_row_0, alpha_abs, alpha_signed, dc, max_value);
+    const uint16x8_t sum_1 =
+        Combine8(luma_row_1, alpha_abs, alpha_signed, dc, max_value);
+    vst1q_u16(dst, sum_0);
+    vst1q_u16(dst + 8, sum_1);
+    dst += dst_stride;
+  }
+}
+
+template <int block_height, int bitdepth = 10>
+inline void CflIntraPredictor32xN_NEON(
+    void* const dest, const ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t dst_stride = stride >> 1;
+  const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1);
+  const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9);
+  const int16x8_t alpha_abs = vabsq_s16(alpha_signed);
+  const int16x8_t dc = vdupq_n_s16(dst[0]);
+  for (int y = 0; y < block_height; ++y) {
+    const int16x8_t luma_row_0 = vld1q_s16(luma[y]);
+    const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8);
+    const int16x8_t luma_row_2 = vld1q_s16(luma[y] + 16);
+    const int16x8_t luma_row_3 = vld1q_s16(luma[y] + 24);
+    const uint16x8_t sum_0 =
+        Combine8(luma_row_0, alpha_abs, alpha_signed, dc, max_value);
+    const uint16x8_t sum_1 =
+        Combine8(luma_row_1, alpha_abs, alpha_signed, dc, max_value);
+    const uint16x8_t sum_2 =
+        Combine8(luma_row_2, alpha_abs, alpha_signed, dc, max_value);
+    const uint16x8_t sum_3 =
+        Combine8(luma_row_3, alpha_abs, alpha_signed, dc, max_value);
+    vst1q_u16(dst, sum_0);
+    vst1q_u16(dst + 8, sum_1);
+    vst1q_u16(dst + 16, sum_2);
+    vst1q_u16(dst + 24, sum_3);
+    dst += dst_stride;
+  }
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+      CflSubsampler420_4xH_NEON<2>;
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+      CflSubsampler420_4xH_NEON<3>;
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+      CflSubsampler420_4xH_NEON<4>;
+
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+      CflSubsampler420_8xH_NEON<2>;
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+      CflSubsampler420_8xH_NEON<3>;
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+      CflSubsampler420_8xH_NEON<4>;
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+      CflSubsampler420_8xH_NEON<5>;
+
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+      CflSubsampler420_WxH_NEON<4, 2>;
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+      CflSubsampler420_WxH_NEON<4, 3>;
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+      CflSubsampler420_WxH_NEON<4, 4>;
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+      CflSubsampler420_WxH_NEON<4, 5>;
+
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+      CflSubsampler420_WxH_NEON<5, 3>;
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+      CflSubsampler420_WxH_NEON<5, 4>;
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+      CflSubsampler420_WxH_NEON<5, 5>;
+
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+      CflSubsampler444_4xH_NEON<2>;
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+      CflSubsampler444_4xH_NEON<3>;
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+      CflSubsampler444_4xH_NEON<4>;
+
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+      CflSubsampler444_8xH_NEON<2>;
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+      CflSubsampler444_8xH_NEON<3>;
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+      CflSubsampler444_8xH_NEON<4>;
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+      CflSubsampler444_8xH_NEON<5>;
+
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+      CflSubsampler444_WxH_NEON<4, 2>;
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+      CflSubsampler444_WxH_NEON<4, 3>;
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+      CflSubsampler444_WxH_NEON<4, 4>;
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+      CflSubsampler444_WxH_NEON<4, 5>;
+
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+      CflSubsampler444_WxH_NEON<5, 3>;
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+      CflSubsampler444_WxH_NEON<5, 4>;
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+      CflSubsampler444_WxH_NEON<5, 5>;
+
+  dsp->cfl_intra_predictors[kTransformSize4x4] = CflIntraPredictor4xN_NEON<4>;
+  dsp->cfl_intra_predictors[kTransformSize4x8] = CflIntraPredictor4xN_NEON<8>;
+  dsp->cfl_intra_predictors[kTransformSize4x16] = CflIntraPredictor4xN_NEON<16>;
+
+  dsp->cfl_intra_predictors[kTransformSize8x4] = CflIntraPredictor8xN_NEON<4>;
+  dsp->cfl_intra_predictors[kTransformSize8x8] = CflIntraPredictor8xN_NEON<8>;
+  dsp->cfl_intra_predictors[kTransformSize8x16] = CflIntraPredictor8xN_NEON<16>;
+  dsp->cfl_intra_predictors[kTransformSize8x32] = CflIntraPredictor8xN_NEON<32>;
+
+  dsp->cfl_intra_predictors[kTransformSize16x4] = CflIntraPredictor16xN_NEON<4>;
+  dsp->cfl_intra_predictors[kTransformSize16x8] = CflIntraPredictor16xN_NEON<8>;
+  dsp->cfl_intra_predictors[kTransformSize16x16] =
+      CflIntraPredictor16xN_NEON<16>;
+  dsp->cfl_intra_predictors[kTransformSize16x32] =
+      CflIntraPredictor16xN_NEON<32>;
+  dsp->cfl_intra_predictors[kTransformSize32x8] = CflIntraPredictor32xN_NEON<8>;
+  dsp->cfl_intra_predictors[kTransformSize32x16] =
+      CflIntraPredictor32xN_NEON<16>;
+  dsp->cfl_intra_predictors[kTransformSize32x32] =
+      CflIntraPredictor32xN_NEON<32>;
+  // Max Cfl predictor size is 32x32.
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredCflInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
 
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 

diff --git a/libgav1/src/dsp/arm/intrapred_cfl_neon.h b/libgav1/src/dsp/arm/intrapred_cfl_neon.h
new file mode 100644
index 0000000..b4f983a
--- /dev/null
+++ b/libgav1/src/dsp/arm/intrapred_cfl_neon.h

@@ -0,0 +1,179 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_CFL_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_CFL_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cfl_intra_predictors and Dsp::cfl_subsamplers, see the
+// defines below for specifics. These functions are not thread-safe.
+void IntraPredCflInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+// 4x4
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x8
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x16
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x4
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x8
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x16
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x32
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x4
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x8
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x16
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x32
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x8
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x16
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x32
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// -----------------------------------------------------------------------------
+// 10bpp
+
+// 4x4
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x8
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x16
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x4
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x8
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x16
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x32
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x4
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x8
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x16
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x32
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x8
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x16
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x32
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_INTRAPRED_CFL_NEON_H_

diff --git a/libgav1/src/dsp/arm/intrapred_directional_neon.cc b/libgav1/src/dsp/arm/intrapred_directional_neon.cc
index 805ba81..3f5edbd 100644
--- a/libgav1/src/dsp/arm/intrapred_directional_neon.cc
+++ b/libgav1/src/dsp/arm/intrapred_directional_neon.cc

@@ -12,18 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_directional.h"
 #include "src/utils/cpu.h"
 
 #if LIBGAV1_ENABLE_NEON
 
 #include <arm_neon.h>
 
-#include <algorithm>  // std::min
+#include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
-#include <cstring>  // memset
+#include <cstring>
 
 #include "src/dsp/arm/common_neon.h"
 #include "src/dsp/constants.h"
@@ -35,14 +35,14 @@
 namespace low_bitdepth {
 namespace {
 
-// Blend two values based on a 32 bit weight.
+// Blend two values based on weights that sum to 32.
 inline uint8x8_t WeightedBlend(const uint8x8_t a, const uint8x8_t b,
                                const uint8x8_t a_weight,
                                const uint8x8_t b_weight) {
   const uint16x8_t a_product = vmull_u8(a, a_weight);
   const uint16x8_t b_product = vmull_u8(b, b_weight);
 
-  return vrshrn_n_u16(vaddq_u16(a_product, b_product), 5);
+  return vrshrn_n_u16(vaddq_u16(a_product, b_product), 5 /*log2(32)*/);
 }
 
 // For vertical operations the weights are one constant value.
@@ -112,7 +112,7 @@
     // 4 wide subsamples the output. 8 wide subsamples the input.
     if (width == 4) {
       const uint8x8_t left_values = vld1_u8(top + top_base_x);
-      const uint8x8_t right_values = RightShift<8>(left_values);
+      const uint8x8_t right_values = RightShiftVector<8>(left_values);
       const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
 
       // If |upsampled| is true then extract every other value for output.
@@ -910,12 +910,590 @@
 }  // namespace
 }  // namespace low_bitdepth
 
-void IntraPredDirectionalInit_NEON() { low_bitdepth::Init8bpp(); }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+// Blend two values based on weights that sum to 32.
+inline uint16x4_t WeightedBlend(const uint16x4_t a, const uint16x4_t b,
+                                const int a_weight, const int b_weight) {
+  const uint16x4_t a_product = vmul_n_u16(a, a_weight);
+  const uint16x4_t sum = vmla_n_u16(a_product, b, b_weight);
+
+  return vrshr_n_u16(sum, 5 /*log2(32)*/);
+}
+
+// Blend two values based on weights that sum to 32.
+inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b,
+                                const uint16_t a_weight,
+                                const uint16_t b_weight) {
+  const uint16x8_t a_product = vmulq_n_u16(a, a_weight);
+  const uint16x8_t sum = vmlaq_n_u16(a_product, b, b_weight);
+
+  return vrshrq_n_u16(sum, 5 /*log2(32)*/);
+}
+
+// Each element of |dest| contains values associated with one weight value.
+inline void LoadEdgeVals(uint16x4x2_t* dest, const uint16_t* const source,
+                         const bool upsampled) {
+  if (upsampled) {
+    *dest = vld2_u16(source);
+  } else {
+    dest->val[0] = vld1_u16(source);
+    dest->val[1] = vld1_u16(source + 1);
+  }
+}
+
+// Each element of |dest| contains values associated with one weight value.
+inline void LoadEdgeVals(uint16x8x2_t* dest, const uint16_t* const source,
+                         const bool upsampled) {
+  if (upsampled) {
+    *dest = vld2q_u16(source);
+  } else {
+    dest->val[0] = vld1q_u16(source);
+    dest->val[1] = vld1q_u16(source + 1);
+  }
+}
+
+template <bool upsampled>
+inline void DirectionalZone1_4xH(uint16_t* dst, const ptrdiff_t stride,
+                                 const int height, const uint16_t* const top,
+                                 const int xstep) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int index_scale_bits = 6 - upsample_shift;
+
+  const int max_base_x = (4 + height - 1) << upsample_shift;
+  const int16x4_t max_base = vdup_n_s16(max_base_x);
+  const uint16x4_t final_top_val = vdup_n_u16(top[max_base_x]);
+  const int16x4_t index_offset = {0, 1, 2, 3};
+
+  // All rows from |min_corner_only_y| down will simply use Memset.
+  // |max_base_x| is always greater than |height|, so clipping the denominator
+  // to 1 is enough to make the logic work.
+  const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+  const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+  int top_x = xstep;
+  int y = 0;
+  for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+    const int top_base_x = top_x >> index_scale_bits;
+
+    // To accommodate reuse of this function in Zone2, permit negative values
+    // for |xstep|.
+    const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const uint16_t shift_1 = 32 - shift_0;
+
+    // Use signed values to compare |top_base_x| to |max_base_x|.
+    const int16x4_t base_x = vadd_s16(vdup_n_s16(top_base_x), index_offset);
+    const uint16x4_t max_base_mask = vclt_s16(base_x, max_base);
+
+    uint16x4x2_t sampled_top_row;
+    LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+    const uint16x4_t combined = WeightedBlend(
+        sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+    // If |upsampled| is true then extract every other value for output.
+    const uint16x4_t masked_result =
+        vbsl_u16(max_base_mask, combined, final_top_val);
+
+    vst1_u16(dst, masked_result);
+  }
+  for (; y < height; ++y) {
+    Memset(dst, top[max_base_x], 4 /* width */);
+    dst += stride;
+  }
+}
+
+// Process a multiple of 8 |width| by any |height|. Processes horizontally
+// before vertically in the hopes of being a little more cache friendly.
+template <bool upsampled>
+inline void DirectionalZone1_WxH(uint16_t* dst, const ptrdiff_t stride,
+                                 const int width, const int height,
+                                 const uint16_t* const top, const int xstep) {
+  assert(width % 8 == 0);
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int index_scale_bits = 6 - upsample_shift;
+
+  const int max_base_index = (width + height - 1) << upsample_shift;
+  const int16x8_t max_base_x = vdupq_n_s16(max_base_index);
+  const uint16x8_t final_top_val = vdupq_n_u16(top[max_base_index]);
+  const int16x8_t index_offset = {0, 1, 2, 3, 4, 5, 6, 7};
+
+  const int base_step = 1 << upsample_shift;
+  const int base_step8 = base_step << 3;
+  const int16x8_t block_step = vdupq_n_s16(base_step8);
+
+  // All rows from |min_corner_only_y| down will simply use Memset.
+  // |max_base_x| is always greater than |height|, so clipping the denominator
+  // to 1 is enough to make the logic work.
+  const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+  const int min_corner_only_y = std::min(max_base_index / xstep_units, height);
+
+  int top_x = xstep;
+  int y = 0;
+  for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+    int top_base_x = top_x >> index_scale_bits;
+
+    // To accommodate reuse of this function in Zone2, permit negative values
+    // for |xstep|.
+    const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const uint16_t shift_1 = 32 - shift_0;
+
+    // Use signed values to compare |top_base_x| to |max_base_x|.
+    int16x8_t base_x = vaddq_s16(vdupq_n_s16(top_base_x), index_offset);
+
+    int x = 0;
+    do {
+      const uint16x8_t max_base_mask = vcltq_s16(base_x, max_base_x);
+
+      uint16x8x2_t sampled_top_row;
+      LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+      const uint16x8_t combined = WeightedBlend(
+          sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+      const uint16x8_t masked_result =
+          vbslq_u16(max_base_mask, combined, final_top_val);
+      vst1q_u16(dst + x, masked_result);
+
+      base_x = vaddq_s16(base_x, block_step);
+      top_base_x += base_step8;
+      x += 8;
+    } while (x < width);
+  }
+  for (int i = y; i < height; ++i) {
+    Memset(dst, top[max_base_index], width);
+    dst += stride;
+  }
+}
+
+// Process a multiple of 8 |width| by any |height|. Processes horizontally
+// before vertically in the hopes of being a little more cache friendly.
+inline void DirectionalZone1_Large(uint16_t* dst, const ptrdiff_t stride,
+                                   const int width, const int height,
+                                   const uint16_t* const top, const int xstep,
+                                   const bool upsampled) {
+  assert(width % 8 == 0);
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int index_scale_bits = 6 - upsample_shift;
+
+  const int max_base_index = (width + height - 1) << upsample_shift;
+  const int16x8_t max_base_x = vdupq_n_s16(max_base_index);
+  const uint16x8_t final_top_val = vdupq_n_u16(top[max_base_index]);
+  const int16x8_t index_offset = {0, 1, 2, 3, 4, 5, 6, 7};
+
+  const int base_step = 1 << upsample_shift;
+  const int base_step8 = base_step << 3;
+  const int16x8_t block_step = vdupq_n_s16(base_step8);
+
+  // All rows from |min_corner_only_y| down will simply use Memset.
+  // |max_base_x| is always greater than |height|, so clipping the denominator
+  // to 1 is enough to make the logic work.
+  const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+  const int min_corner_only_y = std::min(max_base_index / xstep_units, height);
+
+  // Rows up to this y-value can be computed without checking for bounds.
+  const int max_no_corner_y = std::min(
+      ((max_base_index - (base_step * width)) << index_scale_bits) / xstep,
+      height);
+  // No need to check for exceeding |max_base_x| in the first loop.
+  int y = 0;
+  int top_x = xstep;
+  for (; y < max_no_corner_y; ++y, dst += stride, top_x += xstep) {
+    int top_base_x = top_x >> index_scale_bits;
+    // To accommodate reuse of this function in Zone2, permit negative values
+    // for |xstep|.
+    const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const uint16_t shift_1 = 32 - shift_0;
+
+    int x = 0;
+    do {
+      uint16x8x2_t sampled_top_row;
+      LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+      const uint16x8_t combined = WeightedBlend(
+          sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+      vst1q_u16(dst + x, combined);
+
+      top_base_x += base_step8;
+      x += 8;
+    } while (x < width);
+  }
+
+  for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+    int top_base_x = top_x >> index_scale_bits;
+
+    // To accommodate reuse of this function in Zone2, permit negative values
+    // for |xstep|.
+    const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const uint16_t shift_1 = 32 - shift_0;
+
+    // Use signed values to compare |top_base_x| to |max_base_x|.
+    int16x8_t base_x = vaddq_s16(vdupq_n_s16(top_base_x), index_offset);
+
+    int x = 0;
+    const int min_corner_only_x =
+        std::min(width, ((max_base_index - top_base_x) >> upsample_shift) + 7) &
+        ~7;
+    for (; x < min_corner_only_x; x += 8, top_base_x += base_step8,
+                                  base_x = vaddq_s16(base_x, block_step)) {
+      const uint16x8_t max_base_mask = vcltq_s16(base_x, max_base_x);
+
+      uint16x8x2_t sampled_top_row;
+      LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+      const uint16x8_t combined = WeightedBlend(
+          sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+      const uint16x8_t masked_result =
+          vbslq_u16(max_base_mask, combined, final_top_val);
+      vst1q_u16(dst + x, masked_result);
+    }
+    // Corner-only section of the row.
+    Memset(dst + x, top[max_base_index], width - x);
+  }
+  for (; y < height; ++y) {
+    Memset(dst, top[max_base_index], width);
+    dst += stride;
+  }
+}
+
+void DirectionalIntraPredictorZone1_NEON(void* const dest, ptrdiff_t stride,
+                                         const void* const top_row,
+                                         const int width, const int height,
+                                         const int xstep,
+                                         const bool upsampled_top) {
+  const uint16_t* const top = static_cast<const uint16_t*>(top_row);
+  uint16_t* dst = static_cast<uint16_t*>(dest);
+  stride /= sizeof(top[0]);
+
+  assert(xstep > 0);
+
+  if (xstep == 64) {
+    assert(!upsampled_top);
+    const uint16_t* top_ptr = top + 1;
+    const int width_bytes = width * sizeof(top[0]);
+    int y = height;
+    do {
+      memcpy(dst, top_ptr, width_bytes);
+      memcpy(dst + stride, top_ptr + 1, width_bytes);
+      memcpy(dst + 2 * stride, top_ptr + 2, width_bytes);
+      memcpy(dst + 3 * stride, top_ptr + 3, width_bytes);
+      dst += 4 * stride;
+      top_ptr += 4;
+      y -= 4;
+    } while (y != 0);
+  } else {
+    if (width == 4) {
+      if (upsampled_top) {
+        DirectionalZone1_4xH<true>(dst, stride, height, top, xstep);
+      } else {
+        DirectionalZone1_4xH<false>(dst, stride, height, top, xstep);
+      }
+    } else if (width >= 32) {
+      if (upsampled_top) {
+        DirectionalZone1_Large(dst, stride, width, height, top, xstep, true);
+      } else {
+        DirectionalZone1_Large(dst, stride, width, height, top, xstep, false);
+      }
+    } else if (upsampled_top) {
+      DirectionalZone1_WxH<true>(dst, stride, width, height, top, xstep);
+    } else {
+      DirectionalZone1_WxH<false>(dst, stride, width, height, top, xstep);
+    }
+  }
+}
+
+// -----------------------------------------------------------------------------
+// Zone 3
+// This can be considered "the transpose of Zone 1." In Zone 1, the fractional
+// step applies when moving vertically in the destination block, connected to
+// the change in |y|, whereas in this mode, the step applies when moving
+// horizontally, connected to the change in |x|. This makes vectorization very
+// complicated in row-order, because a given vector may need source pixels that
+// span 16 or 32 pixels in steep angles, requiring multiple expensive table
+// lookups and checked loads. Rather than work in row order, it is simpler to
+// compute |dest| in column order, and then store the transposed results.
+
+// Compute 4x4 sub-blocks.
+// Example of computed sub-blocks of a 4x8 block before and after transpose:
+// 00 10 20 30             00 01 02 03
+// 01 11 21 31             10 11 12 13
+// 02 12 22 32             20 21 22 23
+// 03 13 23 33             30 31 32 33
+// -----------     -->     -----------
+// 40 50 60 70             40 41 42 43
+// 41 51 61 71             50 51 52 53
+// 42 52 62 72             60 61 62 63
+// 43 53 63 73             70 71 72 73
+template <bool upsampled>
+inline void DirectionalZone3_4x4(uint8_t* dst, const ptrdiff_t stride,
+                                 const uint16_t* const left, const int ystep,
+                                 const int base_left_y = 0) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int index_scale_bits = 6 - upsample_shift;
+
+  // Compute one column at a time, then transpose for storage.
+  uint16x4_t result[4];
+
+  int left_y = base_left_y + ystep;
+  int left_offset = left_y >> index_scale_bits;
+  int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  int shift_1 = 32 - shift_0;
+  uint16x4x2_t sampled_left_col;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  Transpose4x4(result);
+  Store4(dst, result[0]);
+  dst += stride;
+  Store4(dst, result[1]);
+  dst += stride;
+  Store4(dst, result[2]);
+  dst += stride;
+  Store4(dst, result[3]);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_4xH(uint8_t* dest, const ptrdiff_t stride,
+                                 const int height, const uint16_t* const left,
+                                 const int ystep) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  int y = 0;
+  do {
+    DirectionalZone3_4x4<upsampled>(dest, stride, left + (y << upsample_shift),
+                                    ystep);
+    dest += 4 * stride;
+    y += 4;
+  } while (y < height);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_Wx4(uint8_t* dest, const ptrdiff_t stride,
+                                 const int width, const uint16_t* const left,
+                                 const int ystep) {
+  int x = 0;
+  int base_left_y = 0;
+  do {
+    // TODO(petersonab): Establish 8x4 transpose to reserve this function for
+    // 8x4 and 16x4.
+    DirectionalZone3_4x4<upsampled>(dest + 2 * x, stride, left, ystep,
+                                    base_left_y);
+    base_left_y += 4 * ystep;
+    x += 4;
+  } while (x < width);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_8x8(uint8_t* dest, const ptrdiff_t stride,
+                                 const uint16_t* const left, const int ystep,
+                                 const int base_left_y = 0) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int index_scale_bits = 6 - upsample_shift;
+
+  // Compute one column at a time, then transpose for storage.
+  uint16x8_t result[8];
+
+  int left_y = base_left_y + ystep;
+  uint16x8x2_t sampled_left_col;
+  int left_offset = left_y >> index_scale_bits;
+  int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  int shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[4] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[5] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[6] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  left_y += ystep;
+  left_offset = left_y >> index_scale_bits;
+  shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+  shift_1 = 32 - shift_0;
+  LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+  result[7] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+                            shift_1, shift_0);
+
+  Transpose8x8(result);
+  Store8(dest, result[0]);
+  dest += stride;
+  Store8(dest, result[1]);
+  dest += stride;
+  Store8(dest, result[2]);
+  dest += stride;
+  Store8(dest, result[3]);
+  dest += stride;
+  Store8(dest, result[4]);
+  dest += stride;
+  Store8(dest, result[5]);
+  dest += stride;
+  Store8(dest, result[6]);
+  dest += stride;
+  Store8(dest, result[7]);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_WxH(uint8_t* dest, const ptrdiff_t stride,
+                                 const int width, const int height,
+                                 const uint16_t* const left, const int ystep) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  // Zone3 never runs out of left_column values.
+  assert((width + height - 1) << upsample_shift >  // max_base_y
+         ((ystep * width) >> (6 - upsample_shift)) +
+             (/* base_step */ 1 << upsample_shift) *
+                 (height - 1));  // left_base_y
+  int y = 0;
+  do {
+    int x = 0;
+    uint8_t* dst_x = dest + y * stride;
+    do {
+      const int base_left_y = ystep * x;
+      DirectionalZone3_8x8<upsampled>(
+          dst_x, stride, left + (y << upsample_shift), ystep, base_left_y);
+      dst_x += 8 * sizeof(uint16_t);
+      x += 8;
+    } while (x < width);
+    y += 8;
+  } while (y < height);
+}
+
+void DirectionalIntraPredictorZone3_NEON(void* const dest,
+                                         const ptrdiff_t stride,
+                                         const void* const left_column,
+                                         const int width, const int height,
+                                         const int ystep,
+                                         const bool upsampled_left) {
+  const uint16_t* const left = static_cast<const uint16_t*>(left_column);
+  uint8_t* dst = static_cast<uint8_t*>(dest);
+
+  if (ystep == 64) {
+    assert(!upsampled_left);
+    const int width_bytes = width * sizeof(left[0]);
+    int y = height;
+    do {
+      const uint16_t* left_ptr = left + 1;
+      memcpy(dst, left_ptr, width_bytes);
+      memcpy(dst + stride, left_ptr + 1, width_bytes);
+      memcpy(dst + 2 * stride, left_ptr + 2, width_bytes);
+      memcpy(dst + 3 * stride, left_ptr + 3, width_bytes);
+      dst += 4 * stride;
+      left_ptr += 4;
+      y -= 4;
+    } while (y != 0);
+    return;
+  }
+  if (width == 4) {
+    if (upsampled_left) {
+      DirectionalZone3_4xH<true>(dst, stride, height, left, ystep);
+    } else {
+      DirectionalZone3_4xH<false>(dst, stride, height, left, ystep);
+    }
+  } else if (height == 4) {
+    if (upsampled_left) {
+      DirectionalZone3_Wx4<true>(dst, stride, width, left, ystep);
+    } else {
+      DirectionalZone3_Wx4<false>(dst, stride, width, left, ystep);
+    }
+  } else {
+    if (upsampled_left) {
+      // |upsampled_left| can only be true if |width| + |height| <= 16,
+      // therefore this is 8x8.
+      DirectionalZone3_8x8<true>(dst, stride, left, ystep);
+    } else {
+      DirectionalZone3_WxH<false>(dst, stride, width, height, left, ystep);
+    }
+  }
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->directional_intra_predictor_zone1 = DirectionalIntraPredictorZone1_NEON;
+  dsp->directional_intra_predictor_zone3 = DirectionalIntraPredictorZone3_NEON;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredDirectionalInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}
 
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 

diff --git a/libgav1/src/dsp/arm/intrapred_directional_neon.h b/libgav1/src/dsp/arm/intrapred_directional_neon.h
new file mode 100644
index 0000000..f7d6235
--- /dev/null
+++ b/libgav1/src/dsp/arm/intrapred_directional_neon.h

@@ -0,0 +1,56 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_DIRECTIONAL_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_DIRECTIONAL_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::directional_intra_predictor_zone*, see the defines below for
+// specifics. These functions are not thread-safe.
+void IntraPredDirectionalInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1
+#define LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3
+#define LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_NEON
+#endif
+
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_INTRAPRED_DIRECTIONAL_NEON_H_

diff --git a/libgav1/src/dsp/arm/intrapred_filter_intra_neon.cc b/libgav1/src/dsp/arm/intrapred_filter_neon.cc
similarity index 96%
rename from libgav1/src/dsp/arm/intrapred_filter_intra_neon.cc
rename to libgav1/src/dsp/arm/intrapred_filter_neon.cc
index 411708e..bd9f61d 100644
--- a/libgav1/src/dsp/arm/intrapred_filter_intra_neon.cc
+++ b/libgav1/src/dsp/arm/intrapred_filter_neon.cc

@@ -1,4 +1,4 @@
-// Copyright 2019 The libgav1 Authors
+// Copyright 2021 The libgav1 Authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_filter.h"
 #include "src/utils/cpu.h"
 
 #if LIBGAV1_ENABLE_NEON
@@ -160,16 +160,16 @@
 }  // namespace
 }  // namespace low_bitdepth
 
-void IntraPredFilterIntraInit_NEON() { low_bitdepth::Init8bpp(); }
+void IntraPredFilterInit_NEON() { low_bitdepth::Init8bpp(); }
 
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 
-void IntraPredFilterIntraInit_NEON() {}
+void IntraPredFilterInit_NEON() {}
 
 }  // namespace dsp
 }  // namespace libgav1

diff --git a/libgav1/src/dsp/arm/intrapred_filter_neon.h b/libgav1/src/dsp/arm/intrapred_filter_neon.h
new file mode 100644
index 0000000..283c1b1
--- /dev/null
+++ b/libgav1/src/dsp/arm/intrapred_filter_neon.h

@@ -0,0 +1,37 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_FILTER_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_FILTER_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::filter_intra_predictor, see the defines below for specifics.
+// These functions are not thread-safe.
+void IntraPredFilterInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_INTRAPRED_FILTER_NEON_H_

diff --git a/libgav1/src/dsp/arm/intrapred_neon.cc b/libgav1/src/dsp/arm/intrapred_neon.cc
index c967d82..c143648 100644
--- a/libgav1/src/dsp/arm/intrapred_neon.cc
+++ b/libgav1/src/dsp/arm/intrapred_neon.cc

@@ -26,6 +26,7 @@
 #include "src/dsp/arm/common_neon.h"
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
+#include "src/utils/constants.h"
 
 namespace libgav1 {
 namespace dsp {
@@ -964,6 +965,200 @@
   using _64x64 = DcPredFuncs_NEON<6, 6, DcSum_NEON, DcStore_NEON<64, 64>>;
 };
 
+// IntraPredFuncs_NEON::Horizontal -- duplicate left column across all rows
+
+template <int block_height>
+void Horizontal4xH_NEON(void* const dest, ptrdiff_t stride,
+                        const void* /*top_row*/,
+                        const void* const left_column) {
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  int y = 0;
+  do {
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+    const uint16x4_t row = vld1_dup_u16(left + y);
+    vst1_u16(dst16, row);
+    dst += stride;
+  } while (++y < block_height);
+}
+
+template <int block_height>
+void Horizontal8xH_NEON(void* const dest, ptrdiff_t stride,
+                        const void* /*top_row*/,
+                        const void* const left_column) {
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  int y = 0;
+  do {
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+    const uint16x8_t row = vld1q_dup_u16(left + y);
+    vst1q_u16(dst16, row);
+    dst += stride;
+  } while (++y < block_height);
+}
+
+template <int block_height>
+void Horizontal16xH_NEON(void* const dest, ptrdiff_t stride,
+                         const void* /*top_row*/,
+                         const void* const left_column) {
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  int y = 0;
+  do {
+    const uint16x8_t row0 = vld1q_dup_u16(left + y);
+    const uint16x8_t row1 = vld1q_dup_u16(left + y + 1);
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+    vst1q_u16(dst16, row0);
+    vst1q_u16(dst16 + 8, row0);
+    dst += stride;
+    dst16 = reinterpret_cast<uint16_t*>(dst);
+    vst1q_u16(dst16, row1);
+    vst1q_u16(dst16 + 8, row1);
+    dst += stride;
+    y += 2;
+  } while (y < block_height);
+}
+
+template <int block_height>
+void Horizontal32xH_NEON(void* const dest, ptrdiff_t stride,
+                         const void* /*top_row*/,
+                         const void* const left_column) {
+  const auto* const left = static_cast<const uint16_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  int y = 0;
+  do {
+    const uint16x8_t row0 = vld1q_dup_u16(left + y);
+    const uint16x8_t row1 = vld1q_dup_u16(left + y + 1);
+    auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+    vst1q_u16(dst16, row0);
+    vst1q_u16(dst16 + 8, row0);
+    vst1q_u16(dst16 + 16, row0);
+    vst1q_u16(dst16 + 24, row0);
+    dst += stride;
+    dst16 = reinterpret_cast<uint16_t*>(dst);
+    vst1q_u16(dst16, row1);
+    vst1q_u16(dst16 + 8, row1);
+    vst1q_u16(dst16 + 16, row1);
+    vst1q_u16(dst16 + 24, row1);
+    dst += stride;
+    y += 2;
+  } while (y < block_height);
+}
+
+// IntraPredFuncs_NEON::Vertical -- copy top row to all rows
+
+template <int block_height>
+void Vertical4xH_NEON(void* const dest, ptrdiff_t stride,
+                      const void* const top_row,
+                      const void* const /*left_column*/) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const uint8x8_t row = vld1_u8(top);
+  int y = block_height;
+  do {
+    vst1_u8(dst, row);
+    dst += stride;
+  } while (--y != 0);
+}
+
+template <int block_height>
+void Vertical8xH_NEON(void* const dest, ptrdiff_t stride,
+                      const void* const top_row,
+                      const void* const /*left_column*/) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const uint8x16_t row = vld1q_u8(top);
+  int y = block_height;
+  do {
+    vst1q_u8(dst, row);
+    dst += stride;
+  } while (--y != 0);
+}
+
+template <int block_height>
+void Vertical16xH_NEON(void* const dest, ptrdiff_t stride,
+                       const void* const top_row,
+                       const void* const /*left_column*/) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const uint8x16_t row0 = vld1q_u8(top);
+  const uint8x16_t row1 = vld1q_u8(top + 16);
+  int y = block_height;
+  do {
+    vst1q_u8(dst, row0);
+    vst1q_u8(dst + 16, row1);
+    dst += stride;
+    vst1q_u8(dst, row0);
+    vst1q_u8(dst + 16, row1);
+    dst += stride;
+    y -= 2;
+  } while (y != 0);
+}
+
+template <int block_height>
+void Vertical32xH_NEON(void* const dest, ptrdiff_t stride,
+                       const void* const top_row,
+                       const void* const /*left_column*/) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const uint8x16_t row0 = vld1q_u8(top);
+  const uint8x16_t row1 = vld1q_u8(top + 16);
+  const uint8x16_t row2 = vld1q_u8(top + 32);
+  const uint8x16_t row3 = vld1q_u8(top + 48);
+  int y = block_height;
+  do {
+    vst1q_u8(dst, row0);
+    vst1q_u8(dst + 16, row1);
+    vst1q_u8(dst + 32, row2);
+    vst1q_u8(dst + 48, row3);
+    dst += stride;
+    vst1q_u8(dst, row0);
+    vst1q_u8(dst + 16, row1);
+    vst1q_u8(dst + 32, row2);
+    vst1q_u8(dst + 48, row3);
+    dst += stride;
+    y -= 2;
+  } while (y != 0);
+}
+
+template <int block_height>
+void Vertical64xH_NEON(void* const dest, ptrdiff_t stride,
+                       const void* const top_row,
+                       const void* const /*left_column*/) {
+  const auto* const top = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const uint8x16_t row0 = vld1q_u8(top);
+  const uint8x16_t row1 = vld1q_u8(top + 16);
+  const uint8x16_t row2 = vld1q_u8(top + 32);
+  const uint8x16_t row3 = vld1q_u8(top + 48);
+  const uint8x16_t row4 = vld1q_u8(top + 64);
+  const uint8x16_t row5 = vld1q_u8(top + 80);
+  const uint8x16_t row6 = vld1q_u8(top + 96);
+  const uint8x16_t row7 = vld1q_u8(top + 112);
+  int y = block_height;
+  do {
+    vst1q_u8(dst, row0);
+    vst1q_u8(dst + 16, row1);
+    vst1q_u8(dst + 32, row2);
+    vst1q_u8(dst + 48, row3);
+    vst1q_u8(dst + 64, row4);
+    vst1q_u8(dst + 80, row5);
+    vst1q_u8(dst + 96, row6);
+    vst1q_u8(dst + 112, row7);
+    dst += stride;
+    vst1q_u8(dst, row0);
+    vst1q_u8(dst + 16, row1);
+    vst1q_u8(dst + 32, row2);
+    vst1q_u8(dst + 48, row3);
+    vst1q_u8(dst + 64, row4);
+    vst1q_u8(dst + 80, row5);
+    vst1q_u8(dst + 96, row6);
+    vst1q_u8(dst + 112, row7);
+    dst += stride;
+    y -= 2;
+  } while (y != 0);
+}
+
 void Init10bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
   assert(dsp != nullptr);
@@ -973,6 +1168,8 @@
       DcDefs::_4x4::DcLeft;
   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
       DcDefs::_4x4::Dc;
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorVertical] =
+      Vertical4xH_NEON<4>;
 
   // 4x8
   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
@@ -981,6 +1178,10 @@
       DcDefs::_4x8::DcLeft;
   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
       DcDefs::_4x8::Dc;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
+      Horizontal4xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorVertical] =
+      Vertical4xH_NEON<8>;
 
   // 4x16
   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
@@ -989,6 +1190,10 @@
       DcDefs::_4x16::DcLeft;
   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
       DcDefs::_4x16::Dc;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
+      Horizontal4xH_NEON<16>;
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorVertical] =
+      Vertical4xH_NEON<16>;
 
   // 8x4
   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
@@ -997,6 +1202,8 @@
       DcDefs::_8x4::DcLeft;
   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
       DcDefs::_8x4::Dc;
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorVertical] =
+      Vertical8xH_NEON<4>;
 
   // 8x8
   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
@@ -1005,6 +1212,10 @@
       DcDefs::_8x8::DcLeft;
   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
       DcDefs::_8x8::Dc;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
+      Horizontal8xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorVertical] =
+      Vertical8xH_NEON<8>;
 
   // 8x16
   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
@@ -1013,6 +1224,8 @@
       DcDefs::_8x16::DcLeft;
   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
       DcDefs::_8x16::Dc;
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorVertical] =
+      Vertical8xH_NEON<16>;
 
   // 8x32
   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
@@ -1021,6 +1234,10 @@
       DcDefs::_8x32::DcLeft;
   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
       DcDefs::_8x32::Dc;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
+      Horizontal8xH_NEON<32>;
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorVertical] =
+      Vertical8xH_NEON<32>;
 
   // 16x4
   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
@@ -1029,6 +1246,8 @@
       DcDefs::_16x4::DcLeft;
   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
       DcDefs::_16x4::Dc;
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorVertical] =
+      Vertical16xH_NEON<4>;
 
   // 16x8
   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
@@ -1037,6 +1256,10 @@
       DcDefs::_16x8::DcLeft;
   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
       DcDefs::_16x8::Dc;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
+      Horizontal16xH_NEON<8>;
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorVertical] =
+      Vertical16xH_NEON<8>;
 
   // 16x16
   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
@@ -1045,6 +1268,8 @@
       DcDefs::_16x16::DcLeft;
   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
       DcDefs::_16x16::Dc;
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorVertical] =
+      Vertical16xH_NEON<16>;
 
   // 16x32
   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
@@ -1053,6 +1278,8 @@
       DcDefs::_16x32::DcLeft;
   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
       DcDefs::_16x32::Dc;
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorVertical] =
+      Vertical16xH_NEON<32>;
 
   // 16x64
   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
@@ -1061,6 +1288,8 @@
       DcDefs::_16x64::DcLeft;
   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
       DcDefs::_16x64::Dc;
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorVertical] =
+      Vertical16xH_NEON<64>;
 
   // 32x8
   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
@@ -1069,6 +1298,8 @@
       DcDefs::_32x8::DcLeft;
   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
       DcDefs::_32x8::Dc;
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorVertical] =
+      Vertical32xH_NEON<8>;
 
   // 32x16
   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
@@ -1077,6 +1308,8 @@
       DcDefs::_32x16::DcLeft;
   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
       DcDefs::_32x16::Dc;
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorVertical] =
+      Vertical32xH_NEON<16>;
 
   // 32x32
   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
@@ -1085,6 +1318,8 @@
       DcDefs::_32x32::DcLeft;
   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
       DcDefs::_32x32::Dc;
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorVertical] =
+      Vertical32xH_NEON<32>;
 
   // 32x64
   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
@@ -1093,6 +1328,10 @@
       DcDefs::_32x64::DcLeft;
   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
       DcDefs::_32x64::Dc;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
+      Horizontal32xH_NEON<64>;
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorVertical] =
+      Vertical32xH_NEON<64>;
 
   // 64x16
   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
@@ -1101,6 +1340,8 @@
       DcDefs::_64x16::DcLeft;
   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
       DcDefs::_64x16::Dc;
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorVertical] =
+      Vertical64xH_NEON<16>;
 
   // 64x32
   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
@@ -1109,6 +1350,8 @@
       DcDefs::_64x32::DcLeft;
   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
       DcDefs::_64x32::Dc;
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorVertical] =
+      Vertical64xH_NEON<32>;
 
   // 64x64
   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
@@ -1117,6 +1360,8 @@
       DcDefs::_64x64::DcLeft;
   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
       DcDefs::_64x64::Dc;
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorVertical] =
+      Vertical64xH_NEON<64>;
 }
 
 }  // namespace
@@ -1133,7 +1378,7 @@
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 

diff --git a/libgav1/src/dsp/arm/intrapred_neon.h b/libgav1/src/dsp/arm/intrapred_neon.h
index 16f858c..b27f29f 100644
--- a/libgav1/src/dsp/arm/intrapred_neon.h
+++ b/libgav1/src/dsp/arm/intrapred_neon.h

@@ -23,396 +23,282 @@
 namespace libgav1 {
 namespace dsp {
 
-// Initializes Dsp::intra_predictors, Dsp::directional_intra_predictor_zone*,
-// Dsp::cfl_intra_predictors, Dsp::cfl_subsamplers and
-// Dsp::filter_intra_predictor, see the defines below for specifics. These
-// functions are not thread-safe.
-void IntraPredCflInit_NEON();
-void IntraPredDirectionalInit_NEON();
-void IntraPredFilterIntraInit_NEON();
+// Initializes Dsp::intra_predictors.
+// See the defines below for specifics. These functions are not thread-safe.
 void IntraPredInit_NEON();
-void IntraPredSmoothInit_NEON();
 
 }  // namespace dsp
 }  // namespace libgav1
 
 #if LIBGAV1_ENABLE_NEON
-// 8 bit
-#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_NEON
-
 // 4x4
 #define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 4x8
 #define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 4x16
 #define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 8x4
 #define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 8x8
 #define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 8x16
 #define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 8x32
 #define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 16x4
 #define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 16x8
 #define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 16x16
 #define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 16x32
 #define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 16x64
 #define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
 
 // 32x8
 #define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 32x16
 #define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 32x32
 #define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_NEON
 
 // 32x64
 #define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
 
 // 64x16
 #define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
 
 // 64x32
 #define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
 
 // 64x64
 #define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDc LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_NEON
 
 // 10 bit
 // 4x4
 #define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 4x8
 #define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 4x16
 #define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 8x4
 #define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 8x8
 #define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 8x16
 #define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 8x32
 #define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 16x4
 #define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 16x8
 #define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 16x16
 #define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcLeft \
   LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 16x32
 #define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcLeft \
   LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 16x64
 #define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcLeft \
   LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 32x8
 #define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 32x16
 #define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcLeft \
   LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 32x32
 #define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcLeft \
   LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 32x64
 #define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcLeft \
   LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorHorizontal \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 64x16
 #define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcLeft \
   LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 64x32
 #define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcLeft \
   LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 
 // 64x64
 #define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcLeft \
   LIBGAV1_CPU_NEON
 #define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorVertical \
+  LIBGAV1_CPU_NEON
 #endif  // LIBGAV1_ENABLE_NEON
 
 #endif  // LIBGAV1_SRC_DSP_ARM_INTRAPRED_NEON_H_

diff --git a/libgav1/src/dsp/arm/intrapred_smooth_neon.cc b/libgav1/src/dsp/arm/intrapred_smooth_neon.cc
index abc93e8..c33f333 100644
--- a/libgav1/src/dsp/arm/intrapred_smooth_neon.cc
+++ b/libgav1/src/dsp/arm/intrapred_smooth_neon.cc

@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_smooth.h"
 #include "src/utils/cpu.h"
 
 #if LIBGAV1_ENABLE_NEON
@@ -26,6 +26,7 @@
 #include "src/dsp/arm/common_neon.h"
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
+#include "src/utils/constants.h"
 
 namespace libgav1 {
 namespace dsp {
@@ -605,7 +606,7 @@
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 

diff --git a/libgav1/src/dsp/arm/intrapred_smooth_neon.h b/libgav1/src/dsp/arm/intrapred_smooth_neon.h
new file mode 100644
index 0000000..edd01be
--- /dev/null
+++ b/libgav1/src/dsp/arm/intrapred_smooth_neon.h

@@ -0,0 +1,149 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_SMOOTH_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_SMOOTH_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors[][kIntraPredictorSmooth.*].
+// This function is not thread-safe.
+void IntraPredSmoothInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_INTRAPRED_SMOOTH_NEON_H_

diff --git a/libgav1/src/dsp/arm/inverse_transform_10bit_neon.cc b/libgav1/src/dsp/arm/inverse_transform_10bit_neon.cc
new file mode 100644
index 0000000..ff184a1
--- /dev/null
+++ b/libgav1/src/dsp/arm/inverse_transform_10bit_neon.cc

@@ -0,0 +1,2543 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/inverse_transform.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Include the constants and utility functions inside the anonymous namespace.
+#include "src/dsp/inverse_transform.inc"
+
+//------------------------------------------------------------------------------
+
+LIBGAV1_ALWAYS_INLINE void Transpose4x4(const int32x4_t in[4],
+                                        int32x4_t out[4]) {
+  // in:
+  // 00 01 02 03
+  // 10 11 12 13
+  // 20 21 22 23
+  // 30 31 32 33
+
+  // 00 10 02 12   a.val[0]
+  // 01 11 03 13   a.val[1]
+  // 20 30 22 32   b.val[0]
+  // 21 31 23 33   b.val[1]
+  const int32x4x2_t a = vtrnq_s32(in[0], in[1]);
+  const int32x4x2_t b = vtrnq_s32(in[2], in[3]);
+  out[0] = vextq_s32(vextq_s32(a.val[0], a.val[0], 2), b.val[0], 2);
+  out[1] = vextq_s32(vextq_s32(a.val[1], a.val[1], 2), b.val[1], 2);
+  out[2] = vextq_s32(a.val[0], vextq_s32(b.val[0], b.val[0], 2), 2);
+  out[3] = vextq_s32(a.val[1], vextq_s32(b.val[1], b.val[1], 2), 2);
+  // out:
+  // 00 10 20 30
+  // 01 11 21 31
+  // 02 12 22 32
+  // 03 13 23 33
+}
+
+//------------------------------------------------------------------------------
+template <int store_count>
+LIBGAV1_ALWAYS_INLINE void StoreDst(int32_t* dst, int32_t stride, int32_t idx,
+                                    const int32x4_t* const s) {
+  assert(store_count % 4 == 0);
+  for (int i = 0; i < store_count; i += 4) {
+    vst1q_s32(&dst[i * stride + idx], s[i]);
+    vst1q_s32(&dst[(i + 1) * stride + idx], s[i + 1]);
+    vst1q_s32(&dst[(i + 2) * stride + idx], s[i + 2]);
+    vst1q_s32(&dst[(i + 3) * stride + idx], s[i + 3]);
+  }
+}
+
+template <int load_count>
+LIBGAV1_ALWAYS_INLINE void LoadSrc(const int32_t* src, int32_t stride,
+                                   int32_t idx, int32x4_t* x) {
+  assert(load_count % 4 == 0);
+  for (int i = 0; i < load_count; i += 4) {
+    x[i] = vld1q_s32(&src[i * stride + idx]);
+    x[i + 1] = vld1q_s32(&src[(i + 1) * stride + idx]);
+    x[i + 2] = vld1q_s32(&src[(i + 2) * stride + idx]);
+    x[i + 3] = vld1q_s32(&src[(i + 3) * stride + idx]);
+  }
+}
+
+// Butterfly rotate 4 values.
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_4(int32x4_t* a, int32x4_t* b,
+                                               const int angle,
+                                               const bool flip) {
+  const int32_t cos128 = Cos128(angle);
+  const int32_t sin128 = Sin128(angle);
+  const int32x4_t acc_x = vmulq_n_s32(*a, cos128);
+  const int32x4_t acc_y = vmulq_n_s32(*a, sin128);
+  // The max range for the input is 18 bits. The cos128/sin128 is 13 bits,
+  // which leaves 1 bit for the add/subtract. For 10bpp, x/y will fit in a 32
+  // bit lane.
+  const int32x4_t x0 = vmlsq_n_s32(acc_x, *b, sin128);
+  const int32x4_t y0 = vmlaq_n_s32(acc_y, *b, cos128);
+  const int32x4_t x = vrshrq_n_s32(x0, 12);
+  const int32x4_t y = vrshrq_n_s32(y0, 12);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_FirstIsZero(int32x4_t* a,
+                                                         int32x4_t* b,
+                                                         const int angle,
+                                                         const bool flip) {
+  const int32_t cos128 = Cos128(angle);
+  const int32_t sin128 = Sin128(angle);
+  assert(sin128 <= 0xfff);
+  const int32x4_t x0 = vmulq_n_s32(*b, -sin128);
+  const int32x4_t y0 = vmulq_n_s32(*b, cos128);
+  const int32x4_t x = vrshrq_n_s32(x0, 12);
+  const int32x4_t y = vrshrq_n_s32(y0, 12);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_SecondIsZero(int32x4_t* a,
+                                                          int32x4_t* b,
+                                                          const int angle,
+                                                          const bool flip) {
+  const int32_t cos128 = Cos128(angle);
+  const int32_t sin128 = Sin128(angle);
+  const int32x4_t x0 = vmulq_n_s32(*a, cos128);
+  const int32x4_t y0 = vmulq_n_s32(*a, sin128);
+  const int32x4_t x = vrshrq_n_s32(x0, 12);
+  const int32x4_t y = vrshrq_n_s32(y0, 12);
+  if (flip) {
+    *a = y;
+    *b = x;
+  } else {
+    *a = x;
+    *b = y;
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void HadamardRotation(int32x4_t* a, int32x4_t* b,
+                                            bool flip) {
+  int32x4_t x, y;
+  if (flip) {
+    y = vqaddq_s32(*b, *a);
+    x = vqsubq_s32(*b, *a);
+  } else {
+    x = vqaddq_s32(*a, *b);
+    y = vqsubq_s32(*a, *b);
+  }
+  *a = x;
+  *b = y;
+}
+
+LIBGAV1_ALWAYS_INLINE void HadamardRotation(int32x4_t* a, int32x4_t* b,
+                                            bool flip, const int32x4_t* min,
+                                            const int32x4_t* max) {
+  int32x4_t x, y;
+  if (flip) {
+    y = vqaddq_s32(*b, *a);
+    x = vqsubq_s32(*b, *a);
+  } else {
+    x = vqaddq_s32(*a, *b);
+    y = vqsubq_s32(*a, *b);
+  }
+  *a = vmaxq_s32(vminq_s32(x, *max), *min);
+  *b = vmaxq_s32(vminq_s32(y, *max), *min);
+}
+
+using ButterflyRotationFunc = void (*)(int32x4_t* a, int32x4_t* b, int angle,
+                                       bool flip);
+
+//------------------------------------------------------------------------------
+// Discrete Cosine Transforms (DCT).
+
+template <int width>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, int adjusted_tx_height,
+                                     bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  const int32x4_t v_src = vdupq_n_s32(dst[0]);
+  const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+  const int32x4_t v_src_round =
+      vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12));
+  const int32x4_t s0 = vbslq_s32(v_mask, v_src_round, v_src);
+  const int32_t cos128 = Cos128(32);
+  const int32x4_t xy = vqrdmulhq_n_s32(s0, cos128 << (31 - 12));
+  // vqrshlq_s32 will shift right if shift value is negative.
+  const int32x4_t xy_shifted = vqrshlq_s32(xy, vdupq_n_s32(-row_shift));
+  // Clamp result to signed 16 bits.
+  const int32x4_t result = vmovl_s16(vqmovn_s32(xy_shifted));
+  if (width == 4) {
+    vst1q_s32(dst, result);
+  } else {
+    for (int i = 0; i < width; i += 4) {
+      vst1q_s32(dst, result);
+      dst += 4;
+    }
+  }
+  return true;
+}
+
+template <int height>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, int adjusted_tx_height,
+                                           int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  const int32_t cos128 = Cos128(32);
+
+  // Calculate dc values for first row.
+  if (width == 4) {
+    const int32x4_t v_src = vld1q_s32(dst);
+    const int32x4_t xy = vqrdmulhq_n_s32(v_src, cos128 << (31 - 12));
+    vst1q_s32(dst, xy);
+  } else {
+    int i = 0;
+    do {
+      const int32x4_t v_src = vld1q_s32(&dst[i]);
+      const int32x4_t xy = vqrdmulhq_n_s32(v_src, cos128 << (31 - 12));
+      vst1q_s32(&dst[i], xy);
+      i += 4;
+    } while (i < width);
+  }
+
+  // Copy first row to the rest of the block.
+  for (int y = 1; y < height; ++y) {
+    memcpy(&dst[y * width], dst, width * sizeof(dst[0]));
+  }
+  return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct4Stages(int32x4_t* s, const int32x4_t* min,
+                                      const int32x4_t* max,
+                                      const bool is_last_stage) {
+  // stage 12.
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[0], &s[1], 32, true);
+    ButterflyRotation_SecondIsZero(&s[2], &s[3], 48, false);
+  } else {
+    butterfly_rotation(&s[0], &s[1], 32, true);
+    butterfly_rotation(&s[2], &s[3], 48, false);
+  }
+
+  // stage 17.
+  if (is_last_stage) {
+    HadamardRotation(&s[0], &s[3], false);
+    HadamardRotation(&s[1], &s[2], false);
+  } else {
+    HadamardRotation(&s[0], &s[3], false, min, max);
+    HadamardRotation(&s[1], &s[2], false, min, max);
+  }
+}
+
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool is_row,
+                                     int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  // When |is_row| is true, set range to the row range, otherwise, set to the
+  // column range.
+  const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+  const int32x4_t min = vdupq_n_s32(-(1 << range));
+  const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+  int32x4_t s[4], x[4];
+
+  LoadSrc<4>(dst, step, 0, x);
+  if (is_row) {
+    Transpose4x4(x, x);
+  }
+
+  // stage 1.
+  // kBitReverseLookup 0, 2, 1, 3
+  s[0] = x[0];
+  s[1] = x[2];
+  s[2] = x[1];
+  s[3] = x[3];
+
+  Dct4Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/true);
+
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    for (int i = 0; i < 4; ++i) {
+      s[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(s[i], v_row_shift)));
+    }
+    Transpose4x4(s, s);
+  }
+  StoreDst<4>(dst, step, 0, s);
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct8Stages(int32x4_t* s, const int32x4_t* min,
+                                      const int32x4_t* max,
+                                      const bool is_last_stage) {
+  // stage 8.
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[4], &s[7], 56, false);
+    ButterflyRotation_FirstIsZero(&s[5], &s[6], 24, false);
+  } else {
+    butterfly_rotation(&s[4], &s[7], 56, false);
+    butterfly_rotation(&s[5], &s[6], 24, false);
+  }
+
+  // stage 13.
+  HadamardRotation(&s[4], &s[5], false, min, max);
+  HadamardRotation(&s[6], &s[7], true, min, max);
+
+  // stage 18.
+  butterfly_rotation(&s[6], &s[5], 32, true);
+
+  // stage 22.
+  if (is_last_stage) {
+    HadamardRotation(&s[0], &s[7], false);
+    HadamardRotation(&s[1], &s[6], false);
+    HadamardRotation(&s[2], &s[5], false);
+    HadamardRotation(&s[3], &s[4], false);
+  } else {
+    HadamardRotation(&s[0], &s[7], false, min, max);
+    HadamardRotation(&s[1], &s[6], false, min, max);
+    HadamardRotation(&s[2], &s[5], false, min, max);
+    HadamardRotation(&s[3], &s[4], false, min, max);
+  }
+}
+
+// Process dct8 rows or columns, depending on the |is_row| flag.
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Dct8_NEON(void* dest, int32_t step, bool is_row,
+                                     int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+  const int32x4_t min = vdupq_n_s32(-(1 << range));
+  const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+  int32x4_t s[8], x[8];
+
+  if (is_row) {
+    LoadSrc<4>(dst, step, 0, &x[0]);
+    LoadSrc<4>(dst, step, 4, &x[4]);
+    Transpose4x4(&x[0], &x[0]);
+    Transpose4x4(&x[4], &x[4]);
+  } else {
+    LoadSrc<8>(dst, step, 0, &x[0]);
+  }
+
+  // stage 1.
+  // kBitReverseLookup 0, 4, 2, 6, 1, 5, 3, 7,
+  s[0] = x[0];
+  s[1] = x[4];
+  s[2] = x[2];
+  s[3] = x[6];
+  s[4] = x[1];
+  s[5] = x[5];
+  s[6] = x[3];
+  s[7] = x[7];
+
+  Dct4Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/false);
+  Dct8Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/true);
+
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    for (int i = 0; i < 8; ++i) {
+      s[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(s[i], v_row_shift)));
+    }
+    Transpose4x4(&s[0], &s[0]);
+    Transpose4x4(&s[4], &s[4]);
+    StoreDst<4>(dst, step, 0, &s[0]);
+    StoreDst<4>(dst, step, 4, &s[4]);
+  } else {
+    StoreDst<8>(dst, step, 0, &s[0]);
+  }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct16Stages(int32x4_t* s, const int32x4_t* min,
+                                       const int32x4_t* max,
+                                       const bool is_last_stage) {
+  // stage 5.
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[8], &s[15], 60, false);
+    ButterflyRotation_FirstIsZero(&s[9], &s[14], 28, false);
+    ButterflyRotation_SecondIsZero(&s[10], &s[13], 44, false);
+    ButterflyRotation_FirstIsZero(&s[11], &s[12], 12, false);
+  } else {
+    butterfly_rotation(&s[8], &s[15], 60, false);
+    butterfly_rotation(&s[9], &s[14], 28, false);
+    butterfly_rotation(&s[10], &s[13], 44, false);
+    butterfly_rotation(&s[11], &s[12], 12, false);
+  }
+
+  // stage 9.
+  HadamardRotation(&s[8], &s[9], false, min, max);
+  HadamardRotation(&s[10], &s[11], true, min, max);
+  HadamardRotation(&s[12], &s[13], false, min, max);
+  HadamardRotation(&s[14], &s[15], true, min, max);
+
+  // stage 14.
+  butterfly_rotation(&s[14], &s[9], 48, true);
+  butterfly_rotation(&s[13], &s[10], 112, true);
+
+  // stage 19.
+  HadamardRotation(&s[8], &s[11], false, min, max);
+  HadamardRotation(&s[9], &s[10], false, min, max);
+  HadamardRotation(&s[12], &s[15], true, min, max);
+  HadamardRotation(&s[13], &s[14], true, min, max);
+
+  // stage 23.
+  butterfly_rotation(&s[13], &s[10], 32, true);
+  butterfly_rotation(&s[12], &s[11], 32, true);
+
+  // stage 26.
+  if (is_last_stage) {
+    HadamardRotation(&s[0], &s[15], false);
+    HadamardRotation(&s[1], &s[14], false);
+    HadamardRotation(&s[2], &s[13], false);
+    HadamardRotation(&s[3], &s[12], false);
+    HadamardRotation(&s[4], &s[11], false);
+    HadamardRotation(&s[5], &s[10], false);
+    HadamardRotation(&s[6], &s[9], false);
+    HadamardRotation(&s[7], &s[8], false);
+  } else {
+    HadamardRotation(&s[0], &s[15], false, min, max);
+    HadamardRotation(&s[1], &s[14], false, min, max);
+    HadamardRotation(&s[2], &s[13], false, min, max);
+    HadamardRotation(&s[3], &s[12], false, min, max);
+    HadamardRotation(&s[4], &s[11], false, min, max);
+    HadamardRotation(&s[5], &s[10], false, min, max);
+    HadamardRotation(&s[6], &s[9], false, min, max);
+    HadamardRotation(&s[7], &s[8], false, min, max);
+  }
+}
+
+// Process dct16 rows or columns, depending on the |is_row| flag.
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Dct16_NEON(void* dest, int32_t step, bool is_row,
+                                      int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+  const int32x4_t min = vdupq_n_s32(-(1 << range));
+  const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+  int32x4_t s[16], x[16];
+
+  if (is_row) {
+    for (int idx = 0; idx < 16; idx += 8) {
+      LoadSrc<4>(dst, step, idx, &x[idx]);
+      LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
+      Transpose4x4(&x[idx], &x[idx]);
+      Transpose4x4(&x[idx + 4], &x[idx + 4]);
+    }
+  } else {
+    LoadSrc<16>(dst, step, 0, &x[0]);
+  }
+
+  // stage 1
+  // kBitReverseLookup 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
+  s[0] = x[0];
+  s[1] = x[8];
+  s[2] = x[4];
+  s[3] = x[12];
+  s[4] = x[2];
+  s[5] = x[10];
+  s[6] = x[6];
+  s[7] = x[14];
+  s[8] = x[1];
+  s[9] = x[9];
+  s[10] = x[5];
+  s[11] = x[13];
+  s[12] = x[3];
+  s[13] = x[11];
+  s[14] = x[7];
+  s[15] = x[15];
+
+  Dct4Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/false);
+  Dct8Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/false);
+  Dct16Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/true);
+
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    for (int i = 0; i < 16; ++i) {
+      s[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(s[i], v_row_shift)));
+    }
+    for (int idx = 0; idx < 16; idx += 8) {
+      Transpose4x4(&s[idx], &s[idx]);
+      Transpose4x4(&s[idx + 4], &s[idx + 4]);
+      StoreDst<4>(dst, step, idx, &s[idx]);
+      StoreDst<4>(dst, step, idx + 4, &s[idx + 4]);
+    }
+  } else {
+    StoreDst<16>(dst, step, 0, &s[0]);
+  }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct32Stages(int32x4_t* s, const int32x4_t* min,
+                                       const int32x4_t* max,
+                                       const bool is_last_stage) {
+  // stage 3
+  if (is_fast_butterfly) {
+    ButterflyRotation_SecondIsZero(&s[16], &s[31], 62, false);
+    ButterflyRotation_FirstIsZero(&s[17], &s[30], 30, false);
+    ButterflyRotation_SecondIsZero(&s[18], &s[29], 46, false);
+    ButterflyRotation_FirstIsZero(&s[19], &s[28], 14, false);
+    ButterflyRotation_SecondIsZero(&s[20], &s[27], 54, false);
+    ButterflyRotation_FirstIsZero(&s[21], &s[26], 22, false);
+    ButterflyRotation_SecondIsZero(&s[22], &s[25], 38, false);
+    ButterflyRotation_FirstIsZero(&s[23], &s[24], 6, false);
+  } else {
+    butterfly_rotation(&s[16], &s[31], 62, false);
+    butterfly_rotation(&s[17], &s[30], 30, false);
+    butterfly_rotation(&s[18], &s[29], 46, false);
+    butterfly_rotation(&s[19], &s[28], 14, false);
+    butterfly_rotation(&s[20], &s[27], 54, false);
+    butterfly_rotation(&s[21], &s[26], 22, false);
+    butterfly_rotation(&s[22], &s[25], 38, false);
+    butterfly_rotation(&s[23], &s[24], 6, false);
+  }
+
+  // stage 6.
+  HadamardRotation(&s[16], &s[17], false, min, max);
+  HadamardRotation(&s[18], &s[19], true, min, max);
+  HadamardRotation(&s[20], &s[21], false, min, max);
+  HadamardRotation(&s[22], &s[23], true, min, max);
+  HadamardRotation(&s[24], &s[25], false, min, max);
+  HadamardRotation(&s[26], &s[27], true, min, max);
+  HadamardRotation(&s[28], &s[29], false, min, max);
+  HadamardRotation(&s[30], &s[31], true, min, max);
+
+  // stage 10.
+  butterfly_rotation(&s[30], &s[17], 24 + 32, true);
+  butterfly_rotation(&s[29], &s[18], 24 + 64 + 32, true);
+  butterfly_rotation(&s[26], &s[21], 24, true);
+  butterfly_rotation(&s[25], &s[22], 24 + 64, true);
+
+  // stage 15.
+  HadamardRotation(&s[16], &s[19], false, min, max);
+  HadamardRotation(&s[17], &s[18], false, min, max);
+  HadamardRotation(&s[20], &s[23], true, min, max);
+  HadamardRotation(&s[21], &s[22], true, min, max);
+  HadamardRotation(&s[24], &s[27], false, min, max);
+  HadamardRotation(&s[25], &s[26], false, min, max);
+  HadamardRotation(&s[28], &s[31], true, min, max);
+  HadamardRotation(&s[29], &s[30], true, min, max);
+
+  // stage 20.
+  butterfly_rotation(&s[29], &s[18], 48, true);
+  butterfly_rotation(&s[28], &s[19], 48, true);
+  butterfly_rotation(&s[27], &s[20], 48 + 64, true);
+  butterfly_rotation(&s[26], &s[21], 48 + 64, true);
+
+  // stage 24.
+  HadamardRotation(&s[16], &s[23], false, min, max);
+  HadamardRotation(&s[17], &s[22], false, min, max);
+  HadamardRotation(&s[18], &s[21], false, min, max);
+  HadamardRotation(&s[19], &s[20], false, min, max);
+  HadamardRotation(&s[24], &s[31], true, min, max);
+  HadamardRotation(&s[25], &s[30], true, min, max);
+  HadamardRotation(&s[26], &s[29], true, min, max);
+  HadamardRotation(&s[27], &s[28], true, min, max);
+
+  // stage 27.
+  butterfly_rotation(&s[27], &s[20], 32, true);
+  butterfly_rotation(&s[26], &s[21], 32, true);
+  butterfly_rotation(&s[25], &s[22], 32, true);
+  butterfly_rotation(&s[24], &s[23], 32, true);
+
+  // stage 29.
+  if (is_last_stage) {
+    HadamardRotation(&s[0], &s[31], false);
+    HadamardRotation(&s[1], &s[30], false);
+    HadamardRotation(&s[2], &s[29], false);
+    HadamardRotation(&s[3], &s[28], false);
+    HadamardRotation(&s[4], &s[27], false);
+    HadamardRotation(&s[5], &s[26], false);
+    HadamardRotation(&s[6], &s[25], false);
+    HadamardRotation(&s[7], &s[24], false);
+    HadamardRotation(&s[8], &s[23], false);
+    HadamardRotation(&s[9], &s[22], false);
+    HadamardRotation(&s[10], &s[21], false);
+    HadamardRotation(&s[11], &s[20], false);
+    HadamardRotation(&s[12], &s[19], false);
+    HadamardRotation(&s[13], &s[18], false);
+    HadamardRotation(&s[14], &s[17], false);
+    HadamardRotation(&s[15], &s[16], false);
+  } else {
+    HadamardRotation(&s[0], &s[31], false, min, max);
+    HadamardRotation(&s[1], &s[30], false, min, max);
+    HadamardRotation(&s[2], &s[29], false, min, max);
+    HadamardRotation(&s[3], &s[28], false, min, max);
+    HadamardRotation(&s[4], &s[27], false, min, max);
+    HadamardRotation(&s[5], &s[26], false, min, max);
+    HadamardRotation(&s[6], &s[25], false, min, max);
+    HadamardRotation(&s[7], &s[24], false, min, max);
+    HadamardRotation(&s[8], &s[23], false, min, max);
+    HadamardRotation(&s[9], &s[22], false, min, max);
+    HadamardRotation(&s[10], &s[21], false, min, max);
+    HadamardRotation(&s[11], &s[20], false, min, max);
+    HadamardRotation(&s[12], &s[19], false, min, max);
+    HadamardRotation(&s[13], &s[18], false, min, max);
+    HadamardRotation(&s[14], &s[17], false, min, max);
+    HadamardRotation(&s[15], &s[16], false, min, max);
+  }
+}
+
+// Process dct32 rows or columns, depending on the |is_row| flag.
+LIBGAV1_ALWAYS_INLINE void Dct32_NEON(void* dest, const int32_t step,
+                                      const bool is_row, int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+  const int32x4_t min = vdupq_n_s32(-(1 << range));
+  const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+  int32x4_t s[32], x[32];
+
+  if (is_row) {
+    for (int idx = 0; idx < 32; idx += 8) {
+      LoadSrc<4>(dst, step, idx, &x[idx]);
+      LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
+      Transpose4x4(&x[idx], &x[idx]);
+      Transpose4x4(&x[idx + 4], &x[idx + 4]);
+    }
+  } else {
+    LoadSrc<32>(dst, step, 0, &x[0]);
+  }
+
+  // stage 1
+  // kBitReverseLookup
+  // 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
+  s[0] = x[0];
+  s[1] = x[16];
+  s[2] = x[8];
+  s[3] = x[24];
+  s[4] = x[4];
+  s[5] = x[20];
+  s[6] = x[12];
+  s[7] = x[28];
+  s[8] = x[2];
+  s[9] = x[18];
+  s[10] = x[10];
+  s[11] = x[26];
+  s[12] = x[6];
+  s[13] = x[22];
+  s[14] = x[14];
+  s[15] = x[30];
+
+  // 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31,
+  s[16] = x[1];
+  s[17] = x[17];
+  s[18] = x[9];
+  s[19] = x[25];
+  s[20] = x[5];
+  s[21] = x[21];
+  s[22] = x[13];
+  s[23] = x[29];
+  s[24] = x[3];
+  s[25] = x[19];
+  s[26] = x[11];
+  s[27] = x[27];
+  s[28] = x[7];
+  s[29] = x[23];
+  s[30] = x[15];
+  s[31] = x[31];
+
+  Dct4Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/false);
+  Dct8Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/false);
+  Dct16Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/false);
+  Dct32Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/true);
+
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    for (int idx = 0; idx < 32; idx += 8) {
+      int32x4_t output[8];
+      Transpose4x4(&s[idx], &output[0]);
+      Transpose4x4(&s[idx + 4], &output[4]);
+      for (int i = 0; i < 8; ++i) {
+        output[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(output[i], v_row_shift)));
+      }
+      StoreDst<4>(dst, step, idx, &output[0]);
+      StoreDst<4>(dst, step, idx + 4, &output[4]);
+    }
+  } else {
+    StoreDst<32>(dst, step, 0, &s[0]);
+  }
+}
+
+void Dct64_NEON(void* dest, int32_t step, bool is_row, int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+  const int32x4_t min = vdupq_n_s32(-(1 << range));
+  const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+  int32x4_t s[64], x[32];
+
+  if (is_row) {
+    // The last 32 values of every row are always zero if the |tx_width| is
+    // 64.
+    for (int idx = 0; idx < 32; idx += 8) {
+      LoadSrc<4>(dst, step, idx, &x[idx]);
+      LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
+      Transpose4x4(&x[idx], &x[idx]);
+      Transpose4x4(&x[idx + 4], &x[idx + 4]);
+    }
+  } else {
+    // The last 32 values of every column are always zero if the |tx_height| is
+    // 64.
+    LoadSrc<32>(dst, step, 0, &x[0]);
+  }
+
+  // stage 1
+  // kBitReverseLookup
+  // 0, 32, 16, 48, 8, 40, 24, 56, 4, 36, 20, 52, 12, 44, 28, 60,
+  s[0] = x[0];
+  s[2] = x[16];
+  s[4] = x[8];
+  s[6] = x[24];
+  s[8] = x[4];
+  s[10] = x[20];
+  s[12] = x[12];
+  s[14] = x[28];
+
+  // 2, 34, 18, 50, 10, 42, 26, 58, 6, 38, 22, 54, 14, 46, 30, 62,
+  s[16] = x[2];
+  s[18] = x[18];
+  s[20] = x[10];
+  s[22] = x[26];
+  s[24] = x[6];
+  s[26] = x[22];
+  s[28] = x[14];
+  s[30] = x[30];
+
+  // 1, 33, 17, 49, 9, 41, 25, 57, 5, 37, 21, 53, 13, 45, 29, 61,
+  s[32] = x[1];
+  s[34] = x[17];
+  s[36] = x[9];
+  s[38] = x[25];
+  s[40] = x[5];
+  s[42] = x[21];
+  s[44] = x[13];
+  s[46] = x[29];
+
+  // 3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63
+  s[48] = x[3];
+  s[50] = x[19];
+  s[52] = x[11];
+  s[54] = x[27];
+  s[56] = x[7];
+  s[58] = x[23];
+  s[60] = x[15];
+  s[62] = x[31];
+
+  Dct4Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
+      s, &min, &max, /*is_last_stage=*/false);
+  Dct8Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
+      s, &min, &max, /*is_last_stage=*/false);
+  Dct16Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
+      s, &min, &max, /*is_last_stage=*/false);
+  Dct32Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
+      s, &min, &max, /*is_last_stage=*/false);
+
+  //-- start dct 64 stages
+  // stage 2.
+  ButterflyRotation_SecondIsZero(&s[32], &s[63], 63 - 0, false);
+  ButterflyRotation_FirstIsZero(&s[33], &s[62], 63 - 32, false);
+  ButterflyRotation_SecondIsZero(&s[34], &s[61], 63 - 16, false);
+  ButterflyRotation_FirstIsZero(&s[35], &s[60], 63 - 48, false);
+  ButterflyRotation_SecondIsZero(&s[36], &s[59], 63 - 8, false);
+  ButterflyRotation_FirstIsZero(&s[37], &s[58], 63 - 40, false);
+  ButterflyRotation_SecondIsZero(&s[38], &s[57], 63 - 24, false);
+  ButterflyRotation_FirstIsZero(&s[39], &s[56], 63 - 56, false);
+  ButterflyRotation_SecondIsZero(&s[40], &s[55], 63 - 4, false);
+  ButterflyRotation_FirstIsZero(&s[41], &s[54], 63 - 36, false);
+  ButterflyRotation_SecondIsZero(&s[42], &s[53], 63 - 20, false);
+  ButterflyRotation_FirstIsZero(&s[43], &s[52], 63 - 52, false);
+  ButterflyRotation_SecondIsZero(&s[44], &s[51], 63 - 12, false);
+  ButterflyRotation_FirstIsZero(&s[45], &s[50], 63 - 44, false);
+  ButterflyRotation_SecondIsZero(&s[46], &s[49], 63 - 28, false);
+  ButterflyRotation_FirstIsZero(&s[47], &s[48], 63 - 60, false);
+
+  // stage 4.
+  HadamardRotation(&s[32], &s[33], false, &min, &max);
+  HadamardRotation(&s[34], &s[35], true, &min, &max);
+  HadamardRotation(&s[36], &s[37], false, &min, &max);
+  HadamardRotation(&s[38], &s[39], true, &min, &max);
+  HadamardRotation(&s[40], &s[41], false, &min, &max);
+  HadamardRotation(&s[42], &s[43], true, &min, &max);
+  HadamardRotation(&s[44], &s[45], false, &min, &max);
+  HadamardRotation(&s[46], &s[47], true, &min, &max);
+  HadamardRotation(&s[48], &s[49], false, &min, &max);
+  HadamardRotation(&s[50], &s[51], true, &min, &max);
+  HadamardRotation(&s[52], &s[53], false, &min, &max);
+  HadamardRotation(&s[54], &s[55], true, &min, &max);
+  HadamardRotation(&s[56], &s[57], false, &min, &max);
+  HadamardRotation(&s[58], &s[59], true, &min, &max);
+  HadamardRotation(&s[60], &s[61], false, &min, &max);
+  HadamardRotation(&s[62], &s[63], true, &min, &max);
+
+  // stage 7.
+  ButterflyRotation_4(&s[62], &s[33], 60 - 0, true);
+  ButterflyRotation_4(&s[61], &s[34], 60 - 0 + 64, true);
+  ButterflyRotation_4(&s[58], &s[37], 60 - 32, true);
+  ButterflyRotation_4(&s[57], &s[38], 60 - 32 + 64, true);
+  ButterflyRotation_4(&s[54], &s[41], 60 - 16, true);
+  ButterflyRotation_4(&s[53], &s[42], 60 - 16 + 64, true);
+  ButterflyRotation_4(&s[50], &s[45], 60 - 48, true);
+  ButterflyRotation_4(&s[49], &s[46], 60 - 48 + 64, true);
+
+  // stage 11.
+  HadamardRotation(&s[32], &s[35], false, &min, &max);
+  HadamardRotation(&s[33], &s[34], false, &min, &max);
+  HadamardRotation(&s[36], &s[39], true, &min, &max);
+  HadamardRotation(&s[37], &s[38], true, &min, &max);
+  HadamardRotation(&s[40], &s[43], false, &min, &max);
+  HadamardRotation(&s[41], &s[42], false, &min, &max);
+  HadamardRotation(&s[44], &s[47], true, &min, &max);
+  HadamardRotation(&s[45], &s[46], true, &min, &max);
+  HadamardRotation(&s[48], &s[51], false, &min, &max);
+  HadamardRotation(&s[49], &s[50], false, &min, &max);
+  HadamardRotation(&s[52], &s[55], true, &min, &max);
+  HadamardRotation(&s[53], &s[54], true, &min, &max);
+  HadamardRotation(&s[56], &s[59], false, &min, &max);
+  HadamardRotation(&s[57], &s[58], false, &min, &max);
+  HadamardRotation(&s[60], &s[63], true, &min, &max);
+  HadamardRotation(&s[61], &s[62], true, &min, &max);
+
+  // stage 16.
+  ButterflyRotation_4(&s[61], &s[34], 56, true);
+  ButterflyRotation_4(&s[60], &s[35], 56, true);
+  ButterflyRotation_4(&s[59], &s[36], 56 + 64, true);
+  ButterflyRotation_4(&s[58], &s[37], 56 + 64, true);
+  ButterflyRotation_4(&s[53], &s[42], 56 - 32, true);
+  ButterflyRotation_4(&s[52], &s[43], 56 - 32, true);
+  ButterflyRotation_4(&s[51], &s[44], 56 - 32 + 64, true);
+  ButterflyRotation_4(&s[50], &s[45], 56 - 32 + 64, true);
+
+  // stage 21.
+  HadamardRotation(&s[32], &s[39], false, &min, &max);
+  HadamardRotation(&s[33], &s[38], false, &min, &max);
+  HadamardRotation(&s[34], &s[37], false, &min, &max);
+  HadamardRotation(&s[35], &s[36], false, &min, &max);
+  HadamardRotation(&s[40], &s[47], true, &min, &max);
+  HadamardRotation(&s[41], &s[46], true, &min, &max);
+  HadamardRotation(&s[42], &s[45], true, &min, &max);
+  HadamardRotation(&s[43], &s[44], true, &min, &max);
+  HadamardRotation(&s[48], &s[55], false, &min, &max);
+  HadamardRotation(&s[49], &s[54], false, &min, &max);
+  HadamardRotation(&s[50], &s[53], false, &min, &max);
+  HadamardRotation(&s[51], &s[52], false, &min, &max);
+  HadamardRotation(&s[56], &s[63], true, &min, &max);
+  HadamardRotation(&s[57], &s[62], true, &min, &max);
+  HadamardRotation(&s[58], &s[61], true, &min, &max);
+  HadamardRotation(&s[59], &s[60], true, &min, &max);
+
+  // stage 25.
+  ButterflyRotation_4(&s[59], &s[36], 48, true);
+  ButterflyRotation_4(&s[58], &s[37], 48, true);
+  ButterflyRotation_4(&s[57], &s[38], 48, true);
+  ButterflyRotation_4(&s[56], &s[39], 48, true);
+  ButterflyRotation_4(&s[55], &s[40], 112, true);
+  ButterflyRotation_4(&s[54], &s[41], 112, true);
+  ButterflyRotation_4(&s[53], &s[42], 112, true);
+  ButterflyRotation_4(&s[52], &s[43], 112, true);
+
+  // stage 28.
+  HadamardRotation(&s[32], &s[47], false, &min, &max);
+  HadamardRotation(&s[33], &s[46], false, &min, &max);
+  HadamardRotation(&s[34], &s[45], false, &min, &max);
+  HadamardRotation(&s[35], &s[44], false, &min, &max);
+  HadamardRotation(&s[36], &s[43], false, &min, &max);
+  HadamardRotation(&s[37], &s[42], false, &min, &max);
+  HadamardRotation(&s[38], &s[41], false, &min, &max);
+  HadamardRotation(&s[39], &s[40], false, &min, &max);
+  HadamardRotation(&s[48], &s[63], true, &min, &max);
+  HadamardRotation(&s[49], &s[62], true, &min, &max);
+  HadamardRotation(&s[50], &s[61], true, &min, &max);
+  HadamardRotation(&s[51], &s[60], true, &min, &max);
+  HadamardRotation(&s[52], &s[59], true, &min, &max);
+  HadamardRotation(&s[53], &s[58], true, &min, &max);
+  HadamardRotation(&s[54], &s[57], true, &min, &max);
+  HadamardRotation(&s[55], &s[56], true, &min, &max);
+
+  // stage 30.
+  ButterflyRotation_4(&s[55], &s[40], 32, true);
+  ButterflyRotation_4(&s[54], &s[41], 32, true);
+  ButterflyRotation_4(&s[53], &s[42], 32, true);
+  ButterflyRotation_4(&s[52], &s[43], 32, true);
+  ButterflyRotation_4(&s[51], &s[44], 32, true);
+  ButterflyRotation_4(&s[50], &s[45], 32, true);
+  ButterflyRotation_4(&s[49], &s[46], 32, true);
+  ButterflyRotation_4(&s[48], &s[47], 32, true);
+
+  // stage 31.
+  for (int i = 0; i < 32; i += 4) {
+    HadamardRotation(&s[i], &s[63 - i], false, &min, &max);
+    HadamardRotation(&s[i + 1], &s[63 - i - 1], false, &min, &max);
+    HadamardRotation(&s[i + 2], &s[63 - i - 2], false, &min, &max);
+    HadamardRotation(&s[i + 3], &s[63 - i - 3], false, &min, &max);
+  }
+  //-- end dct 64 stages
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    for (int idx = 0; idx < 64; idx += 8) {
+      int32x4_t output[8];
+      Transpose4x4(&s[idx], &output[0]);
+      Transpose4x4(&s[idx + 4], &output[4]);
+      for (int i = 0; i < 8; ++i) {
+        output[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(output[i], v_row_shift)));
+      }
+      StoreDst<4>(dst, step, idx, &output[0]);
+      StoreDst<4>(dst, step, idx + 4, &output[4]);
+    }
+  } else {
+    StoreDst<64>(dst, step, 0, &s[0]);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Asymmetric Discrete Sine Transforms (ADST).
+LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step, bool is_row,
+                                      int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  int32x4_t s[8];
+  int32x4_t x[4];
+
+  LoadSrc<4>(dst, step, 0, x);
+  if (is_row) {
+    Transpose4x4(x, x);
+  }
+
+  // stage 1.
+  s[5] = vmulq_n_s32(x[3], kAdst4Multiplier[1]);
+  s[6] = vmulq_n_s32(x[3], kAdst4Multiplier[3]);
+
+  // stage 2.
+  const int32x4_t a7 = vsubq_s32(x[0], x[2]);
+  const int32x4_t b7 = vaddq_s32(a7, x[3]);
+
+  // stage 3.
+  s[0] = vmulq_n_s32(x[0], kAdst4Multiplier[0]);
+  s[1] = vmulq_n_s32(x[0], kAdst4Multiplier[1]);
+  // s[0] = s[0] + s[3]
+  s[0] = vmlaq_n_s32(s[0], x[2], kAdst4Multiplier[3]);
+  // s[1] = s[1] - s[4]
+  s[1] = vmlsq_n_s32(s[1], x[2], kAdst4Multiplier[0]);
+
+  s[3] = vmulq_n_s32(x[1], kAdst4Multiplier[2]);
+  s[2] = vmulq_n_s32(b7, kAdst4Multiplier[2]);
+
+  // stage 4.
+  s[0] = vaddq_s32(s[0], s[5]);
+  s[1] = vsubq_s32(s[1], s[6]);
+
+  // stages 5 and 6.
+  const int32x4_t x0 = vaddq_s32(s[0], s[3]);
+  const int32x4_t x1 = vaddq_s32(s[1], s[3]);
+  const int32x4_t x3_a = vaddq_s32(s[0], s[1]);
+  const int32x4_t x3 = vsubq_s32(x3_a, s[3]);
+  x[0] = vrshrq_n_s32(x0, 12);
+  x[1] = vrshrq_n_s32(x1, 12);
+  x[2] = vrshrq_n_s32(s[2], 12);
+  x[3] = vrshrq_n_s32(x3, 12);
+
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    x[0] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[0], v_row_shift)));
+    x[1] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[1], v_row_shift)));
+    x[2] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[2], v_row_shift)));
+    x[3] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[3], v_row_shift)));
+    Transpose4x4(x, x);
+  }
+  StoreDst<4>(dst, step, 0, x);
+}
+
+alignas(16) constexpr int32_t kAdst4DcOnlyMultiplier[4] = {1321, 2482, 3344,
+                                                           2482};
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, int adjusted_tx_height,
+                                       bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  int32x4_t s[2];
+
+  const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
+  const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+  const int32x4_t v_src0_round =
+      vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+
+  const int32x4_t v_src = vbslq_s32(v_mask, v_src0_round, v_src0);
+  const int32x4_t kAdst4DcOnlyMultipliers = vld1q_s32(kAdst4DcOnlyMultiplier);
+  s[1] = vdupq_n_s32(0);
+
+  // s0*k0 s0*k1 s0*k2 s0*k1
+  s[0] = vmulq_s32(kAdst4DcOnlyMultipliers, v_src);
+  // 0     0     0     s0*k0
+  s[1] = vextq_s32(s[1], s[0], 1);
+
+  const int32x4_t x3 = vaddq_s32(s[0], s[1]);
+  const int32x4_t dst_0 = vrshrq_n_s32(x3, 12);
+
+  // vqrshlq_s32 will shift right if shift value is negative.
+  vst1q_s32(dst,
+            vmovl_s16(vqmovn_s32(vqrshlq_s32(dst_0, vdupq_n_s32(-row_shift)))));
+
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, int adjusted_tx_height,
+                                             int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  int32x4_t s[4];
+
+  int i = 0;
+  do {
+    const int32x4_t v_src = vld1q_s32(&dst[i]);
+
+    s[0] = vmulq_n_s32(v_src, kAdst4Multiplier[0]);
+    s[1] = vmulq_n_s32(v_src, kAdst4Multiplier[1]);
+    s[2] = vmulq_n_s32(v_src, kAdst4Multiplier[2]);
+
+    const int32x4_t x0 = s[0];
+    const int32x4_t x1 = s[1];
+    const int32x4_t x2 = s[2];
+    const int32x4_t x3 = vaddq_s32(s[0], s[1]);
+    const int32x4_t dst_0 = vrshrq_n_s32(x0, 12);
+    const int32x4_t dst_1 = vrshrq_n_s32(x1, 12);
+    const int32x4_t dst_2 = vrshrq_n_s32(x2, 12);
+    const int32x4_t dst_3 = vrshrq_n_s32(x3, 12);
+
+    vst1q_s32(&dst[i], dst_0);
+    vst1q_s32(&dst[i + width * 1], dst_1);
+    vst1q_s32(&dst[i + width * 2], dst_2);
+    vst1q_s32(&dst[i + width * 3], dst_3);
+
+    i += 4;
+  } while (i < width);
+
+  return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Adst8_NEON(void* dest, int32_t step, bool is_row,
+                                      int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+  const int32x4_t min = vdupq_n_s32(-(1 << range));
+  const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+  int32x4_t s[8], x[8];
+
+  if (is_row) {
+    LoadSrc<4>(dst, step, 0, &x[0]);
+    LoadSrc<4>(dst, step, 4, &x[4]);
+    Transpose4x4(&x[0], &x[0]);
+    Transpose4x4(&x[4], &x[4]);
+  } else {
+    LoadSrc<8>(dst, step, 0, &x[0]);
+  }
+
+  // stage 1.
+  s[0] = x[7];
+  s[1] = x[0];
+  s[2] = x[5];
+  s[3] = x[2];
+  s[4] = x[3];
+  s[5] = x[4];
+  s[6] = x[1];
+  s[7] = x[6];
+
+  // stage 2.
+  butterfly_rotation(&s[0], &s[1], 60 - 0, true);
+  butterfly_rotation(&s[2], &s[3], 60 - 16, true);
+  butterfly_rotation(&s[4], &s[5], 60 - 32, true);
+  butterfly_rotation(&s[6], &s[7], 60 - 48, true);
+
+  // stage 3.
+  HadamardRotation(&s[0], &s[4], false, &min, &max);
+  HadamardRotation(&s[1], &s[5], false, &min, &max);
+  HadamardRotation(&s[2], &s[6], false, &min, &max);
+  HadamardRotation(&s[3], &s[7], false, &min, &max);
+
+  // stage 4.
+  butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+  butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+
+  // stage 5.
+  HadamardRotation(&s[0], &s[2], false, &min, &max);
+  HadamardRotation(&s[4], &s[6], false, &min, &max);
+  HadamardRotation(&s[1], &s[3], false, &min, &max);
+  HadamardRotation(&s[5], &s[7], false, &min, &max);
+
+  // stage 6.
+  butterfly_rotation(&s[2], &s[3], 32, true);
+  butterfly_rotation(&s[6], &s[7], 32, true);
+
+  // stage 7.
+  x[0] = s[0];
+  x[1] = vqnegq_s32(s[4]);
+  x[2] = s[6];
+  x[3] = vqnegq_s32(s[2]);
+  x[4] = s[3];
+  x[5] = vqnegq_s32(s[7]);
+  x[6] = s[5];
+  x[7] = vqnegq_s32(s[1]);
+
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    for (int i = 0; i < 8; ++i) {
+      x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], v_row_shift)));
+    }
+    Transpose4x4(&x[0], &x[0]);
+    Transpose4x4(&x[4], &x[4]);
+    StoreDst<4>(dst, step, 0, &x[0]);
+    StoreDst<4>(dst, step, 4, &x[4]);
+  } else {
+    StoreDst<8>(dst, step, 0, &x[0]);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, int adjusted_tx_height,
+                                       bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  int32x4_t s[8];
+
+  const int32x4_t v_src = vdupq_n_s32(dst[0]);
+  const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+  const int32x4_t v_src_round =
+      vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12));
+  // stage 1.
+  s[1] = vbslq_s32(v_mask, v_src_round, v_src);
+
+  // stage 2.
+  ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+  // stage 3.
+  s[4] = s[0];
+  s[5] = s[1];
+
+  // stage 4.
+  ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+  // stage 5.
+  s[2] = s[0];
+  s[3] = s[1];
+  s[6] = s[4];
+  s[7] = s[5];
+
+  // stage 6.
+  ButterflyRotation_4(&s[2], &s[3], 32, true);
+  ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+  // stage 7.
+  int32x4_t x[8];
+  x[0] = s[0];
+  x[1] = vqnegq_s32(s[4]);
+  x[2] = s[6];
+  x[3] = vqnegq_s32(s[2]);
+  x[4] = s[3];
+  x[5] = vqnegq_s32(s[7]);
+  x[6] = s[5];
+  x[7] = vqnegq_s32(s[1]);
+
+  for (int i = 0; i < 8; ++i) {
+    // vqrshlq_s32 will shift right if shift value is negative.
+    x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], vdupq_n_s32(-row_shift))));
+    vst1q_lane_s32(&dst[i], x[i], 0);
+  }
+
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, int adjusted_tx_height,
+                                             int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  int32x4_t s[8];
+
+  int i = 0;
+  do {
+    const int32x4_t v_src = vld1q_s32(dst);
+    // stage 1.
+    s[1] = v_src;
+
+    // stage 2.
+    ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+    // stage 3.
+    s[4] = s[0];
+    s[5] = s[1];
+
+    // stage 4.
+    ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+    // stage 5.
+    s[2] = s[0];
+    s[3] = s[1];
+    s[6] = s[4];
+    s[7] = s[5];
+
+    // stage 6.
+    ButterflyRotation_4(&s[2], &s[3], 32, true);
+    ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+    // stage 7.
+    int32x4_t x[8];
+    x[0] = s[0];
+    x[1] = vqnegq_s32(s[4]);
+    x[2] = s[6];
+    x[3] = vqnegq_s32(s[2]);
+    x[4] = s[3];
+    x[5] = vqnegq_s32(s[7]);
+    x[6] = s[5];
+    x[7] = vqnegq_s32(s[1]);
+
+    for (int j = 0; j < 8; ++j) {
+      vst1q_s32(&dst[j * width], x[j]);
+    }
+    i += 4;
+    dst += 4;
+  } while (i < width);
+
+  return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Adst16_NEON(void* dest, int32_t step, bool is_row,
+                                       int row_shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+  const int32x4_t min = vdupq_n_s32(-(1 << range));
+  const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+  int32x4_t s[16], x[16];
+
+  if (is_row) {
+    for (int idx = 0; idx < 16; idx += 8) {
+      LoadSrc<4>(dst, step, idx, &x[idx]);
+      LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
+      Transpose4x4(&x[idx], &x[idx]);
+      Transpose4x4(&x[idx + 4], &x[idx + 4]);
+    }
+  } else {
+    LoadSrc<16>(dst, step, 0, &x[0]);
+  }
+
+  // stage 1.
+  s[0] = x[15];
+  s[1] = x[0];
+  s[2] = x[13];
+  s[3] = x[2];
+  s[4] = x[11];
+  s[5] = x[4];
+  s[6] = x[9];
+  s[7] = x[6];
+  s[8] = x[7];
+  s[9] = x[8];
+  s[10] = x[5];
+  s[11] = x[10];
+  s[12] = x[3];
+  s[13] = x[12];
+  s[14] = x[1];
+  s[15] = x[14];
+
+  // stage 2.
+  butterfly_rotation(&s[0], &s[1], 62 - 0, true);
+  butterfly_rotation(&s[2], &s[3], 62 - 8, true);
+  butterfly_rotation(&s[4], &s[5], 62 - 16, true);
+  butterfly_rotation(&s[6], &s[7], 62 - 24, true);
+  butterfly_rotation(&s[8], &s[9], 62 - 32, true);
+  butterfly_rotation(&s[10], &s[11], 62 - 40, true);
+  butterfly_rotation(&s[12], &s[13], 62 - 48, true);
+  butterfly_rotation(&s[14], &s[15], 62 - 56, true);
+
+  // stage 3.
+  HadamardRotation(&s[0], &s[8], false, &min, &max);
+  HadamardRotation(&s[1], &s[9], false, &min, &max);
+  HadamardRotation(&s[2], &s[10], false, &min, &max);
+  HadamardRotation(&s[3], &s[11], false, &min, &max);
+  HadamardRotation(&s[4], &s[12], false, &min, &max);
+  HadamardRotation(&s[5], &s[13], false, &min, &max);
+  HadamardRotation(&s[6], &s[14], false, &min, &max);
+  HadamardRotation(&s[7], &s[15], false, &min, &max);
+
+  // stage 4.
+  butterfly_rotation(&s[8], &s[9], 56 - 0, true);
+  butterfly_rotation(&s[13], &s[12], 8 + 0, true);
+  butterfly_rotation(&s[10], &s[11], 56 - 32, true);
+  butterfly_rotation(&s[15], &s[14], 8 + 32, true);
+
+  // stage 5.
+  HadamardRotation(&s[0], &s[4], false, &min, &max);
+  HadamardRotation(&s[8], &s[12], false, &min, &max);
+  HadamardRotation(&s[1], &s[5], false, &min, &max);
+  HadamardRotation(&s[9], &s[13], false, &min, &max);
+  HadamardRotation(&s[2], &s[6], false, &min, &max);
+  HadamardRotation(&s[10], &s[14], false, &min, &max);
+  HadamardRotation(&s[3], &s[7], false, &min, &max);
+  HadamardRotation(&s[11], &s[15], false, &min, &max);
+
+  // stage 6.
+  butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+  butterfly_rotation(&s[12], &s[13], 48 - 0, true);
+  butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+  butterfly_rotation(&s[15], &s[14], 48 - 32, true);
+
+  // stage 7.
+  HadamardRotation(&s[0], &s[2], false, &min, &max);
+  HadamardRotation(&s[4], &s[6], false, &min, &max);
+  HadamardRotation(&s[8], &s[10], false, &min, &max);
+  HadamardRotation(&s[12], &s[14], false, &min, &max);
+  HadamardRotation(&s[1], &s[3], false, &min, &max);
+  HadamardRotation(&s[5], &s[7], false, &min, &max);
+  HadamardRotation(&s[9], &s[11], false, &min, &max);
+  HadamardRotation(&s[13], &s[15], false, &min, &max);
+
+  // stage 8.
+  butterfly_rotation(&s[2], &s[3], 32, true);
+  butterfly_rotation(&s[6], &s[7], 32, true);
+  butterfly_rotation(&s[10], &s[11], 32, true);
+  butterfly_rotation(&s[14], &s[15], 32, true);
+
+  // stage 9.
+  x[0] = s[0];
+  x[1] = vqnegq_s32(s[8]);
+  x[2] = s[12];
+  x[3] = vqnegq_s32(s[4]);
+  x[4] = s[6];
+  x[5] = vqnegq_s32(s[14]);
+  x[6] = s[10];
+  x[7] = vqnegq_s32(s[2]);
+  x[8] = s[3];
+  x[9] = vqnegq_s32(s[11]);
+  x[10] = s[15];
+  x[11] = vqnegq_s32(s[7]);
+  x[12] = s[5];
+  x[13] = vqnegq_s32(s[13]);
+  x[14] = s[9];
+  x[15] = vqnegq_s32(s[1]);
+
+  if (is_row) {
+    const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+    for (int i = 0; i < 16; ++i) {
+      x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], v_row_shift)));
+    }
+    for (int idx = 0; idx < 16; idx += 8) {
+      Transpose4x4(&x[idx], &x[idx]);
+      Transpose4x4(&x[idx + 4], &x[idx + 4]);
+      StoreDst<4>(dst, step, idx, &x[idx]);
+      StoreDst<4>(dst, step, idx + 4, &x[idx + 4]);
+    }
+  } else {
+    StoreDst<16>(dst, step, 0, &x[0]);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Adst16DcOnlyInternal(int32x4_t* s, int32x4_t* x) {
+  // stage 2.
+  ButterflyRotation_FirstIsZero(&s[0], &s[1], 62, true);
+
+  // stage 3.
+  s[8] = s[0];
+  s[9] = s[1];
+
+  // stage 4.
+  ButterflyRotation_4(&s[8], &s[9], 56, true);
+
+  // stage 5.
+  s[4] = s[0];
+  s[12] = s[8];
+  s[5] = s[1];
+  s[13] = s[9];
+
+  // stage 6.
+  ButterflyRotation_4(&s[4], &s[5], 48, true);
+  ButterflyRotation_4(&s[12], &s[13], 48, true);
+
+  // stage 7.
+  s[2] = s[0];
+  s[6] = s[4];
+  s[10] = s[8];
+  s[14] = s[12];
+  s[3] = s[1];
+  s[7] = s[5];
+  s[11] = s[9];
+  s[15] = s[13];
+
+  // stage 8.
+  ButterflyRotation_4(&s[2], &s[3], 32, true);
+  ButterflyRotation_4(&s[6], &s[7], 32, true);
+  ButterflyRotation_4(&s[10], &s[11], 32, true);
+  ButterflyRotation_4(&s[14], &s[15], 32, true);
+
+  // stage 9.
+  x[0] = s[0];
+  x[1] = vqnegq_s32(s[8]);
+  x[2] = s[12];
+  x[3] = vqnegq_s32(s[4]);
+  x[4] = s[6];
+  x[5] = vqnegq_s32(s[14]);
+  x[6] = s[10];
+  x[7] = vqnegq_s32(s[2]);
+  x[8] = s[3];
+  x[9] = vqnegq_s32(s[11]);
+  x[10] = s[15];
+  x[11] = vqnegq_s32(s[7]);
+  x[12] = s[5];
+  x[13] = vqnegq_s32(s[13]);
+  x[14] = s[9];
+  x[15] = vqnegq_s32(s[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, int adjusted_tx_height,
+                                        bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  int32x4_t s[16];
+  int32x4_t x[16];
+  const int32x4_t v_src = vdupq_n_s32(dst[0]);
+  const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+  const int32x4_t v_src_round =
+      vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12));
+  // stage 1.
+  s[1] = vbslq_s32(v_mask, v_src_round, v_src);
+
+  Adst16DcOnlyInternal(s, x);
+
+  for (int i = 0; i < 16; ++i) {
+    // vqrshlq_s32 will shift right if shift value is negative.
+    x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], vdupq_n_s32(-row_shift))));
+    vst1q_lane_s32(&dst[i], x[i], 0);
+  }
+
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest,
+                                              int adjusted_tx_height,
+                                              int width) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  int i = 0;
+  do {
+    int32x4_t s[16];
+    int32x4_t x[16];
+    const int32x4_t v_src = vld1q_s32(dst);
+    // stage 1.
+    s[1] = v_src;
+
+    Adst16DcOnlyInternal(s, x);
+
+    for (int j = 0; j < 16; ++j) {
+      vst1q_s32(&dst[j * width], x[j]);
+    }
+    i += 4;
+    dst += 4;
+  } while (i < width);
+
+  return true;
+}
+
+//------------------------------------------------------------------------------
+// Identity Transforms.
+
+LIBGAV1_ALWAYS_INLINE void Identity4_NEON(void* dest, int32_t step, int shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+  const int32x4_t v_multiplier = vdupq_n_s32(kIdentity4Multiplier);
+  const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+  for (int i = 0; i < 4; ++i) {
+    const int32x4_t v_src = vld1q_s32(&dst[i * step]);
+    const int32x4_t v_src_mult_lo =
+        vmlaq_s32(v_dual_round, v_src, v_multiplier);
+    const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift);
+    vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(shift_lo)));
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height,
+                                           bool should_round, int tx_height) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
+  const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+  const int32x4_t v_src_round =
+      vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+  const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0);
+  const int shift = tx_height < 16 ? 0 : 1;
+  const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+  const int32x4_t v_multiplier = vdupq_n_s32(kIdentity4Multiplier);
+  const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+  const int32x4_t v_src_mult_lo = vmlaq_s32(v_dual_round, v_src, v_multiplier);
+  const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, v_shift);
+  vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0);
+  return true;
+}
+
+template <int identity_size>
+LIBGAV1_ALWAYS_INLINE void IdentityColumnStoreToFrame(
+    Array2DView<uint16_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int tx_height, const int32_t* source) {
+  static_assert(identity_size == 4 || identity_size == 8 || identity_size == 16,
+                "Invalid identity_size.");
+  const int stride = frame.columns();
+  uint16_t* dst = frame[start_y] + start_x;
+  const int32x4_t v_dual_round = vdupq_n_s32((1 + (1 << 4)) << 11);
+  const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+
+  if (tx_width == 4) {
+    int i = 0;
+    do {
+      int32x4x2_t v_src, v_dst_i, a, b;
+      v_src.val[0] = vld1q_s32(&source[i * 4]);
+      v_src.val[1] = vld1q_s32(&source[(i * 4) + 4]);
+      if (identity_size == 4) {
+        v_dst_i.val[0] =
+            vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity4Multiplier);
+        v_dst_i.val[1] =
+            vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity4Multiplier);
+        a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+        a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+      } else if (identity_size == 8) {
+        v_dst_i.val[0] = vaddq_s32(v_src.val[0], v_src.val[0]);
+        v_dst_i.val[1] = vaddq_s32(v_src.val[1], v_src.val[1]);
+        a.val[0] = vrshrq_n_s32(v_dst_i.val[0], 4);
+        a.val[1] = vrshrq_n_s32(v_dst_i.val[1], 4);
+      } else {  // identity_size == 16
+        v_dst_i.val[0] =
+            vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier);
+        v_dst_i.val[1] =
+            vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier);
+        a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+        a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+      }
+      uint16x4x2_t frame_data;
+      frame_data.val[0] = vld1_u16(dst);
+      frame_data.val[1] = vld1_u16(dst + stride);
+      b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
+      b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
+      vst1_u16(dst, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
+      vst1_u16(dst + stride, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
+      dst += stride << 1;
+      i += 2;
+    } while (i < tx_height);
+  } else {
+    int i = 0;
+    do {
+      const int row = i * tx_width;
+      int j = 0;
+      do {
+        int32x4x2_t v_src, v_dst_i, a, b;
+        v_src.val[0] = vld1q_s32(&source[row + j]);
+        v_src.val[1] = vld1q_s32(&source[row + j + 4]);
+        if (identity_size == 4) {
+          v_dst_i.val[0] =
+              vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity4Multiplier);
+          v_dst_i.val[1] =
+              vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity4Multiplier);
+          a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+          a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+        } else if (identity_size == 8) {
+          v_dst_i.val[0] = vaddq_s32(v_src.val[0], v_src.val[0]);
+          v_dst_i.val[1] = vaddq_s32(v_src.val[1], v_src.val[1]);
+          a.val[0] = vrshrq_n_s32(v_dst_i.val[0], 4);
+          a.val[1] = vrshrq_n_s32(v_dst_i.val[1], 4);
+        } else {  // identity_size == 16
+          v_dst_i.val[0] =
+              vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier);
+          v_dst_i.val[1] =
+              vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier);
+          a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+          a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+        }
+        uint16x4x2_t frame_data;
+        frame_data.val[0] = vld1_u16(dst + j);
+        frame_data.val[1] = vld1_u16(dst + j + 4);
+        b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
+        b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
+        vst1_u16(dst + j, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
+        vst1_u16(dst + j + 4, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
+        j += 8;
+      } while (j < tx_width);
+      dst += stride;
+    } while (++i < tx_height);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity4RowColumnStoreToFrame(
+    Array2DView<uint16_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int tx_height, const int32_t* source) {
+  const int stride = frame.columns();
+  uint16_t* dst = frame[start_y] + start_x;
+  const int32x4_t v_round = vdupq_n_s32((1 + (0)) << 11);
+  const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+
+  if (tx_width == 4) {
+    int i = 0;
+    do {
+      const int32x4_t v_src = vld1q_s32(&source[i * 4]);
+      const int32x4_t v_dst_row =
+          vshrq_n_s32(vmlaq_n_s32(v_round, v_src, kIdentity4Multiplier), 12);
+      const int32x4_t v_dst_col =
+          vmlaq_n_s32(v_round, v_dst_row, kIdentity4Multiplier);
+      const uint16x4_t frame_data = vld1_u16(dst);
+      const int32x4_t a = vrshrq_n_s32(v_dst_col, 4 + 12);
+      const int32x4_t b = vaddw_s16(a, vreinterpret_s16_u16(frame_data));
+      vst1_u16(dst, vmin_u16(vqmovun_s32(b), v_max_bitdepth));
+      dst += stride;
+    } while (++i < tx_height);
+  } else {
+    int i = 0;
+    do {
+      const int row = i * tx_width;
+      int j = 0;
+      do {
+        int32x4x2_t v_src, v_src_round, v_dst_row, v_dst_col, a, b;
+        v_src.val[0] = vld1q_s32(&source[row + j]);
+        v_src.val[1] = vld1q_s32(&source[row + j + 4]);
+        v_src_round.val[0] = vshrq_n_s32(
+            vmlaq_n_s32(v_round, v_src.val[0], kTransformRowMultiplier), 12);
+        v_src_round.val[1] = vshrq_n_s32(
+            vmlaq_n_s32(v_round, v_src.val[1], kTransformRowMultiplier), 12);
+        v_dst_row.val[0] = vqaddq_s32(v_src_round.val[0], v_src_round.val[0]);
+        v_dst_row.val[1] = vqaddq_s32(v_src_round.val[1], v_src_round.val[1]);
+        v_dst_col.val[0] =
+            vmlaq_n_s32(v_round, v_dst_row.val[0], kIdentity4Multiplier);
+        v_dst_col.val[1] =
+            vmlaq_n_s32(v_round, v_dst_row.val[1], kIdentity4Multiplier);
+        uint16x4x2_t frame_data;
+        frame_data.val[0] = vld1_u16(dst + j);
+        frame_data.val[1] = vld1_u16(dst + j + 4);
+        a.val[0] = vrshrq_n_s32(v_dst_col.val[0], 4 + 12);
+        a.val[1] = vrshrq_n_s32(v_dst_col.val[1], 4 + 12);
+        b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
+        b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
+        vst1_u16(dst + j, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
+        vst1_u16(dst + j + 4, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
+        j += 8;
+      } while (j < tx_width);
+      dst += stride;
+    } while (++i < tx_height);
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row32_NEON(void* dest, int32_t step) {
+  auto* const dst = static_cast<int32_t*>(dest);
+
+  // When combining the identity8 multiplier with the row shift, the
+  // calculations for tx_height equal to 32 can be simplified from
+  // ((A * 2) + 2) >> 2) to ((A + 1) >> 1).
+  for (int i = 0; i < 4; ++i) {
+    const int32x4_t v_src_lo = vld1q_s32(&dst[i * step]);
+    const int32x4_t v_src_hi = vld1q_s32(&dst[(i * step) + 4]);
+    const int32x4_t a_lo = vrshrq_n_s32(v_src_lo, 1);
+    const int32x4_t a_hi = vrshrq_n_s32(v_src_hi, 1);
+    vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(a_lo)));
+    vst1q_s32(&dst[(i * step) + 4], vmovl_s16(vqmovn_s32(a_hi)));
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row4_NEON(void* dest, int32_t step) {
+  auto* const dst = static_cast<int32_t*>(dest);
+
+  for (int i = 0; i < 4; ++i) {
+    const int32x4_t v_src_lo = vld1q_s32(&dst[i * step]);
+    const int32x4_t v_src_hi = vld1q_s32(&dst[(i * step) + 4]);
+    const int32x4_t v_srcx2_lo = vqaddq_s32(v_src_lo, v_src_lo);
+    const int32x4_t v_srcx2_hi = vqaddq_s32(v_src_hi, v_src_hi);
+    vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(v_srcx2_lo)));
+    vst1q_s32(&dst[(i * step) + 4], vmovl_s16(vqmovn_s32(v_srcx2_hi)));
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height,
+                                           bool should_round, int row_shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
+  const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+  const int32x4_t v_src_round =
+      vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+  const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0);
+  const int32x4_t v_srcx2 = vaddq_s32(v_src, v_src);
+  const int32x4_t dst_0 = vqrshlq_s32(v_srcx2, vdupq_n_s32(-row_shift));
+  vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0);
+  return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity16Row_NEON(void* dest, int32_t step,
+                                              int shift) {
+  auto* const dst = static_cast<int32_t*>(dest);
+  const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+  const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      int32x4x2_t v_src;
+      v_src.val[0] = vld1q_s32(&dst[i * step + j * 8]);
+      v_src.val[1] = vld1q_s32(&dst[i * step + j * 8 + 4]);
+      const int32x4_t v_src_mult_lo =
+          vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier);
+      const int32x4_t v_src_mult_hi =
+          vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier);
+      const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift);
+      const int32x4_t shift_hi = vqshlq_s32(v_src_mult_hi, v_shift);
+      vst1q_s32(&dst[i * step + j * 8], vmovl_s16(vqmovn_s32(shift_lo)));
+      vst1q_s32(&dst[i * step + j * 8 + 4], vmovl_s16(vqmovn_s32(shift_hi)));
+    }
+  }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height,
+                                            bool should_round, int shift) {
+  if (adjusted_tx_height > 1) return false;
+
+  auto* dst = static_cast<int32_t*>(dest);
+  const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
+  const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+  const int32x4_t v_src_round =
+      vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+  const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0);
+  const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+  const int32x4_t v_src_mult_lo =
+      vmlaq_n_s32(v_dual_round, v_src, kIdentity16Multiplier);
+  const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, vdupq_n_s32(-(12 + shift)));
+  vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0);
+  return true;
+}
+
+//------------------------------------------------------------------------------
+// row/column transform loops
+
+template <int tx_height>
+LIBGAV1_ALWAYS_INLINE void FlipColumns(int32_t* source, int tx_width) {
+  if (tx_width >= 16) {
+    int i = 0;
+    do {
+      // 00 01 02 03
+      const int32x4_t a = vld1q_s32(&source[i]);
+      const int32x4_t b = vld1q_s32(&source[i + 4]);
+      const int32x4_t c = vld1q_s32(&source[i + 8]);
+      const int32x4_t d = vld1q_s32(&source[i + 12]);
+      // 01 00 03 02
+      const int32x4_t a_rev = vrev64q_s32(a);
+      const int32x4_t b_rev = vrev64q_s32(b);
+      const int32x4_t c_rev = vrev64q_s32(c);
+      const int32x4_t d_rev = vrev64q_s32(d);
+      // 03 02 01 00
+      vst1q_s32(&source[i], vextq_s32(d_rev, d_rev, 2));
+      vst1q_s32(&source[i + 4], vextq_s32(c_rev, c_rev, 2));
+      vst1q_s32(&source[i + 8], vextq_s32(b_rev, b_rev, 2));
+      vst1q_s32(&source[i + 12], vextq_s32(a_rev, a_rev, 2));
+      i += 16;
+    } while (i < tx_width * tx_height);
+  } else if (tx_width == 8) {
+    for (int i = 0; i < 8 * tx_height; i += 8) {
+      // 00 01 02 03
+      const int32x4_t a = vld1q_s32(&source[i]);
+      const int32x4_t b = vld1q_s32(&source[i + 4]);
+      // 01 00 03 02
+      const int32x4_t a_rev = vrev64q_s32(a);
+      const int32x4_t b_rev = vrev64q_s32(b);
+      // 03 02 01 00
+      vst1q_s32(&source[i], vextq_s32(b_rev, b_rev, 2));
+      vst1q_s32(&source[i + 4], vextq_s32(a_rev, a_rev, 2));
+    }
+  } else {
+    // Process two rows per iteration.
+    for (int i = 0; i < 4 * tx_height; i += 8) {
+      // 00 01 02 03
+      const int32x4_t a = vld1q_s32(&source[i]);
+      const int32x4_t b = vld1q_s32(&source[i + 4]);
+      // 01 00 03 02
+      const int32x4_t a_rev = vrev64q_s32(a);
+      const int32x4_t b_rev = vrev64q_s32(b);
+      // 03 02 01 00
+      vst1q_s32(&source[i], vextq_s32(a_rev, a_rev, 2));
+      vst1q_s32(&source[i + 4], vextq_s32(b_rev, b_rev, 2));
+    }
+  }
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void ApplyRounding(int32_t* source, int num_rows) {
+  // Process two rows per iteration.
+  int i = 0;
+  do {
+    const int32x4_t a_lo = vld1q_s32(&source[i]);
+    const int32x4_t a_hi = vld1q_s32(&source[i + 4]);
+    const int32x4_t b_lo =
+        vqrdmulhq_n_s32(a_lo, kTransformRowMultiplier << (31 - 12));
+    const int32x4_t b_hi =
+        vqrdmulhq_n_s32(a_hi, kTransformRowMultiplier << (31 - 12));
+    vst1q_s32(&source[i], b_lo);
+    vst1q_s32(&source[i + 4], b_hi);
+    i += 8;
+  } while (i < tx_width * num_rows);
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void RowShift(int32_t* source, int num_rows,
+                                    int row_shift) {
+  // vqrshlq_s32 will shift right if shift value is negative.
+  row_shift = -row_shift;
+
+  // Process two rows per iteration.
+  int i = 0;
+  do {
+    const int32x4_t residual0 = vld1q_s32(&source[i]);
+    const int32x4_t residual1 = vld1q_s32(&source[i + 4]);
+    vst1q_s32(&source[i], vqrshlq_s32(residual0, vdupq_n_s32(row_shift)));
+    vst1q_s32(&source[i + 4], vqrshlq_s32(residual1, vdupq_n_s32(row_shift)));
+    i += 8;
+  } while (i < tx_width * num_rows);
+}
+
+template <int tx_height, bool enable_flip_rows = false>
+LIBGAV1_ALWAYS_INLINE void StoreToFrameWithRound(
+    Array2DView<uint16_t> frame, const int start_x, const int start_y,
+    const int tx_width, const int32_t* source, TransformType tx_type) {
+  const bool flip_rows =
+      enable_flip_rows ? kTransformFlipRowsMask.Contains(tx_type) : false;
+  const int stride = frame.columns();
+  uint16_t* dst = frame[start_y] + start_x;
+
+  if (tx_width == 4) {
+    for (int i = 0; i < tx_height; ++i) {
+      const int row = flip_rows ? (tx_height - i - 1) * 4 : i * 4;
+      const int32x4_t residual = vld1q_s32(&source[row]);
+      const uint16x4_t frame_data = vld1_u16(dst);
+      const int32x4_t a = vrshrq_n_s32(residual, 4);
+      const uint32x4_t b = vaddw_u16(vreinterpretq_u32_s32(a), frame_data);
+      const uint16x4_t d = vqmovun_s32(vreinterpretq_s32_u32(b));
+      vst1_u16(dst, vmin_u16(d, vdup_n_u16((1 << kBitdepth10) - 1)));
+      dst += stride;
+    }
+  } else {
+    for (int i = 0; i < tx_height; ++i) {
+      const int y = start_y + i;
+      const int row = flip_rows ? (tx_height - i - 1) * tx_width : i * tx_width;
+      int j = 0;
+      do {
+        const int x = start_x + j;
+        const int32x4_t residual = vld1q_s32(&source[row + j]);
+        const int32x4_t residual_hi = vld1q_s32(&source[row + j + 4]);
+        const uint16x8_t frame_data = vld1q_u16(frame[y] + x);
+        const int32x4_t a = vrshrq_n_s32(residual, 4);
+        const int32x4_t a_hi = vrshrq_n_s32(residual_hi, 4);
+        const uint32x4_t b =
+            vaddw_u16(vreinterpretq_u32_s32(a), vget_low_u16(frame_data));
+        const uint32x4_t b_hi =
+            vaddw_u16(vreinterpretq_u32_s32(a_hi), vget_high_u16(frame_data));
+        const uint16x4_t d = vqmovun_s32(vreinterpretq_s32_u32(b));
+        const uint16x4_t d_hi = vqmovun_s32(vreinterpretq_s32_u32(b_hi));
+        vst1q_u16(frame[y] + x, vminq_u16(vcombine_u16(d, d_hi),
+                                          vdupq_n_u16((1 << kBitdepth10) - 1)));
+        j += 8;
+      } while (j < tx_width);
+    }
+  }
+}
+
+void Dct4TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
+                               int adjusted_tx_height, void* src_buffer,
+                               int /*start_x*/, int /*start_y*/,
+                               void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = (tx_height == 8);
+  const int row_shift = (tx_height == 16);
+
+  if (DctDcOnly<4>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+
+  // Process 4 1d dct4 rows in parallel per iteration.
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    Dct4_NEON<ButterflyRotation_4>(data, /*step=*/4, /*is_row=*/true,
+                                   row_shift);
+    data += 16;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Dct4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                  int adjusted_tx_height, void* src_buffer,
+                                  int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<4>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<4>(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d dct4 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Dct4_NEON<ButterflyRotation_4>(data, tx_width, /*transpose=*/false,
+                                     /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<4>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct8TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
+                               int adjusted_tx_height, void* src_buffer,
+                               int /*start_x*/, int /*start_y*/,
+                               void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<8>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  // Process 4 1d dct8 rows in parallel per iteration.
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    Dct8_NEON<ButterflyRotation_4>(data, /*step=*/8, /*is_row=*/true,
+                                   row_shift);
+    data += 32;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Dct8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                  int adjusted_tx_height, void* src_buffer,
+                                  int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<8>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<8>(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d dct8 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Dct8_NEON<ButterflyRotation_4>(data, tx_width, /*is_row=*/false,
+                                     /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<8>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct16TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<16>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+
+  assert(adjusted_tx_height % 4 == 0);
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    // Process 4 1d dct16 rows in parallel per iteration.
+    Dct16_NEON<ButterflyRotation_4>(data, 16, /*is_row=*/true, row_shift);
+    data += 64;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Dct16TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height, void* src_buffer,
+                                   int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<16>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<16>(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d dct16 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Dct16_NEON<ButterflyRotation_4>(data, tx_width, /*is_row=*/false,
+                                      /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<16>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct32TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<32>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<32>(src, adjusted_tx_height);
+  }
+
+  assert(adjusted_tx_height % 4 == 0);
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    // Process 4 1d dct32 rows in parallel per iteration.
+    Dct32_NEON(data, 32, /*is_row=*/true, row_shift);
+    data += 128;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Dct32TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height, void* src_buffer,
+                                   int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<32>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<32>(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d dct32 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Dct32_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<32>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct64TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (DctDcOnly<64>(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<64>(src, adjusted_tx_height);
+  }
+
+  assert(adjusted_tx_height % 4 == 0);
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    // Process 4 1d dct64 rows in parallel per iteration.
+    Dct64_NEON(data, 64, /*is_row=*/true, row_shift);
+    data += 128 * 2;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Dct64TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height, void* src_buffer,
+                                   int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<64>(src, tx_width);
+  }
+
+  if (!DctDcOnlyColumn<64>(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d dct64 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Dct64_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<64>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Adst4TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const int row_shift = static_cast<int>(tx_height == 16);
+  const bool should_round = (tx_height == 8);
+
+  if (Adst4DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+
+  // Process 4 1d adst4 rows in parallel per iteration.
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    Adst4_NEON(data, /*step=*/4, /*is_row=*/true, row_shift);
+    data += 16;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Adst4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height, void* src_buffer,
+                                   int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<4>(src, tx_width);
+  }
+
+  if (!Adst4DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d adst4 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Adst4_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<4, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+                                                      tx_width, src, tx_type);
+}
+
+void Adst8TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Adst8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  // Process 4 1d adst8 rows in parallel per iteration.
+  assert(adjusted_tx_height % 4 == 0);
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    Adst8_NEON<ButterflyRotation_4>(data, /*step=*/8,
+                                    /*transpose=*/true, row_shift);
+    data += 32;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Adst8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height, void* src_buffer,
+                                   int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<8>(src, tx_width);
+  }
+
+  if (!Adst8DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+    // Process 4 1d adst8 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Adst8_NEON<ButterflyRotation_4>(data, tx_width, /*transpose=*/false,
+                                      /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<8, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+                                                      tx_width, src, tx_type);
+}
+
+void Adst16TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                 TransformSize tx_size, int adjusted_tx_height,
+                                 void* src_buffer, int /*start_x*/,
+                                 int /*start_y*/, void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Adst16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+
+  assert(adjusted_tx_height % 4 == 0);
+  int i = adjusted_tx_height;
+  do {
+    // Process 4 1d adst16 rows in parallel per iteration.
+    Adst16_NEON<ButterflyRotation_4>(src, 16, /*is_row=*/true, row_shift);
+    src += 64;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Adst16TransformLoopColumn_NEON(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height, void* src_buffer,
+                                    int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<16>(src, tx_width);
+  }
+
+  if (!Adst16DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+    int i = tx_width;
+    auto* data = src;
+    do {
+      // Process 4 1d adst16 columns in parallel per iteration.
+      Adst16_NEON<ButterflyRotation_4>(data, tx_width, /*is_row=*/false,
+                                       /*row_shift=*/0);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  StoreToFrameWithRound<16, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+                                                       tx_width, src, tx_type);
+}
+
+void Identity4TransformLoopRow_NEON(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height, void* src_buffer,
+                                    int /*start_x*/, int /*start_y*/,
+                                    void* /*dst_frame*/) {
+  // Special case: Process row calculations during column transform call.
+  // Improves performance.
+  if (tx_type == kTransformTypeIdentityIdentity &&
+      tx_size == kTransformSize4x4) {
+    return;
+  }
+
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = (tx_height == 8);
+
+  if (Identity4DcOnly(src, adjusted_tx_height, should_round, tx_height)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+
+  const int shift = tx_height > 8 ? 1 : 0;
+  int i = adjusted_tx_height;
+  do {
+    Identity4_NEON(src, /*step=*/4, shift);
+    src += 16;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Identity4TransformLoopColumn_NEON(TransformType tx_type,
+                                       TransformSize tx_size,
+                                       int adjusted_tx_height, void* src_buffer,
+                                       int start_x, int start_y,
+                                       void* dst_frame) {
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  // Special case: Process row calculations during column transform call.
+  if (tx_type == kTransformTypeIdentityIdentity &&
+      (tx_size == kTransformSize4x4 || tx_size == kTransformSize8x4)) {
+    Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width,
+                                   adjusted_tx_height, src);
+    return;
+  }
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<4>(src, tx_width);
+  }
+
+  IdentityColumnStoreToFrame<4>(frame, start_x, start_y, tx_width,
+                                adjusted_tx_height, src);
+}
+
+void Identity8TransformLoopRow_NEON(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height, void* src_buffer,
+                                    int /*start_x*/, int /*start_y*/,
+                                    void* /*dst_frame*/) {
+  // Special case: Process row calculations during column transform call.
+  // Improves performance.
+  if (tx_type == kTransformTypeIdentityIdentity &&
+      tx_size == kTransformSize8x4) {
+    return;
+  }
+
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Identity8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  // When combining the identity8 multiplier with the row shift, the
+  // calculations for tx_height == 8 and tx_height == 16 can be simplified
+  // from ((A * 2) + 1) >> 1) to A. For 10bpp, A must be clamped to a signed 16
+  // bit value.
+  if ((tx_height & 0x18) != 0) {
+    for (int i = 0; i < tx_height; ++i) {
+      const int32x4_t v_src_lo = vld1q_s32(&src[i * 8]);
+      const int32x4_t v_src_hi = vld1q_s32(&src[(i * 8) + 4]);
+      vst1q_s32(&src[i * 8], vmovl_s16(vqmovn_s32(v_src_lo)));
+      vst1q_s32(&src[(i * 8) + 4], vmovl_s16(vqmovn_s32(v_src_hi)));
+    }
+    return;
+  }
+  if (tx_height == 32) {
+    int i = adjusted_tx_height;
+    do {
+      Identity8Row32_NEON(src, /*step=*/8);
+      src += 32;
+      i -= 4;
+    } while (i != 0);
+    return;
+  }
+
+  assert(tx_size == kTransformSize8x4);
+  int i = adjusted_tx_height;
+  do {
+    Identity8Row4_NEON(src, /*step=*/8);
+    src += 32;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Identity8TransformLoopColumn_NEON(TransformType tx_type,
+                                       TransformSize tx_size,
+                                       int adjusted_tx_height, void* src_buffer,
+                                       int start_x, int start_y,
+                                       void* dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<8>(src, tx_width);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  IdentityColumnStoreToFrame<8>(frame, start_x, start_y, tx_width,
+                                adjusted_tx_height, src);
+}
+
+void Identity16TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                     TransformSize tx_size,
+                                     int adjusted_tx_height, void* src_buffer,
+                                     int /*start_x*/, int /*start_y*/,
+                                     void* /*dst_frame*/) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Identity16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+  int i = adjusted_tx_height;
+  do {
+    Identity16Row_NEON(src, /*step=*/16, row_shift);
+    src += 64;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Identity16TransformLoopColumn_NEON(TransformType tx_type,
+                                        TransformSize tx_size,
+                                        int adjusted_tx_height,
+                                        void* src_buffer, int start_x,
+                                        int start_y, void* dst_frame) {
+  auto* src = static_cast<int32_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (kTransformFlipColumnsMask.Contains(tx_type)) {
+    FlipColumns<16>(src, tx_width);
+  }
+  auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+  IdentityColumnStoreToFrame<16>(frame, start_x, start_y, tx_width,
+                                 adjusted_tx_height, src);
+}
+
+//------------------------------------------------------------------------------
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  // Maximum transform size for Dct is 64.
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
+      Dct4TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
+      Dct4TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
+      Dct8TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
+      Dct8TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
+      Dct16TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
+      Dct16TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
+      Dct32TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
+      Dct32TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
+      Dct64TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
+      Dct64TransformLoopColumn_NEON;
+
+  // Maximum transform size for Adst is 16.
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
+      Adst4TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
+      Adst4TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
+      Adst8TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
+      Adst8TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
+      Adst16TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
+      Adst16TransformLoopColumn_NEON;
+
+  // Maximum transform size for Identity transform is 32.
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
+      Identity4TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
+      Identity4TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
+      Identity8TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
+      Identity8TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
+      Identity16TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
+      Identity16TransformLoopColumn_NEON;
+}
+
+}  // namespace
+
+void InverseTransformInit10bpp_NEON() { Init10bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+#else   // !LIBGAV1_ENABLE_NEON || LIBGAV1_MAX_BITDEPTH < 10
+namespace libgav1 {
+namespace dsp {
+
+void InverseTransformInit10bpp_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10

diff --git a/libgav1/src/dsp/arm/inverse_transform_neon.cc b/libgav1/src/dsp/arm/inverse_transform_neon.cc
index 5ad53f6..315d5e9 100644
--- a/libgav1/src/dsp/arm/inverse_transform_neon.cc
+++ b/libgav1/src/dsp/arm/inverse_transform_neon.cc

@@ -85,6 +85,8 @@
   out[3] = vcombine_s16(d3, d3);
 }
 
+// Note this is only used in the final stage of Dct32/64 and Adst16 as the in
+// place version causes additional stack usage with clang.
 LIBGAV1_ALWAYS_INLINE void Transpose8x8(const int16x8_t in[8],
                                         int16x8_t out[8]) {
   // Swap 16 bit elements. Goes from:
@@ -392,8 +394,8 @@
   // inside 12 bits.  This leaves room for the sign bit and the 3 left shifted
   // bits.
   assert(sin128 <= 0xfff);
-  const int16x8_t x = vqrdmulhq_s16(*b, vdupq_n_s16(-sin128 << 3));
-  const int16x8_t y = vqrdmulhq_s16(*b, vdupq_n_s16(cos128 << 3));
+  const int16x8_t x = vqrdmulhq_n_s16(*b, -sin128 << 3);
+  const int16x8_t y = vqrdmulhq_n_s16(*b, cos128 << 3);
   if (flip) {
     *a = y;
     *b = x;
@@ -409,8 +411,8 @@
                                                           const bool flip) {
   const int16_t cos128 = Cos128(angle);
   const int16_t sin128 = Sin128(angle);
-  const int16x8_t x = vqrdmulhq_s16(*a, vdupq_n_s16(cos128 << 3));
-  const int16x8_t y = vqrdmulhq_s16(*a, vdupq_n_s16(sin128 << 3));
+  const int16x8_t x = vqrdmulhq_n_s16(*a, cos128 << 3);
+  const int16x8_t y = vqrdmulhq_n_s16(*a, sin128 << 3);
   if (flip) {
     *a = y;
     *b = x;
@@ -441,23 +443,18 @@
 // Discrete Cosine Transforms (DCT).
 
 template <int width>
-LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, const void* source,
-                                     int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, int adjusted_tx_height,
                                      bool should_round, int row_shift) {
-  if (non_zero_coeff_count > 1) {
-    return false;
-  }
+  if (adjusted_tx_height > 1) return false;
 
   auto* dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
-
-  const int16x8_t v_src = vdupq_n_s16(src[0]);
+  const int16x8_t v_src = vdupq_n_s16(dst[0]);
   const uint16x8_t v_mask = vdupq_n_u16(should_round ? 0xffff : 0);
   const int16x8_t v_src_round =
       vqrdmulhq_n_s16(v_src, kTransformRowMultiplier << 3);
   const int16x8_t s0 = vbslq_s16(v_mask, v_src_round, v_src);
   const int16_t cos128 = Cos128(32);
-  const int16x8_t xy = vqrdmulhq_s16(s0, vdupq_n_s16(cos128 << 3));
+  const int16x8_t xy = vqrdmulhq_n_s16(s0, cos128 << 3);
   // vqrshlq_s16 will shift right if shift value is negative.
   const int16x8_t xy_shifted = vqrshlq_s16(xy, vdupq_n_s16(-row_shift));
 
@@ -473,27 +470,23 @@
 }
 
 template <int height>
-LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, const void* source,
-                                           int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, int adjusted_tx_height,
                                            int width) {
-  if (non_zero_coeff_count > 1) {
-    return false;
-  }
+  if (adjusted_tx_height > 1) return false;
 
   auto* dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
   const int16_t cos128 = Cos128(32);
 
   // Calculate dc values for first row.
   if (width == 4) {
-    const int16x4_t v_src = vld1_s16(src);
-    const int16x4_t xy = vqrdmulh_s16(v_src, vdup_n_s16(cos128 << 3));
+    const int16x4_t v_src = vld1_s16(dst);
+    const int16x4_t xy = vqrdmulh_n_s16(v_src, cos128 << 3);
     vst1_s16(dst, xy);
   } else {
     int i = 0;
     do {
-      const int16x8_t v_src = vld1q_s16(&src[i]);
-      const int16x8_t xy = vqrdmulhq_s16(v_src, vdupq_n_s16(cos128 << 3));
+      const int16x8_t v_src = vld1q_s16(&dst[i]);
+      const int16x8_t xy = vqrdmulhq_n_s16(v_src, cos128 << 3);
       vst1q_s16(&dst[i], xy);
       i += 8;
     } while (i < width);
@@ -501,21 +494,21 @@
 
   // Copy first row to the rest of the block.
   for (int y = 1; y < height; ++y) {
-    memcpy(&dst[y * width], &src[(y - 1) * width], width * sizeof(dst[0]));
+    memcpy(&dst[y * width], dst, width * sizeof(dst[0]));
   }
   return true;
 }
 
-template <ButterflyRotationFunc bufferfly_rotation,
-          bool is_fast_bufferfly = false>
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
 LIBGAV1_ALWAYS_INLINE void Dct4Stages(int16x8_t* s) {
   // stage 12.
-  if (is_fast_bufferfly) {
+  if (is_fast_butterfly) {
     ButterflyRotation_SecondIsZero(&s[0], &s[1], 32, true);
     ButterflyRotation_SecondIsZero(&s[2], &s[3], 48, false);
   } else {
-    bufferfly_rotation(&s[0], &s[1], 32, true);
-    bufferfly_rotation(&s[2], &s[3], 48, false);
+    butterfly_rotation(&s[0], &s[1], 32, true);
+    butterfly_rotation(&s[2], &s[3], 48, false);
   }
 
   // stage 17.
@@ -523,23 +516,21 @@
   HadamardRotation(&s[1], &s[2], false);
 }
 
-template <ButterflyRotationFunc bufferfly_rotation, bool stage_is_rectangular>
-LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, const void* source,
-                                     int32_t step, bool transpose) {
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool transpose) {
   auto* const dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
   int16x8_t s[4], x[4];
 
   if (stage_is_rectangular) {
     if (transpose) {
       int16x8_t input[8];
-      LoadSrc<8, 8>(src, step, 0, input);
+      LoadSrc<8, 8>(dst, step, 0, input);
       Transpose4x8To8x4(input, x);
     } else {
-      LoadSrc<16, 4>(src, step, 0, x);
+      LoadSrc<16, 4>(dst, step, 0, x);
     }
   } else {
-    LoadSrc<8, 4>(src, step, 0, x);
+    LoadSrc<8, 4>(dst, step, 0, x);
     if (transpose) {
       Transpose4x4(x, x);
     }
@@ -552,7 +543,7 @@
   s[2] = x[1];
   s[3] = x[3];
 
-  Dct4Stages<bufferfly_rotation>(s);
+  Dct4Stages<butterfly_rotation>(s);
 
   if (stage_is_rectangular) {
     if (transpose) {
@@ -570,16 +561,16 @@
   }
 }
 
-template <ButterflyRotationFunc bufferfly_rotation,
-          bool is_fast_bufferfly = false>
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
 LIBGAV1_ALWAYS_INLINE void Dct8Stages(int16x8_t* s) {
   // stage 8.
-  if (is_fast_bufferfly) {
+  if (is_fast_butterfly) {
     ButterflyRotation_SecondIsZero(&s[4], &s[7], 56, false);
     ButterflyRotation_FirstIsZero(&s[5], &s[6], 24, false);
   } else {
-    bufferfly_rotation(&s[4], &s[7], 56, false);
-    bufferfly_rotation(&s[5], &s[6], 24, false);
+    butterfly_rotation(&s[4], &s[7], 56, false);
+    butterfly_rotation(&s[5], &s[6], 24, false);
   }
 
   // stage 13.
@@ -587,7 +578,7 @@
   HadamardRotation(&s[6], &s[7], true);
 
   // stage 18.
-  bufferfly_rotation(&s[6], &s[5], 32, true);
+  butterfly_rotation(&s[6], &s[5], 32, true);
 
   // stage 22.
   HadamardRotation(&s[0], &s[7], false);
@@ -597,27 +588,24 @@
 }
 
 // Process dct8 rows or columns, depending on the transpose flag.
-template <ButterflyRotationFunc bufferfly_rotation, bool stage_is_rectangular>
-LIBGAV1_ALWAYS_INLINE void Dct8_NEON(void* dest, const void* source,
-                                     int32_t step, bool transpose) {
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct8_NEON(void* dest, int32_t step, bool transpose) {
   auto* const dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
   int16x8_t s[8], x[8];
 
   if (stage_is_rectangular) {
     if (transpose) {
       int16x8_t input[4];
-      LoadSrc<16, 4>(src, step, 0, input);
+      LoadSrc<16, 4>(dst, step, 0, input);
       Transpose8x4To4x8(input, x);
     } else {
-      LoadSrc<8, 8>(src, step, 0, x);
+      LoadSrc<8, 8>(dst, step, 0, x);
     }
   } else if (transpose) {
-    int16x8_t input[8];
-    LoadSrc<16, 8>(src, step, 0, input);
-    Transpose8x8(input, x);
+    LoadSrc<16, 8>(dst, step, 0, x);
+    dsp::Transpose8x8(x);
   } else {
-    LoadSrc<16, 8>(src, step, 0, x);
+    LoadSrc<16, 8>(dst, step, 0, x);
   }
 
   // stage 1.
@@ -631,8 +619,8 @@
   s[6] = x[3];
   s[7] = x[7];
 
-  Dct4Stages<bufferfly_rotation>(s);
-  Dct8Stages<bufferfly_rotation>(s);
+  Dct4Stages<butterfly_rotation>(s);
+  Dct8Stages<butterfly_rotation>(s);
 
   if (stage_is_rectangular) {
     if (transpose) {
@@ -643,28 +631,27 @@
       StoreDst<8, 8>(dst, step, 0, s);
     }
   } else if (transpose) {
-    int16x8_t output[8];
-    Transpose8x8(s, output);
-    StoreDst<16, 8>(dst, step, 0, output);
+    dsp::Transpose8x8(s);
+    StoreDst<16, 8>(dst, step, 0, s);
   } else {
     StoreDst<16, 8>(dst, step, 0, s);
   }
 }
 
-template <ButterflyRotationFunc bufferfly_rotation,
-          bool is_fast_bufferfly = false>
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
 LIBGAV1_ALWAYS_INLINE void Dct16Stages(int16x8_t* s) {
   // stage 5.
-  if (is_fast_bufferfly) {
+  if (is_fast_butterfly) {
     ButterflyRotation_SecondIsZero(&s[8], &s[15], 60, false);
     ButterflyRotation_FirstIsZero(&s[9], &s[14], 28, false);
     ButterflyRotation_SecondIsZero(&s[10], &s[13], 44, false);
     ButterflyRotation_FirstIsZero(&s[11], &s[12], 12, false);
   } else {
-    bufferfly_rotation(&s[8], &s[15], 60, false);
-    bufferfly_rotation(&s[9], &s[14], 28, false);
-    bufferfly_rotation(&s[10], &s[13], 44, false);
-    bufferfly_rotation(&s[11], &s[12], 12, false);
+    butterfly_rotation(&s[8], &s[15], 60, false);
+    butterfly_rotation(&s[9], &s[14], 28, false);
+    butterfly_rotation(&s[10], &s[13], 44, false);
+    butterfly_rotation(&s[11], &s[12], 12, false);
   }
 
   // stage 9.
@@ -674,8 +661,8 @@
   HadamardRotation(&s[14], &s[15], true);
 
   // stage 14.
-  bufferfly_rotation(&s[14], &s[9], 48, true);
-  bufferfly_rotation(&s[13], &s[10], 112, true);
+  butterfly_rotation(&s[14], &s[9], 48, true);
+  butterfly_rotation(&s[13], &s[10], 112, true);
 
   // stage 19.
   HadamardRotation(&s[8], &s[11], false);
@@ -684,8 +671,8 @@
   HadamardRotation(&s[13], &s[14], true);
 
   // stage 23.
-  bufferfly_rotation(&s[13], &s[10], 32, true);
-  bufferfly_rotation(&s[12], &s[11], 32, true);
+  butterfly_rotation(&s[13], &s[10], 32, true);
+  butterfly_rotation(&s[12], &s[11], 32, true);
 
   // stage 26.
   HadamardRotation(&s[0], &s[15], false);
@@ -699,32 +686,29 @@
 }
 
 // Process dct16 rows or columns, depending on the transpose flag.
-template <ButterflyRotationFunc bufferfly_rotation, bool stage_is_rectangular>
-LIBGAV1_ALWAYS_INLINE void Dct16_NEON(void* dest, const void* source,
-                                      int32_t step, bool is_row,
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct16_NEON(void* dest, int32_t step, bool is_row,
                                       int row_shift) {
   auto* const dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
   int16x8_t s[16], x[16];
 
   if (stage_is_rectangular) {
     if (is_row) {
       int16x8_t input[4];
-      LoadSrc<16, 4>(src, step, 0, input);
+      LoadSrc<16, 4>(dst, step, 0, input);
       Transpose8x4To4x8(input, x);
-      LoadSrc<16, 4>(src, step, 8, input);
+      LoadSrc<16, 4>(dst, step, 8, input);
       Transpose8x4To4x8(input, &x[8]);
     } else {
-      LoadSrc<8, 16>(src, step, 0, x);
+      LoadSrc<8, 16>(dst, step, 0, x);
     }
   } else if (is_row) {
     for (int idx = 0; idx < 16; idx += 8) {
-      int16x8_t input[8];
-      LoadSrc<16, 8>(src, step, idx, input);
-      Transpose8x8(input, &x[idx]);
+      LoadSrc<16, 8>(dst, step, idx, &x[idx]);
+      dsp::Transpose8x8(&x[idx]);
     }
   } else {
-    LoadSrc<16, 16>(src, step, 0, x);
+    LoadSrc<16, 16>(dst, step, 0, x);
   }
 
   // stage 1
@@ -746,9 +730,9 @@
   s[14] = x[7];
   s[15] = x[15];
 
-  Dct4Stages<bufferfly_rotation>(s);
-  Dct8Stages<bufferfly_rotation>(s);
-  Dct16Stages<bufferfly_rotation>(s);
+  Dct4Stages<butterfly_rotation>(s);
+  Dct8Stages<butterfly_rotation>(s);
+  Dct16Stages<butterfly_rotation>(s);
 
   if (is_row) {
     const int16x8_t v_row_shift = vdupq_n_s16(-row_shift);
@@ -769,16 +753,15 @@
     }
   } else if (is_row) {
     for (int idx = 0; idx < 16; idx += 8) {
-      int16x8_t output[8];
-      Transpose8x8(&s[idx], output);
-      StoreDst<16, 8>(dst, step, idx, output);
+      dsp::Transpose8x8(&s[idx]);
+      StoreDst<16, 8>(dst, step, idx, &s[idx]);
     }
   } else {
     StoreDst<16, 16>(dst, step, 0, s);
   }
 }
 
-template <ButterflyRotationFunc bufferfly_rotation,
+template <ButterflyRotationFunc butterfly_rotation,
           bool is_fast_butterfly = false>
 LIBGAV1_ALWAYS_INLINE void Dct32Stages(int16x8_t* s) {
   // stage 3
@@ -792,14 +775,14 @@
     ButterflyRotation_SecondIsZero(&s[22], &s[25], 38, false);
     ButterflyRotation_FirstIsZero(&s[23], &s[24], 6, false);
   } else {
-    bufferfly_rotation(&s[16], &s[31], 62, false);
-    bufferfly_rotation(&s[17], &s[30], 30, false);
-    bufferfly_rotation(&s[18], &s[29], 46, false);
-    bufferfly_rotation(&s[19], &s[28], 14, false);
-    bufferfly_rotation(&s[20], &s[27], 54, false);
-    bufferfly_rotation(&s[21], &s[26], 22, false);
-    bufferfly_rotation(&s[22], &s[25], 38, false);
-    bufferfly_rotation(&s[23], &s[24], 6, false);
+    butterfly_rotation(&s[16], &s[31], 62, false);
+    butterfly_rotation(&s[17], &s[30], 30, false);
+    butterfly_rotation(&s[18], &s[29], 46, false);
+    butterfly_rotation(&s[19], &s[28], 14, false);
+    butterfly_rotation(&s[20], &s[27], 54, false);
+    butterfly_rotation(&s[21], &s[26], 22, false);
+    butterfly_rotation(&s[22], &s[25], 38, false);
+    butterfly_rotation(&s[23], &s[24], 6, false);
   }
   // stage 6.
   HadamardRotation(&s[16], &s[17], false);
@@ -812,10 +795,10 @@
   HadamardRotation(&s[30], &s[31], true);
 
   // stage 10.
-  bufferfly_rotation(&s[30], &s[17], 24 + 32, true);
-  bufferfly_rotation(&s[29], &s[18], 24 + 64 + 32, true);
-  bufferfly_rotation(&s[26], &s[21], 24, true);
-  bufferfly_rotation(&s[25], &s[22], 24 + 64, true);
+  butterfly_rotation(&s[30], &s[17], 24 + 32, true);
+  butterfly_rotation(&s[29], &s[18], 24 + 64 + 32, true);
+  butterfly_rotation(&s[26], &s[21], 24, true);
+  butterfly_rotation(&s[25], &s[22], 24 + 64, true);
 
   // stage 15.
   HadamardRotation(&s[16], &s[19], false);
@@ -828,10 +811,10 @@
   HadamardRotation(&s[29], &s[30], true);
 
   // stage 20.
-  bufferfly_rotation(&s[29], &s[18], 48, true);
-  bufferfly_rotation(&s[28], &s[19], 48, true);
-  bufferfly_rotation(&s[27], &s[20], 48 + 64, true);
-  bufferfly_rotation(&s[26], &s[21], 48 + 64, true);
+  butterfly_rotation(&s[29], &s[18], 48, true);
+  butterfly_rotation(&s[28], &s[19], 48, true);
+  butterfly_rotation(&s[27], &s[20], 48 + 64, true);
+  butterfly_rotation(&s[26], &s[21], 48 + 64, true);
 
   // stage 24.
   HadamardRotation(&s[16], &s[23], false);
@@ -844,10 +827,10 @@
   HadamardRotation(&s[27], &s[28], true);
 
   // stage 27.
-  bufferfly_rotation(&s[27], &s[20], 32, true);
-  bufferfly_rotation(&s[26], &s[21], 32, true);
-  bufferfly_rotation(&s[25], &s[22], 32, true);
-  bufferfly_rotation(&s[24], &s[23], 32, true);
+  butterfly_rotation(&s[27], &s[20], 32, true);
+  butterfly_rotation(&s[26], &s[21], 32, true);
+  butterfly_rotation(&s[25], &s[22], 32, true);
+  butterfly_rotation(&s[24], &s[23], 32, true);
 
   // stage 29.
   HadamardRotation(&s[0], &s[31], false);
@@ -869,21 +852,18 @@
 }
 
 // Process dct32 rows or columns, depending on the transpose flag.
-LIBGAV1_ALWAYS_INLINE void Dct32_NEON(void* dest, const void* source,
-                                      const int32_t step, const bool is_row,
-                                      int row_shift) {
+LIBGAV1_ALWAYS_INLINE void Dct32_NEON(void* dest, const int32_t step,
+                                      const bool is_row, int row_shift) {
   auto* const dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
   int16x8_t s[32], x[32];
 
   if (is_row) {
     for (int idx = 0; idx < 32; idx += 8) {
-      int16x8_t input[8];
-      LoadSrc<16, 8>(src, step, idx, input);
-      Transpose8x8(input, &x[idx]);
+      LoadSrc<16, 8>(dst, step, idx, &x[idx]);
+      dsp::Transpose8x8(&x[idx]);
     }
   } else {
-    LoadSrc<16, 32>(src, step, 0, x);
+    LoadSrc<16, 32>(dst, step, 0, x);
   }
 
   // stage 1
@@ -946,24 +926,21 @@
 
 // Allow the compiler to call this function instead of force inlining. Tests
 // show the performance is slightly faster.
-void Dct64_NEON(void* dest, const void* source, int32_t step, bool is_row,
-                int row_shift) {
+void Dct64_NEON(void* dest, int32_t step, bool is_row, int row_shift) {
   auto* const dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
   int16x8_t s[64], x[32];
 
   if (is_row) {
     // The last 32 values of every row are always zero if the |tx_width| is
     // 64.
     for (int idx = 0; idx < 32; idx += 8) {
-      int16x8_t input[8];
-      LoadSrc<16, 8>(src, step, idx, input);
-      Transpose8x8(input, &x[idx]);
+      LoadSrc<16, 8>(dst, step, idx, &x[idx]);
+      dsp::Transpose8x8(&x[idx]);
     }
   } else {
     // The last 32 values of every column are always zero if the |tx_height| is
     // 64.
-    LoadSrc<16, 32>(src, step, 0, x);
+    LoadSrc<16, 32>(dst, step, 0, x);
   }
 
   // stage 1
@@ -1171,23 +1148,22 @@
 //------------------------------------------------------------------------------
 // Asymmetric Discrete Sine Transforms (ADST).
 template <bool stage_is_rectangular>
-LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, const void* source,
-                                      int32_t step, bool transpose) {
+LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step,
+                                      bool transpose) {
   auto* const dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
   int32x4_t s[8];
   int16x8_t x[4];
 
   if (stage_is_rectangular) {
     if (transpose) {
       int16x8_t input[8];
-      LoadSrc<8, 8>(src, step, 0, input);
+      LoadSrc<8, 8>(dst, step, 0, input);
       Transpose4x8To8x4(input, x);
     } else {
-      LoadSrc<16, 4>(src, step, 0, x);
+      LoadSrc<16, 4>(dst, step, 0, x);
     }
   } else {
-    LoadSrc<8, 4>(src, step, 0, x);
+    LoadSrc<8, 4>(dst, step, 0, x);
     if (transpose) {
       Transpose4x4(x, x);
     }
@@ -1250,18 +1226,14 @@
 alignas(8) constexpr int16_t kAdst4DcOnlyMultiplier[4] = {1321, 2482, 3344,
                                                           2482};
 
-LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, const void* source,
-                                       int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, int adjusted_tx_height,
                                        bool should_round, int row_shift) {
-  if (non_zero_coeff_count > 1) {
-    return false;
-  }
+  if (adjusted_tx_height > 1) return false;
 
   auto* dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
   int32x4_t s[2];
 
-  const int16x4_t v_src0 = vdup_n_s16(src[0]);
+  const int16x4_t v_src0 = vdup_n_s16(dst[0]);
   const uint16x4_t v_mask = vdup_n_u16(should_round ? 0xffff : 0);
   const int16x4_t v_src_round =
       vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3);
@@ -1283,21 +1255,16 @@
   return true;
 }
 
-LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, const void* source,
-                                             int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, int adjusted_tx_height,
                                              int width) {
-  if (non_zero_coeff_count > 1) {
-    return false;
-  }
+  if (adjusted_tx_height > 1) return false;
 
   auto* dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
-
   int32x4_t s[4];
 
   int i = 0;
   do {
-    const int16x4_t v_src = vld1_s16(&src[i]);
+    const int16x4_t v_src = vld1_s16(&dst[i]);
 
     s[0] = vmull_n_s16(v_src, kAdst4Multiplier[0]);
     s[1] = vmull_n_s16(v_src, kAdst4Multiplier[1]);
@@ -1323,28 +1290,26 @@
   return true;
 }
 
-template <ButterflyRotationFunc bufferfly_rotation, bool stage_is_rectangular>
-LIBGAV1_ALWAYS_INLINE void Adst8_NEON(void* dest, const void* source,
-                                      int32_t step, bool transpose) {
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst8_NEON(void* dest, int32_t step,
+                                      bool transpose) {
   auto* const dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
   int16x8_t s[8], x[8];
 
   if (stage_is_rectangular) {
     if (transpose) {
       int16x8_t input[4];
-      LoadSrc<16, 4>(src, step, 0, input);
+      LoadSrc<16, 4>(dst, step, 0, input);
       Transpose8x4To4x8(input, x);
     } else {
-      LoadSrc<8, 8>(src, step, 0, x);
+      LoadSrc<8, 8>(dst, step, 0, x);
     }
   } else {
     if (transpose) {
-      int16x8_t input[8];
-      LoadSrc<16, 8>(src, step, 0, input);
-      Transpose8x8(input, x);
+      LoadSrc<16, 8>(dst, step, 0, x);
+      dsp::Transpose8x8(x);
     } else {
-      LoadSrc<16, 8>(src, step, 0, x);
+      LoadSrc<16, 8>(dst, step, 0, x);
     }
   }
 
@@ -1359,10 +1324,10 @@
   s[7] = x[6];
 
   // stage 2.
-  bufferfly_rotation(&s[0], &s[1], 60 - 0, true);
-  bufferfly_rotation(&s[2], &s[3], 60 - 16, true);
-  bufferfly_rotation(&s[4], &s[5], 60 - 32, true);
-  bufferfly_rotation(&s[6], &s[7], 60 - 48, true);
+  butterfly_rotation(&s[0], &s[1], 60 - 0, true);
+  butterfly_rotation(&s[2], &s[3], 60 - 16, true);
+  butterfly_rotation(&s[4], &s[5], 60 - 32, true);
+  butterfly_rotation(&s[6], &s[7], 60 - 48, true);
 
   // stage 3.
   HadamardRotation(&s[0], &s[4], false);
@@ -1371,8 +1336,8 @@
   HadamardRotation(&s[3], &s[7], false);
 
   // stage 4.
-  bufferfly_rotation(&s[4], &s[5], 48 - 0, true);
-  bufferfly_rotation(&s[7], &s[6], 48 - 32, true);
+  butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+  butterfly_rotation(&s[7], &s[6], 48 - 32, true);
 
   // stage 5.
   HadamardRotation(&s[0], &s[2], false);
@@ -1381,8 +1346,8 @@
   HadamardRotation(&s[5], &s[7], false);
 
   // stage 6.
-  bufferfly_rotation(&s[2], &s[3], 32, true);
-  bufferfly_rotation(&s[6], &s[7], 32, true);
+  butterfly_rotation(&s[2], &s[3], 32, true);
+  butterfly_rotation(&s[6], &s[7], 32, true);
 
   // stage 7.
   x[0] = s[0];
@@ -1404,27 +1369,22 @@
     }
   } else {
     if (transpose) {
-      int16x8_t output[8];
-      Transpose8x8(x, output);
-      StoreDst<16, 8>(dst, step, 0, output);
+      dsp::Transpose8x8(x);
+      StoreDst<16, 8>(dst, step, 0, x);
     } else {
       StoreDst<16, 8>(dst, step, 0, x);
     }
   }
 }
 
-LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, const void* source,
-                                       int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, int adjusted_tx_height,
                                        bool should_round, int row_shift) {
-  if (non_zero_coeff_count > 1) {
-    return false;
-  }
+  if (adjusted_tx_height > 1) return false;
 
   auto* dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
   int16x8_t s[8];
 
-  const int16x8_t v_src = vdupq_n_s16(src[0]);
+  const int16x8_t v_src = vdupq_n_s16(dst[0]);
   const uint16x8_t v_mask = vdupq_n_u16(should_round ? 0xffff : 0);
   const int16x8_t v_src_round =
       vqrdmulhq_n_s16(v_src, kTransformRowMultiplier << 3);
@@ -1471,20 +1431,16 @@
   return true;
 }
 
-LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, const void* source,
-                                             int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, int adjusted_tx_height,
                                              int width) {
-  if (non_zero_coeff_count > 1) {
-    return false;
-  }
+  if (adjusted_tx_height > 1) return false;
 
   auto* dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
   int16x8_t s[8];
 
   int i = 0;
   do {
-    const int16x8_t v_src = vld1q_s16(&src[i]);
+    const int16x8_t v_src = vld1q_s16(dst);
     // stage 1.
     s[1] = v_src;
 
@@ -1529,33 +1485,30 @@
   return true;
 }
 
-template <ButterflyRotationFunc bufferfly_rotation, bool stage_is_rectangular>
-LIBGAV1_ALWAYS_INLINE void Adst16_NEON(void* dest, const void* source,
-                                       int32_t step, bool is_row,
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst16_NEON(void* dest, int32_t step, bool is_row,
                                        int row_shift) {
   auto* const dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
   int16x8_t s[16], x[16];
 
   if (stage_is_rectangular) {
     if (is_row) {
       int16x8_t input[4];
-      LoadSrc<16, 4>(src, step, 0, input);
+      LoadSrc<16, 4>(dst, step, 0, input);
       Transpose8x4To4x8(input, x);
-      LoadSrc<16, 4>(src, step, 8, input);
+      LoadSrc<16, 4>(dst, step, 8, input);
       Transpose8x4To4x8(input, &x[8]);
     } else {
-      LoadSrc<8, 16>(src, step, 0, x);
+      LoadSrc<8, 16>(dst, step, 0, x);
     }
   } else {
     if (is_row) {
       for (int idx = 0; idx < 16; idx += 8) {
-        int16x8_t input[8];
-        LoadSrc<16, 8>(src, step, idx, input);
-        Transpose8x8(input, &x[idx]);
+        LoadSrc<16, 8>(dst, step, idx, &x[idx]);
+        dsp::Transpose8x8(&x[idx]);
       }
     } else {
-      LoadSrc<16, 16>(src, step, 0, x);
+      LoadSrc<16, 16>(dst, step, 0, x);
     }
   }
 
@@ -1578,14 +1531,14 @@
   s[15] = x[14];
 
   // stage 2.
-  bufferfly_rotation(&s[0], &s[1], 62 - 0, true);
-  bufferfly_rotation(&s[2], &s[3], 62 - 8, true);
-  bufferfly_rotation(&s[4], &s[5], 62 - 16, true);
-  bufferfly_rotation(&s[6], &s[7], 62 - 24, true);
-  bufferfly_rotation(&s[8], &s[9], 62 - 32, true);
-  bufferfly_rotation(&s[10], &s[11], 62 - 40, true);
-  bufferfly_rotation(&s[12], &s[13], 62 - 48, true);
-  bufferfly_rotation(&s[14], &s[15], 62 - 56, true);
+  butterfly_rotation(&s[0], &s[1], 62 - 0, true);
+  butterfly_rotation(&s[2], &s[3], 62 - 8, true);
+  butterfly_rotation(&s[4], &s[5], 62 - 16, true);
+  butterfly_rotation(&s[6], &s[7], 62 - 24, true);
+  butterfly_rotation(&s[8], &s[9], 62 - 32, true);
+  butterfly_rotation(&s[10], &s[11], 62 - 40, true);
+  butterfly_rotation(&s[12], &s[13], 62 - 48, true);
+  butterfly_rotation(&s[14], &s[15], 62 - 56, true);
 
   // stage 3.
   HadamardRotation(&s[0], &s[8], false);
@@ -1598,10 +1551,10 @@
   HadamardRotation(&s[7], &s[15], false);
 
   // stage 4.
-  bufferfly_rotation(&s[8], &s[9], 56 - 0, true);
-  bufferfly_rotation(&s[13], &s[12], 8 + 0, true);
-  bufferfly_rotation(&s[10], &s[11], 56 - 32, true);
-  bufferfly_rotation(&s[15], &s[14], 8 + 32, true);
+  butterfly_rotation(&s[8], &s[9], 56 - 0, true);
+  butterfly_rotation(&s[13], &s[12], 8 + 0, true);
+  butterfly_rotation(&s[10], &s[11], 56 - 32, true);
+  butterfly_rotation(&s[15], &s[14], 8 + 32, true);
 
   // stage 5.
   HadamardRotation(&s[0], &s[4], false);
@@ -1614,10 +1567,10 @@
   HadamardRotation(&s[11], &s[15], false);
 
   // stage 6.
-  bufferfly_rotation(&s[4], &s[5], 48 - 0, true);
-  bufferfly_rotation(&s[12], &s[13], 48 - 0, true);
-  bufferfly_rotation(&s[7], &s[6], 48 - 32, true);
-  bufferfly_rotation(&s[15], &s[14], 48 - 32, true);
+  butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+  butterfly_rotation(&s[12], &s[13], 48 - 0, true);
+  butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+  butterfly_rotation(&s[15], &s[14], 48 - 32, true);
 
   // stage 7.
   HadamardRotation(&s[0], &s[2], false);
@@ -1630,10 +1583,10 @@
   HadamardRotation(&s[13], &s[15], false);
 
   // stage 8.
-  bufferfly_rotation(&s[2], &s[3], 32, true);
-  bufferfly_rotation(&s[6], &s[7], 32, true);
-  bufferfly_rotation(&s[10], &s[11], 32, true);
-  bufferfly_rotation(&s[14], &s[15], 32, true);
+  butterfly_rotation(&s[2], &s[3], 32, true);
+  butterfly_rotation(&s[6], &s[7], 32, true);
+  butterfly_rotation(&s[10], &s[11], 32, true);
+  butterfly_rotation(&s[14], &s[15], 32, true);
 
   // stage 9.
   x[0] = s[0];
@@ -1743,19 +1696,15 @@
   x[15] = vqnegq_s16(s[1]);
 }
 
-LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, const void* source,
-                                        int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, int adjusted_tx_height,
                                         bool should_round, int row_shift) {
-  if (non_zero_coeff_count > 1) {
-    return false;
-  }
+  if (adjusted_tx_height > 1) return false;
 
   auto* dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
   int16x8_t s[16];
   int16x8_t x[16];
 
-  const int16x8_t v_src = vdupq_n_s16(src[0]);
+  const int16x8_t v_src = vdupq_n_s16(dst[0]);
   const uint16x8_t v_mask = vdupq_n_u16(should_round ? 0xffff : 0);
   const int16x8_t v_src_round =
       vqrdmulhq_n_s16(v_src, kTransformRowMultiplier << 3);
@@ -1773,21 +1722,17 @@
   return true;
 }
 
-LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest, const void* source,
-                                              int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest,
+                                              int adjusted_tx_height,
                                               int width) {
-  if (non_zero_coeff_count > 1) {
-    return false;
-  }
+  if (adjusted_tx_height > 1) return false;
 
   auto* dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
-
   int i = 0;
   do {
     int16x8_t s[16];
     int16x8_t x[16];
-    const int16x8_t v_src = vld1q_s16(&src[i]);
+    const int16x8_t v_src = vld1q_s16(dst);
     // stage 1.
     s[1] = v_src;
 
@@ -1807,10 +1752,8 @@
 // Identity Transforms.
 
 template <bool is_row_shift>
-LIBGAV1_ALWAYS_INLINE void Identity4_NEON(void* dest, const void* source,
-                                          int32_t step) {
+LIBGAV1_ALWAYS_INLINE void Identity4_NEON(void* dest, int32_t step) {
   auto* const dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
 
   if (is_row_shift) {
     const int shift = 1;
@@ -1818,7 +1761,7 @@
     const int16x4_t v_multiplier = vdup_n_s16(kIdentity4Multiplier);
     const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
     for (int i = 0; i < 4; i += 2) {
-      const int16x8_t v_src = vld1q_s16(&src[i * step]);
+      const int16x8_t v_src = vld1q_s16(&dst[i * step]);
       const int32x4_t v_src_mult_lo =
           vmlal_s16(v_dual_round, vget_low_s16(v_src), v_multiplier);
       const int32x4_t v_src_mult_hi =
@@ -1830,7 +1773,7 @@
     }
   } else {
     for (int i = 0; i < 4; i += 2) {
-      const int16x8_t v_src = vld1q_s16(&src[i * step]);
+      const int16x8_t v_src = vld1q_s16(&dst[i * step]);
       const int16x8_t a =
           vqrdmulhq_n_s16(v_src, kIdentity4MultiplierFraction << 3);
       const int16x8_t b = vqaddq_s16(v_src, a);
@@ -1839,17 +1782,12 @@
   }
 }
 
-LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, const void* source,
-                                           int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height,
                                            bool should_round, int tx_height) {
-  if (non_zero_coeff_count > 1) {
-    return false;
-  }
+  if (adjusted_tx_height > 1) return false;
 
   auto* dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
-
-  const int16x4_t v_src0 = vdup_n_s16(src[0]);
+  const int16x4_t v_src0 = vdup_n_s16(dst[0]);
   const uint16x4_t v_mask = vdup_n_u16(should_round ? 0xffff : 0);
   const int16x4_t v_src_round =
       vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3);
@@ -1860,7 +1798,7 @@
   const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
   const int32x4_t v_src_mult_lo = vmlal_s16(v_dual_round, v_src, v_multiplier);
   const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, v_shift);
-  vst1_lane_s16(&dst[0], vqmovn_s32(dst_0), 0);
+  vst1_lane_s16(dst, vqmovn_s32(dst_0), 0);
   return true;
 }
 
@@ -2001,28 +1939,24 @@
   }
 }
 
-LIBGAV1_ALWAYS_INLINE void Identity8Row32_NEON(void* dest, const void* source,
-                                               int32_t step) {
+LIBGAV1_ALWAYS_INLINE void Identity8Row32_NEON(void* dest, int32_t step) {
   auto* const dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
 
   // When combining the identity8 multiplier with the row shift, the
   // calculations for tx_height equal to 32 can be simplified from
   // ((A * 2) + 2) >> 2) to ((A + 1) >> 1).
   for (int i = 0; i < 4; ++i) {
-    const int16x8_t v_src = vld1q_s16(&src[i * step]);
+    const int16x8_t v_src = vld1q_s16(&dst[i * step]);
     const int16x8_t a = vrshrq_n_s16(v_src, 1);
     vst1q_s16(&dst[i * step], a);
   }
 }
 
-LIBGAV1_ALWAYS_INLINE void Identity8Row4_NEON(void* dest, const void* source,
-                                              int32_t step) {
+LIBGAV1_ALWAYS_INLINE void Identity8Row4_NEON(void* dest, int32_t step) {
   auto* const dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
 
   for (int i = 0; i < 4; ++i) {
-    const int16x8_t v_src = vld1q_s16(&src[i * step]);
+    const int16x8_t v_src = vld1q_s16(&dst[i * step]);
     // For bitdepth == 8, the identity row clamps to a signed 16bit value, so
     // saturating add here is ok.
     const int16x8_t v_srcx2 = vqaddq_s16(v_src, v_src);
@@ -2030,36 +1964,31 @@
   }
 }
 
-LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, const void* source,
-                                           int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height,
                                            bool should_round, int row_shift) {
-  if (non_zero_coeff_count > 1) {
-    return false;
-  }
+  if (adjusted_tx_height > 1) return false;
 
   auto* dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
-  const int16x4_t v_src0 = vdup_n_s16(src[0]);
+  const int16x4_t v_src0 = vdup_n_s16(dst[0]);
   const uint16x4_t v_mask = vdup_n_u16(should_round ? 0xffff : 0);
   const int16x4_t v_src_round =
       vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3);
   const int16x4_t v_src = vbsl_s16(v_mask, v_src_round, v_src0);
   const int32x4_t v_srcx2 = vaddl_s16(v_src, v_src);
   const int32x4_t dst_0 = vqrshlq_s32(v_srcx2, vdupq_n_s32(-row_shift));
-  vst1_lane_s16(&dst[0], vqmovn_s32(dst_0), 0);
+  vst1_lane_s16(dst, vqmovn_s32(dst_0), 0);
   return true;
 }
 
-LIBGAV1_ALWAYS_INLINE void Identity16Row_NEON(void* dest, const void* source,
-                                              int32_t step, int shift) {
+LIBGAV1_ALWAYS_INLINE void Identity16Row_NEON(void* dest, int32_t step,
+                                              int shift) {
   auto* const dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
   const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
   const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
 
   for (int i = 0; i < 4; ++i) {
     for (int j = 0; j < 2; ++j) {
-      const int16x8_t v_src = vld1q_s16(&src[i * step + j * 8]);
+      const int16x8_t v_src = vld1q_s16(&dst[i * step + j * 8]);
       const int32x4_t v_src_mult_lo =
           vmlal_n_s16(v_dual_round, vget_low_s16(v_src), kIdentity16Multiplier);
       const int32x4_t v_src_mult_hi = vmlal_n_s16(
@@ -2072,17 +2001,12 @@
   }
 }
 
-LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, const void* source,
-                                            int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height,
                                             bool should_round, int shift) {
-  if (non_zero_coeff_count > 1) {
-    return false;
-  }
+  if (adjusted_tx_height > 1) return false;
 
   auto* dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
-
-  const int16x4_t v_src0 = vdup_n_s16(src[0]);
+  const int16x4_t v_src0 = vdup_n_s16(dst[0]);
   const uint16x4_t v_mask = vdup_n_u16(should_round ? 0xffff : 0);
   const int16x4_t v_src_round =
       vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3);
@@ -2093,21 +2017,20 @@
   const int32x4_t v_src_mult_lo =
       vmlal_s16(v_dual_round, (v_src), v_multiplier);
   const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, v_shift);
-  vst1_lane_s16(&dst[0], vqmovn_s32(dst_0), 0);
+  vst1_lane_s16(dst, vqmovn_s32(dst_0), 0);
   return true;
 }
 
-LIBGAV1_ALWAYS_INLINE void Identity32Row16_NEON(void* dest, const void* source,
+LIBGAV1_ALWAYS_INLINE void Identity32Row16_NEON(void* dest,
                                                 const int32_t step) {
   auto* const dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
 
   // When combining the identity32 multiplier with the row shift, the
   // calculation for tx_height equal to 16 can be simplified from
   // ((A * 4) + 1) >> 1) to (A * 2).
   for (int i = 0; i < 4; ++i) {
     for (int j = 0; j < 32; j += 8) {
-      const int16x8_t v_src = vld1q_s16(&src[i * step + j]);
+      const int16x8_t v_src = vld1q_s16(&dst[i * step + j]);
       // For bitdepth == 8, the identity row clamps to a signed 16bit value, so
       // saturating add here is ok.
       const int16x8_t v_dst_i = vqaddq_s16(v_src, v_src);
@@ -2116,21 +2039,18 @@
   }
 }
 
-LIBGAV1_ALWAYS_INLINE bool Identity32DcOnly(void* dest, const void* source,
-                                            int non_zero_coeff_count) {
-  if (non_zero_coeff_count > 1) {
-    return false;
-  }
+LIBGAV1_ALWAYS_INLINE bool Identity32DcOnly(void* dest,
+                                            int adjusted_tx_height) {
+  if (adjusted_tx_height > 1) return false;
 
   auto* dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
-  const int16x4_t v_src0 = vdup_n_s16(src[0]);
+  const int16x4_t v_src0 = vdup_n_s16(dst[0]);
   const int16x4_t v_src = vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3);
   // When combining the identity32 multiplier with the row shift, the
   // calculation for tx_height equal to 16 can be simplified from
   // ((A * 4) + 1) >> 1) to (A * 2).
   const int16x4_t v_dst_0 = vqadd_s16(v_src, v_src);
-  vst1_lane_s16(&dst[0], v_dst_0, 0);
+  vst1_lane_s16(dst, v_dst_0, 0);
   return true;
 }
 
@@ -2188,11 +2108,11 @@
 // Process 4 wht4 rows and columns.
 LIBGAV1_ALWAYS_INLINE void Wht4_NEON(uint8_t* dst, const int dst_stride,
                                      const void* source,
-                                     const int non_zero_coeff_count) {
+                                     const int adjusted_tx_height) {
   const auto* const src = static_cast<const int16_t*>(source);
   int16x4_t s[4];
 
-  if (non_zero_coeff_count == 1) {
+  if (adjusted_tx_height == 1) {
     // Special case: only src[0] is nonzero.
     //   src[0]  0   0   0
     //       0   0   0   0
@@ -2411,533 +2331,531 @@
   }
 }
 
-void Dct4TransformLoop_NEON(TransformType tx_type, TransformSize tx_size,
-                            void* src_buffer, int start_x, int start_y,
-                            void* dst_frame, bool is_row,
-                            int non_zero_coeff_count) {
-  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+void Dct4TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
+                               int adjusted_tx_height, void* src_buffer,
+                               int /*start_x*/, int /*start_y*/,
+                               void* /*dst_frame*/) {
   auto* src = static_cast<int16_t*>(src_buffer);
-  const int tx_width = kTransformWidth[tx_size];
   const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = (tx_height == 8);
+  const int row_shift = (tx_height == 16);
 
-  if (is_row) {
-    const bool should_round = (tx_height == 8);
-    const int row_shift = (tx_height == 16);
-
-    if (DctDcOnly<4>(&src[0], &src[0], non_zero_coeff_count, should_round,
-                     row_shift)) {
-      return;
-    }
-
-    const int num_rows =
-        GetNumRows<4>(tx_type, tx_height, non_zero_coeff_count);
-    if (should_round) {
-      ApplyRounding<4>(src, num_rows);
-    }
-
-    if (num_rows <= 4) {
-      // Process 4 1d dct4 rows in parallel.
-      Dct4_NEON<ButterflyRotation_4, false>(&src[0], &src[0], /*step=*/4,
-                                            /*transpose=*/true);
-    } else {
-      // Process 8 1d dct4 rows in parallel per iteration.
-      int i = 0;
-      do {
-        Dct4_NEON<ButterflyRotation_8, true>(&src[i * 4], &src[i * 4],
-                                             /*step=*/4, /*transpose=*/true);
-        i += 8;
-      } while (i < num_rows);
-    }
-    if (tx_height == 16) {
-      RowShift<4>(src, num_rows, 1);
-    }
+  if (DctDcOnly<4>(src, adjusted_tx_height, should_round, row_shift)) {
     return;
   }
 
-  assert(!is_row);
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height == 4) {
+    // Process 4 1d dct4 rows in parallel.
+    Dct4_NEON<ButterflyRotation_4, false>(src, /*step=*/4, /*transpose=*/true);
+  } else {
+    // Process 8 1d dct4 rows in parallel per iteration.
+    int i = adjusted_tx_height;
+    auto* data = src;
+    do {
+      Dct4_NEON<ButterflyRotation_8, true>(data, /*step=*/4,
+                                           /*transpose=*/true);
+      data += 32;
+      i -= 8;
+    } while (i != 0);
+  }
+  if (tx_height == 16) {
+    RowShift<4>(src, adjusted_tx_height, 1);
+  }
+}
+
+void Dct4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                  int adjusted_tx_height, void* src_buffer,
+                                  int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
   if (kTransformFlipColumnsMask.Contains(tx_type)) {
     FlipColumns<4>(src, tx_width);
   }
 
-  if (!DctDcOnlyColumn<4>(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
+  if (!DctDcOnlyColumn<4>(src, adjusted_tx_height, tx_width)) {
     if (tx_width == 4) {
       // Process 4 1d dct4 columns in parallel.
-      Dct4_NEON<ButterflyRotation_4, false>(&src[0], &src[0], tx_width,
-                                            /*transpose=*/false);
+      Dct4_NEON<ButterflyRotation_4, false>(src, tx_width, /*transpose=*/false);
     } else {
       // Process 8 1d dct4 columns in parallel per iteration.
-      int i = 0;
+      int i = tx_width;
+      auto* data = src;
       do {
-        Dct4_NEON<ButterflyRotation_8, true>(&src[i], &src[i], tx_width,
+        Dct4_NEON<ButterflyRotation_8, true>(data, tx_width,
                                              /*transpose=*/false);
-        i += 8;
-      } while (i < tx_width);
+        data += 8;
+        i -= 8;
+      } while (i != 0);
     }
   }
+
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
   StoreToFrameWithRound<4>(frame, start_x, start_y, tx_width, src, tx_type);
 }
 
-void Dct8TransformLoop_NEON(TransformType tx_type, TransformSize tx_size,
-                            void* src_buffer, int start_x, int start_y,
-                            void* dst_frame, bool is_row,
-                            int non_zero_coeff_count) {
-  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+void Dct8TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
+                               int adjusted_tx_height, void* src_buffer,
+                               int /*start_x*/, int /*start_y*/,
+                               void* /*dst_frame*/) {
   auto* src = static_cast<int16_t*>(src_buffer);
-  const int tx_width = kTransformWidth[tx_size];
-  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
 
-  if (is_row) {
-    const bool should_round = kShouldRound[tx_size];
-    const uint8_t row_shift = kTransformRowShift[tx_size];
-
-    if (DctDcOnly<8>(&src[0], &src[0], non_zero_coeff_count, should_round,
-                     row_shift)) {
-      return;
-    }
-
-    const int num_rows =
-        GetNumRows<8>(tx_type, tx_height, non_zero_coeff_count);
-    if (should_round) {
-      ApplyRounding<8>(src, num_rows);
-    }
-
-    if (num_rows <= 4) {
-      // Process 4 1d dct8 rows in parallel.
-      Dct8_NEON<ButterflyRotation_4, true>(&src[0], &src[0], /*step=*/8,
-                                           /*transpose=*/true);
-    } else {
-      // Process 8 1d dct8 rows in parallel per iteration.
-      int i = 0;
-      do {
-        Dct8_NEON<ButterflyRotation_8, false>(&src[i * 8], &src[i * 8],
-                                              /*step=*/8, /*transpose=*/true);
-        i += 8;
-      } while (i < num_rows);
-    }
-    if (row_shift > 0) {
-      RowShift<8>(src, num_rows, row_shift);
-    }
+  if (DctDcOnly<8>(src, adjusted_tx_height, should_round, row_shift)) {
     return;
   }
 
-  assert(!is_row);
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height == 4) {
+    // Process 4 1d dct8 rows in parallel.
+    Dct8_NEON<ButterflyRotation_4, true>(src, /*step=*/8, /*transpose=*/true);
+  } else {
+    // Process 8 1d dct8 rows in parallel per iteration.
+    assert(adjusted_tx_height % 8 == 0);
+    int i = adjusted_tx_height;
+    auto* data = src;
+    do {
+      Dct8_NEON<ButterflyRotation_8, false>(data, /*step=*/8,
+                                            /*transpose=*/true);
+      data += 64;
+      i -= 8;
+    } while (i != 0);
+  }
+  if (row_shift > 0) {
+    RowShift<8>(src, adjusted_tx_height, row_shift);
+  }
+}
+
+void Dct8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                  int adjusted_tx_height, void* src_buffer,
+                                  int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
   if (kTransformFlipColumnsMask.Contains(tx_type)) {
     FlipColumns<8>(src, tx_width);
   }
 
-  if (!DctDcOnlyColumn<8>(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
+  if (!DctDcOnlyColumn<8>(src, adjusted_tx_height, tx_width)) {
     if (tx_width == 4) {
       // Process 4 1d dct8 columns in parallel.
-      Dct8_NEON<ButterflyRotation_4, true>(&src[0], &src[0], 4,
-                                           /*transpose=*/false);
+      Dct8_NEON<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
     } else {
       // Process 8 1d dct8 columns in parallel per iteration.
-      int i = 0;
+      int i = tx_width;
+      auto* data = src;
       do {
-        Dct8_NEON<ButterflyRotation_8, false>(&src[i], &src[i], tx_width,
+        Dct8_NEON<ButterflyRotation_8, false>(data, tx_width,
                                               /*transpose=*/false);
-        i += 8;
-      } while (i < tx_width);
+        data += 8;
+        i -= 8;
+      } while (i != 0);
     }
   }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
   StoreToFrameWithRound<8>(frame, start_x, start_y, tx_width, src, tx_type);
 }
 
-void Dct16TransformLoop_NEON(TransformType tx_type, TransformSize tx_size,
-                             void* src_buffer, int start_x, int start_y,
-                             void* dst_frame, bool is_row,
-                             int non_zero_coeff_count) {
-  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+void Dct16TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
   auto* src = static_cast<int16_t*>(src_buffer);
-  const int tx_width = kTransformWidth[tx_size];
-  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
 
-  if (is_row) {
-    const bool should_round = kShouldRound[tx_size];
-    const uint8_t row_shift = kTransformRowShift[tx_size];
-
-    if (DctDcOnly<16>(&src[0], &src[0], non_zero_coeff_count, should_round,
-                      row_shift)) {
-      return;
-    }
-
-    const int num_rows =
-        GetNumRows<16>(tx_type, std::min(tx_height, 32), non_zero_coeff_count);
-    if (should_round) {
-      ApplyRounding<16>(src, num_rows);
-    }
-
-    if (num_rows <= 4) {
-      // Process 4 1d dct16 rows in parallel.
-      Dct16_NEON<ButterflyRotation_4, true>(&src[0], &src[0], 16,
-                                            /*is_row=*/true, row_shift);
-    } else {
-      int i = 0;
-      do {
-        // Process 8 1d dct16 rows in parallel per iteration.
-        Dct16_NEON<ButterflyRotation_8, false>(&src[i * 16], &src[i * 16], 16,
-                                               /*is_row=*/true, row_shift);
-        i += 8;
-      } while (i < num_rows);
-    }
-
+  if (DctDcOnly<16>(src, adjusted_tx_height, should_round, row_shift)) {
     return;
   }
 
-  assert(!is_row);
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height == 4) {
+    // Process 4 1d dct16 rows in parallel.
+    Dct16_NEON<ButterflyRotation_4, true>(src, 16, /*is_row=*/true, row_shift);
+  } else {
+    assert(adjusted_tx_height % 8 == 0);
+    int i = adjusted_tx_height;
+    do {
+      // Process 8 1d dct16 rows in parallel per iteration.
+      Dct16_NEON<ButterflyRotation_8, false>(src, 16, /*is_row=*/true,
+                                             row_shift);
+      src += 128;
+      i -= 8;
+    } while (i != 0);
+  }
+}
+
+void Dct16TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height, void* src_buffer,
+                                   int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
   if (kTransformFlipColumnsMask.Contains(tx_type)) {
     FlipColumns<16>(src, tx_width);
   }
 
-  if (!DctDcOnlyColumn<16>(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
+  if (!DctDcOnlyColumn<16>(src, adjusted_tx_height, tx_width)) {
     if (tx_width == 4) {
       // Process 4 1d dct16 columns in parallel.
-      Dct16_NEON<ButterflyRotation_4, true>(&src[0], &src[0], 4,
-                                            /*is_row=*/false, /*row_shift=*/0);
+      Dct16_NEON<ButterflyRotation_4, true>(src, 4, /*is_row=*/false,
+                                            /*row_shift=*/0);
     } else {
-      int i = 0;
+      int i = tx_width;
+      auto* data = src;
       do {
         // Process 8 1d dct16 columns in parallel per iteration.
-        Dct16_NEON<ButterflyRotation_8, false>(&src[i], &src[i], tx_width,
-                                               /*is_row=*/false,
+        Dct16_NEON<ButterflyRotation_8, false>(data, tx_width, /*is_row=*/false,
                                                /*row_shift=*/0);
-        i += 8;
-      } while (i < tx_width);
+        data += 8;
+        i -= 8;
+      } while (i != 0);
     }
   }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
   StoreToFrameWithRound<16>(frame, start_x, start_y, tx_width, src, tx_type);
 }
 
-void Dct32TransformLoop_NEON(TransformType tx_type, TransformSize tx_size,
-                             void* src_buffer, int start_x, int start_y,
-                             void* dst_frame, bool is_row,
-                             int non_zero_coeff_count) {
-  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+void Dct32TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
   auto* src = static_cast<int16_t*>(src_buffer);
-  const int tx_width = kTransformWidth[tx_size];
-  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
 
-  if (is_row) {
-    const bool should_round = kShouldRound[tx_size];
-    const uint8_t row_shift = kTransformRowShift[tx_size];
-
-    if (DctDcOnly<32>(&src[0], &src[0], non_zero_coeff_count, should_round,
-                      row_shift)) {
-      return;
-    }
-
-    const int num_rows =
-        GetNumRows<32>(tx_type, std::min(tx_height, 32), non_zero_coeff_count);
-    if (should_round) {
-      ApplyRounding<32>(src, num_rows);
-    }
-    // Process 8 1d dct32 rows in parallel per iteration.
-    int i = 0;
-    do {
-      Dct32_NEON(&src[i * 32], &src[i * 32], 32, /*is_row=*/true, row_shift);
-      i += 8;
-    } while (i < num_rows);
-
+  if (DctDcOnly<32>(src, adjusted_tx_height, should_round, row_shift)) {
     return;
   }
 
-  assert(!is_row);
-  if (!DctDcOnlyColumn<32>(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
-    // Process 8 1d dct32 columns in parallel per iteration.
-    int i = 0;
-    do {
-      Dct32_NEON(&src[i], &src[i], tx_width, /*is_row=*/false, /*row_shift=*/0);
-      i += 8;
-    } while (i < tx_width);
+  if (should_round) {
+    ApplyRounding<32>(src, adjusted_tx_height);
   }
+  // Process 8 1d dct32 rows in parallel per iteration.
+  int i = 0;
+  do {
+    Dct32_NEON(&src[i * 32], 32, /*is_row=*/true, row_shift);
+    i += 8;
+  } while (i < adjusted_tx_height);
+}
+
+void Dct32TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height, void* src_buffer,
+                                   int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (!DctDcOnlyColumn<32>(src, adjusted_tx_height, tx_width)) {
+    // Process 8 1d dct32 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Dct32_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+      data += 8;
+      i -= 8;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
   StoreToFrameWithRound<32>(frame, start_x, start_y, tx_width, src, tx_type);
 }
 
-void Dct64TransformLoop_NEON(TransformType tx_type, TransformSize tx_size,
-                             void* src_buffer, int start_x, int start_y,
-                             void* dst_frame, bool is_row,
-                             int non_zero_coeff_count) {
-  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+void Dct64TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
   auto* src = static_cast<int16_t*>(src_buffer);
-  const int tx_width = kTransformWidth[tx_size];
-  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
 
-  if (is_row) {
-    const bool should_round = kShouldRound[tx_size];
-    const uint8_t row_shift = kTransformRowShift[tx_size];
-
-    if (DctDcOnly<64>(&src[0], &src[0], non_zero_coeff_count, should_round,
-                      row_shift)) {
-      return;
-    }
-
-    const int num_rows =
-        GetNumRows<32>(tx_type, std::min(tx_height, 32), non_zero_coeff_count);
-    if (should_round) {
-      ApplyRounding<64>(src, num_rows);
-    }
-    // Process 8 1d dct64 rows in parallel per iteration.
-    int i = 0;
-    do {
-      Dct64_NEON(&src[i * 64], &src[i * 64], 64, /*is_row=*/true, row_shift);
-      i += 8;
-    } while (i < num_rows);
-
+  if (DctDcOnly<64>(src, adjusted_tx_height, should_round, row_shift)) {
     return;
   }
 
-  assert(!is_row);
-  if (!DctDcOnlyColumn<64>(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
-    // Process 8 1d dct64 columns in parallel per iteration.
-    int i = 0;
-    do {
-      Dct64_NEON(&src[i], &src[i], tx_width, /*is_row=*/false, /*row_shift=*/0);
-      i += 8;
-    } while (i < tx_width);
+  if (should_round) {
+    ApplyRounding<64>(src, adjusted_tx_height);
   }
+  // Process 8 1d dct64 rows in parallel per iteration.
+  int i = 0;
+  do {
+    Dct64_NEON(&src[i * 64], 64, /*is_row=*/true, row_shift);
+    i += 8;
+  } while (i < adjusted_tx_height);
+}
+
+void Dct64TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height, void* src_buffer,
+                                   int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (!DctDcOnlyColumn<64>(src, adjusted_tx_height, tx_width)) {
+    // Process 8 1d dct64 columns in parallel per iteration.
+    int i = tx_width;
+    auto* data = src;
+    do {
+      Dct64_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+      data += 8;
+      i -= 8;
+    } while (i != 0);
+  }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
   StoreToFrameWithRound<64>(frame, start_x, start_y, tx_width, src, tx_type);
 }
 
-void Adst4TransformLoop_NEON(TransformType tx_type, TransformSize tx_size,
-                             void* src_buffer, int start_x, int start_y,
-                             void* dst_frame, bool is_row,
-                             int non_zero_coeff_count) {
-  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+void Adst4TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
   auto* src = static_cast<int16_t*>(src_buffer);
-  const int tx_width = kTransformWidth[tx_size];
   const int tx_height = kTransformHeight[tx_size];
+  const int row_shift = static_cast<int>(tx_height == 16);
+  const bool should_round = (tx_height == 8);
 
-  if (is_row) {
-    const uint8_t row_shift = static_cast<uint8_t>(tx_height == 16);
-    const bool should_round = (tx_height == 8);
-
-    if (Adst4DcOnly(&src[0], &src[0], non_zero_coeff_count, should_round,
-                    row_shift)) {
-      return;
-    }
-
-    const int num_rows =
-        GetNumRows<4>(tx_type, tx_height, non_zero_coeff_count);
-    if (should_round) {
-      ApplyRounding<4>(src, num_rows);
-    }
-
-    // Process 4 1d adst4 rows in parallel per iteration.
-    int i = 0;
-    do {
-      Adst4_NEON<false>(&src[i * 4], &src[i * 4], /*step=*/4,
-                        /*transpose=*/true);
-      i += 4;
-    } while (i < num_rows);
-
-    if (tx_height == 16) {
-      RowShift<4>(src, num_rows, 1);
-    }
+  if (Adst4DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
     return;
   }
 
-  assert(!is_row);
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+
+  // Process 4 1d adst4 rows in parallel per iteration.
+  int i = adjusted_tx_height;
+  auto* data = src;
+  do {
+    Adst4_NEON<false>(data, /*step=*/4, /*transpose=*/true);
+    data += 16;
+    i -= 4;
+  } while (i != 0);
+
+  if (tx_height == 16) {
+    RowShift<4>(src, adjusted_tx_height, 1);
+  }
+}
+
+void Adst4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height, void* src_buffer,
+                                   int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
   if (kTransformFlipColumnsMask.Contains(tx_type)) {
     FlipColumns<4>(src, tx_width);
   }
 
-  if (!Adst4DcOnlyColumn(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
+  if (!Adst4DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
     // Process 4 1d adst4 columns in parallel per iteration.
-    int i = 0;
+    int i = tx_width;
+    auto* data = src;
     do {
-      Adst4_NEON<false>(&src[i], &src[i], tx_width, /*transpose=*/false);
-      i += 4;
-    } while (i < tx_width);
+      Adst4_NEON<false>(data, tx_width, /*transpose=*/false);
+      data += 4;
+      i -= 4;
+    } while (i != 0);
   }
 
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
   StoreToFrameWithRound<4, /*enable_flip_rows=*/true>(frame, start_x, start_y,
                                                       tx_width, src, tx_type);
 }
 
-void Adst8TransformLoop_NEON(TransformType tx_type, TransformSize tx_size,
-                             void* src_buffer, int start_x, int start_y,
-                             void* dst_frame, bool is_row,
-                             int non_zero_coeff_count) {
-  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+void Adst8TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                TransformSize tx_size, int adjusted_tx_height,
+                                void* src_buffer, int /*start_x*/,
+                                int /*start_y*/, void* /*dst_frame*/) {
   auto* src = static_cast<int16_t*>(src_buffer);
-  const int tx_width = kTransformWidth[tx_size];
-  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
 
-  if (is_row) {
-    const bool should_round = kShouldRound[tx_size];
-    const uint8_t row_shift = kTransformRowShift[tx_size];
-
-    if (Adst8DcOnly(&src[0], &src[0], non_zero_coeff_count, should_round,
-                    row_shift)) {
-      return;
-    }
-
-    const int num_rows =
-        GetNumRows<8>(tx_type, tx_height, non_zero_coeff_count);
-    if (should_round) {
-      ApplyRounding<8>(src, num_rows);
-    }
-
-    if (num_rows <= 4) {
-      // Process 4 1d adst8 rows in parallel.
-      Adst8_NEON<ButterflyRotation_4, true>(&src[0], &src[0], /*step=*/8,
-                                            /*transpose=*/true);
-    } else {
-      // Process 8 1d adst8 rows in parallel per iteration.
-      int i = 0;
-      do {
-        Adst8_NEON<ButterflyRotation_8, false>(&src[i * 8], &src[i * 8],
-                                               /*step=*/8,
-                                               /*transpose=*/true);
-        i += 8;
-      } while (i < num_rows);
-    }
-    if (row_shift > 0) {
-      RowShift<8>(src, num_rows, row_shift);
-    }
+  if (Adst8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
     return;
   }
 
-  assert(!is_row);
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height == 4) {
+    // Process 4 1d adst8 rows in parallel.
+    Adst8_NEON<ButterflyRotation_4, true>(src, /*step=*/8, /*transpose=*/true);
+  } else {
+    // Process 8 1d adst8 rows in parallel per iteration.
+    assert(adjusted_tx_height % 8 == 0);
+    int i = adjusted_tx_height;
+    auto* data = src;
+    do {
+      Adst8_NEON<ButterflyRotation_8, false>(data, /*step=*/8,
+                                             /*transpose=*/true);
+      data += 64;
+      i -= 8;
+    } while (i != 0);
+  }
+  if (row_shift > 0) {
+    RowShift<8>(src, adjusted_tx_height, row_shift);
+  }
+}
+
+void Adst8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                   int adjusted_tx_height, void* src_buffer,
+                                   int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
   if (kTransformFlipColumnsMask.Contains(tx_type)) {
     FlipColumns<8>(src, tx_width);
   }
 
-  if (!Adst8DcOnlyColumn(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
+  if (!Adst8DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
     if (tx_width == 4) {
       // Process 4 1d adst8 columns in parallel.
-      Adst8_NEON<ButterflyRotation_4, true>(&src[0], &src[0], 4,
-                                            /*transpose=*/false);
+      Adst8_NEON<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
     } else {
       // Process 8 1d adst8 columns in parallel per iteration.
-      int i = 0;
+      int i = tx_width;
+      auto* data = src;
       do {
-        Adst8_NEON<ButterflyRotation_8, false>(&src[i], &src[i], tx_width,
+        Adst8_NEON<ButterflyRotation_8, false>(data, tx_width,
                                                /*transpose=*/false);
-        i += 8;
-      } while (i < tx_width);
+        data += 8;
+        i -= 8;
+      } while (i != 0);
     }
   }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
   StoreToFrameWithRound<8, /*enable_flip_rows=*/true>(frame, start_x, start_y,
                                                       tx_width, src, tx_type);
 }
 
-void Adst16TransformLoop_NEON(TransformType tx_type, TransformSize tx_size,
-                              void* src_buffer, int start_x, int start_y,
-                              void* dst_frame, bool is_row,
-                              int non_zero_coeff_count) {
-  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+void Adst16TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                 TransformSize tx_size, int adjusted_tx_height,
+                                 void* src_buffer, int /*start_x*/,
+                                 int /*start_y*/, void* /*dst_frame*/) {
   auto* src = static_cast<int16_t*>(src_buffer);
-  const int tx_width = kTransformWidth[tx_size];
-  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
 
-  if (is_row) {
-    const bool should_round = kShouldRound[tx_size];
-    const uint8_t row_shift = kTransformRowShift[tx_size];
-
-    if (Adst16DcOnly(&src[0], &src[0], non_zero_coeff_count, should_round,
-                     row_shift)) {
-      return;
-    }
-
-    const int num_rows =
-        GetNumRows<16>(tx_type, std::min(tx_height, 32), non_zero_coeff_count);
-    if (should_round) {
-      ApplyRounding<16>(src, num_rows);
-    }
-
-    if (num_rows <= 4) {
-      // Process 4 1d adst16 rows in parallel.
-      Adst16_NEON<ButterflyRotation_4, true>(&src[0], &src[0], 16,
-                                             /*is_row=*/true, row_shift);
-    } else {
-      int i = 0;
-      do {
-        // Process 8 1d adst16 rows in parallel per iteration.
-        Adst16_NEON<ButterflyRotation_8, false>(&src[i * 16], &src[i * 16], 16,
-                                                /*is_row=*/true, row_shift);
-        i += 8;
-      } while (i < num_rows);
-    }
+  if (Adst16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
     return;
   }
 
-  assert(!is_row);
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height == 4) {
+    // Process 4 1d adst16 rows in parallel.
+    Adst16_NEON<ButterflyRotation_4, true>(src, 16, /*is_row=*/true, row_shift);
+  } else {
+    assert(adjusted_tx_height % 8 == 0);
+    int i = adjusted_tx_height;
+    do {
+      // Process 8 1d adst16 rows in parallel per iteration.
+      Adst16_NEON<ButterflyRotation_8, false>(src, 16, /*is_row=*/true,
+                                              row_shift);
+      src += 128;
+      i -= 8;
+    } while (i != 0);
+  }
+}
+
+void Adst16TransformLoopColumn_NEON(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height, void* src_buffer,
+                                    int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
   if (kTransformFlipColumnsMask.Contains(tx_type)) {
     FlipColumns<16>(src, tx_width);
   }
 
-  if (!Adst16DcOnlyColumn(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
+  if (!Adst16DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
     if (tx_width == 4) {
       // Process 4 1d adst16 columns in parallel.
-      Adst16_NEON<ButterflyRotation_4, true>(&src[0], &src[0], 4,
-                                             /*is_row=*/false, /*row_shift=*/0);
+      Adst16_NEON<ButterflyRotation_4, true>(src, 4, /*is_row=*/false,
+                                             /*row_shift=*/0);
     } else {
-      int i = 0;
+      int i = tx_width;
+      auto* data = src;
       do {
         // Process 8 1d adst16 columns in parallel per iteration.
-        Adst16_NEON<ButterflyRotation_8, false>(&src[i], &src[i], tx_width,
-                                                /*is_row=*/false,
-                                                /*row_shift=*/0);
-        i += 8;
-      } while (i < tx_width);
+        Adst16_NEON<ButterflyRotation_8, false>(
+            data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+        data += 8;
+        i -= 8;
+      } while (i != 0);
     }
   }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
   StoreToFrameWithRound<16, /*enable_flip_rows=*/true>(frame, start_x, start_y,
                                                        tx_width, src, tx_type);
 }
 
-void Identity4TransformLoop_NEON(TransformType tx_type, TransformSize tx_size,
-                                 void* src_buffer, int start_x, int start_y,
-                                 void* dst_frame, bool is_row,
-                                 int non_zero_coeff_count) {
+void Identity4TransformLoopRow_NEON(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height, void* src_buffer,
+                                    int /*start_x*/, int /*start_y*/,
+                                    void* /*dst_frame*/) {
+  // Special case: Process row calculations during column transform call.
+  // Improves performance.
+  if (tx_type == kTransformTypeIdentityIdentity &&
+      tx_size == kTransformSize4x4) {
+    return;
+  }
+
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = (tx_height == 8);
+
+  if (Identity4DcOnly(src, adjusted_tx_height, should_round, tx_height)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+  if (tx_height < 16) {
+    int i = adjusted_tx_height;
+    do {
+      Identity4_NEON<false>(src, /*step=*/4);
+      src += 16;
+      i -= 4;
+    } while (i != 0);
+  } else {
+    int i = adjusted_tx_height;
+    do {
+      Identity4_NEON<true>(src, /*step=*/4);
+      src += 16;
+      i -= 4;
+    } while (i != 0);
+  }
+}
+
+void Identity4TransformLoopColumn_NEON(TransformType tx_type,
+                                       TransformSize tx_size,
+                                       int adjusted_tx_height, void* src_buffer,
+                                       int start_x, int start_y,
+                                       void* dst_frame) {
   auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
   auto* src = static_cast<int16_t*>(src_buffer);
   const int tx_width = kTransformWidth[tx_size];
-  const int tx_height = kTransformHeight[tx_size];
 
-  if (is_row) {
-    // Special case: Process row calculations during column transform call.
-    // Improves performance.
-    if (tx_type == kTransformTypeIdentityIdentity &&
-        tx_size == kTransformSize4x4) {
-      return;
-    }
-
-    const bool should_round = (tx_height == 8);
-
-    if (Identity4DcOnly(&src[0], &src[0], non_zero_coeff_count, should_round,
-                        tx_height)) {
-      return;
-    }
-
-    const int num_rows =
-        GetNumRows<4>(tx_type, tx_height, non_zero_coeff_count);
-    if (should_round) {
-      ApplyRounding<4>(src, num_rows);
-    }
-    if (tx_height < 16) {
-      int i = 0;
-      do {
-        Identity4_NEON<false>(&src[i * 4], &src[i * 4], /*step=*/4);
-        i += 4;
-      } while (i < num_rows);
-    } else {
-      int i = 0;
-      do {
-        Identity4_NEON<true>(&src[i * 4], &src[i * 4], /*step=*/4);
-        i += 4;
-      } while (i < num_rows);
-    }
-    return;
-  }
-  assert(!is_row);
-  const int height = (non_zero_coeff_count == 1) ? 1 : tx_height;
   // Special case: Process row calculations during column transform call.
   if (tx_type == kTransformTypeIdentityIdentity &&
       (tx_size == kTransformSize4x4 || tx_size == kTransformSize8x4)) {
-    Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width, height,
-                                   src);
+    Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width,
+                                   adjusted_tx_height, src);
     return;
   }
 
@@ -2945,168 +2863,185 @@
     FlipColumns<4>(src, tx_width);
   }
 
-  IdentityColumnStoreToFrame<4>(frame, start_x, start_y, tx_width, height, src);
+  IdentityColumnStoreToFrame<4>(frame, start_x, start_y, tx_width,
+                                adjusted_tx_height, src);
 }
 
-void Identity8TransformLoop_NEON(TransformType tx_type, TransformSize tx_size,
-                                 void* src_buffer, int start_x, int start_y,
-                                 void* dst_frame, bool is_row,
-                                 int non_zero_coeff_count) {
-  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
-  auto* src = static_cast<int16_t*>(src_buffer);
-  const int tx_width = kTransformWidth[tx_size];
-  const int tx_height = kTransformHeight[tx_size];
-
-  if (is_row) {
-    // Special case: Process row calculations during column transform call.
-    // Improves performance.
-    if (tx_type == kTransformTypeIdentityIdentity &&
-        tx_size == kTransformSize8x4) {
-      return;
-    }
-    const bool should_round = kShouldRound[tx_size];
-    const uint8_t row_shift = kTransformRowShift[tx_size];
-
-    if (Identity8DcOnly(&src[0], &src[0], non_zero_coeff_count, should_round,
-                        row_shift)) {
-      return;
-    }
-
-    const int num_rows =
-        GetNumRows<8>(tx_type, tx_height, non_zero_coeff_count);
-    if (should_round) {
-      ApplyRounding<8>(src, num_rows);
-    }
-
-    // When combining the identity8 multiplier with the row shift, the
-    // calculations for tx_height == 8 and tx_height == 16 can be simplified
-    // from ((A * 2) + 1) >> 1) to A.
-    if ((tx_height & 0x18) != 0) {
-      return;
-    }
-    if (tx_height == 32) {
-      for (int i = 0; i < num_rows; i += 4) {
-        Identity8Row32_NEON(&src[i * 8], &src[i * 8], /*step=*/8);
-      }
-      return;
-    }
-
-    // Process kTransformSize8x4
-    assert(tx_size == kTransformSize8x4);
-    for (int i = 0; i < num_rows; i += 4) {
-      Identity8Row4_NEON(&src[i * 8], &src[i * 8], /*step=*/8);
-    }
+void Identity8TransformLoopRow_NEON(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height, void* src_buffer,
+                                    int /*start_x*/, int /*start_y*/,
+                                    void* /*dst_frame*/) {
+  // Special case: Process row calculations during column transform call.
+  // Improves performance.
+  if (tx_type == kTransformTypeIdentityIdentity &&
+      tx_size == kTransformSize8x4) {
     return;
   }
 
-  assert(!is_row);
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+
+  if (Identity8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  // When combining the identity8 multiplier with the row shift, the
+  // calculations for tx_height == 8 and tx_height == 16 can be simplified
+  // from ((A * 2) + 1) >> 1) to A.
+  if ((tx_height & 0x18) != 0) {
+    return;
+  }
+  if (tx_height == 32) {
+    int i = adjusted_tx_height;
+    do {
+      Identity8Row32_NEON(src, /*step=*/8);
+      src += 32;
+      i -= 4;
+    } while (i != 0);
+    return;
+  }
+
+  assert(tx_size == kTransformSize8x4);
+  int i = adjusted_tx_height;
+  do {
+    Identity8Row4_NEON(src, /*step=*/8);
+    src += 32;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Identity8TransformLoopColumn_NEON(TransformType tx_type,
+                                       TransformSize tx_size,
+                                       int adjusted_tx_height, void* src_buffer,
+                                       int start_x, int start_y,
+                                       void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
   if (kTransformFlipColumnsMask.Contains(tx_type)) {
     FlipColumns<8>(src, tx_width);
   }
-  const int height = (non_zero_coeff_count == 1) ? 1 : tx_height;
-  IdentityColumnStoreToFrame<8>(frame, start_x, start_y, tx_width, height, src);
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  IdentityColumnStoreToFrame<8>(frame, start_x, start_y, tx_width,
+                                adjusted_tx_height, src);
 }
 
-void Identity16TransformLoop_NEON(TransformType tx_type, TransformSize tx_size,
-                                  void* src_buffer, int start_x, int start_y,
-                                  void* dst_frame, bool is_row,
-                                  int non_zero_coeff_count) {
-  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+void Identity16TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                     TransformSize tx_size,
+                                     int adjusted_tx_height, void* src_buffer,
+                                     int /*start_x*/, int /*start_y*/,
+                                     void* /*dst_frame*/) {
   auto* src = static_cast<int16_t*>(src_buffer);
-  const int tx_width = kTransformWidth[tx_size];
-  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
 
-  if (is_row) {
-    const bool should_round = kShouldRound[tx_size];
-    const uint8_t row_shift = kTransformRowShift[tx_size];
-
-    if (Identity16DcOnly(&src[0], &src[0], non_zero_coeff_count, should_round,
-                         row_shift)) {
-      return;
-    }
-
-    const int num_rows =
-        GetNumRows<16>(tx_type, std::min(tx_height, 32), non_zero_coeff_count);
-    if (should_round) {
-      ApplyRounding<16>(src, num_rows);
-    }
-    for (int i = 0; i < num_rows; i += 4) {
-      Identity16Row_NEON(&src[i * 16], &src[i * 16], /*step=*/16,
-                         kTransformRowShift[tx_size]);
-    }
+  if (Identity16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
     return;
   }
 
-  assert(!is_row);
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+  int i = adjusted_tx_height;
+  do {
+    Identity16Row_NEON(src, /*step=*/16, kTransformRowShift[tx_size]);
+    src += 64;
+    i -= 4;
+  } while (i != 0);
+}
+
+void Identity16TransformLoopColumn_NEON(TransformType tx_type,
+                                        TransformSize tx_size,
+                                        int adjusted_tx_height,
+                                        void* src_buffer, int start_x,
+                                        int start_y, void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
   if (kTransformFlipColumnsMask.Contains(tx_type)) {
     FlipColumns<16>(src, tx_width);
   }
-  const int height = (non_zero_coeff_count == 1) ? 1 : tx_height;
-  IdentityColumnStoreToFrame<16>(frame, start_x, start_y, tx_width, height,
-                                 src);
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  IdentityColumnStoreToFrame<16>(frame, start_x, start_y, tx_width,
+                                 adjusted_tx_height, src);
 }
 
-void Identity32TransformLoop_NEON(TransformType tx_type, TransformSize tx_size,
-                                  void* src_buffer, int start_x, int start_y,
-                                  void* dst_frame, bool is_row,
-                                  int non_zero_coeff_count) {
-  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
-  auto* src = static_cast<int16_t*>(src_buffer);
-  const int tx_width = kTransformWidth[tx_size];
+void Identity32TransformLoopRow_NEON(TransformType /*tx_type*/,
+                                     TransformSize tx_size,
+                                     int adjusted_tx_height, void* src_buffer,
+                                     int /*start_x*/, int /*start_y*/,
+                                     void* /*dst_frame*/) {
   const int tx_height = kTransformHeight[tx_size];
 
-  if (is_row) {
-    // When combining the identity32 multiplier with the row shift, the
-    // calculations for tx_height == 8 and tx_height == 32 can be simplified
-    // from ((A * 4) + 2) >> 2) to A.
-    if ((tx_height & 0x28) != 0) {
-      return;
-    }
-
-    // Process kTransformSize32x16.  The src is always rounded before the
-    // identity transform and shifted by 1 afterwards.
-
-    if (Identity32DcOnly(&src[0], &src[0], non_zero_coeff_count)) {
-      return;
-    }
-
-    const int num_rows =
-        GetNumRows<32>(tx_type, tx_height, non_zero_coeff_count);
-
-    assert(tx_size == kTransformSize32x16);
-    ApplyRounding<32>(src, num_rows);
-    for (int i = 0; i < num_rows; i += 4) {
-      Identity32Row16_NEON(&src[i * 32], &src[i * 32], /*step=*/32);
-    }
+  // When combining the identity32 multiplier with the row shift, the
+  // calculations for tx_height == 8 and tx_height == 32 can be simplified
+  // from ((A * 4) + 2) >> 2) to A.
+  if ((tx_height & 0x28) != 0) {
     return;
   }
 
-  assert(!is_row);
-  const int height = (non_zero_coeff_count == 1) ? 1 : tx_height;
-  IdentityColumnStoreToFrame<32>(frame, start_x, start_y, tx_width, height,
-                                 src);
+  // Process kTransformSize32x16.  The src is always rounded before the
+  // identity transform and shifted by 1 afterwards.
+  auto* src = static_cast<int16_t*>(src_buffer);
+  if (Identity32DcOnly(src, adjusted_tx_height)) {
+    return;
+  }
+
+  assert(tx_size == kTransformSize32x16);
+  ApplyRounding<32>(src, adjusted_tx_height);
+  int i = adjusted_tx_height;
+  do {
+    Identity32Row16_NEON(src, /*step=*/32);
+    src += 128;
+    i -= 4;
+  } while (i != 0);
 }
 
-void Wht4TransformLoop_NEON(TransformType tx_type, TransformSize tx_size,
-                            void* src_buffer, int start_x, int start_y,
-                            void* dst_frame, bool is_row,
-                            int non_zero_coeff_count) {
+void Identity32TransformLoopColumn_NEON(TransformType /*tx_type*/,
+                                        TransformSize tx_size,
+                                        int adjusted_tx_height,
+                                        void* src_buffer, int start_x,
+                                        int start_y, void* dst_frame) {
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  IdentityColumnStoreToFrame<32>(frame, start_x, start_y, tx_width,
+                                 adjusted_tx_height, src);
+}
+
+void Wht4TransformLoopRow_NEON(TransformType tx_type, TransformSize tx_size,
+                               int /*adjusted_tx_height*/, void* /*src_buffer*/,
+                               int /*start_x*/, int /*start_y*/,
+                               void* /*dst_frame*/) {
   assert(tx_type == kTransformTypeDctDct);
   assert(tx_size == kTransformSize4x4);
   static_cast<void>(tx_type);
   static_cast<void>(tx_size);
-  if (is_row) {
-    // Do both row and column transforms in the column-transform pass.
-    return;
-  }
+  // Do both row and column transforms in the column-transform pass.
+}
 
-  assert(!is_row);
+void Wht4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+                                  int adjusted_tx_height, void* src_buffer,
+                                  int start_x, int start_y, void* dst_frame) {
+  assert(tx_type == kTransformTypeDctDct);
+  assert(tx_size == kTransformSize4x4);
+  static_cast<void>(tx_type);
+  static_cast<void>(tx_size);
+
   // Process 4 1d wht4 rows and columns in parallel.
-  auto* src = static_cast<int16_t*>(src_buffer);
+  const auto* src = static_cast<int16_t*>(src_buffer);
   auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
   uint8_t* dst = frame[start_y] + start_x;
   const int dst_stride = frame.columns();
-  Wht4_NEON(dst, dst_stride, src, non_zero_coeff_count);
+  Wht4_NEON(dst, dst_stride, src, adjusted_tx_height);
 }
 
 //------------------------------------------------------------------------------
@@ -3115,38 +3050,64 @@
   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
   assert(dsp != nullptr);
   // Maximum transform size for Dct is 64.
-  dsp->inverse_transforms[k1DTransformSize4][k1DTransformDct] =
-      Dct4TransformLoop_NEON;
-  dsp->inverse_transforms[k1DTransformSize8][k1DTransformDct] =
-      Dct8TransformLoop_NEON;
-  dsp->inverse_transforms[k1DTransformSize16][k1DTransformDct] =
-      Dct16TransformLoop_NEON;
-  dsp->inverse_transforms[k1DTransformSize32][k1DTransformDct] =
-      Dct32TransformLoop_NEON;
-  dsp->inverse_transforms[k1DTransformSize64][k1DTransformDct] =
-      Dct64TransformLoop_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
+      Dct4TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
+      Dct4TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
+      Dct8TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
+      Dct8TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
+      Dct16TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
+      Dct16TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
+      Dct32TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
+      Dct32TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
+      Dct64TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
+      Dct64TransformLoopColumn_NEON;
 
   // Maximum transform size for Adst is 16.
-  dsp->inverse_transforms[k1DTransformSize4][k1DTransformAdst] =
-      Adst4TransformLoop_NEON;
-  dsp->inverse_transforms[k1DTransformSize8][k1DTransformAdst] =
-      Adst8TransformLoop_NEON;
-  dsp->inverse_transforms[k1DTransformSize16][k1DTransformAdst] =
-      Adst16TransformLoop_NEON;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
+      Adst4TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
+      Adst4TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
+      Adst8TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
+      Adst8TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
+      Adst16TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
+      Adst16TransformLoopColumn_NEON;
 
   // Maximum transform size for Identity transform is 32.
-  dsp->inverse_transforms[k1DTransformSize4][k1DTransformIdentity] =
-      Identity4TransformLoop_NEON;
-  dsp->inverse_transforms[k1DTransformSize8][k1DTransformIdentity] =
-      Identity8TransformLoop_NEON;
-  dsp->inverse_transforms[k1DTransformSize16][k1DTransformIdentity] =
-      Identity16TransformLoop_NEON;
-  dsp->inverse_transforms[k1DTransformSize32][k1DTransformIdentity] =
-      Identity32TransformLoop_NEON;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
+      Identity4TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
+      Identity4TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
+      Identity8TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
+      Identity8TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
+      Identity16TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
+      Identity16TransformLoopColumn_NEON;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] =
+      Identity32TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] =
+      Identity32TransformLoopColumn_NEON;
 
   // Maximum transform size for Wht is 4.
-  dsp->inverse_transforms[k1DTransformSize4][k1DTransformWht] =
-      Wht4TransformLoop_NEON;
+  dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] =
+      Wht4TransformLoopRow_NEON;
+  dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] =
+      Wht4TransformLoopColumn_NEON;
 }
 
 }  // namespace
@@ -3156,7 +3117,7 @@
 
 }  // namespace dsp
 }  // namespace libgav1
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 

diff --git a/libgav1/src/dsp/arm/inverse_transform_neon.h b/libgav1/src/dsp/arm/inverse_transform_neon.h
index af647e8..91e0e83 100644
--- a/libgav1/src/dsp/arm/inverse_transform_neon.h
+++ b/libgav1/src/dsp/arm/inverse_transform_neon.h

@@ -26,6 +26,7 @@
 // Initializes Dsp::inverse_transforms, see the defines below for specifics.
 // This function is not thread-safe.
 void InverseTransformInit_NEON();
+void InverseTransformInit10bpp_NEON();
 
 }  // namespace dsp
 }  // namespace libgav1
@@ -47,6 +48,21 @@
 #define LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformIdentity LIBGAV1_CPU_NEON
 
 #define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformWht LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize32_1DTransformDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize64_1DTransformDct LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformAdst LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformAdst LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformAdst LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformIdentity LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformIdentity LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformIdentity LIBGAV1_CPU_NEON
+
 #endif  // LIBGAV1_ENABLE_NEON
 
 #endif  // LIBGAV1_SRC_DSP_ARM_INVERSE_TRANSFORM_NEON_H_

diff --git a/libgav1/src/dsp/arm/loop_filter_neon.cc b/libgav1/src/dsp/arm/loop_filter_neon.cc
index 146c983..8d72892 100644
--- a/libgav1/src/dsp/arm/loop_filter_neon.cc
+++ b/libgav1/src/dsp/arm/loop_filter_neon.cc

@@ -35,7 +35,7 @@
 // (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh)
 inline uint8x8_t Hev(const uint8x8_t abd_p0p1_q0q1, const uint8_t thresh) {
   const uint8x8_t a = vcgt_u8(abd_p0p1_q0q1, vdup_n_u8(thresh));
-  return vorr_u8(a, RightShift<32>(a));
+  return vorr_u8(a, RightShiftVector<32>(a));
 }
 
 // abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh
@@ -44,7 +44,7 @@
   const uint8x8x2_t a = Interleave32(p0q0, p1q1);
   const uint8x8_t b = vabd_u8(a.val[0], a.val[1]);
   const uint8x8_t p0q0_double = vqadd_u8(b, b);
-  const uint8x8_t p1q1_half = RightShift<32>(vshr_n_u8(b, 1));
+  const uint8x8_t p1q1_half = RightShiftVector<32>(vshr_n_u8(b, 1));
   const uint8x8_t c = vqadd_u8(p0q0_double, p1q1_half);
   return vcle_u8(c, vdup_n_u8(outer_thresh));
 }
@@ -56,7 +56,7 @@
                               const uint8_t inner_thresh,
                               const uint8_t outer_thresh) {
   const uint8x8_t a = vcle_u8(abd_p0p1_q0q1, vdup_n_u8(inner_thresh));
-  const uint8x8_t inner_mask = vand_u8(a, RightShift<32>(a));
+  const uint8x8_t inner_mask = vand_u8(a, RightShiftVector<32>(a));
   const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
   return vand_u8(inner_mask, outer_mask);
 }
@@ -121,7 +121,7 @@
       vcombine_s16(vget_low_s16(p0q1_l), vget_low_s16(q0p1_l));
   // Need to shift the second term or we end up with a2_ma2.
   const int8x8_t a2_ma1 =
-      InterleaveLow32(a2_a1, RightShift<32>(vneg_s8(a2_a1)));
+      InterleaveLow32(a2_a1, RightShiftVector<32>(vneg_s8(a2_a1)));
   const int16x8_t p0q0_a = vaddw_s8(p0q0_l, a2_ma1);
 
   *p1q1_result = vqmovun_s16(p1q1_a3);
@@ -251,7 +251,7 @@
                          const uint8x8_t abd_p0p2_q0q2) {
   const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p0p2_q0q2);
   const uint8x8_t b = vcle_u8(a, vdup_n_u8(1));
-  return vand_u8(b, RightShift<32>(b));
+  return vand_u8(b, RightShiftVector<32>(b));
 }
 
 // abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh &&
@@ -264,7 +264,7 @@
                               const uint8_t outer_thresh) {
   const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p1p2_q1q2);
   const uint8x8_t b = vcle_u8(a, vdup_n_u8(inner_thresh));
-  const uint8x8_t inner_mask = vand_u8(b, RightShift<32>(b));
+  const uint8x8_t inner_mask = vand_u8(b, RightShiftVector<32>(b));
   const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
   return vand_u8(inner_mask, outer_mask);
 }
@@ -482,7 +482,7 @@
   const uint8x8_t a = vmax_u8(abd_p0n0_q0n0, abd_p0n1_q0n1);
   const uint8x8_t b = vmax_u8(a, abd_p0n2_q0n2);
   const uint8x8_t c = vcle_u8(b, vdup_n_u8(1));
-  return vand_u8(c, RightShift<32>(c));
+  return vand_u8(c, RightShiftVector<32>(c));
 }
 
 // abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh &&
@@ -498,7 +498,7 @@
   const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p1p2_q1q2);
   const uint8x8_t b = vmax_u8(a, abd_p2p3_q2q3);
   const uint8x8_t c = vcle_u8(b, vdup_n_u8(inner_thresh));
-  const uint8x8_t inner_mask = vand_u8(c, RightShift<32>(c));
+  const uint8x8_t inner_mask = vand_u8(c, RightShiftVector<32>(c));
   const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
   return vand_u8(inner_mask, outer_mask);
 }
@@ -1179,7 +1179,7 @@
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 

diff --git a/libgav1/src/dsp/arm/loop_restoration_neon.cc b/libgav1/src/dsp/arm/loop_restoration_neon.cc
index 1e8dfb2..e6ceb66 100644
--- a/libgav1/src/dsp/arm/loop_restoration_neon.cc
+++ b/libgav1/src/dsp/arm/loop_restoration_neon.cc

@@ -18,6 +18,7 @@
 #if LIBGAV1_ENABLE_NEON
 #include <arm_neon.h>
 
+#include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
@@ -40,10 +41,25 @@
 }
 
 template <int bytes>
+inline uint8x8_t VshrU128(const uint8x8_t src[2]) {
+  return vext_u8(src[0], src[1], bytes);
+}
+
+template <int bytes>
+inline uint8x16_t VshrU128(const uint8x16_t src[2]) {
+  return vextq_u8(src[0], src[1], bytes);
+}
+
+template <int bytes>
 inline uint16x8_t VshrU128(const uint16x8x2_t src) {
   return vextq_u16(src.val[0], src.val[1], bytes / 2);
 }
 
+template <int bytes>
+inline uint16x8_t VshrU128(const uint16x8_t src[2]) {
+  return vextq_u16(src[0], src[1], bytes / 2);
+}
+
 // Wiener
 
 // Must make a local copy of coefficients to help compiler know that they have
@@ -115,8 +131,7 @@
                                  const ptrdiff_t width, const int height,
                                  const int16_t filter[4],
                                  int16_t** const wiener_buffer) {
-  int y = height;
-  do {
+  for (int y = height; y != 0; --y) {
     const uint8_t* src_ptr = src;
     uint8x16_t s[8];
     s[0] = vld1q_u8(src_ptr);
@@ -140,15 +155,14 @@
       x -= 16;
     } while (x != 0);
     src += src_stride;
-  } while (--y != 0);
+  }
 }
 
 inline void WienerHorizontalTap5(const uint8_t* src, const ptrdiff_t src_stride,
                                  const ptrdiff_t width, const int height,
                                  const int16_t filter[4],
                                  int16_t** const wiener_buffer) {
-  int y = height;
-  do {
+  for (int y = height; y != 0; --y) {
     const uint8_t* src_ptr = src;
     uint8x16_t s[6];
     s[0] = vld1q_u8(src_ptr);
@@ -169,40 +183,37 @@
       x -= 16;
     } while (x != 0);
     src += src_stride;
-  } while (--y != 0);
+  }
 }
 
 inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride,
                                  const ptrdiff_t width, const int height,
                                  const int16_t filter[4],
                                  int16_t** const wiener_buffer) {
-  int y = height;
-  do {
+  for (int y = height; y != 0; --y) {
     const uint8_t* src_ptr = src;
-    uint8x16_t s[4];
-    s[0] = vld1q_u8(src_ptr);
+    uint8x16_t s[3];
     ptrdiff_t x = width;
     do {
-      src_ptr += 16;
-      s[3] = vld1q_u8(src_ptr);
-      s[1] = vextq_u8(s[0], s[3], 1);
-      s[2] = vextq_u8(s[0], s[3], 2);
+      // Slightly faster than using vextq_u8().
+      s[0] = vld1q_u8(src_ptr);
+      s[1] = vld1q_u8(src_ptr + 1);
+      s[2] = vld1q_u8(src_ptr + 2);
       int16x8x2_t sum;
       sum.val[0] = sum.val[1] = vdupq_n_s16(0);
       WienerHorizontalSum(s, filter, sum, *wiener_buffer);
-      s[0] = s[3];
+      src_ptr += 16;
       *wiener_buffer += 16;
       x -= 16;
     } while (x != 0);
     src += src_stride;
-  } while (--y != 0);
+  }
 }
 
 inline void WienerHorizontalTap1(const uint8_t* src, const ptrdiff_t src_stride,
                                  const ptrdiff_t width, const int height,
                                  int16_t** const wiener_buffer) {
-  int y = height;
-  do {
+  for (int y = height; y != 0; --y) {
     const uint8_t* src_ptr = src;
     ptrdiff_t x = width;
     do {
@@ -218,7 +229,7 @@
       x -= 16;
     } while (x != 0);
     src += src_stride;
-  } while (--y != 0);
+  }
 }
 
 inline int32x4x2_t WienerVertical2(const int16x8_t a0, const int16x8_t a1,
@@ -479,19 +490,19 @@
 // For width 16 and up, store the horizontal results, and then do the vertical
 // filter row by row. This is faster than doing it column by column when
 // considering cache issues.
-void WienerFilter_NEON(const void* const source, void* const dest,
-                       const RestorationUnitInfo& restoration_info,
-                       const ptrdiff_t source_stride,
-                       const ptrdiff_t dest_stride, const int width,
-                       const int height, RestorationBuffer* const buffer) {
-  constexpr int kCenterTap = kWienerFilterTaps / 2;
+void WienerFilter_NEON(
+    const RestorationUnitInfo& restoration_info, const void* const source,
+    const ptrdiff_t stride, const void* const top_border,
+    const ptrdiff_t top_border_stride, const void* const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* const restoration_buffer, void* const dest) {
   const int16_t* const number_leading_zero_coefficients =
       restoration_info.wiener_info.number_leading_zero_coefficients;
   const int number_rows_to_skip = std::max(
       static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
       1);
   const ptrdiff_t wiener_stride = Align(width, 16);
-  int16_t* const wiener_buffer_vertical = buffer->wiener_buffer;
+  int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
   // The values are saturated to 13 bits before storing.
   int16_t* wiener_buffer_horizontal =
       wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
@@ -506,25 +517,48 @@
   // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
   const int height_horizontal =
       height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
-  const auto* const src = static_cast<const uint8_t*>(source) -
-                          (kCenterTap - number_rows_to_skip) * source_stride;
+  const int height_extra = (height_horizontal - height) >> 1;
+  assert(height_extra <= 2);
+  const auto* const src = static_cast<const uint8_t*>(source);
+  const auto* const top = static_cast<const uint8_t*>(top_border);
+  const auto* const bottom = static_cast<const uint8_t*>(bottom_border);
   if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
-    WienerHorizontalTap7(src - 3, source_stride, wiener_stride,
-                         height_horizontal, filter_horizontal,
+    WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+                         top_border_stride, wiener_stride, height_extra,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+                         height_extra, filter_horizontal,
                          &wiener_buffer_horizontal);
   } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
-    WienerHorizontalTap5(src - 2, source_stride, wiener_stride,
-                         height_horizontal, filter_horizontal,
+    WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+                         top_border_stride, wiener_stride, height_extra,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+                         height_extra, filter_horizontal,
                          &wiener_buffer_horizontal);
   } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
     // The maximum over-reads happen here.
-    WienerHorizontalTap3(src - 1, source_stride, wiener_stride,
-                         height_horizontal, filter_horizontal,
+    WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+                         top_border_stride, wiener_stride, height_extra,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+                         filter_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+                         height_extra, filter_horizontal,
                          &wiener_buffer_horizontal);
   } else {
     assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
-    WienerHorizontalTap1(src, source_stride, wiener_stride, height_horizontal,
+    WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+                         top_border_stride, wiener_stride, height_extra,
                          &wiener_buffer_horizontal);
+    WienerHorizontalTap1(src, stride, wiener_stride, height,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+                         height_extra, &wiener_buffer_horizontal);
   }
 
   // vertical filtering.
@@ -536,52 +570,41 @@
     // the top and bottom row of |wiener_buffer| accordingly.
     memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
            sizeof(*wiener_buffer_horizontal) * wiener_stride);
-    memcpy(buffer->wiener_buffer, buffer->wiener_buffer + wiener_stride,
-           sizeof(*buffer->wiener_buffer) * wiener_stride);
+    memcpy(restoration_buffer->wiener_buffer,
+           restoration_buffer->wiener_buffer + wiener_stride,
+           sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
     WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
-                       filter_vertical, dst, dest_stride);
+                       filter_vertical, dst, stride);
   } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
     WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
-                       height, filter_vertical, dst, dest_stride);
+                       height, filter_vertical, dst, stride);
   } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
     WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
-                       wiener_stride, height, filter_vertical, dst,
-                       dest_stride);
+                       wiener_stride, height, filter_vertical, dst, stride);
   } else {
     assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
     WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
-                       wiener_stride, height, dst, dest_stride);
+                       wiener_stride, height, dst, stride);
   }
 }
 
 //------------------------------------------------------------------------------
 // SGR
 
-template <int n>
-inline uint16x4_t CalculateMa(const uint16x4_t sum, const uint32x4_t sum_sq,
-                              const uint32_t scale) {
-  // a = |sum_sq|
-  // d = |sum|
-  // p = (a * n < d * d) ? 0 : a * n - d * d;
-  const uint32x4_t dxd = vmull_u16(sum, sum);
-  const uint32x4_t axn = vmulq_n_u32(sum_sq, n);
-  // Ensure |p| does not underflow by using saturating subtraction.
-  const uint32x4_t p = vqsubq_u32(axn, dxd);
-  // z = RightShiftWithRounding(p * scale, kSgrProjScaleBits);
-  const uint32x4_t pxs = vmulq_n_u32(p, scale);
-  // vrshrn_n_u32() (narrowing shift) can only shift by 16 and kSgrProjScaleBits
-  // is 20.
-  const uint32x4_t shifted = vrshrq_n_u32(pxs, kSgrProjScaleBits);
-  return vmovn_u32(shifted);
-}
-
-inline void Prepare3_8(const uint8x8x2_t src, uint8x8_t dst[3]) {
+inline void Prepare3_8(const uint8x8_t src[2], uint8x8_t dst[3]) {
   dst[0] = VshrU128<0>(src);
   dst[1] = VshrU128<1>(src);
   dst[2] = VshrU128<2>(src);
 }
 
-inline void Prepare3_16(const uint16x8x2_t src, uint16x4_t low[3],
+template <int offset>
+inline void Prepare3_8(const uint8x16_t src[2], uint8x16_t dst[3]) {
+  dst[0] = VshrU128<offset + 0>(src);
+  dst[1] = VshrU128<offset + 1>(src);
+  dst[2] = VshrU128<offset + 2>(src);
+}
+
+inline void Prepare3_16(const uint16x8_t src[2], uint16x4_t low[3],
                         uint16x4_t high[3]) {
   uint16x8_t s[3];
   s[0] = VshrU128<0>(src);
@@ -595,7 +618,7 @@
   high[2] = vget_high_u16(s[2]);
 }
 
-inline void Prepare5_8(const uint8x8x2_t src, uint8x8_t dst[5]) {
+inline void Prepare5_8(const uint8x8_t src[2], uint8x8_t dst[5]) {
   dst[0] = VshrU128<0>(src);
   dst[1] = VshrU128<1>(src);
   dst[2] = VshrU128<2>(src);
@@ -603,7 +626,16 @@
   dst[4] = VshrU128<4>(src);
 }
 
-inline void Prepare5_16(const uint16x8x2_t src, uint16x4_t low[5],
+template <int offset>
+inline void Prepare5_8(const uint8x16_t src[2], uint8x16_t dst[5]) {
+  dst[0] = VshrU128<offset + 0>(src);
+  dst[1] = VshrU128<offset + 1>(src);
+  dst[2] = VshrU128<offset + 2>(src);
+  dst[3] = VshrU128<offset + 3>(src);
+  dst[4] = VshrU128<offset + 4>(src);
+}
+
+inline void Prepare5_16(const uint16x8_t src[2], uint16x4_t low[5],
                         uint16x4_t high[5]) {
   Prepare3_16(src, low, high);
   const uint16x8_t s3 = VshrU128<6>(src);
@@ -642,6 +674,30 @@
   return vaddw_u8(sum, src[2]);
 }
 
+inline uint16x8_t Sum3WLo16(const uint8x16_t src[3]) {
+  const uint16x8_t sum = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[1]));
+  return vaddw_u8(sum, vget_low_u8(src[2]));
+}
+
+inline uint16x8_t Sum3WHi16(const uint8x16_t src[3]) {
+  const uint16x8_t sum = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[1]));
+  return vaddw_u8(sum, vget_high_u8(src[2]));
+}
+
+inline uint16x8_t Sum5WLo16(const uint8x16_t src[5]) {
+  const uint16x8_t sum01 = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[1]));
+  const uint16x8_t sum23 = vaddl_u8(vget_low_u8(src[2]), vget_low_u8(src[3]));
+  const uint16x8_t sum = vaddq_u16(sum01, sum23);
+  return vaddw_u8(sum, vget_low_u8(src[4]));
+}
+
+inline uint16x8_t Sum5WHi16(const uint8x16_t src[5]) {
+  const uint16x8_t sum01 = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[1]));
+  const uint16x8_t sum23 = vaddl_u8(vget_high_u8(src[2]), vget_high_u8(src[3]));
+  const uint16x8_t sum = vaddq_u16(sum01, sum23);
+  return vaddw_u8(sum, vget_high_u8(src[4]));
+}
+
 inline uint32x4_t Sum3W_32(const uint16x4_t src[3]) {
   const uint32x4_t sum = vaddl_u16(src[0], src[1]);
   return vaddw_u16(sum, src[2]);
@@ -679,13 +735,28 @@
   return vaddw_u16(sum0123, src[4]);
 }
 
-inline uint16x8_t Sum3Horizontal(const uint8x8x2_t src) {
+inline uint16x8_t Sum3Horizontal(const uint8x8_t src[2]) {
   uint8x8_t s[3];
   Prepare3_8(src, s);
   return Sum3W_16(s);
 }
 
-inline uint32x4x2_t Sum3WHorizontal(const uint16x8x2_t src) {
+inline uint16x8_t Sum3Horizontal(const uint8x16_t src) {
+  uint8x8_t s[2];
+  s[0] = vget_low_u8(src);
+  s[1] = vget_high_u8(src);
+  return Sum3Horizontal(s);
+}
+
+template <int offset>
+inline void Sum3Horizontal(const uint8x16_t src[2], uint16x8_t dst[2]) {
+  uint8x16_t s[3];
+  Prepare3_8<offset>(src, s);
+  dst[0] = Sum3WLo16(s);
+  dst[1] = Sum3WHi16(s);
+}
+
+inline uint32x4x2_t Sum3WHorizontal(const uint16x8_t src[2]) {
   uint16x4_t low[3], high[3];
   uint32x4x2_t sum;
   Prepare3_16(src, low, high);
@@ -694,7 +765,7 @@
   return sum;
 }
 
-inline uint16x8_t Sum5Horizontal(const uint8x8x2_t src) {
+inline uint16x8_t Sum5Horizontal(const uint8x8_t src[2]) {
   uint8x8_t s[5];
   Prepare5_8(src, s);
   const uint16x8_t sum01 = vaddl_u8(s[0], s[1]);
@@ -703,7 +774,23 @@
   return vaddw_u8(sum0123, s[4]);
 }
 
-inline uint32x4x2_t Sum5WHorizontal(const uint16x8x2_t src) {
+inline uint16x8_t Sum5Horizontal(const uint8x16_t src) {
+  uint8x8_t s[2];
+  s[0] = vget_low_u8(src);
+  s[1] = vget_high_u8(src);
+  return Sum5Horizontal(s);
+}
+
+template <int offset>
+inline void Sum5Horizontal(const uint8x16_t src[2], uint16x8_t* const dst0,
+                           uint16x8_t* const dst1) {
+  uint8x16_t s[5];
+  Prepare5_8<offset>(src, s);
+  *dst0 = Sum5WLo16(s);
+  *dst1 = Sum5WHi16(s);
+}
+
+inline uint32x4x2_t Sum5WHorizontal(const uint16x8_t src[2]) {
   uint16x4_t low[5], high[5];
   Prepare5_16(src, low, high);
   uint32x4x2_t sum;
@@ -712,6 +799,30 @@
   return sum;
 }
 
+template <int offset>
+void SumHorizontal(const uint8x16_t src[2], uint16x8_t* const row3_0,
+                   uint16x8_t* const row3_1, uint16x8_t* const row5_0,
+                   uint16x8_t* const row5_1) {
+  uint8x16_t s[5];
+  Prepare5_8<offset>(src, s);
+  const uint16x8_t sum04_lo = vaddl_u8(vget_low_u8(s[0]), vget_low_u8(s[4]));
+  const uint16x8_t sum04_hi = vaddl_u8(vget_high_u8(s[0]), vget_high_u8(s[4]));
+  *row3_0 = Sum3WLo16(s + 1);
+  *row3_1 = Sum3WHi16(s + 1);
+  *row5_0 = vaddq_u16(sum04_lo, *row3_0);
+  *row5_1 = vaddq_u16(sum04_hi, *row3_1);
+}
+
+void SumHorizontal(const uint8x8_t src[2], uint16x8_t* const row3,
+                   uint16x8_t* const row5) {
+  uint8x8_t s[5];
+  Prepare5_8(src, s);
+  const uint16x8_t sum04 = vaddl_u8(s[0], s[4]);
+  const uint16x8_t sum12 = vaddl_u8(s[1], s[2]);
+  *row3 = vaddw_u8(sum12, s[3]);
+  *row5 = vaddq_u16(sum04, *row3);
+}
+
 void SumHorizontal(const uint16x4_t src[5], uint32x4_t* const row_sq3,
                    uint32x4_t* const row_sq5) {
   const uint32x4_t sum04 = vaddl_u16(src[0], src[4]);
@@ -720,27 +831,36 @@
   *row_sq5 = vaddq_u32(sum04, *row_sq3);
 }
 
-void SumHorizontal(const uint8x8x2_t src, const uint16x8x2_t sq,
-                   uint16x8_t* const row3, uint16x8_t* const row5,
-                   uint32x4x2_t* const row_sq3, uint32x4x2_t* const row_sq5) {
-  uint8x8_t s[5];
-  Prepare5_8(src, s);
-  const uint16x8_t sum04 = vaddl_u8(s[0], s[4]);
-  const uint16x8_t sum12 = vaddl_u8(s[1], s[2]);
-  *row3 = vaddw_u8(sum12, s[3]);
-  *row5 = vaddq_u16(sum04, *row3);
+void SumHorizontal(const uint16x8_t sq[2], uint32x4x2_t* const row_sq3,
+                   uint32x4x2_t* const row_sq5) {
   uint16x4_t low[5], high[5];
   Prepare5_16(sq, low, high);
   SumHorizontal(low, &row_sq3->val[0], &row_sq5->val[0]);
   SumHorizontal(high, &row_sq3->val[1], &row_sq5->val[1]);
 }
 
-inline uint16x8_t Sum343(const uint8x8x2_t src) {
-  uint8x8_t s[3];
-  Prepare3_8(src, s);
-  const uint16x8_t sum = Sum3W_16(s);
+void SumHorizontal(const uint8x8_t src[2], const uint16x8_t sq[2],
+                   uint16x8_t* const row3, uint16x8_t* const row5,
+                   uint32x4x2_t* const row_sq3, uint32x4x2_t* const row_sq5) {
+  SumHorizontal(src, row3, row5);
+  SumHorizontal(sq, row_sq3, row_sq5);
+}
+
+void SumHorizontal(const uint8x16_t src, const uint16x8_t sq[2],
+                   uint16x8_t* const row3, uint16x8_t* const row5,
+                   uint32x4x2_t* const row_sq3, uint32x4x2_t* const row_sq5) {
+  uint8x8_t s[2];
+  s[0] = vget_low_u8(src);
+  s[1] = vget_high_u8(src);
+  return SumHorizontal(s, sq, row3, row5, row_sq3, row_sq5);
+}
+
+template <int offset>
+inline uint16x8_t Sum343(const uint8x16_t ma3[2]) {
+  const uint16x8_t sum = (offset == 0) ? Sum3WLo16(ma3) : Sum3WHi16(ma3);
   const uint16x8_t sum3 = Sum3_16(sum, sum, sum);
-  return vaddw_u8(sum3, s[1]);
+  return vaddw_u8(sum3,
+                  (offset == 0) ? vget_low_u8(ma3[1]) : vget_high_u8(ma3[1]));
 }
 
 inline uint32x4_t Sum343W(const uint16x4_t src[3]) {
@@ -749,7 +869,7 @@
   return vaddw_u16(sum3, src[1]);
 }
 
-inline uint32x4x2_t Sum343W(const uint16x8x2_t src) {
+inline uint32x4x2_t Sum343W(const uint16x8_t src[2]) {
   uint16x4_t low[3], high[3];
   uint32x4x2_t d;
   Prepare3_16(src, low, high);
@@ -758,13 +878,13 @@
   return d;
 }
 
-inline uint16x8_t Sum565(const uint8x8x2_t src) {
-  uint8x8_t s[3];
-  Prepare3_8(src, s);
-  const uint16x8_t sum = Sum3W_16(s);
+template <int offset>
+inline uint16x8_t Sum565(const uint8x16_t ma5[2]) {
+  const uint16x8_t sum = (offset == 0) ? Sum3WLo16(ma5) : Sum3WHi16(ma5);
   const uint16x8_t sum4 = vshlq_n_u16(sum, 2);
   const uint16x8_t sum5 = vaddq_u16(sum4, sum);
-  return vaddw_u8(sum5, s[1]);
+  return vaddw_u8(sum5,
+                  (offset == 0) ? vget_low_u8(ma5[1]) : vget_high_u8(ma5[1]));
 }
 
 inline uint32x4_t Sum565W(const uint16x4_t src[3]) {
@@ -774,7 +894,7 @@
   return vaddw_u16(sum5, src[1]);
 }
 
-inline uint32x4x2_t Sum565W(const uint16x8x2_t src) {
+inline uint32x4x2_t Sum565W(const uint16x8_t src[2]) {
   uint16x4_t low[3], high[3];
   uint32x4x2_t d;
   Prepare3_16(src, low, high);
@@ -783,19 +903,203 @@
   return d;
 }
 
-inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3,
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t sum_stride, uint16_t* sum3, uint16_t* sum5,
+                   uint32_t* square_sum3, uint32_t* square_sum5) {
+  int y = 2;
+  // Don't change loop width to 16, which is even slower.
+  do {
+    uint8x8_t s[2];
+    uint16x8_t sq[2];
+    s[0] = vld1_u8(src);
+    sq[0] = vmull_u8(s[0], s[0]);
+    ptrdiff_t x = 0;
+    do {
+      uint16x8_t row3, row5;
+      uint32x4x2_t row_sq3, row_sq5;
+      s[1] = vld1_u8(src + x + 8);
+      sq[1] = vmull_u8(s[1], s[1]);
+      SumHorizontal(s, sq, &row3, &row5, &row_sq3, &row_sq5);
+      vst1q_u16(sum3, row3);
+      vst1q_u16(sum5, row5);
+      vst1q_u32(square_sum3 + 0, row_sq3.val[0]);
+      vst1q_u32(square_sum3 + 4, row_sq3.val[1]);
+      vst1q_u32(square_sum5 + 0, row_sq5.val[0]);
+      vst1q_u32(square_sum5 + 4, row_sq5.val[1]);
+      s[0] = s[1];
+      sq[0] = sq[1];
+      sum3 += 8;
+      sum5 += 8;
+      square_sum3 += 8;
+      square_sum5 += 8;
+      x += 8;
+    } while (x < sum_stride);
+    src += src_stride;
+  } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t sum_stride, uint16_t* sums,
+                   uint32_t* square_sums) {
+  static_assert(size == 3 || size == 5, "");
+  int y = 2;
+  // Don't change loop width to 16, which is even slower.
+  do {
+    uint8x8_t s[2];
+    uint16x8_t sq[2];
+    s[0] = vld1_u8(src);
+    sq[0] = vmull_u8(s[0], s[0]);
+    ptrdiff_t x = 0;
+    do {
+      uint16x8_t row;
+      uint32x4x2_t row_sq;
+      s[1] = vld1_u8(src + x + 8);
+      sq[1] = vmull_u8(s[1], s[1]);
+      if (size == 3) {
+        row = Sum3Horizontal(s);
+        row_sq = Sum3WHorizontal(sq);
+      } else {
+        row = Sum5Horizontal(s);
+        row_sq = Sum5WHorizontal(sq);
+      }
+      vst1q_u16(sums, row);
+      vst1q_u32(square_sums + 0, row_sq.val[0]);
+      vst1q_u32(square_sums + 4, row_sq.val[1]);
+      s[0] = s[1];
+      sq[0] = sq[1];
+      sums += 8;
+      square_sums += 8;
+      x += 8;
+    } while (x < sum_stride);
+    src += src_stride;
+  } while (--y != 0);
+}
+
+template <int n>
+inline uint16x4_t CalculateMa(const uint16x4_t sum, const uint32x4_t sum_sq,
+                              const uint32_t scale) {
+  // a = |sum_sq|
+  // d = |sum|
+  // p = (a * n < d * d) ? 0 : a * n - d * d;
+  const uint32x4_t dxd = vmull_u16(sum, sum);
+  const uint32x4_t axn = vmulq_n_u32(sum_sq, n);
+  // Ensure |p| does not underflow by using saturating subtraction.
+  const uint32x4_t p = vqsubq_u32(axn, dxd);
+  const uint32x4_t pxs = vmulq_n_u32(p, scale);
+  // vrshrn_n_u32() (narrowing shift) can only shift by 16 and kSgrProjScaleBits
+  // is 20.
+  const uint32x4_t shifted = vrshrq_n_u32(pxs, kSgrProjScaleBits);
+  return vmovn_u32(shifted);
+}
+
+inline uint8x8_t AdjustValue(const uint8x8_t value, const uint8x8_t index,
+                             const int threshold) {
+  const uint8x8_t thresholds = vdup_n_u8(threshold);
+  const uint8x8_t offset = vcgt_u8(index, thresholds);
+  // Adding 255 is equivalent to subtracting 1 for 8-bit data.
+  return vadd_u8(value, offset);
+}
+
+template <int n, int offset>
+inline void CalculateIntermediate(const uint16x8_t sum,
+                                  const uint32x4x2_t sum_sq,
+                                  const uint32_t scale, uint8x16_t* const ma,
+                                  uint16x8_t* const b) {
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
+  const uint16x4_t z0 = CalculateMa<n>(vget_low_u16(sum), sum_sq.val[0], scale);
+  const uint16x4_t z1 =
+      CalculateMa<n>(vget_high_u16(sum), sum_sq.val[1], scale);
+  const uint16x8_t z01 = vcombine_u16(z0, z1);
+  const uint8x8_t idx = vqmovn_u16(z01);
+  // Use table lookup to read elements whose indices are less than 48.
+  // Using one uint8x8x4_t vector and one uint8x8x2_t vector is faster than
+  // using two uint8x8x3_t vectors.
+  uint8x8x4_t table0;
+  uint8x8x2_t table1;
+  table0.val[0] = vld1_u8(kSgrMaLookup + 0 * 8);
+  table0.val[1] = vld1_u8(kSgrMaLookup + 1 * 8);
+  table0.val[2] = vld1_u8(kSgrMaLookup + 2 * 8);
+  table0.val[3] = vld1_u8(kSgrMaLookup + 3 * 8);
+  table1.val[0] = vld1_u8(kSgrMaLookup + 4 * 8);
+  table1.val[1] = vld1_u8(kSgrMaLookup + 5 * 8);
+  // All elements whose indices are out of range [0, 47] are set to 0.
+  uint8x8_t val = vtbl4_u8(table0, idx);  // Range [0, 31].
+  // Subtract 8 to shuffle the next index range.
+  const uint8x8_t index = vsub_u8(idx, vdup_n_u8(32));
+  const uint8x8_t res = vtbl2_u8(table1, index);  // Range [32, 47].
+  // Use OR instruction to combine shuffle results together.
+  val = vorr_u8(val, res);
+
+  // For elements whose indices are larger than 47, since they seldom change
+  // values with the increase of the index, we use comparison and arithmetic
+  // operations to calculate their values.
+  // Elements whose indices are larger than 47 (with value 0) are set to 5.
+  val = vmax_u8(val, vdup_n_u8(5));
+  val = AdjustValue(val, idx, 55);   // 55 is the last index which value is 5.
+  val = AdjustValue(val, idx, 72);   // 72 is the last index which value is 4.
+  val = AdjustValue(val, idx, 101);  // 101 is the last index which value is 3.
+  val = AdjustValue(val, idx, 169);  // 169 is the last index which value is 2.
+  val = AdjustValue(val, idx, 254);  // 254 is the last index which value is 1.
+  *ma = (offset == 0) ? vcombine_u8(val, vget_high_u8(*ma))
+                      : vcombine_u8(vget_low_u8(*ma), val);
+
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const uint16x8_t maq =
+      vmovl_u8((offset == 0) ? vget_low_u8(*ma) : vget_high_u8(*ma));
+  const uint32x4_t m0 = vmull_u16(vget_low_u16(maq), vget_low_u16(sum));
+  const uint32x4_t m1 = vmull_u16(vget_high_u16(maq), vget_high_u16(sum));
+  const uint32x4_t m2 = vmulq_n_u32(m0, one_over_n);
+  const uint32x4_t m3 = vmulq_n_u32(m1, one_over_n);
+  const uint16x4_t b_lo = vrshrn_n_u32(m2, kSgrProjReciprocalBits);
+  const uint16x4_t b_hi = vrshrn_n_u32(m3, kSgrProjReciprocalBits);
+  *b = vcombine_u16(b_lo, b_hi);
+}
+
+template <int offset>
+inline void CalculateIntermediate5(const uint16x8_t s5[5],
+                                   const uint32x4x2_t sq5[5],
+                                   const uint32_t scale, uint8x16_t* const ma,
+                                   uint16x8_t* const b) {
+  const uint16x8_t sum = Sum5_16(s5);
+  const uint32x4x2_t sum_sq = Sum5_32(sq5);
+  CalculateIntermediate<25, offset>(sum, sum_sq, scale, ma, b);
+}
+
+template <int offset>
+inline void CalculateIntermediate3(const uint16x8_t s3[3],
+                                   const uint32x4x2_t sq3[3],
+                                   const uint32_t scale, uint8x16_t* const ma,
+                                   uint16x8_t* const b) {
+  const uint16x8_t sum = Sum3_16(s3);
+  const uint32x4x2_t sum_sq = Sum3_32(sq3);
+  CalculateIntermediate<9, offset>(sum, sum_sq, scale, ma, b);
+}
+
+template <int offset>
+inline void Store343_444(const uint8x16_t ma3[3], const uint16x8_t b3[2],
                          const ptrdiff_t x, uint16x8_t* const sum_ma343,
                          uint16x8_t* const sum_ma444,
                          uint32x4x2_t* const sum_b343,
                          uint32x4x2_t* const sum_b444, uint16_t* const ma343,
                          uint16_t* const ma444, uint32_t* const b343,
                          uint32_t* const b444) {
-  uint8x8_t s[3];
-  Prepare3_8(ma3, s);
-  const uint16x8_t sum_ma111 = Sum3W_16(s);
+  const uint16x8_t sum_ma111 = (offset == 0) ? Sum3WLo16(ma3) : Sum3WHi16(ma3);
   *sum_ma444 = vshlq_n_u16(sum_ma111, 2);
   const uint16x8_t sum333 = vsubq_u16(*sum_ma444, sum_ma111);
-  *sum_ma343 = vaddw_u8(sum333, s[1]);
+  *sum_ma343 = vaddw_u8(
+      sum333, (offset == 0) ? vget_low_u8(ma3[1]) : vget_high_u8(ma3[1]));
   uint16x4_t low[3], high[3];
   uint32x4x2_t sum_b111;
   Prepare3_16(b3, low, high);
@@ -809,30 +1113,608 @@
   sum_b343->val[1] = vaddw_u16(sum_b343->val[1], high[1]);
   vst1q_u16(ma343 + x, *sum_ma343);
   vst1q_u16(ma444 + x, *sum_ma444);
-  vst1q_u32(b343 + x + 0, (*sum_b343).val[0]);
-  vst1q_u32(b343 + x + 4, (*sum_b343).val[1]);
-  vst1q_u32(b444 + x + 0, (*sum_b444).val[0]);
-  vst1q_u32(b444 + x + 4, (*sum_b444).val[1]);
+  vst1q_u32(b343 + x + 0, sum_b343->val[0]);
+  vst1q_u32(b343 + x + 4, sum_b343->val[1]);
+  vst1q_u32(b444 + x + 0, sum_b444->val[0]);
+  vst1q_u32(b444 + x + 4, sum_b444->val[1]);
 }
 
-inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3,
+template <int offset>
+inline void Store343_444(const uint8x16_t ma3[3], const uint16x8_t b3[2],
                          const ptrdiff_t x, uint16x8_t* const sum_ma343,
                          uint32x4x2_t* const sum_b343, uint16_t* const ma343,
                          uint16_t* const ma444, uint32_t* const b343,
                          uint32_t* const b444) {
   uint16x8_t sum_ma444;
   uint32x4x2_t sum_b444;
-  Store343_444(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, &sum_b444, ma343,
-               ma444, b343, b444);
+  Store343_444<offset>(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, &sum_b444,
+                       ma343, ma444, b343, b444);
 }
 
-inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3,
+template <int offset>
+inline void Store343_444(const uint8x16_t ma3[3], const uint16x8_t b3[2],
                          const ptrdiff_t x, uint16_t* const ma343,
                          uint16_t* const ma444, uint32_t* const b343,
                          uint32_t* const b444) {
   uint16x8_t sum_ma343;
   uint32x4x2_t sum_b343;
-  Store343_444(ma3, b3, x, &sum_ma343, &sum_b343, ma343, ma444, b343, b444);
+  Store343_444<offset>(ma3, b3, x, &sum_ma343, &sum_b343, ma343, ma444, b343,
+                       b444);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+    const uint8_t* const src0, const uint8_t* const src1, const uint32_t scale,
+    uint8x16_t s[2][2], uint16_t* const sum5[5], uint32_t* const square_sum5[5],
+    uint16x8_t sq[2][4], uint8x16_t* const ma, uint16x8_t* const b) {
+  uint16x8_t s5[5];
+  uint32x4x2_t sq5[5];
+  s[0][0] = vld1q_u8(src0);
+  s[1][0] = vld1q_u8(src1);
+  sq[0][0] = vmull_u8(vget_low_u8(s[0][0]), vget_low_u8(s[0][0]));
+  sq[1][0] = vmull_u8(vget_low_u8(s[1][0]), vget_low_u8(s[1][0]));
+  sq[0][1] = vmull_u8(vget_high_u8(s[0][0]), vget_high_u8(s[0][0]));
+  sq[1][1] = vmull_u8(vget_high_u8(s[1][0]), vget_high_u8(s[1][0]));
+  s5[3] = Sum5Horizontal(s[0][0]);
+  s5[4] = Sum5Horizontal(s[1][0]);
+  sq5[3] = Sum5WHorizontal(sq[0]);
+  sq5[4] = Sum5WHorizontal(sq[1]);
+  vst1q_u16(sum5[3], s5[3]);
+  vst1q_u16(sum5[4], s5[4]);
+  vst1q_u32(square_sum5[3] + 0, sq5[3].val[0]);
+  vst1q_u32(square_sum5[3] + 4, sq5[3].val[1]);
+  vst1q_u32(square_sum5[4] + 0, sq5[4].val[0]);
+  vst1q_u32(square_sum5[4] + 4, sq5[4].val[1]);
+  s5[0] = vld1q_u16(sum5[0]);
+  s5[1] = vld1q_u16(sum5[1]);
+  s5[2] = vld1q_u16(sum5[2]);
+  sq5[0].val[0] = vld1q_u32(square_sum5[0] + 0);
+  sq5[0].val[1] = vld1q_u32(square_sum5[0] + 4);
+  sq5[1].val[0] = vld1q_u32(square_sum5[1] + 0);
+  sq5[1].val[1] = vld1q_u32(square_sum5[1] + 4);
+  sq5[2].val[0] = vld1q_u32(square_sum5[2] + 0);
+  sq5[2].val[1] = vld1q_u32(square_sum5[2] + 4);
+  CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+    const uint8_t* const src0, const uint8_t* const src1, const ptrdiff_t x,
+    const uint32_t scale, uint8x16_t s[2][2], uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], uint16x8_t sq[2][4], uint8x16_t ma[2],
+    uint16x8_t b[2]) {
+  uint16x8_t s5[2][5];
+  uint32x4x2_t sq5[5];
+  s[0][1] = vld1q_u8(src0 + x + 8);
+  s[1][1] = vld1q_u8(src1 + x + 8);
+  sq[0][2] = vmull_u8(vget_low_u8(s[0][1]), vget_low_u8(s[0][1]));
+  sq[1][2] = vmull_u8(vget_low_u8(s[1][1]), vget_low_u8(s[1][1]));
+  Sum5Horizontal<8>(s[0], &s5[0][3], &s5[1][3]);
+  Sum5Horizontal<8>(s[1], &s5[0][4], &s5[1][4]);
+  sq5[3] = Sum5WHorizontal(sq[0] + 1);
+  sq5[4] = Sum5WHorizontal(sq[1] + 1);
+  vst1q_u16(sum5[3] + x, s5[0][3]);
+  vst1q_u16(sum5[4] + x, s5[0][4]);
+  vst1q_u32(square_sum5[3] + x + 0, sq5[3].val[0]);
+  vst1q_u32(square_sum5[3] + x + 4, sq5[3].val[1]);
+  vst1q_u32(square_sum5[4] + x + 0, sq5[4].val[0]);
+  vst1q_u32(square_sum5[4] + x + 4, sq5[4].val[1]);
+  s5[0][0] = vld1q_u16(sum5[0] + x);
+  s5[0][1] = vld1q_u16(sum5[1] + x);
+  s5[0][2] = vld1q_u16(sum5[2] + x);
+  sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
+  sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
+  sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
+  sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
+  sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
+  sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
+  CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[0]);
+
+  sq[0][3] = vmull_u8(vget_high_u8(s[0][1]), vget_high_u8(s[0][1]));
+  sq[1][3] = vmull_u8(vget_high_u8(s[1][1]), vget_high_u8(s[1][1]));
+  sq5[3] = Sum5WHorizontal(sq[0] + 2);
+  sq5[4] = Sum5WHorizontal(sq[1] + 2);
+  vst1q_u16(sum5[3] + x + 8, s5[1][3]);
+  vst1q_u16(sum5[4] + x + 8, s5[1][4]);
+  vst1q_u32(square_sum5[3] + x + 8, sq5[3].val[0]);
+  vst1q_u32(square_sum5[3] + x + 12, sq5[3].val[1]);
+  vst1q_u32(square_sum5[4] + x + 8, sq5[4].val[0]);
+  vst1q_u32(square_sum5[4] + x + 12, sq5[4].val[1]);
+  s5[1][0] = vld1q_u16(sum5[0] + x + 8);
+  s5[1][1] = vld1q_u16(sum5[1] + x + 8);
+  s5[1][2] = vld1q_u16(sum5[2] + x + 8);
+  sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 8);
+  sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 12);
+  sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 8);
+  sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 12);
+  sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 8);
+  sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 12);
+  CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+    const uint8_t* const src, const uint32_t scale, uint8x16_t* const s,
+    const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
+    uint16x8_t sq[2], uint8x16_t* const ma, uint16x8_t* const b) {
+  uint16x8_t s5[5];
+  uint32x4x2_t sq5[5];
+  *s = vld1q_u8(src);
+  sq[0] = vmull_u8(vget_low_u8(*s), vget_low_u8(*s));
+  sq[1] = vmull_u8(vget_high_u8(*s), vget_high_u8(*s));
+  s5[3] = s5[4] = Sum5Horizontal(*s);
+  sq5[3] = sq5[4] = Sum5WHorizontal(sq);
+  s5[0] = vld1q_u16(sum5[0]);
+  s5[1] = vld1q_u16(sum5[1]);
+  s5[2] = vld1q_u16(sum5[2]);
+  sq5[0].val[0] = vld1q_u32(square_sum5[0] + 0);
+  sq5[0].val[1] = vld1q_u32(square_sum5[0] + 4);
+  sq5[1].val[0] = vld1q_u32(square_sum5[1] + 0);
+  sq5[1].val[1] = vld1q_u32(square_sum5[1] + 4);
+  sq5[2].val[0] = vld1q_u32(square_sum5[2] + 0);
+  sq5[2].val[1] = vld1q_u32(square_sum5[2] + 4);
+  CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+    const uint8_t* const src, const ptrdiff_t x, const uint32_t scale,
+    uint8x16_t s[2], const uint16_t* const sum5[5],
+    const uint32_t* const square_sum5[5], uint16x8_t sq[3], uint8x16_t ma[2],
+    uint16x8_t b[2]) {
+  uint16x8_t s5[2][5];
+  uint32x4x2_t sq5[5];
+  s[1] = vld1q_u8(src + x + 8);
+  sq[1] = vmull_u8(vget_low_u8(s[1]), vget_low_u8(s[1]));
+  Sum5Horizontal<8>(s, &s5[0][3], &s5[1][3]);
+  sq5[3] = sq5[4] = Sum5WHorizontal(sq);
+  s5[0][0] = vld1q_u16(sum5[0] + x);
+  s5[0][1] = vld1q_u16(sum5[1] + x);
+  s5[0][2] = vld1q_u16(sum5[2] + x);
+  s5[0][4] = s5[0][3];
+  sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
+  sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
+  sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
+  sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
+  sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
+  sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
+  CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[0]);
+
+  sq[2] = vmull_u8(vget_high_u8(s[1]), vget_high_u8(s[1]));
+  sq5[3] = sq5[4] = Sum5WHorizontal(sq + 1);
+  s5[1][0] = vld1q_u16(sum5[0] + x + 8);
+  s5[1][1] = vld1q_u16(sum5[1] + x + 8);
+  s5[1][2] = vld1q_u16(sum5[2] + x + 8);
+  s5[1][4] = s5[1][3];
+  sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 8);
+  sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 12);
+  sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 8);
+  sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 12);
+  sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 8);
+  sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 12);
+  CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+    const uint8_t* const src, const uint32_t scale, uint8x16_t* const s,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint16x8_t sq[2],
+    uint8x16_t* const ma, uint16x8_t* const b) {
+  uint16x8_t s3[3];
+  uint32x4x2_t sq3[3];
+  *s = vld1q_u8(src);
+  sq[0] = vmull_u8(vget_low_u8(*s), vget_low_u8(*s));
+  sq[1] = vmull_u8(vget_high_u8(*s), vget_high_u8(*s));
+  s3[2] = Sum3Horizontal(*s);
+  sq3[2] = Sum3WHorizontal(sq);
+  vst1q_u16(sum3[2], s3[2]);
+  vst1q_u32(square_sum3[2] + 0, sq3[2].val[0]);
+  vst1q_u32(square_sum3[2] + 4, sq3[2].val[1]);
+  s3[0] = vld1q_u16(sum3[0]);
+  s3[1] = vld1q_u16(sum3[1]);
+  sq3[0].val[0] = vld1q_u32(square_sum3[0] + 0);
+  sq3[0].val[1] = vld1q_u32(square_sum3[0] + 4);
+  sq3[1].val[0] = vld1q_u32(square_sum3[1] + 0);
+  sq3[1].val[1] = vld1q_u32(square_sum3[1] + 4);
+  CalculateIntermediate3<0>(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+    const uint8_t* const src, const ptrdiff_t x, const uint32_t scale,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint8x16_t s[2],
+    uint16x8_t sq[3], uint8x16_t ma[2], uint16x8_t b[2]) {
+  uint16x8_t s3[4];
+  uint32x4x2_t sq3[3];
+  s[1] = vld1q_u8(src + x + 8);
+  sq[1] = vmull_u8(vget_low_u8(s[1]), vget_low_u8(s[1]));
+  Sum3Horizontal<8>(s, s3 + 2);
+  sq3[2] = Sum3WHorizontal(sq);
+  vst1q_u16(sum3[2] + x, s3[2]);
+  vst1q_u32(square_sum3[2] + x + 0, sq3[2].val[0]);
+  vst1q_u32(square_sum3[2] + x + 4, sq3[2].val[1]);
+  s3[0] = vld1q_u16(sum3[0] + x);
+  s3[1] = vld1q_u16(sum3[1] + x);
+  sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 0);
+  sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4);
+  sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0);
+  sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4);
+  CalculateIntermediate3<8>(s3, sq3, scale, &ma[0], &b[0]);
+
+  sq[2] = vmull_u8(vget_high_u8(s[1]), vget_high_u8(s[1]));
+  sq3[2] = Sum3WHorizontal(sq + 1);
+  vst1q_u16(sum3[2] + x + 8, s3[3]);
+  vst1q_u32(square_sum3[2] + x + 8, sq3[2].val[0]);
+  vst1q_u32(square_sum3[2] + x + 12, sq3[2].val[1]);
+  s3[1] = vld1q_u16(sum3[0] + x + 8);
+  s3[2] = vld1q_u16(sum3[1] + x + 8);
+  sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 8);
+  sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 12);
+  sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 8);
+  sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 12);
+  CalculateIntermediate3<0>(s3 + 1, sq3, scale, &ma[1], &b[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+    const uint8_t* const src0, const uint8_t* const src1,
+    const uint16_t scales[2], uint8x16_t s[2][2], uint16_t* const sum3[4],
+    uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+    uint32_t* const square_sum5[5], uint16x8_t sq[2][4], uint8x16_t ma3[2][2],
+    uint16x8_t b3[2][3], uint8x16_t* const ma5, uint16x8_t* const b5) {
+  uint16x8_t s3[4], s5[5];
+  uint32x4x2_t sq3[4], sq5[5];
+  s[0][0] = vld1q_u8(src0);
+  s[1][0] = vld1q_u8(src1);
+  sq[0][0] = vmull_u8(vget_low_u8(s[0][0]), vget_low_u8(s[0][0]));
+  sq[1][0] = vmull_u8(vget_low_u8(s[1][0]), vget_low_u8(s[1][0]));
+  sq[0][1] = vmull_u8(vget_high_u8(s[0][0]), vget_high_u8(s[0][0]));
+  sq[1][1] = vmull_u8(vget_high_u8(s[1][0]), vget_high_u8(s[1][0]));
+  SumHorizontal(s[0][0], sq[0], &s3[2], &s5[3], &sq3[2], &sq5[3]);
+  SumHorizontal(s[1][0], sq[1], &s3[3], &s5[4], &sq3[3], &sq5[4]);
+  vst1q_u16(sum3[2], s3[2]);
+  vst1q_u16(sum3[3], s3[3]);
+  vst1q_u32(square_sum3[2] + 0, sq3[2].val[0]);
+  vst1q_u32(square_sum3[2] + 4, sq3[2].val[1]);
+  vst1q_u32(square_sum3[3] + 0, sq3[3].val[0]);
+  vst1q_u32(square_sum3[3] + 4, sq3[3].val[1]);
+  vst1q_u16(sum5[3], s5[3]);
+  vst1q_u16(sum5[4], s5[4]);
+  vst1q_u32(square_sum5[3] + 0, sq5[3].val[0]);
+  vst1q_u32(square_sum5[3] + 4, sq5[3].val[1]);
+  vst1q_u32(square_sum5[4] + 0, sq5[4].val[0]);
+  vst1q_u32(square_sum5[4] + 4, sq5[4].val[1]);
+  s3[0] = vld1q_u16(sum3[0]);
+  s3[1] = vld1q_u16(sum3[1]);
+  sq3[0].val[0] = vld1q_u32(square_sum3[0] + 0);
+  sq3[0].val[1] = vld1q_u32(square_sum3[0] + 4);
+  sq3[1].val[0] = vld1q_u32(square_sum3[1] + 0);
+  sq3[1].val[1] = vld1q_u32(square_sum3[1] + 4);
+  s5[0] = vld1q_u16(sum5[0]);
+  s5[1] = vld1q_u16(sum5[1]);
+  s5[2] = vld1q_u16(sum5[2]);
+  sq5[0].val[0] = vld1q_u32(square_sum5[0] + 0);
+  sq5[0].val[1] = vld1q_u32(square_sum5[0] + 4);
+  sq5[1].val[0] = vld1q_u32(square_sum5[1] + 0);
+  sq5[1].val[1] = vld1q_u32(square_sum5[1] + 4);
+  sq5[2].val[0] = vld1q_u32(square_sum5[2] + 0);
+  sq5[2].val[1] = vld1q_u32(square_sum5[2] + 4);
+  CalculateIntermediate3<0>(s3, sq3, scales[1], ma3[0], b3[0]);
+  CalculateIntermediate3<0>(s3 + 1, sq3 + 1, scales[1], ma3[1], b3[1]);
+  CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+    const uint8_t* const src0, const uint8_t* const src1, const ptrdiff_t x,
+    const uint16_t scales[2], uint8x16_t s[2][2], uint16_t* const sum3[4],
+    uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+    uint32_t* const square_sum5[5], uint16x8_t sq[2][4], uint8x16_t ma3[2][2],
+    uint16x8_t b3[2][3], uint8x16_t ma5[2], uint16x8_t b5[2]) {
+  uint16x8_t s3[2][4], s5[2][5];
+  uint32x4x2_t sq3[4], sq5[5];
+  s[0][1] = vld1q_u8(src0 + x + 8);
+  s[1][1] = vld1q_u8(src1 + x + 8);
+  sq[0][2] = vmull_u8(vget_low_u8(s[0][1]), vget_low_u8(s[0][1]));
+  sq[1][2] = vmull_u8(vget_low_u8(s[1][1]), vget_low_u8(s[1][1]));
+  SumHorizontal<8>(s[0], &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+  SumHorizontal<8>(s[1], &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]);
+  SumHorizontal(sq[0] + 1, &sq3[2], &sq5[3]);
+  SumHorizontal(sq[1] + 1, &sq3[3], &sq5[4]);
+  vst1q_u16(sum3[2] + x, s3[0][2]);
+  vst1q_u16(sum3[3] + x, s3[0][3]);
+  vst1q_u32(square_sum3[2] + x + 0, sq3[2].val[0]);
+  vst1q_u32(square_sum3[2] + x + 4, sq3[2].val[1]);
+  vst1q_u32(square_sum3[3] + x + 0, sq3[3].val[0]);
+  vst1q_u32(square_sum3[3] + x + 4, sq3[3].val[1]);
+  vst1q_u16(sum5[3] + x, s5[0][3]);
+  vst1q_u16(sum5[4] + x, s5[0][4]);
+  vst1q_u32(square_sum5[3] + x + 0, sq5[3].val[0]);
+  vst1q_u32(square_sum5[3] + x + 4, sq5[3].val[1]);
+  vst1q_u32(square_sum5[4] + x + 0, sq5[4].val[0]);
+  vst1q_u32(square_sum5[4] + x + 4, sq5[4].val[1]);
+  s3[0][0] = vld1q_u16(sum3[0] + x);
+  s3[0][1] = vld1q_u16(sum3[1] + x);
+  sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 0);
+  sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4);
+  sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0);
+  sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4);
+  s5[0][0] = vld1q_u16(sum5[0] + x);
+  s5[0][1] = vld1q_u16(sum5[1] + x);
+  s5[0][2] = vld1q_u16(sum5[2] + x);
+  sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
+  sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
+  sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
+  sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
+  sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
+  sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
+  CalculateIntermediate3<8>(s3[0], sq3, scales[1], &ma3[0][0], &b3[0][1]);
+  CalculateIntermediate3<8>(s3[0] + 1, sq3 + 1, scales[1], &ma3[1][0],
+                            &b3[1][1]);
+  CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], &b5[0]);
+
+  sq[0][3] = vmull_u8(vget_high_u8(s[0][1]), vget_high_u8(s[0][1]));
+  sq[1][3] = vmull_u8(vget_high_u8(s[1][1]), vget_high_u8(s[1][1]));
+  SumHorizontal(sq[0] + 2, &sq3[2], &sq5[3]);
+  SumHorizontal(sq[1] + 2, &sq3[3], &sq5[4]);
+  vst1q_u16(sum3[2] + x + 8, s3[1][2]);
+  vst1q_u16(sum3[3] + x + 8, s3[1][3]);
+  vst1q_u32(square_sum3[2] + x + 8, sq3[2].val[0]);
+  vst1q_u32(square_sum3[2] + x + 12, sq3[2].val[1]);
+  vst1q_u32(square_sum3[3] + x + 8, sq3[3].val[0]);
+  vst1q_u32(square_sum3[3] + x + 12, sq3[3].val[1]);
+  vst1q_u16(sum5[3] + x + 8, s5[1][3]);
+  vst1q_u16(sum5[4] + x + 8, s5[1][4]);
+  vst1q_u32(square_sum5[3] + x + 8, sq5[3].val[0]);
+  vst1q_u32(square_sum5[3] + x + 12, sq5[3].val[1]);
+  vst1q_u32(square_sum5[4] + x + 8, sq5[4].val[0]);
+  vst1q_u32(square_sum5[4] + x + 12, sq5[4].val[1]);
+  s3[1][0] = vld1q_u16(sum3[0] + x + 8);
+  s3[1][1] = vld1q_u16(sum3[1] + x + 8);
+  sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 8);
+  sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 12);
+  sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 8);
+  sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 12);
+  s5[1][0] = vld1q_u16(sum5[0] + x + 8);
+  s5[1][1] = vld1q_u16(sum5[1] + x + 8);
+  s5[1][2] = vld1q_u16(sum5[2] + x + 8);
+  sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 8);
+  sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 12);
+  sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 8);
+  sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 12);
+  sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 8);
+  sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 12);
+  CalculateIntermediate3<0>(s3[1], sq3, scales[1], &ma3[0][1], &b3[0][2]);
+  CalculateIntermediate3<0>(s3[1] + 1, sq3 + 1, scales[1], &ma3[1][1],
+                            &b3[1][2]);
+  CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], &b5[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+    const uint8_t* const src, const uint16_t scales[2],
+    const uint16_t* const sum3[4], const uint16_t* const sum5[5],
+    const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
+    uint8x16_t* const s, uint16x8_t sq[2], uint8x16_t* const ma3,
+    uint8x16_t* const ma5, uint16x8_t* const b3, uint16x8_t* const b5) {
+  uint16x8_t s3[3], s5[5];
+  uint32x4x2_t sq3[3], sq5[5];
+  *s = vld1q_u8(src);
+  sq[0] = vmull_u8(vget_low_u8(*s), vget_low_u8(*s));
+  sq[1] = vmull_u8(vget_high_u8(*s), vget_high_u8(*s));
+  SumHorizontal(*s, sq, &s3[2], &s5[3], &sq3[2], &sq5[3]);
+  s5[0] = vld1q_u16(sum5[0]);
+  s5[1] = vld1q_u16(sum5[1]);
+  s5[2] = vld1q_u16(sum5[2]);
+  s5[4] = s5[3];
+  sq5[0].val[0] = vld1q_u32(square_sum5[0] + 0);
+  sq5[0].val[1] = vld1q_u32(square_sum5[0] + 4);
+  sq5[1].val[0] = vld1q_u32(square_sum5[1] + 0);
+  sq5[1].val[1] = vld1q_u32(square_sum5[1] + 4);
+  sq5[2].val[0] = vld1q_u32(square_sum5[2] + 0);
+  sq5[2].val[1] = vld1q_u32(square_sum5[2] + 4);
+  sq5[4] = sq5[3];
+  CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+  s3[0] = vld1q_u16(sum3[0]);
+  s3[1] = vld1q_u16(sum3[1]);
+  sq3[0].val[0] = vld1q_u32(square_sum3[0] + 0);
+  sq3[0].val[1] = vld1q_u32(square_sum3[0] + 4);
+  sq3[1].val[0] = vld1q_u32(square_sum3[1] + 0);
+  sq3[1].val[1] = vld1q_u32(square_sum3[1] + 4);
+  CalculateIntermediate3<0>(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+    const uint8_t* const src, const ptrdiff_t x, const uint16_t scales[2],
+    const uint16_t* const sum3[4], const uint16_t* const sum5[5],
+    const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
+    uint8x16_t s[2], uint16x8_t sq[3], uint8x16_t ma3[2], uint8x16_t ma5[2],
+    uint16x8_t b3[2], uint16x8_t b5[2]) {
+  uint16x8_t s3[2][3], s5[2][5];
+  uint32x4x2_t sq3[3], sq5[5];
+  s[1] = vld1q_u8(src + x + 8);
+  sq[1] = vmull_u8(vget_low_u8(s[1]), vget_low_u8(s[1]));
+  SumHorizontal<8>(s, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+  SumHorizontal(sq, &sq3[2], &sq5[3]);
+  s5[0][0] = vld1q_u16(sum5[0] + x);
+  s5[0][1] = vld1q_u16(sum5[1] + x);
+  s5[0][2] = vld1q_u16(sum5[2] + x);
+  s5[0][4] = s5[0][3];
+  sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
+  sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
+  sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
+  sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
+  sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
+  sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
+  sq5[4] = sq5[3];
+  CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], &b5[0]);
+  s3[0][0] = vld1q_u16(sum3[0] + x);
+  s3[0][1] = vld1q_u16(sum3[1] + x);
+  sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 0);
+  sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4);
+  sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0);
+  sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4);
+  CalculateIntermediate3<8>(s3[0], sq3, scales[1], &ma3[0], &b3[0]);
+
+  sq[2] = vmull_u8(vget_high_u8(s[1]), vget_high_u8(s[1]));
+  SumHorizontal(sq + 1, &sq3[2], &sq5[3]);
+  s5[1][0] = vld1q_u16(sum5[0] + x + 8);
+  s5[1][1] = vld1q_u16(sum5[1] + x + 8);
+  s5[1][2] = vld1q_u16(sum5[2] + x + 8);
+  s5[1][4] = s5[1][3];
+  sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 8);
+  sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 12);
+  sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 8);
+  sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 12);
+  sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 8);
+  sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 12);
+  sq5[4] = sq5[3];
+  CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], &b5[1]);
+  s3[1][0] = vld1q_u16(sum3[0] + x + 8);
+  s3[1][1] = vld1q_u16(sum3[1] + x + 8);
+  sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 8);
+  sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 12);
+  sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 8);
+  sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 12);
+  CalculateIntermediate3<0>(s3[1], sq3, scales[1], &ma3[1], &b3[1]);
+}
+
+inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
+                                    const uint8_t* const src1, const int width,
+                                    const uint32_t scale,
+                                    uint16_t* const sum5[5],
+                                    uint32_t* const square_sum5[5],
+                                    uint16_t* ma565, uint32_t* b565) {
+  uint8x16_t s[2][2], mas[2];
+  uint16x8_t sq[2][4], bs[3];
+  BoxFilterPreProcess5Lo(src0, src1, scale, s, sum5, square_sum5, sq, &mas[0],
+                         &bs[0]);
+
+  int x = 0;
+  do {
+    uint16x8_t ma[2];
+    uint8x16_t masx[3];
+    uint32x4x2_t b[2];
+    BoxFilterPreProcess5(src0, src1, x + 8, scale, s, sum5, square_sum5, sq,
+                         mas, bs + 1);
+    Prepare3_8<0>(mas, masx);
+    ma[0] = Sum565<0>(masx);
+    b[0] = Sum565W(bs);
+    vst1q_u16(ma565, ma[0]);
+    vst1q_u32(b565 + 0, b[0].val[0]);
+    vst1q_u32(b565 + 4, b[0].val[1]);
+
+    ma[1] = Sum565<8>(masx);
+    b[1] = Sum565W(bs + 1);
+    vst1q_u16(ma565 + 8, ma[1]);
+    vst1q_u32(b565 + 8, b[1].val[0]);
+    vst1q_u32(b565 + 12, b[1].val[1]);
+    s[0][0] = s[0][1];
+    s[1][0] = s[1][1];
+    sq[0][1] = sq[0][3];
+    sq[1][1] = sq[1][3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+    const uint8_t* const src, const int width, const uint32_t scale,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint16_t* ma343,
+    uint16_t* ma444, uint32_t* b343, uint32_t* b444) {
+  uint8x16_t s[2], mas[2];
+  uint16x8_t sq[4], bs[3];
+  BoxFilterPreProcess3Lo(src, scale, &s[0], sum3, square_sum3, sq, &mas[0],
+                         &bs[0]);
+
+  int x = 0;
+  do {
+    uint8x16_t ma3x[3];
+    BoxFilterPreProcess3(src, x + 8, scale, sum3, square_sum3, s, sq + 1, mas,
+                         bs + 1);
+    Prepare3_8<0>(mas, ma3x);
+    if (calculate444) {
+      Store343_444<0>(ma3x, bs + 0, 0, ma343, ma444, b343, b444);
+      Store343_444<8>(ma3x, bs + 1, 0, ma343 + 8, ma444 + 8, b343 + 8,
+                      b444 + 8);
+      ma444 += 16;
+      b444 += 16;
+    } else {
+      uint16x8_t ma[2];
+      uint32x4x2_t b[2];
+      ma[0] = Sum343<0>(ma3x);
+      b[0] = Sum343W(bs);
+      vst1q_u16(ma343, ma[0]);
+      vst1q_u32(b343 + 0, b[0].val[0]);
+      vst1q_u32(b343 + 4, b[0].val[1]);
+      ma[1] = Sum343<8>(ma3x);
+      b[1] = Sum343W(bs + 1);
+      vst1q_u16(ma343 + 8, ma[1]);
+      vst1q_u32(b343 + 8, b[1].val[0]);
+      vst1q_u32(b343 + 12, b[1].val[1]);
+    }
+    s[0] = s[1];
+    sq[1] = sq[3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    ma343 += 16;
+    b343 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+    const uint8_t* const src0, const uint8_t* const src1, const int width,
+    const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    uint16_t* const ma343[4], uint16_t* const ma444, uint16_t* ma565,
+    uint32_t* const b343[4], uint32_t* const b444, uint32_t* b565) {
+  uint8x16_t s[2][2], ma3[2][2], ma5[2];
+  uint16x8_t sq[2][4], b3[2][3], b5[3];
+  BoxFilterPreProcessLo(src0, src1, scales, s, sum3, sum5, square_sum3,
+                        square_sum5, sq, ma3, b3, &ma5[0], &b5[0]);
+
+  int x = 0;
+  do {
+    uint16x8_t ma[2];
+    uint8x16_t ma3x[3], ma5x[3];
+    uint32x4x2_t b[2];
+    BoxFilterPreProcess(src0, src1, x + 8, scales, s, sum3, sum5, square_sum3,
+                        square_sum5, sq, ma3, b3, ma5, b5 + 1);
+    Prepare3_8<0>(ma3[0], ma3x);
+    ma[0] = Sum343<0>(ma3x);
+    ma[1] = Sum343<8>(ma3x);
+    b[0] = Sum343W(b3[0] + 0);
+    b[1] = Sum343W(b3[0] + 1);
+    vst1q_u16(ma343[0] + x, ma[0]);
+    vst1q_u16(ma343[0] + x + 8, ma[1]);
+    vst1q_u32(b343[0] + x, b[0].val[0]);
+    vst1q_u32(b343[0] + x + 4, b[0].val[1]);
+    vst1q_u32(b343[0] + x + 8, b[1].val[0]);
+    vst1q_u32(b343[0] + x + 12, b[1].val[1]);
+    Prepare3_8<0>(ma3[1], ma3x);
+    Store343_444<0>(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+    Store343_444<8>(ma3x, b3[1] + 1, x + 8, ma343[1], ma444, b343[1], b444);
+    Prepare3_8<0>(ma5, ma5x);
+    ma[0] = Sum565<0>(ma5x);
+    ma[1] = Sum565<8>(ma5x);
+    b[0] = Sum565W(b5);
+    b[1] = Sum565W(b5 + 1);
+    vst1q_u16(ma565, ma[0]);
+    vst1q_u16(ma565 + 8, ma[1]);
+    vst1q_u32(b565 + 0, b[0].val[0]);
+    vst1q_u32(b565 + 4, b[0].val[1]);
+    vst1q_u32(b565 + 8, b[1].val[0]);
+    vst1q_u32(b565 + 12, b[1].val[1]);
+    s[0][0] = s[0][1];
+    s[1][0] = s[1][1];
+    sq[0][1] = sq[0][3];
+    sq[1][1] = sq[1][3];
+    ma3[0][0] = ma3[0][1];
+    ma3[1][0] = ma3[1][1];
+    b3[0][0] = b3[0][2];
+    b3[1][0] = b3[1][2];
+    ma5[0] = ma5[1];
+    b5[0] = b5[2];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
 }
 
 template <int shift>
@@ -879,734 +1761,382 @@
   return CalculateFilteredOutput<5>(s, ma_sum, b_sum);
 }
 
-inline void SelfGuidedFinal(const uint8x8_t src, const int32x4_t v[2],
-                            uint8_t* const dst) {
+inline uint8x8_t SelfGuidedFinal(const uint8x8_t src, const int32x4_t v[2]) {
   const int16x4_t v_lo =
       vrshrn_n_s32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
   const int16x4_t v_hi =
       vrshrn_n_s32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
   const int16x8_t vv = vcombine_s16(v_lo, v_hi);
-  const int16x8_t s = ZeroExtend(src);
-  const int16x8_t d = vaddq_s16(s, vv);
-  vst1_u8(dst, vqmovun_s16(d));
+  const int16x8_t d =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(vv), src));
+  return vqmovun_s16(d);
 }
 
-inline void SelfGuidedDoubleMultiplier(const uint8x8_t src,
-                                       const int16x8_t filter[2], const int w0,
-                                       const int w2, uint8_t* const dst) {
+inline uint8x8_t SelfGuidedDoubleMultiplier(const uint8x8_t src,
+                                            const int16x8_t filter[2],
+                                            const int w0, const int w2) {
   int32x4_t v[2];
   v[0] = vmull_n_s16(vget_low_s16(filter[0]), w0);
   v[1] = vmull_n_s16(vget_high_s16(filter[0]), w0);
   v[0] = vmlal_n_s16(v[0], vget_low_s16(filter[1]), w2);
   v[1] = vmlal_n_s16(v[1], vget_high_s16(filter[1]), w2);
-  SelfGuidedFinal(src, v, dst);
+  return SelfGuidedFinal(src, v);
 }
 
-inline void SelfGuidedSingleMultiplier(const uint8x8_t src,
-                                       const int16x8_t filter, const int w0,
-                                       uint8_t* const dst) {
+inline uint8x8_t SelfGuidedSingleMultiplier(const uint8x8_t src,
+                                            const int16x8_t filter,
+                                            const int w0) {
   // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
   int32x4_t v[2];
   v[0] = vmull_n_s16(vget_low_s16(filter), w0);
   v[1] = vmull_n_s16(vget_high_s16(filter), w0);
-  SelfGuidedFinal(src, v, dst);
+  return SelfGuidedFinal(src, v);
 }
 
-inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
-                   const int height, const ptrdiff_t width, uint16_t* sum3,
-                   uint16_t* sum5, uint32_t* square_sum3,
-                   uint32_t* square_sum5) {
-  int y = height;
-  do {
-    uint8x8x2_t s;
-    uint16x8x2_t sq;
-    s.val[0] = vld1_u8(src);
-    sq.val[0] = vmull_u8(s.val[0], s.val[0]);
-    ptrdiff_t x = 0;
-    do {
-      uint16x8_t row3, row5;
-      uint32x4x2_t row_sq3, row_sq5;
-      s.val[1] = vld1_u8(src + x + 8);
-      sq.val[1] = vmull_u8(s.val[1], s.val[1]);
-      SumHorizontal(s, sq, &row3, &row5, &row_sq3, &row_sq5);
-      vst1q_u16(sum3, row3);
-      vst1q_u16(sum5, row5);
-      vst1q_u32(square_sum3 + 0, row_sq3.val[0]);
-      vst1q_u32(square_sum3 + 4, row_sq3.val[1]);
-      vst1q_u32(square_sum5 + 0, row_sq5.val[0]);
-      vst1q_u32(square_sum5 + 4, row_sq5.val[1]);
-      s.val[0] = s.val[1];
-      sq.val[0] = sq.val[1];
-      sum3 += 8;
-      sum5 += 8;
-      square_sum3 += 8;
-      square_sum5 += 8;
-      x += 8;
-    } while (x < width);
-    src += src_stride;
-  } while (--y != 0);
-}
-
-template <int size>
-inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
-                   const int height, const ptrdiff_t width, uint16_t* sums,
-                   uint32_t* square_sums) {
-  static_assert(size == 3 || size == 5, "");
-  int y = height;
-  do {
-    uint8x8x2_t s;
-    uint16x8x2_t sq;
-    s.val[0] = vld1_u8(src);
-    sq.val[0] = vmull_u8(s.val[0], s.val[0]);
-    ptrdiff_t x = 0;
-    do {
-      uint16x8_t row;
-      uint32x4x2_t row_sq;
-      s.val[1] = vld1_u8(src + x + 8);
-      sq.val[1] = vmull_u8(s.val[1], s.val[1]);
-      if (size == 3) {
-        row = Sum3Horizontal(s);
-        row_sq = Sum3WHorizontal(sq);
-      } else {
-        row = Sum5Horizontal(s);
-        row_sq = Sum5WHorizontal(sq);
-      }
-      vst1q_u16(sums, row);
-      vst1q_u32(square_sums + 0, row_sq.val[0]);
-      vst1q_u32(square_sums + 4, row_sq.val[1]);
-      s.val[0] = s.val[1];
-      sq.val[0] = sq.val[1];
-      sums += 8;
-      square_sums += 8;
-      x += 8;
-    } while (x < width);
-    src += src_stride;
-  } while (--y != 0);
-}
-
-template <int n>
-inline void CalculateIntermediate(const uint16x8_t sum,
-                                  const uint32x4x2_t sum_sq,
-                                  const uint32_t scale, uint8x8_t* const ma,
-                                  uint16x8_t* const b) {
-  constexpr uint32_t one_over_n =
-      ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
-  const uint16x4_t z0 = CalculateMa<n>(vget_low_u16(sum), sum_sq.val[0], scale);
-  const uint16x4_t z1 =
-      CalculateMa<n>(vget_high_u16(sum), sum_sq.val[1], scale);
-  const uint16x8_t z01 = vcombine_u16(z0, z1);
-  // Using vqmovn_u16() needs an extra sign extension instruction.
-  const uint16x8_t z = vminq_u16(z01, vdupq_n_u16(255));
-  // Using vgetq_lane_s16() can save the sign extension instruction.
-  const uint8_t lookup[8] = {
-      kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 0)],
-      kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 1)],
-      kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 2)],
-      kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 3)],
-      kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 4)],
-      kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 5)],
-      kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 6)],
-      kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 7)]};
-  *ma = vld1_u8(lookup);
-  // b = ma * b * one_over_n
-  // |ma| = [0, 255]
-  // |sum| is a box sum with radius 1 or 2.
-  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
-  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
-  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
-  // When radius is 2 |n| is 25. |one_over_n| is 164.
-  // When radius is 1 |n| is 9. |one_over_n| is 455.
-  // |kSgrProjReciprocalBits| is 12.
-  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
-  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
-  const uint16x8_t maq = vmovl_u8(*ma);
-  const uint32x4_t m0 = vmull_u16(vget_low_u16(maq), vget_low_u16(sum));
-  const uint32x4_t m1 = vmull_u16(vget_high_u16(maq), vget_high_u16(sum));
-  const uint32x4_t m2 = vmulq_n_u32(m0, one_over_n);
-  const uint32x4_t m3 = vmulq_n_u32(m1, one_over_n);
-  const uint16x4_t b_lo = vrshrn_n_u32(m2, kSgrProjReciprocalBits);
-  const uint16x4_t b_hi = vrshrn_n_u32(m3, kSgrProjReciprocalBits);
-  *b = vcombine_u16(b_lo, b_hi);
-}
-
-inline void CalculateIntermediate5(const uint16x8_t s5[5],
-                                   const uint32x4x2_t sq5[5],
-                                   const uint32_t scale, uint8x8_t* const ma,
-                                   uint16x8_t* const b) {
-  const uint16x8_t sum = Sum5_16(s5);
-  const uint32x4x2_t sum_sq = Sum5_32(sq5);
-  CalculateIntermediate<25>(sum, sum_sq, scale, ma, b);
-}
-
-inline void CalculateIntermediate3(const uint16x8_t s3[3],
-                                   const uint32x4x2_t sq3[3],
-                                   const uint32_t scale, uint8x8_t* const ma,
-                                   uint16x8_t* const b) {
-  const uint16x8_t sum = Sum3_16(s3);
-  const uint32x4x2_t sum_sq = Sum3_32(sq3);
-  CalculateIntermediate<9>(sum, sum_sq, scale, ma, b);
-}
-
-LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
-    const uint8_t* const src, const ptrdiff_t src_stride, const ptrdiff_t x,
-    const uint32_t scale, uint8x8x2_t s[2], uint16x8x2_t sq[2],
-    uint16_t* const sum5[5], uint32_t* const square_sum5[5],
-    uint8x8_t* const ma, uint16x8_t* const b) {
-  uint16x8_t s5[5];
-  uint32x4x2_t sq5[5];
-  s[0].val[1] = vld1_u8(src + x + 8);
-  s[1].val[1] = vld1_u8(src + src_stride + x + 8);
-  sq[0].val[1] = vmull_u8(s[0].val[1], s[0].val[1]);
-  sq[1].val[1] = vmull_u8(s[1].val[1], s[1].val[1]);
-  s5[3] = Sum5Horizontal(s[0]);
-  s5[4] = Sum5Horizontal(s[1]);
-  sq5[3] = Sum5WHorizontal(sq[0]);
-  sq5[4] = Sum5WHorizontal(sq[1]);
-  vst1q_u16(sum5[3] + x, s5[3]);
-  vst1q_u16(sum5[4] + x, s5[4]);
-  vst1q_u32(square_sum5[3] + x + 0, sq5[3].val[0]);
-  vst1q_u32(square_sum5[3] + x + 4, sq5[3].val[1]);
-  vst1q_u32(square_sum5[4] + x + 0, sq5[4].val[0]);
-  vst1q_u32(square_sum5[4] + x + 4, sq5[4].val[1]);
-  s5[0] = vld1q_u16(sum5[0] + x);
-  s5[1] = vld1q_u16(sum5[1] + x);
-  s5[2] = vld1q_u16(sum5[2] + x);
-  sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
-  sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
-  sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
-  sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
-  sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
-  sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
-  CalculateIntermediate5(s5, sq5, scale, ma, b);
-}
-
-LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
-    const uint8_t* const src, const ptrdiff_t x, const uint32_t scale,
-    uint8x8x2_t* const s, uint16x8x2_t* const sq, uint16_t* const sum5[5],
-    uint32_t* const square_sum5[5], uint8x8_t* const ma, uint16x8_t* const b) {
-  uint16x8_t s5[5];
-  uint32x4x2_t sq5[5];
-  s->val[1] = vld1_u8(src + x + 8);
-  sq->val[1] = vmull_u8(s->val[1], s->val[1]);
-  s5[3] = s5[4] = Sum5Horizontal(*s);
-  sq5[3] = sq5[4] = Sum5WHorizontal(*sq);
-  s5[0] = vld1q_u16(sum5[0] + x);
-  s5[1] = vld1q_u16(sum5[1] + x);
-  s5[2] = vld1q_u16(sum5[2] + x);
-  sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
-  sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
-  sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
-  sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
-  sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
-  sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
-  CalculateIntermediate5(s5, sq5, scale, ma, b);
-}
-
-LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
-    const uint8_t* const src, const ptrdiff_t x, const uint32_t scale,
-    uint8x8x2_t* const s, uint16x8x2_t* const sq, uint16_t* const sum3[3],
-    uint32_t* const square_sum3[3], uint8x8_t* const ma, uint16x8_t* const b) {
-  uint16x8_t s3[3];
-  uint32x4x2_t sq3[3];
-  s->val[1] = vld1_u8(src + x + 8);
-  sq->val[1] = vmull_u8(s->val[1], s->val[1]);
-  s3[2] = Sum3Horizontal(*s);
-  sq3[2] = Sum3WHorizontal(*sq);
-  vst1q_u16(sum3[2] + x, s3[2]);
-  vst1q_u32(square_sum3[2] + x + 0, sq3[2].val[0]);
-  vst1q_u32(square_sum3[2] + x + 4, sq3[2].val[1]);
-  s3[0] = vld1q_u16(sum3[0] + x);
-  s3[1] = vld1q_u16(sum3[1] + x);
-  sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 0);
-  sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4);
-  sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0);
-  sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4);
-  CalculateIntermediate3(s3, sq3, scale, ma, b);
-}
-
-LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
-    const uint8_t* const src, const ptrdiff_t src_stride, const ptrdiff_t x,
-    const uint16_t scales[2], uint8x8x2_t s[2], uint16x8x2_t sq[2],
-    uint16_t* const sum3[4], uint16_t* const sum5[5],
-    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
-    uint8x8_t* const ma3_0, uint8x8_t* const ma3_1, uint16x8_t* const b3_0,
-    uint16x8_t* const b3_1, uint8x8_t* const ma5, uint16x8_t* const b5) {
-  uint16x8_t s3[4], s5[5];
-  uint32x4x2_t sq3[4], sq5[5];
-  s[0].val[1] = vld1_u8(src + x + 8);
-  s[1].val[1] = vld1_u8(src + src_stride + x + 8);
-  sq[0].val[1] = vmull_u8(s[0].val[1], s[0].val[1]);
-  sq[1].val[1] = vmull_u8(s[1].val[1], s[1].val[1]);
-  SumHorizontal(s[0], sq[0], &s3[2], &s5[3], &sq3[2], &sq5[3]);
-  SumHorizontal(s[1], sq[1], &s3[3], &s5[4], &sq3[3], &sq5[4]);
-  vst1q_u16(sum3[2] + x, s3[2]);
-  vst1q_u16(sum3[3] + x, s3[3]);
-  vst1q_u32(square_sum3[2] + x + 0, sq3[2].val[0]);
-  vst1q_u32(square_sum3[2] + x + 4, sq3[2].val[1]);
-  vst1q_u32(square_sum3[3] + x + 0, sq3[3].val[0]);
-  vst1q_u32(square_sum3[3] + x + 4, sq3[3].val[1]);
-  vst1q_u16(sum5[3] + x, s5[3]);
-  vst1q_u16(sum5[4] + x, s5[4]);
-  vst1q_u32(square_sum5[3] + x + 0, sq5[3].val[0]);
-  vst1q_u32(square_sum5[3] + x + 4, sq5[3].val[1]);
-  vst1q_u32(square_sum5[4] + x + 0, sq5[4].val[0]);
-  vst1q_u32(square_sum5[4] + x + 4, sq5[4].val[1]);
-  s3[0] = vld1q_u16(sum3[0] + x);
-  s3[1] = vld1q_u16(sum3[1] + x);
-  sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 0);
-  sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4);
-  sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0);
-  sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4);
-  s5[0] = vld1q_u16(sum5[0] + x);
-  s5[1] = vld1q_u16(sum5[1] + x);
-  s5[2] = vld1q_u16(sum5[2] + x);
-  sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
-  sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
-  sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
-  sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
-  sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
-  sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
-  CalculateIntermediate3(s3, sq3, scales[1], ma3_0, b3_0);
-  CalculateIntermediate3(s3 + 1, sq3 + 1, scales[1], ma3_1, b3_1);
-  CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
-}
-
-LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
-    const uint8_t* const src, const ptrdiff_t x, const uint16_t scales[2],
-    const uint16_t* const sum3[4], const uint16_t* const sum5[5],
-    const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
-    uint8x8x2_t* const s, uint16x8x2_t* const sq, uint8x8_t* const ma3,
-    uint8x8_t* const ma5, uint16x8_t* const b3, uint16x8_t* const b5) {
-  uint16x8_t s3[3], s5[5];
-  uint32x4x2_t sq3[3], sq5[5];
-  s->val[1] = vld1_u8(src + x + 8);
-  sq->val[1] = vmull_u8(s->val[1], s->val[1]);
-  SumHorizontal(*s, *sq, &s3[2], &s5[3], &sq3[2], &sq5[3]);
-  s5[0] = vld1q_u16(sum5[0] + x);
-  s5[1] = vld1q_u16(sum5[1] + x);
-  s5[2] = vld1q_u16(sum5[2] + x);
-  s5[4] = s5[3];
-  sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
-  sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
-  sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
-  sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
-  sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
-  sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
-  sq5[4] = sq5[3];
-  CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
-  s3[0] = vld1q_u16(sum3[0] + x);
-  s3[1] = vld1q_u16(sum3[1] + x);
-  sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 0);
-  sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4);
-  sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0);
-  sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4);
-  CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
-}
-
-inline void BoxSumFilterPreProcess5(const uint8_t* const src,
-                                    const ptrdiff_t src_stride, const int width,
-                                    const uint32_t scale,
-                                    uint16_t* const sum5[5],
-                                    uint32_t* const square_sum5[5],
-                                    uint16_t* ma565, uint32_t* b565) {
-  uint8x8x2_t s[2], mas;
-  uint16x8x2_t sq[2], bs;
-  s[0].val[0] = vld1_u8(src);
-  sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]);
-  s[1].val[0] = vld1_u8(src + src_stride);
-  sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]);
-  BoxFilterPreProcess5(src, src_stride, 0, scale, s, sq, sum5, square_sum5,
-                       &mas.val[0], &bs.val[0]);
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+    const uint8_t* const src, const uint8_t* const src0,
+    const uint8_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], const int width, const uint32_t scale,
+    const int16_t w0, uint16_t* const ma565[2], uint32_t* const b565[2],
+    uint8_t* const dst) {
+  uint8x16_t s[2][2], mas[2];
+  uint16x8_t sq[2][4], bs[3];
+  BoxFilterPreProcess5Lo(src0, src1, scale, s, sum5, square_sum5, sq, &mas[0],
+                         &bs[0]);
 
   int x = 0;
   do {
-    s[0].val[0] = s[0].val[1];
-    s[1].val[0] = s[1].val[1];
-    sq[0].val[0] = sq[0].val[1];
-    sq[1].val[0] = sq[1].val[1];
-    BoxFilterPreProcess5(src, src_stride, x + 8, scale, s, sq, sum5,
-                         square_sum5, &mas.val[1], &bs.val[1]);
-    const uint16x8_t ma = Sum565(mas);
-    const uint32x4x2_t b = Sum565W(bs);
-    vst1q_u16(ma565, ma);
-    vst1q_u32(b565 + 0, b.val[0]);
-    vst1q_u32(b565 + 4, b.val[1]);
-    mas.val[0] = mas.val[1];
-    bs.val[0] = bs.val[1];
-    ma565 += 8;
-    b565 += 8;
-    x += 8;
-  } while (x < width);
-}
-
-template <bool calculate444>
-LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
-    const uint8_t* const src, const int width, const uint32_t scale,
-    uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint16_t* ma343,
-    uint16_t* ma444, uint32_t* b343, uint32_t* b444) {
-  uint8x8x2_t s, mas;
-  uint16x8x2_t sq, bs;
-  s.val[0] = vld1_u8(src);
-  sq.val[0] = vmull_u8(s.val[0], s.val[0]);
-  BoxFilterPreProcess3(src, 0, scale, &s, &sq, sum3, square_sum3, &mas.val[0],
-                       &bs.val[0]);
-
-  int x = 0;
-  do {
-    s.val[0] = s.val[1];
-    sq.val[0] = sq.val[1];
-    BoxFilterPreProcess3(src, x + 8, scale, &s, &sq, sum3, square_sum3,
-                         &mas.val[1], &bs.val[1]);
-    if (calculate444) {
-      Store343_444(mas, bs, 0, ma343, ma444, b343, b444);
-      ma444 += 8;
-      b444 += 8;
-    } else {
-      const uint16x8_t ma = Sum343(mas);
-      const uint32x4x2_t b = Sum343W(bs);
-      vst1q_u16(ma343, ma);
-      vst1q_u32(b343 + 0, b.val[0]);
-      vst1q_u32(b343 + 4, b.val[1]);
-    }
-    mas.val[0] = mas.val[1];
-    bs.val[0] = bs.val[1];
-    ma343 += 8;
-    b343 += 8;
-    x += 8;
-  } while (x < width);
-}
-
-inline void BoxSumFilterPreProcess(
-    const uint8_t* const src, const ptrdiff_t src_stride, const int width,
-    const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
-    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
-    uint16_t* const ma343[4], uint16_t* const ma444[2], uint16_t* ma565,
-    uint32_t* const b343[4], uint32_t* const b444[2], uint32_t* b565) {
-  uint8x8x2_t s[2];
-  uint8x8x2_t ma3[2], ma5;
-  uint16x8x2_t sq[2], b3[2], b5;
-  s[0].val[0] = vld1_u8(src + 0);
-  s[1].val[0] = vld1_u8(src + src_stride + 0);
-  sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]);
-  sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]);
-  BoxFilterPreProcess(src, src_stride, 0, scales, s, sq, sum3, sum5,
-                      square_sum3, square_sum5, &ma3[0].val[0], &ma3[1].val[0],
-                      &b3[0].val[0], &b3[1].val[0], &ma5.val[0], &b5.val[0]);
-
-  int x = 0;
-  do {
-    s[0].val[0] = s[0].val[1];
-    s[1].val[0] = s[1].val[1];
-    sq[0].val[0] = sq[0].val[1];
-    sq[1].val[0] = sq[1].val[1];
-    BoxFilterPreProcess(src, src_stride, x + 8, scales, s, sq, sum3, sum5,
-                        square_sum3, square_sum5, &ma3[0].val[1],
-                        &ma3[1].val[1], &b3[0].val[1], &b3[1].val[1],
-                        &ma5.val[1], &b5.val[1]);
-    uint16x8_t ma = Sum343(ma3[0]);
-    uint32x4x2_t b = Sum343W(b3[0]);
-    vst1q_u16(ma343[0] + x, ma);
-    vst1q_u32(b343[0] + x, b.val[0]);
-    vst1q_u32(b343[0] + x + 4, b.val[1]);
-    Store343_444(ma3[1], b3[1], x, ma343[1], ma444[0], b343[1], b444[0]);
-    ma = Sum565(ma5);
-    b = Sum565W(b5);
-    vst1q_u16(ma565, ma);
-    vst1q_u32(b565 + 0, b.val[0]);
-    vst1q_u32(b565 + 4, b.val[1]);
-    ma3[0].val[0] = ma3[0].val[1];
-    ma3[1].val[0] = ma3[1].val[1];
-    b3[0].val[0] = b3[0].val[1];
-    b3[1].val[0] = b3[1].val[1];
-    ma5.val[0] = ma5.val[1];
-    b5.val[0] = b5.val[1];
-    ma565 += 8;
-    b565 += 8;
-    x += 8;
-  } while (x < width);
-}
-
-inline void BoxFilterPass1(const uint8_t* const src0, const uint8_t* const src,
-                           const ptrdiff_t src_stride, uint16_t* const sum5[5],
-                           uint32_t* const square_sum5[5], const int width,
-                           const uint32_t scale, const int16_t w0,
-                           uint16_t* const ma565[2], uint32_t* const b565[2],
-                           uint8_t* const dst, const ptrdiff_t dst_stride) {
-  uint8x8x2_t s[2], mas;
-  uint16x8x2_t sq[2], bs;
-  s[0].val[0] = vld1_u8(src);
-  s[1].val[0] = vld1_u8(src + src_stride);
-  sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]);
-  sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]);
-  BoxFilterPreProcess5(src, src_stride, 0, scale, s, sq, sum5, square_sum5,
-                       &mas.val[0], &bs.val[0]);
-
-  int x = 0;
-  do {
-    s[0].val[0] = s[0].val[1];
-    s[1].val[0] = s[1].val[1];
-    sq[0].val[0] = sq[0].val[1];
-    sq[1].val[0] = sq[1].val[1];
-    BoxFilterPreProcess5(src, src_stride, x + 8, scale, s, sq, sum5,
-                         square_sum5, &mas.val[1], &bs.val[1]);
     uint16x8_t ma[2];
+    uint8x16_t masx[3];
     uint32x4x2_t b[2];
-    ma[1] = Sum565(mas);
+    int16x8_t p0, p1;
+    BoxFilterPreProcess5(src0, src1, x + 8, scale, s, sum5, square_sum5, sq,
+                         mas, bs + 1);
+    Prepare3_8<0>(mas, masx);
+    ma[1] = Sum565<0>(masx);
     b[1] = Sum565W(bs);
     vst1q_u16(ma565[1] + x, ma[1]);
     vst1q_u32(b565[1] + x + 0, b[1].val[0]);
     vst1q_u32(b565[1] + x + 4, b[1].val[1]);
-    const uint8x8_t s0 = vld1_u8(src0 + x);
-    const uint8x8_t s1 = vld1_u8(src0 + src_stride + x);
-    int16x8_t p0, p1;
+    const uint8x16_t sr0 = vld1q_u8(src + x);
+    const uint8x16_t sr1 = vld1q_u8(src + stride + x);
+    const uint8x8_t sr00 = vget_low_u8(sr0);
+    const uint8x8_t sr10 = vget_low_u8(sr1);
     ma[0] = vld1q_u16(ma565[0] + x);
     b[0].val[0] = vld1q_u32(b565[0] + x + 0);
     b[0].val[1] = vld1q_u32(b565[0] + x + 4);
-    p0 = CalculateFilteredOutputPass1(s0, ma, b);
-    p1 = CalculateFilteredOutput<4>(s1, ma[1], b[1]);
-    SelfGuidedSingleMultiplier(s0, p0, w0, dst + x);
-    SelfGuidedSingleMultiplier(s1, p1, w0, dst + dst_stride + x);
-    mas.val[0] = mas.val[1];
-    bs.val[0] = bs.val[1];
-    x += 8;
+    p0 = CalculateFilteredOutputPass1(sr00, ma, b);
+    p1 = CalculateFilteredOutput<4>(sr10, ma[1], b[1]);
+    const uint8x8_t d00 = SelfGuidedSingleMultiplier(sr00, p0, w0);
+    const uint8x8_t d10 = SelfGuidedSingleMultiplier(sr10, p1, w0);
+
+    ma[1] = Sum565<8>(masx);
+    b[1] = Sum565W(bs + 1);
+    vst1q_u16(ma565[1] + x + 8, ma[1]);
+    vst1q_u32(b565[1] + x + 8, b[1].val[0]);
+    vst1q_u32(b565[1] + x + 12, b[1].val[1]);
+    const uint8x8_t sr01 = vget_high_u8(sr0);
+    const uint8x8_t sr11 = vget_high_u8(sr1);
+    ma[0] = vld1q_u16(ma565[0] + x + 8);
+    b[0].val[0] = vld1q_u32(b565[0] + x + 8);
+    b[0].val[1] = vld1q_u32(b565[0] + x + 12);
+    p0 = CalculateFilteredOutputPass1(sr01, ma, b);
+    p1 = CalculateFilteredOutput<4>(sr11, ma[1], b[1]);
+    const uint8x8_t d01 = SelfGuidedSingleMultiplier(sr01, p0, w0);
+    const uint8x8_t d11 = SelfGuidedSingleMultiplier(sr11, p1, w0);
+    vst1q_u8(dst + x, vcombine_u8(d00, d01));
+    vst1q_u8(dst + stride + x, vcombine_u8(d10, d11));
+    s[0][0] = s[0][1];
+    s[1][0] = s[1][1];
+    sq[0][1] = sq[0][3];
+    sq[1][1] = sq[1][3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    x += 16;
   } while (x < width);
 }
 
-inline void BoxFilterPass1LastRow(const uint8_t* const src0,
-                                  const uint8_t* const src, const int width,
+inline void BoxFilterPass1LastRow(const uint8_t* const src,
+                                  const uint8_t* const src0, const int width,
                                   const uint32_t scale, const int16_t w0,
                                   uint16_t* const sum5[5],
                                   uint32_t* const square_sum5[5],
                                   uint16_t* ma565, uint32_t* b565,
                                   uint8_t* const dst) {
-  uint8x8x2_t s, mas;
-  uint16x8x2_t sq, bs;
-  s.val[0] = vld1_u8(src);
-  sq.val[0] = vmull_u8(s.val[0], s.val[0]);
-  BoxFilterPreProcess5LastRow(src, 0, scale, &s, &sq, sum5, square_sum5,
-                              &mas.val[0], &bs.val[0]);
+  uint8x16_t s[2], mas[2];
+  uint16x8_t sq[4], bs[4];
+  BoxFilterPreProcess5LastRowLo(src0, scale, s, sum5, square_sum5, sq, &mas[0],
+                                &bs[0]);
 
   int x = 0;
   do {
-    s.val[0] = s.val[1];
-    sq.val[0] = sq.val[1];
-    BoxFilterPreProcess5LastRow(src, x + 8, scale, &s, &sq, sum5, square_sum5,
-                                &mas.val[1], &bs.val[1]);
     uint16x8_t ma[2];
+    uint8x16_t masx[3];
     uint32x4x2_t b[2];
-    ma[1] = Sum565(mas);
+    BoxFilterPreProcess5LastRow(src0, x + 8, scale, s, sum5, square_sum5,
+                                sq + 1, mas, bs + 1);
+    Prepare3_8<0>(mas, masx);
+    ma[1] = Sum565<0>(masx);
     b[1] = Sum565W(bs);
-    mas.val[0] = mas.val[1];
-    bs.val[0] = bs.val[1];
     ma[0] = vld1q_u16(ma565);
     b[0].val[0] = vld1q_u32(b565 + 0);
     b[0].val[1] = vld1q_u32(b565 + 4);
-    const uint8x8_t s = vld1_u8(src0 + x);
-    const int16x8_t p = CalculateFilteredOutputPass1(s, ma, b);
-    SelfGuidedSingleMultiplier(s, p, w0, dst + x);
-    ma565 += 8;
-    b565 += 8;
-    x += 8;
+    const uint8x16_t sr = vld1q_u8(src + x);
+    const uint8x8_t sr0 = vget_low_u8(sr);
+    const int16x8_t p0 = CalculateFilteredOutputPass1(sr0, ma, b);
+    const uint8x8_t d0 = SelfGuidedSingleMultiplier(sr0, p0, w0);
+
+    ma[1] = Sum565<8>(masx);
+    b[1] = Sum565W(bs + 1);
+    bs[0] = bs[2];
+    const uint8x8_t sr1 = vget_high_u8(sr);
+    ma[0] = vld1q_u16(ma565 + 8);
+    b[0].val[0] = vld1q_u32(b565 + 8);
+    b[0].val[1] = vld1q_u32(b565 + 12);
+    const int16x8_t p1 = CalculateFilteredOutputPass1(sr1, ma, b);
+    const uint8x8_t d1 = SelfGuidedSingleMultiplier(sr1, p1, w0);
+    vst1q_u8(dst + x, vcombine_u8(d0, d1));
+    s[0] = s[1];
+    sq[1] = sq[3];
+    mas[0] = mas[1];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
   } while (x < width);
 }
 
-inline void BoxFilterPass2(const uint8_t* const src0, const uint8_t* const src,
-                           const int width, const uint32_t scale,
-                           const int16_t w0, uint16_t* const sum3[3],
-                           uint32_t* const square_sum3[3],
-                           uint16_t* const ma343[3], uint16_t* const ma444[2],
-                           uint32_t* const b343[3], uint32_t* const b444[2],
-                           uint8_t* const dst) {
-  uint8x8x2_t s, mas;
-  uint16x8x2_t sq, bs;
-  s.val[0] = vld1_u8(src);
-  sq.val[0] = vmull_u8(s.val[0], s.val[0]);
-  BoxFilterPreProcess3(src, 0, scale, &s, &sq, sum3, square_sum3, &mas.val[0],
-                       &bs.val[0]);
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+    const uint8_t* const src, const uint8_t* const src0, const int width,
+    const uint32_t scale, const int16_t w0, uint16_t* const sum3[3],
+    uint32_t* const square_sum3[3], uint16_t* const ma343[3],
+    uint16_t* const ma444[2], uint32_t* const b343[3], uint32_t* const b444[2],
+    uint8_t* const dst) {
+  uint8x16_t s[2], mas[2];
+  uint16x8_t sq[4], bs[3];
+  BoxFilterPreProcess3Lo(src0, scale, &s[0], sum3, square_sum3, sq, &mas[0],
+                         &bs[0]);
 
   int x = 0;
   do {
-    s.val[0] = s.val[1];
-    sq.val[0] = sq.val[1];
-    BoxFilterPreProcess3(src, x + 8, scale, &s, &sq, sum3, square_sum3,
-                         &mas.val[1], &bs.val[1]);
     uint16x8_t ma[3];
+    uint8x16_t ma3x[3];
     uint32x4x2_t b[3];
-    Store343_444(mas, bs, x, &ma[2], &b[2], ma343[2], ma444[1], b343[2],
-                 b444[1]);
-    const uint8x8_t s0 = vld1_u8(src0 + x);
+    BoxFilterPreProcess3(src0, x + 8, scale, sum3, square_sum3, s, sq + 1, mas,
+                         bs + 1);
+    Prepare3_8<0>(mas, ma3x);
+    Store343_444<0>(ma3x, bs, x, &ma[2], &b[2], ma343[2], ma444[1], b343[2],
+                    b444[1]);
+    const uint8x16_t sr = vld1q_u8(src + x);
+    const uint8x8_t sr0 = vget_low_u8(sr);
     ma[0] = vld1q_u16(ma343[0] + x);
     ma[1] = vld1q_u16(ma444[0] + x);
     b[0].val[0] = vld1q_u32(b343[0] + x + 0);
     b[0].val[1] = vld1q_u32(b343[0] + x + 4);
     b[1].val[0] = vld1q_u32(b444[0] + x + 0);
     b[1].val[1] = vld1q_u32(b444[0] + x + 4);
-    const int16x8_t p = CalculateFilteredOutputPass2(s0, ma, b);
-    SelfGuidedSingleMultiplier(s0, p, w0, dst + x);
-    mas.val[0] = mas.val[1];
-    bs.val[0] = bs.val[1];
-    x += 8;
+    const int16x8_t p0 = CalculateFilteredOutputPass2(sr0, ma, b);
+    const uint8x8_t d0 = SelfGuidedSingleMultiplier(sr0, p0, w0);
+
+    Store343_444<8>(ma3x, bs + 1, x + 8, &ma[2], &b[2], ma343[2], ma444[1],
+                    b343[2], b444[1]);
+    const uint8x8_t sr1 = vget_high_u8(sr);
+    ma[0] = vld1q_u16(ma343[0] + x + 8);
+    ma[1] = vld1q_u16(ma444[0] + x + 8);
+    b[0].val[0] = vld1q_u32(b343[0] + x + 8);
+    b[0].val[1] = vld1q_u32(b343[0] + x + 12);
+    b[1].val[0] = vld1q_u32(b444[0] + x + 8);
+    b[1].val[1] = vld1q_u32(b444[0] + x + 12);
+    const int16x8_t p1 = CalculateFilteredOutputPass2(sr1, ma, b);
+    const uint8x8_t d1 = SelfGuidedSingleMultiplier(sr1, p1, w0);
+    vst1q_u8(dst + x, vcombine_u8(d0, d1));
+    s[0] = s[1];
+    sq[1] = sq[3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    x += 16;
   } while (x < width);
 }
 
-inline void BoxFilter(const uint8_t* const src0, const uint8_t* const src,
-                      const ptrdiff_t src_stride, const int width,
-                      const uint16_t scales[2], const int16_t w0,
-                      const int16_t w2, uint16_t* const sum3[4],
-                      uint16_t* const sum5[5], uint32_t* const square_sum3[4],
-                      uint32_t* const square_sum5[5], uint16_t* const ma343[4],
-                      uint16_t* const ma444[3], uint16_t* const ma565[2],
-                      uint32_t* const b343[4], uint32_t* const b444[3],
-                      uint32_t* const b565[2], uint8_t* const dst,
-                      const ptrdiff_t dst_stride) {
-  uint8x8x2_t s[2], ma3[2], ma5;
-  uint16x8x2_t sq[2], b3[2], b5;
-  s[0].val[0] = vld1_u8(src);
-  s[1].val[0] = vld1_u8(src + src_stride);
-  sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]);
-  sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]);
-  BoxFilterPreProcess(src, src_stride, 0, scales, s, sq, sum3, sum5,
-                      square_sum3, square_sum5, &ma3[0].val[0], &ma3[1].val[0],
-                      &b3[0].val[0], &b3[1].val[0], &ma5.val[0], &b5.val[0]);
-
-  int x = 0;
-  do {
-    s[0].val[0] = s[0].val[1];
-    s[1].val[0] = s[1].val[1];
-    sq[0].val[0] = sq[0].val[1];
-    sq[1].val[0] = sq[1].val[1];
-    BoxFilterPreProcess(src, src_stride, x + 8, scales, s, sq, sum3, sum5,
-                        square_sum3, square_sum5, &ma3[0].val[1],
-                        &ma3[1].val[1], &b3[0].val[1], &b3[1].val[1],
-                        &ma5.val[1], &b5.val[1]);
-    uint16x8_t ma[3][3];
-    uint32x4x2_t b[3][3];
-    Store343_444(ma3[0], b3[0], x, &ma[1][2], &ma[2][1], &b[1][2], &b[2][1],
-                 ma343[2], ma444[1], b343[2], b444[1]);
-    Store343_444(ma3[1], b3[1], x, &ma[2][2], &b[2][2], ma343[3], ma444[2],
-                 b343[3], b444[2]);
-    ma[0][1] = Sum565(ma5);
-    b[0][1] = Sum565W(b5);
-    vst1q_u16(ma565[1] + x, ma[0][1]);
-    vst1q_u32(b565[1] + x, b[0][1].val[0]);
-    vst1q_u32(b565[1] + x + 4, b[0][1].val[1]);
-    s[0].val[0] = s[0].val[1];
-    s[1].val[0] = s[1].val[1];
-    sq[0].val[0] = sq[0].val[1];
-    sq[1].val[0] = sq[1].val[1];
-    ma3[0].val[0] = ma3[0].val[1];
-    ma3[1].val[0] = ma3[1].val[1];
-    b3[0].val[0] = b3[0].val[1];
-    b3[1].val[0] = b3[1].val[1];
-    ma5.val[0] = ma5.val[1];
-    b5.val[0] = b5.val[1];
-    int16x8_t p[2][2];
-    const uint8x8_t s0 = vld1_u8(src0 + x);
-    const uint8x8_t s1 = vld1_u8(src0 + src_stride + x);
-    ma[0][0] = vld1q_u16(ma565[0] + x);
-    b[0][0].val[0] = vld1q_u32(b565[0] + x);
-    b[0][0].val[1] = vld1q_u32(b565[0] + x + 4);
-    p[0][0] = CalculateFilteredOutputPass1(s0, ma[0], b[0]);
-    p[1][0] = CalculateFilteredOutput<4>(s1, ma[0][1], b[0][1]);
-    ma[1][0] = vld1q_u16(ma343[0] + x);
-    ma[1][1] = vld1q_u16(ma444[0] + x);
-    b[1][0].val[0] = vld1q_u32(b343[0] + x);
-    b[1][0].val[1] = vld1q_u32(b343[0] + x + 4);
-    b[1][1].val[0] = vld1q_u32(b444[0] + x);
-    b[1][1].val[1] = vld1q_u32(b444[0] + x + 4);
-    p[0][1] = CalculateFilteredOutputPass2(s0, ma[1], b[1]);
-    ma[2][0] = vld1q_u16(ma343[1] + x);
-    b[2][0].val[0] = vld1q_u32(b343[1] + x);
-    b[2][0].val[1] = vld1q_u32(b343[1] + x + 4);
-    p[1][1] = CalculateFilteredOutputPass2(s1, ma[2], b[2]);
-    SelfGuidedDoubleMultiplier(s0, p[0], w0, w2, dst + x);
-    SelfGuidedDoubleMultiplier(s1, p[1], w0, w2, dst + dst_stride + x);
-    x += 8;
-  } while (x < width);
-}
-
-inline void BoxFilterLastRow(
-    const uint8_t* const src0, const uint8_t* const src, const int width,
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+    const uint8_t* const src, const uint8_t* const src0,
+    const uint8_t* const src1, const ptrdiff_t stride, const int width,
     const uint16_t scales[2], const int16_t w0, const int16_t w2,
     uint16_t* const sum3[4], uint16_t* const sum5[5],
     uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
     uint16_t* const ma343[4], uint16_t* const ma444[3],
     uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3],
     uint32_t* const b565[2], uint8_t* const dst) {
-  uint8x8x2_t s, ma3, ma5;
-  uint16x8x2_t sq, b3, b5;
-  uint16x8_t ma[3];
-  uint32x4x2_t b[3];
-  s.val[0] = vld1_u8(src);
-  sq.val[0] = vmull_u8(s.val[0], s.val[0]);
-  BoxFilterPreProcessLastRow(src, 0, scales, sum3, sum5, square_sum3,
-                             square_sum5, &s, &sq, &ma3.val[0], &ma5.val[0],
-                             &b3.val[0], &b5.val[0]);
+  uint8x16_t s[2][2], ma3[2][2], ma5[2];
+  uint16x8_t sq[2][4], b3[2][3], b5[3];
+  BoxFilterPreProcessLo(src0, src1, scales, s, sum3, sum5, square_sum3,
+                        square_sum5, sq, ma3, b3, &ma5[0], &b5[0]);
 
   int x = 0;
   do {
-    s.val[0] = s.val[1];
-    sq.val[0] = sq.val[1];
-    BoxFilterPreProcessLastRow(src, x + 8, scales, sum3, sum5, square_sum3,
-                               square_sum5, &s, &sq, &ma3.val[1], &ma5.val[1],
-                               &b3.val[1], &b5.val[1]);
-    ma[1] = Sum565(ma5);
-    b[1] = Sum565W(b5);
-    ma5.val[0] = ma5.val[1];
-    b5.val[0] = b5.val[1];
-    ma[2] = Sum343(ma3);
-    b[2] = Sum343W(b3);
-    ma3.val[0] = ma3.val[1];
-    b3.val[0] = b3.val[1];
-    const uint8x8_t s0 = vld1_u8(src0 + x);
-    int16x8_t p[2];
-    ma[0] = vld1q_u16(ma565[0] + x);
-    b[0].val[0] = vld1q_u32(b565[0] + x + 0);
-    b[0].val[1] = vld1q_u32(b565[0] + x + 4);
-    p[0] = CalculateFilteredOutputPass1(s0, ma, b);
-    ma[0] = vld1q_u16(ma343[0] + x);
-    ma[1] = vld1q_u16(ma444[0] + x);
-    b[0].val[0] = vld1q_u32(b343[0] + x + 0);
-    b[0].val[1] = vld1q_u32(b343[0] + x + 4);
-    b[1].val[0] = vld1q_u32(b444[0] + x + 0);
-    b[1].val[1] = vld1q_u32(b444[0] + x + 4);
-    p[1] = CalculateFilteredOutputPass2(s0, ma, b);
-    SelfGuidedDoubleMultiplier(s0, p, w0, w2, dst + x);
-    x += 8;
+    uint16x8_t ma[3][3];
+    uint8x16_t ma3x[2][3], ma5x[3];
+    uint32x4x2_t b[3][3];
+    int16x8_t p[2][2];
+    BoxFilterPreProcess(src0, src1, x + 8, scales, s, sum3, sum5, square_sum3,
+                        square_sum5, sq, ma3, b3, ma5, b5 + 1);
+    Prepare3_8<0>(ma3[0], ma3x[0]);
+    Prepare3_8<0>(ma3[1], ma3x[1]);
+    Store343_444<0>(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], &b[1][2], &b[2][1],
+                    ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444<0>(ma3x[1], b3[1], x, &ma[2][2], &b[2][2], ma343[3], ma444[2],
+                    b343[3], b444[2]);
+    Prepare3_8<0>(ma5, ma5x);
+    ma[0][1] = Sum565<0>(ma5x);
+    b[0][1] = Sum565W(b5);
+    vst1q_u16(ma565[1] + x, ma[0][1]);
+    vst1q_u32(b565[1] + x, b[0][1].val[0]);
+    vst1q_u32(b565[1] + x + 4, b[0][1].val[1]);
+    const uint8x16_t sr0 = vld1q_u8(src + x);
+    const uint8x16_t sr1 = vld1q_u8(src + stride + x);
+    const uint8x8_t sr00 = vget_low_u8(sr0);
+    const uint8x8_t sr10 = vget_low_u8(sr1);
+    ma[0][0] = vld1q_u16(ma565[0] + x);
+    b[0][0].val[0] = vld1q_u32(b565[0] + x);
+    b[0][0].val[1] = vld1q_u32(b565[0] + x + 4);
+    p[0][0] = CalculateFilteredOutputPass1(sr00, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr10, ma[0][1], b[0][1]);
+    ma[1][0] = vld1q_u16(ma343[0] + x);
+    ma[1][1] = vld1q_u16(ma444[0] + x);
+    b[1][0].val[0] = vld1q_u32(b343[0] + x);
+    b[1][0].val[1] = vld1q_u32(b343[0] + x + 4);
+    b[1][1].val[0] = vld1q_u32(b444[0] + x);
+    b[1][1].val[1] = vld1q_u32(b444[0] + x + 4);
+    p[0][1] = CalculateFilteredOutputPass2(sr00, ma[1], b[1]);
+    ma[2][0] = vld1q_u16(ma343[1] + x);
+    b[2][0].val[0] = vld1q_u32(b343[1] + x);
+    b[2][0].val[1] = vld1q_u32(b343[1] + x + 4);
+    p[1][1] = CalculateFilteredOutputPass2(sr10, ma[2], b[2]);
+    const uint8x8_t d00 = SelfGuidedDoubleMultiplier(sr00, p[0], w0, w2);
+    const uint8x8_t d10 = SelfGuidedDoubleMultiplier(sr10, p[1], w0, w2);
+
+    Store343_444<8>(ma3x[0], b3[0] + 1, x + 8, &ma[1][2], &ma[2][1], &b[1][2],
+                    &b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444<8>(ma3x[1], b3[1] + 1, x + 8, &ma[2][2], &b[2][2], ma343[3],
+                    ma444[2], b343[3], b444[2]);
+    ma[0][1] = Sum565<8>(ma5x);
+    b[0][1] = Sum565W(b5 + 1);
+    vst1q_u16(ma565[1] + x + 8, ma[0][1]);
+    vst1q_u32(b565[1] + x + 8, b[0][1].val[0]);
+    vst1q_u32(b565[1] + x + 12, b[0][1].val[1]);
+    b3[0][0] = b3[0][2];
+    b3[1][0] = b3[1][2];
+    b5[0] = b5[2];
+    const uint8x8_t sr01 = vget_high_u8(sr0);
+    const uint8x8_t sr11 = vget_high_u8(sr1);
+    ma[0][0] = vld1q_u16(ma565[0] + x + 8);
+    b[0][0].val[0] = vld1q_u32(b565[0] + x + 8);
+    b[0][0].val[1] = vld1q_u32(b565[0] + x + 12);
+    p[0][0] = CalculateFilteredOutputPass1(sr01, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr11, ma[0][1], b[0][1]);
+    ma[1][0] = vld1q_u16(ma343[0] + x + 8);
+    ma[1][1] = vld1q_u16(ma444[0] + x + 8);
+    b[1][0].val[0] = vld1q_u32(b343[0] + x + 8);
+    b[1][0].val[1] = vld1q_u32(b343[0] + x + 12);
+    b[1][1].val[0] = vld1q_u32(b444[0] + x + 8);
+    b[1][1].val[1] = vld1q_u32(b444[0] + x + 12);
+    p[0][1] = CalculateFilteredOutputPass2(sr01, ma[1], b[1]);
+    ma[2][0] = vld1q_u16(ma343[1] + x + 8);
+    b[2][0].val[0] = vld1q_u32(b343[1] + x + 8);
+    b[2][0].val[1] = vld1q_u32(b343[1] + x + 12);
+    p[1][1] = CalculateFilteredOutputPass2(sr11, ma[2], b[2]);
+    const uint8x8_t d01 = SelfGuidedDoubleMultiplier(sr01, p[0], w0, w2);
+    const uint8x8_t d11 = SelfGuidedDoubleMultiplier(sr11, p[1], w0, w2);
+    vst1q_u8(dst + x, vcombine_u8(d00, d01));
+    vst1q_u8(dst + stride + x, vcombine_u8(d10, d11));
+    s[0][0] = s[0][1];
+    s[1][0] = s[1][1];
+    sq[0][1] = sq[0][3];
+    sq[1][1] = sq[1][3];
+    ma3[0][0] = ma3[0][1];
+    ma3[1][0] = ma3[1][1];
+    ma5[0] = ma5[1];
+    x += 16;
   } while (x < width);
 }
 
-template <typename T>
-void Circulate3PointersBy1(T* p[3]) {
-  T* const p0 = p[0];
-  p[0] = p[1];
-  p[1] = p[2];
-  p[2] = p0;
+inline void BoxFilterLastRow(
+    const uint8_t* const src, const uint8_t* const src0, const int width,
+    const uint16_t scales[2], const int16_t w0, const int16_t w2,
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+    uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+    uint8_t* const dst) {
+  uint8x16_t s[2], ma3[2], ma5[2];
+  uint16x8_t sq[4], ma[3], b3[3], b5[3];
+  uint32x4x2_t b[3];
+  BoxFilterPreProcessLastRowLo(src0, scales, sum3, sum5, square_sum3,
+                               square_sum5, &s[0], sq, &ma3[0], &ma5[0], &b3[0],
+                               &b5[0]);
+
+  int x = 0;
+  do {
+    uint8x16_t ma3x[3], ma5x[3];
+    int16x8_t p[2];
+    BoxFilterPreProcessLastRow(src0, x + 8, scales, sum3, sum5, square_sum3,
+                               square_sum5, s, sq + 1, ma3, ma5, &b3[1],
+                               &b5[1]);
+    Prepare3_8<0>(ma5, ma5x);
+    ma[1] = Sum565<0>(ma5x);
+    b[1] = Sum565W(b5);
+    Prepare3_8<0>(ma3, ma3x);
+    ma[2] = Sum343<0>(ma3x);
+    b[2] = Sum343W(b3);
+    const uint8x16_t sr = vld1q_u8(src + x);
+    const uint8x8_t sr0 = vget_low_u8(sr);
+    ma[0] = vld1q_u16(ma565 + x);
+    b[0].val[0] = vld1q_u32(b565 + x + 0);
+    b[0].val[1] = vld1q_u32(b565 + x + 4);
+    p[0] = CalculateFilteredOutputPass1(sr0, ma, b);
+    ma[0] = vld1q_u16(ma343 + x);
+    ma[1] = vld1q_u16(ma444 + x);
+    b[0].val[0] = vld1q_u32(b343 + x + 0);
+    b[0].val[1] = vld1q_u32(b343 + x + 4);
+    b[1].val[0] = vld1q_u32(b444 + x + 0);
+    b[1].val[1] = vld1q_u32(b444 + x + 4);
+    p[1] = CalculateFilteredOutputPass2(sr0, ma, b);
+    const uint8x8_t d0 = SelfGuidedDoubleMultiplier(sr0, p, w0, w2);
+
+    ma[1] = Sum565<8>(ma5x);
+    b[1] = Sum565W(b5 + 1);
+    b5[0] = b5[2];
+    ma[2] = Sum343<8>(ma3x);
+    b[2] = Sum343W(b3 + 1);
+    b3[0] = b3[2];
+    const uint8x8_t sr1 = vget_high_u8(sr);
+    ma[0] = vld1q_u16(ma565 + x + 8);
+    b[0].val[0] = vld1q_u32(b565 + x + 8);
+    b[0].val[1] = vld1q_u32(b565 + x + 12);
+    p[0] = CalculateFilteredOutputPass1(sr1, ma, b);
+    ma[0] = vld1q_u16(ma343 + x + 8);
+    ma[1] = vld1q_u16(ma444 + x + 8);
+    b[0].val[0] = vld1q_u32(b343 + x + 8);
+    b[0].val[1] = vld1q_u32(b343 + x + 12);
+    b[1].val[0] = vld1q_u32(b444 + x + 8);
+    b[1].val[1] = vld1q_u32(b444 + x + 12);
+    p[1] = CalculateFilteredOutputPass2(sr1, ma, b);
+    const uint8x8_t d1 = SelfGuidedDoubleMultiplier(sr1, p, w0, w2);
+    vst1q_u8(dst + x, vcombine_u8(d0, d1));
+    s[0] = s[1];
+    sq[1] = sq[3];
+    ma3[0] = ma3[1];
+    ma5[0] = ma5[1];
+    x += 16;
+  } while (x < width);
 }
 
-template <typename T>
-void Circulate4PointersBy2(T* p[4]) {
-  std::swap(p[0], p[2]);
-  std::swap(p[1], p[3]);
-}
-
-template <typename T>
-void Circulate5PointersBy2(T* p[5]) {
-  T* const p0 = p[0];
-  T* const p1 = p[1];
-  p[0] = p[2];
-  p[1] = p[3];
-  p[2] = p[4];
-  p[3] = p0;
-  p[4] = p1;
-}
-
-inline void BoxFilterProcess(const RestorationUnitInfo& restoration_info,
-                             const uint8_t* src, const ptrdiff_t src_stride,
-                             const int width, const int height,
-                             SgrBuffer* const sgr_buffer, uint8_t* dst,
-                             const ptrdiff_t dst_stride) {
-  const auto temp_stride = Align<ptrdiff_t>(width, 8);
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+    const RestorationUnitInfo& restoration_info, const uint8_t* src,
+    const ptrdiff_t stride, const uint8_t* const top_border,
+    const ptrdiff_t top_border_stride, const uint8_t* bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    SgrBuffer* const sgr_buffer, uint8_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
   const ptrdiff_t sum_stride = temp_stride + 8;
   const int sgr_proj_index = restoration_info.sgr_proj_info.index;
   const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index];  // < 2^12.
@@ -1643,25 +2173,27 @@
   b565[1] = b565[0] + temp_stride;
   assert(scales[0] != 0);
   assert(scales[1] != 0);
-  BoxSum(src - 2 * src_stride - 3, src_stride, 2, sum_stride, sum3[0], sum5[1],
+  BoxSum(top_border, top_border_stride, sum_stride, sum3[0], sum5[1],
          square_sum3[0], square_sum5[1]);
   sum5[0] = sum5[1];
   square_sum5[0] = square_sum5[1];
-  BoxSumFilterPreProcess(src - 3, src_stride, width, scales, sum3, sum5,
-                         square_sum3, square_sum5, ma343, ma444, ma565[0], b343,
-                         b444, b565[0]);
+  const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+                         square_sum5, ma343, ma444[0], ma565[0], b343, b444[0],
+                         b565[0]);
   sum5[0] = sgr_buffer->sum5;
   square_sum5[0] = sgr_buffer->square_sum5;
-  for (int y = height >> 1; y != 0; --y) {
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
     Circulate4PointersBy2<uint16_t>(sum3);
     Circulate4PointersBy2<uint32_t>(square_sum3);
     Circulate5PointersBy2<uint16_t>(sum5);
     Circulate5PointersBy2<uint32_t>(square_sum5);
-    BoxFilter(src, src + 2 * src_stride - 3, src_stride, width, scales, w0, w2,
-              sum3, sum5, square_sum3, square_sum5, ma343, ma444, ma565, b343,
-              b444, b565, dst, dst_stride);
-    src += 2 * src_stride;
-    dst += 2 * dst_stride;
+    BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+              scales, w0, w2, sum3, sum5, square_sum3, square_sum5, ma343,
+              ma444, ma565, b343, b444, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
     Circulate4PointersBy2<uint16_t>(ma343);
     Circulate4PointersBy2<uint32_t>(b343);
     std::swap(ma444[0], ma444[2]);
@@ -1669,23 +2201,55 @@
     std::swap(ma565[0], ma565[1]);
     std::swap(b565[0], b565[1]);
   }
+
+  Circulate4PointersBy2<uint16_t>(sum3);
+  Circulate4PointersBy2<uint32_t>(square_sum3);
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint8_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+              square_sum3, square_sum5, ma343, ma444, ma565, b343, b444, b565,
+              dst);
+  }
   if ((height & 1) != 0) {
-    Circulate4PointersBy2<uint16_t>(sum3);
-    Circulate4PointersBy2<uint32_t>(square_sum3);
-    Circulate5PointersBy2<uint16_t>(sum5);
-    Circulate5PointersBy2<uint32_t>(square_sum5);
-    BoxFilterLastRow(src, src + 2 * src_stride - 3, width, scales, w0, w2, sum3,
-                     sum5, square_sum3, square_sum5, ma343, ma444, ma565, b343,
-                     b444, b565, dst);
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      Circulate4PointersBy2<uint16_t>(sum3);
+      Circulate4PointersBy2<uint32_t>(square_sum3);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+      Circulate4PointersBy2<uint16_t>(ma343);
+      Circulate4PointersBy2<uint32_t>(b343);
+      std::swap(ma444[0], ma444[2]);
+      std::swap(b444[0], b444[2]);
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+    }
+    BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+                     scales, w0, w2, sum3, sum5, square_sum3, square_sum5,
+                     ma343[0], ma444[0], ma565[0], b343[0], b444[0], b565[0],
+                     dst);
   }
 }
 
 inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
-                                  const uint8_t* src,
-                                  const ptrdiff_t src_stride, const int width,
-                                  const int height, SgrBuffer* const sgr_buffer,
-                                  uint8_t* dst, const ptrdiff_t dst_stride) {
-  const auto temp_stride = Align<ptrdiff_t>(width, 8);
+                                  const uint8_t* src, const ptrdiff_t stride,
+                                  const uint8_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint8_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint8_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
   const ptrdiff_t sum_stride = temp_stride + 8;
   const int sgr_proj_index = restoration_info.sgr_proj_info.index;
   const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0];  // < 2^12.
@@ -1703,39 +2267,64 @@
   b565[0] = sgr_buffer->b565;
   b565[1] = b565[0] + temp_stride;
   assert(scale != 0);
-  BoxSum<5>(src - 2 * src_stride - 3, src_stride, 2, sum_stride, sum5[1],
-            square_sum5[1]);
+  BoxSum<5>(top_border, top_border_stride, sum_stride, sum5[1], square_sum5[1]);
   sum5[0] = sum5[1];
   square_sum5[0] = square_sum5[1];
-  BoxSumFilterPreProcess5(src - 3, src_stride, width, scale, sum5, square_sum5,
-                          ma565[0], b565[0]);
+  const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, ma565[0],
+                          b565[0]);
   sum5[0] = sgr_buffer->sum5;
   square_sum5[0] = sgr_buffer->square_sum5;
-  for (int y = height >> 1; y != 0; --y) {
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
     Circulate5PointersBy2<uint16_t>(sum5);
     Circulate5PointersBy2<uint32_t>(square_sum5);
-    BoxFilterPass1(src, src + 2 * src_stride - 3, src_stride, sum5, square_sum5,
-                   width, scale, w0, ma565, b565, dst, dst_stride);
-    src += 2 * src_stride;
-    dst += 2 * dst_stride;
+    BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+                   square_sum5, width, scale, w0, ma565, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
     std::swap(ma565[0], ma565[1]);
     std::swap(b565[0], b565[1]);
   }
+
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint8_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+                   scale, w0, ma565, b565, dst);
+  }
   if ((height & 1) != 0) {
-    Circulate5PointersBy2<uint16_t>(sum5);
-    Circulate5PointersBy2<uint32_t>(square_sum5);
-    BoxFilterPass1LastRow(src, src + 2 * src_stride - 3, width, scale, w0, sum5,
-                          square_sum5, ma565[0], b565[0], dst);
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+    }
+    BoxFilterPass1LastRow(src + 3, bottom_border + bottom_border_stride, width,
+                          scale, w0, sum5, square_sum5, ma565[0], b565[0], dst);
   }
 }
 
 inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
-                                  const uint8_t* src,
-                                  const ptrdiff_t src_stride, const int width,
-                                  const int height, SgrBuffer* const sgr_buffer,
-                                  uint8_t* dst, const ptrdiff_t dst_stride) {
+                                  const uint8_t* src, const ptrdiff_t stride,
+                                  const uint8_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint8_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint8_t* dst) {
   assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
-  const auto temp_stride = Align<ptrdiff_t>(width, 8);
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
   const ptrdiff_t sum_stride = temp_stride + 8;
   const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
   const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
@@ -1758,24 +2347,44 @@
   b444[0] = sgr_buffer->b444;
   b444[1] = b444[0] + temp_stride;
   assert(scale != 0);
-  BoxSum<3>(src - 2 * src_stride - 2, src_stride, 2, sum_stride, sum3[0],
-            square_sum3[0]);
-  BoxSumFilterPreProcess3<false>(src - 2, width, scale, sum3, square_sum3,
-                                 ma343[0], nullptr, b343[0], nullptr);
+  BoxSum<3>(top_border, top_border_stride, sum_stride, sum3[0], square_sum3[0]);
+  BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3, ma343[0],
+                                 nullptr, b343[0], nullptr);
   Circulate3PointersBy1<uint16_t>(sum3);
   Circulate3PointersBy1<uint32_t>(square_sum3);
-  BoxSumFilterPreProcess3<true>(src + src_stride - 2, width, scale, sum3,
-                                square_sum3, ma343[1], ma444[0], b343[1],
-                                b444[0]);
+  const uint8_t* s;
+  if (height > 1) {
+    s = src + stride;
+  } else {
+    s = bottom_border;
+    bottom_border += bottom_border_stride;
+  }
+  BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, ma343[1],
+                                ma444[0], b343[1], b444[0]);
 
-  int y = height;
+  for (int y = height - 2; y > 0; --y) {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src + 2, src + 2 * stride, width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  }
+
+  src += 2;
+  int y = std::min(height, 2);
   do {
     Circulate3PointersBy1<uint16_t>(sum3);
     Circulate3PointersBy1<uint32_t>(square_sum3);
-    BoxFilterPass2(src, src + 2 * src_stride - 2, width, scale, w0, sum3,
-                   square_sum3, ma343, ma444, b343, b444, dst);
-    src += src_stride;
-    dst += dst_stride;
+    BoxFilterPass2(src, bottom_border, width, scale, w0, sum3, square_sum3,
+                   ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    bottom_border += bottom_border_stride;
     Circulate3PointersBy1<uint16_t>(ma343);
     Circulate3PointersBy1<uint32_t>(b343);
     std::swap(ma444[0], ma444[1]);
@@ -1786,30 +2395,35 @@
 // If |width| is non-multiple of 8, up to 7 more pixels are written to |dest| in
 // the end of each row. It is safe to overwrite the output as it will not be
 // part of the visible frame.
-void SelfGuidedFilter_NEON(const void* const source, void* const dest,
-                           const RestorationUnitInfo& restoration_info,
-                           const ptrdiff_t source_stride,
-                           const ptrdiff_t dest_stride, const int width,
-                           const int height,
-                           RestorationBuffer* const restoration_buffer) {
+void SelfGuidedFilter_NEON(
+    const RestorationUnitInfo& restoration_info, const void* const source,
+    const ptrdiff_t stride, const void* const top_border,
+    const ptrdiff_t top_border_stride, const void* const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* const restoration_buffer, void* const dest) {
   const int index = restoration_info.sgr_proj_info.index;
   const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
   const int radius_pass_1 = kSgrProjParams[index][2];  // 1 or 0
   const auto* const src = static_cast<const uint8_t*>(source);
+  const auto* top = static_cast<const uint8_t*>(top_border);
+  const auto* bottom = static_cast<const uint8_t*>(bottom_border);
   auto* const dst = static_cast<uint8_t*>(dest);
   SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
   if (radius_pass_1 == 0) {
     // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
     // following assertion.
     assert(radius_pass_0 != 0);
-    BoxFilterProcessPass1(restoration_info, src, source_stride, width, height,
-                          sgr_buffer, dst, dest_stride);
+    BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+                          top_border_stride, bottom - 3, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
   } else if (radius_pass_0 == 0) {
-    BoxFilterProcessPass2(restoration_info, src, source_stride, width, height,
-                          sgr_buffer, dst, dest_stride);
+    BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+                          top_border_stride, bottom - 2, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
   } else {
-    BoxFilterProcess(restoration_info, src, source_stride, width, height,
-                     sgr_buffer, dst, dest_stride);
+    BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+                     top_border_stride, bottom - 3, bottom_border_stride, width,
+                     height, sgr_buffer, dst);
   }
 }
 
@@ -1828,7 +2442,7 @@
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 

diff --git a/libgav1/src/dsp/arm/mask_blend_neon.cc b/libgav1/src/dsp/arm/mask_blend_neon.cc
index 21f3fb1..ee50923 100644
--- a/libgav1/src/dsp/arm/mask_blend_neon.cc
+++ b/libgav1/src/dsp/arm/mask_blend_neon.cc

@@ -84,20 +84,19 @@
                                   const int16x8_t pred_mask_0,
                                   const int16x8_t pred_mask_1, uint8_t* dst,
                                   const ptrdiff_t dst_stride) {
-  const int16x4_t pred_val_0_lo = vld1_s16(pred_0);
-  const int16x4_t pred_val_0_hi = vld1_s16(pred_0 + 4);
-  const int16x4_t pred_val_1_lo = vld1_s16(pred_1);
-  const int16x4_t pred_val_1_hi = vld1_s16(pred_1 + 4);
+  const int16x8_t pred_val_0 = vld1q_s16(pred_0);
+  const int16x8_t pred_val_1 = vld1q_s16(pred_1);
   // int res = (mask_value * prediction_0[x] +
   //      (64 - mask_value) * prediction_1[x]) >> 6;
   const int32x4_t weighted_pred_0_lo =
-      vmull_s16(vget_low_s16(pred_mask_0), pred_val_0_lo);
+      vmull_s16(vget_low_s16(pred_mask_0), vget_low_s16(pred_val_0));
   const int32x4_t weighted_pred_0_hi =
-      vmull_s16(vget_high_s16(pred_mask_0), pred_val_0_hi);
-  const int32x4_t weighted_combo_lo =
-      vmlal_s16(weighted_pred_0_lo, vget_low_s16(pred_mask_1), pred_val_1_lo);
+      vmull_s16(vget_high_s16(pred_mask_0), vget_high_s16(pred_val_0));
+  const int32x4_t weighted_combo_lo = vmlal_s16(
+      weighted_pred_0_lo, vget_low_s16(pred_mask_1), vget_low_s16(pred_val_1));
   const int32x4_t weighted_combo_hi =
-      vmlal_s16(weighted_pred_0_hi, vget_high_s16(pred_mask_1), pred_val_1_hi);
+      vmlal_s16(weighted_pred_0_hi, vget_high_s16(pred_mask_1),
+                vget_high_s16(pred_val_1));
   // dst[x] = static_cast<Pixel>(
   //     Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
   //         (1 << kBitdepth8) - 1));
@@ -433,7 +432,7 @@
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 
 namespace libgav1 {
 namespace dsp {

diff --git a/libgav1/src/dsp/arm/motion_field_projection_neon.cc b/libgav1/src/dsp/arm/motion_field_projection_neon.cc
index 8caba7d..3e731b2 100644
--- a/libgav1/src/dsp/arm/motion_field_projection_neon.cc
+++ b/libgav1/src/dsp/arm/motion_field_projection_neon.cc

@@ -382,7 +382,7 @@
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 

diff --git a/libgav1/src/dsp/arm/motion_vector_search_neon.cc b/libgav1/src/dsp/arm/motion_vector_search_neon.cc
index 8a403a6..da3ba17 100644
--- a/libgav1/src/dsp/arm/motion_vector_search_neon.cc
+++ b/libgav1/src/dsp/arm/motion_vector_search_neon.cc

@@ -256,7 +256,7 @@
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 

diff --git a/libgav1/src/dsp/arm/obmc_neon.cc b/libgav1/src/dsp/arm/obmc_neon.cc
index 66ad663..1111a90 100644
--- a/libgav1/src/dsp/arm/obmc_neon.cc
+++ b/libgav1/src/dsp/arm/obmc_neon.cc

@@ -380,7 +380,7 @@
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 
 namespace libgav1 {
 namespace dsp {

diff --git a/libgav1/src/dsp/arm/super_res_neon.cc b/libgav1/src/dsp/arm/super_res_neon.cc
index d77b9c7..91537c4 100644
--- a/libgav1/src/dsp/arm/super_res_neon.cc
+++ b/libgav1/src/dsp/arm/super_res_neon.cc

@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "src/dsp/arm/common_neon.h"
 #include "src/dsp/super_res.h"
 #include "src/utils/cpu.h"
 
@@ -20,8 +19,10 @@
 
 #include <arm_neon.h>
 
+#include "src/dsp/arm/common_neon.h"
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
+#include "src/utils/common.h"
 #include "src/utils/constants.h"
 
 namespace libgav1 {
@@ -30,57 +31,265 @@
 namespace low_bitdepth {
 namespace {
 
-void ComputeSuperRes_NEON(const void* source, const int upscaled_width,
-                          const int initial_subpixel_x, const int step,
-                          void* const dest) {
-  const auto* src = static_cast<const uint8_t*>(source);
-  auto* dst = static_cast<uint8_t*>(dest);
-  src -= kSuperResFilterTaps >> 1;
-
-  int p = initial_subpixel_x;
-  uint16x8_t weighted_src[8];
-  for (int x = 0; x < upscaled_width; x += 8) {
-    for (int i = 0; i < kSuperResFilterTaps; ++i, p += step) {
-      const uint8x8_t src_x = vld1_u8(&src[p >> kSuperResScaleBits]);
-      const int remainder = p & kSuperResScaleMask;
-      const uint8x8_t filter =
-          vld1_u8(kUpscaleFilterUnsigned[remainder >> kSuperResExtraBits]);
-      weighted_src[i] = vmull_u8(src_x, filter);
+void SuperResCoefficients_NEON(const int upscaled_width,
+                               const int initial_subpixel_x, const int step,
+                               void* const coefficients) {
+  auto* dst = static_cast<uint8_t*>(coefficients);
+  int subpixel_x = initial_subpixel_x;
+  int x = RightShiftWithCeiling(upscaled_width, 3);
+  do {
+    uint8x8_t filter[8];
+    uint8x16_t d[kSuperResFilterTaps / 2];
+    for (int i = 0; i < 8; ++i, subpixel_x += step) {
+      filter[i] =
+          vld1_u8(kUpscaleFilterUnsigned[(subpixel_x & kSuperResScaleMask) >>
+                                         kSuperResExtraBits]);
     }
-    Transpose8x8(weighted_src);
+    Transpose8x8(filter, d);
+    vst1q_u8(dst, d[0]);
+    dst += 16;
+    vst1q_u8(dst, d[1]);
+    dst += 16;
+    vst1q_u8(dst, d[2]);
+    dst += 16;
+    vst1q_u8(dst, d[3]);
+    dst += 16;
+  } while (--x != 0);
+}
 
-    // Maximum sum of positive taps: 171 = 7 + 86 + 71 + 7
-    // Maximum sum: 255*171 == 0xAA55
-    // The sum is clipped to [0, 255], so adding all positive and then
-    // subtracting all negative with saturation is sufficient.
-    //           0 1 2 3 4 5 6 7
-    // tap sign: - + - + + - + -
-    uint16x8_t res = weighted_src[1];
-    res = vaddq_u16(res, weighted_src[3]);
-    res = vaddq_u16(res, weighted_src[4]);
-    res = vaddq_u16(res, weighted_src[6]);
-    res = vqsubq_u16(res, weighted_src[0]);
-    res = vqsubq_u16(res, weighted_src[2]);
-    res = vqsubq_u16(res, weighted_src[5]);
-    res = vqsubq_u16(res, weighted_src[7]);
-    vst1_u8(&dst[x], vqrshrn_n_u16(res, kFilterBits));
+// Maximum sum of positive taps: 171 = 7 + 86 + 71 + 7
+// Maximum sum: 255*171 == 0xAA55
+// The sum is clipped to [0, 255], so adding all positive and then
+// subtracting all negative with saturation is sufficient.
+//           0 1 2 3 4 5 6 7
+// tap sign: - + - + + - + -
+inline uint8x8_t SuperRes(const uint8x8_t src[kSuperResFilterTaps],
+                          const uint8_t** coefficients) {
+  uint8x16_t f[kSuperResFilterTaps / 2];
+  for (int i = 0; i < kSuperResFilterTaps / 2; ++i, *coefficients += 16) {
+    f[i] = vld1q_u8(*coefficients);
   }
+  uint16x8_t res = vmull_u8(src[1], vget_high_u8(f[0]));
+  res = vmlal_u8(res, src[3], vget_high_u8(f[1]));
+  res = vmlal_u8(res, src[4], vget_low_u8(f[2]));
+  res = vmlal_u8(res, src[6], vget_low_u8(f[3]));
+  uint16x8_t temp = vmull_u8(src[0], vget_low_u8(f[0]));
+  temp = vmlal_u8(temp, src[2], vget_low_u8(f[1]));
+  temp = vmlal_u8(temp, src[5], vget_high_u8(f[2]));
+  temp = vmlal_u8(temp, src[7], vget_high_u8(f[3]));
+  res = vqsubq_u16(res, temp);
+  return vqrshrn_n_u16(res, kFilterBits);
+}
+
+void SuperRes_NEON(const void* const coefficients, void* const source,
+                   const ptrdiff_t source_stride, const int height,
+                   const int downscaled_width, const int upscaled_width,
+                   const int initial_subpixel_x, const int step,
+                   void* const dest, const ptrdiff_t dest_stride) {
+  auto* src = static_cast<uint8_t*>(source) - DivideBy2(kSuperResFilterTaps);
+  auto* dst = static_cast<uint8_t*>(dest);
+  int y = height;
+  do {
+    const auto* filter = static_cast<const uint8_t*>(coefficients);
+    uint8_t* dst_ptr = dst;
+    ExtendLine<uint8_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+                        kSuperResHorizontalBorder, kSuperResHorizontalBorder);
+    int subpixel_x = initial_subpixel_x;
+    uint8x8_t sr[8];
+    uint8x16_t s[8];
+    int x = RightShiftWithCeiling(upscaled_width, 4);
+    // The below code calculates up to 15 extra upscaled
+    // pixels which will over-read up to 15 downscaled pixels in the end of each
+    // row. kSuperResHorizontalPadding accounts for this.
+    do {
+      for (int i = 0; i < 8; ++i, subpixel_x += step) {
+        sr[i] = vld1_u8(&src[subpixel_x >> kSuperResScaleBits]);
+      }
+      for (int i = 0; i < 8; ++i, subpixel_x += step) {
+        const uint8x8_t s_hi = vld1_u8(&src[subpixel_x >> kSuperResScaleBits]);
+        s[i] = vcombine_u8(sr[i], s_hi);
+      }
+      Transpose8x16(s);
+      // Do not use loop for the following 8 instructions, since the compiler
+      // will generate redundant code.
+      sr[0] = vget_low_u8(s[0]);
+      sr[1] = vget_low_u8(s[1]);
+      sr[2] = vget_low_u8(s[2]);
+      sr[3] = vget_low_u8(s[3]);
+      sr[4] = vget_low_u8(s[4]);
+      sr[5] = vget_low_u8(s[5]);
+      sr[6] = vget_low_u8(s[6]);
+      sr[7] = vget_low_u8(s[7]);
+      const uint8x8_t d0 = SuperRes(sr, &filter);
+      // Do not use loop for the following 8 instructions, since the compiler
+      // will generate redundant code.
+      sr[0] = vget_high_u8(s[0]);
+      sr[1] = vget_high_u8(s[1]);
+      sr[2] = vget_high_u8(s[2]);
+      sr[3] = vget_high_u8(s[3]);
+      sr[4] = vget_high_u8(s[4]);
+      sr[5] = vget_high_u8(s[5]);
+      sr[6] = vget_high_u8(s[6]);
+      sr[7] = vget_high_u8(s[7]);
+      const uint8x8_t d1 = SuperRes(sr, &filter);
+      vst1q_u8(dst_ptr, vcombine_u8(d0, d1));
+      dst_ptr += 16;
+    } while (--x != 0);
+    src += source_stride;
+    dst += dest_stride;
+  } while (--y != 0);
 }
 
 void Init8bpp() {
   Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
-  dsp->super_res_row = ComputeSuperRes_NEON;
+  dsp->super_res_coefficients = SuperResCoefficients_NEON;
+  dsp->super_res = SuperRes_NEON;
 }
 
 }  // namespace
 }  // namespace low_bitdepth
 
-void SuperResInit_NEON() { low_bitdepth::Init8bpp(); }
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
 
+void SuperResCoefficients_NEON(const int upscaled_width,
+                               const int initial_subpixel_x, const int step,
+                               void* const coefficients) {
+  auto* dst = static_cast<uint16_t*>(coefficients);
+  int subpixel_x = initial_subpixel_x;
+  int x = RightShiftWithCeiling(upscaled_width, 3);
+  do {
+    uint16x8_t filter[8];
+    for (int i = 0; i < 8; ++i, subpixel_x += step) {
+      const uint8x8_t filter_8 =
+          vld1_u8(kUpscaleFilterUnsigned[(subpixel_x & kSuperResScaleMask) >>
+                                         kSuperResExtraBits]);
+      // uint8_t -> uint16_t
+      filter[i] = vmovl_u8(filter_8);
+    }
+
+    Transpose8x8(filter);
+
+    vst1q_u16(dst, filter[0]);
+    dst += 8;
+    vst1q_u16(dst, filter[1]);
+    dst += 8;
+    vst1q_u16(dst, filter[2]);
+    dst += 8;
+    vst1q_u16(dst, filter[3]);
+    dst += 8;
+    vst1q_u16(dst, filter[4]);
+    dst += 8;
+    vst1q_u16(dst, filter[5]);
+    dst += 8;
+    vst1q_u16(dst, filter[6]);
+    dst += 8;
+    vst1q_u16(dst, filter[7]);
+    dst += 8;
+  } while (--x != 0);
+}
+
+// The sum is clipped to [0, ((1 << bitdepth) -1)]. Adding all positive and then
+// subtracting all negative with saturation will clip to zero.
+//           0 1 2 3 4 5 6 7
+// tap sign: - + - + + - + -
+inline uint16x8_t SuperRes(const uint16x8_t src[kSuperResFilterTaps],
+                           const uint16_t** coefficients, int bitdepth) {
+  uint16x8_t f[kSuperResFilterTaps];
+  for (int i = 0; i < kSuperResFilterTaps; ++i, *coefficients += 8) {
+    f[i] = vld1q_u16(*coefficients);
+  }
+
+  uint32x4_t res_lo = vmull_u16(vget_low_u16(src[1]), vget_low_u16(f[1]));
+  res_lo = vmlal_u16(res_lo, vget_low_u16(src[3]), vget_low_u16(f[3]));
+  res_lo = vmlal_u16(res_lo, vget_low_u16(src[4]), vget_low_u16(f[4]));
+  res_lo = vmlal_u16(res_lo, vget_low_u16(src[6]), vget_low_u16(f[6]));
+
+  uint32x4_t temp_lo = vmull_u16(vget_low_u16(src[0]), vget_low_u16(f[0]));
+  temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[2]), vget_low_u16(f[2]));
+  temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[5]), vget_low_u16(f[5]));
+  temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[7]), vget_low_u16(f[7]));
+
+  res_lo = vqsubq_u32(res_lo, temp_lo);
+
+  uint32x4_t res_hi = vmull_u16(vget_high_u16(src[1]), vget_high_u16(f[1]));
+  res_hi = vmlal_u16(res_hi, vget_high_u16(src[3]), vget_high_u16(f[3]));
+  res_hi = vmlal_u16(res_hi, vget_high_u16(src[4]), vget_high_u16(f[4]));
+  res_hi = vmlal_u16(res_hi, vget_high_u16(src[6]), vget_high_u16(f[6]));
+
+  uint32x4_t temp_hi = vmull_u16(vget_high_u16(src[0]), vget_high_u16(f[0]));
+  temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[2]), vget_high_u16(f[2]));
+  temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[5]), vget_high_u16(f[5]));
+  temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[7]), vget_high_u16(f[7]));
+
+  res_hi = vqsubq_u32(res_hi, temp_hi);
+
+  const uint16x8_t res = vcombine_u16(vqrshrn_n_u32(res_lo, kFilterBits),
+                                      vqrshrn_n_u32(res_hi, kFilterBits));
+
+  // Clip the result at (1 << bd) - 1.
+  return vminq_u16(res, vdupq_n_u16((1 << bitdepth) - 1));
+}
+
+template <int bitdepth>
+void SuperRes_NEON(const void* const coefficients, void* const source,
+                   const ptrdiff_t source_stride, const int height,
+                   const int downscaled_width, const int upscaled_width,
+                   const int initial_subpixel_x, const int step,
+                   void* const dest, const ptrdiff_t dest_stride) {
+  auto* src = static_cast<uint16_t*>(source) - DivideBy2(kSuperResFilterTaps);
+  auto* dst = static_cast<uint16_t*>(dest);
+  int y = height;
+  do {
+    const auto* filter = static_cast<const uint16_t*>(coefficients);
+    uint16_t* dst_ptr = dst;
+    ExtendLine<uint16_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+                         kSuperResHorizontalBorder, kSuperResHorizontalBorder);
+    int subpixel_x = initial_subpixel_x;
+    uint16x8_t sr[8];
+    int x = RightShiftWithCeiling(upscaled_width, 3);
+    // The below code calculates up to 7 extra upscaled
+    // pixels which will over-read up to 7 downscaled pixels in the end of each
+    // row. kSuperResHorizontalBorder accounts for this.
+    do {
+      for (int i = 0; i < 8; ++i, subpixel_x += step) {
+        sr[i] = vld1q_u16(&src[subpixel_x >> kSuperResScaleBits]);
+      }
+
+      Transpose8x8(sr);
+
+      const uint16x8_t d0 = SuperRes(sr, &filter, bitdepth);
+      vst1q_u16(dst_ptr, d0);
+      dst_ptr += 8;
+    } while (--x != 0);
+    src += source_stride;
+    dst += dest_stride;
+  } while (--y != 0);
+}
+
+void Init10bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  dsp->super_res_coefficients = SuperResCoefficients_NEON;
+  dsp->super_res = SuperRes_NEON<10>;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void SuperResInit_NEON() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 
 namespace libgav1 {
 namespace dsp {

diff --git a/libgav1/src/dsp/arm/super_res_neon.h b/libgav1/src/dsp/arm/super_res_neon.h
index f51785d..65e48c5 100644
--- a/libgav1/src/dsp/arm/super_res_neon.h
+++ b/libgav1/src/dsp/arm/super_res_neon.h

@@ -31,7 +31,10 @@
 
 #if LIBGAV1_ENABLE_NEON
 #define LIBGAV1_Dsp8bpp_SuperRes LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_SuperResClip LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_SuperResCoefficients LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_SuperResCoefficients LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_SuperRes LIBGAV1_CPU_NEON
 #endif  // LIBGAV1_ENABLE_NEON
 
 #endif  // LIBGAV1_SRC_DSP_ARM_SUPER_RES_NEON_H_

diff --git a/libgav1/src/dsp/arm/warp_neon.cc b/libgav1/src/dsp/arm/warp_neon.cc
index 7a41998..c7fb739 100644
--- a/libgav1/src/dsp/arm/warp_neon.cc
+++ b/libgav1/src/dsp/arm/warp_neon.cc

@@ -289,7 +289,7 @@
             const int16x8_t sum = vld1q_s16(tmp);
             vst1_u8(reinterpret_cast<uint8_t*>(dst_row), vqmovun_s16(sum));
           }
-#else  // !defined(__aarch64__)
+#else   // !defined(__aarch64__)
           int16x8_t filter[8];
           for (int x = 0; x < 8; ++x) {
             const int offset =
@@ -442,7 +442,7 @@
 
 }  // namespace dsp
 }  // namespace libgav1
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 namespace libgav1 {
 namespace dsp {
 

diff --git a/libgav1/src/dsp/arm/weight_mask_neon.cc b/libgav1/src/dsp/arm/weight_mask_neon.cc
index 49d3be0..7e5bff0 100644
--- a/libgav1/src/dsp/arm/weight_mask_neon.cc
+++ b/libgav1/src/dsp/arm/weight_mask_neon.cc

@@ -451,7 +451,7 @@
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_NEON
+#else   // !LIBGAV1_ENABLE_NEON
 
 namespace libgav1 {
 namespace dsp {

diff --git a/libgav1/src/dsp/average_blend.cc b/libgav1/src/dsp/average_blend.cc
index a59abb0..d3ec21f 100644
--- a/libgav1/src/dsp/average_blend.cc
+++ b/libgav1/src/dsp/average_blend.cc

@@ -76,9 +76,7 @@
   Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
   assert(dsp != nullptr);
 #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
-#ifndef LIBGAV1_Dsp10bpp_AverageBlend
   dsp->average_blend = AverageBlend_C<10, uint16_t>;
-#endif
 #else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
   static_cast<void>(dsp);
 #ifndef LIBGAV1_Dsp10bpp_AverageBlend

diff --git a/libgav1/src/dsp/cdef.cc b/libgav1/src/dsp/cdef.cc
index 95e5a4a..0b50517 100644
--- a/libgav1/src/dsp/cdef.cc
+++ b/libgav1/src/dsp/cdef.cc

@@ -41,7 +41,7 @@
 
 template <int bitdepth, typename Pixel>
 void CdefDirection_C(const void* const source, ptrdiff_t stride,
-                     int* const direction, int* const variance) {
+                     uint8_t* const direction, int* const variance) {
   assert(direction != nullptr);
   assert(variance != nullptr);
   const auto* src = static_cast<const Pixel*>(source);

diff --git a/libgav1/src/dsp/cdef.h b/libgav1/src/dsp/cdef.h
index 2d70d2c..b820b77 100644
--- a/libgav1/src/dsp/cdef.h
+++ b/libgav1/src/dsp/cdef.h

@@ -30,6 +30,7 @@
 // The order of includes is important as each tests for a superior version
 // before setting the base.
 // clang-format off
+#include "src/dsp/x86/cdef_avx2.h"
 #include "src/dsp/x86/cdef_sse4.h"
 // clang-format on
 // IWYU pragma: end_exports

diff --git a/libgav1/src/dsp/common.h b/libgav1/src/dsp/common.h
index 8ce3211..d614a81 100644
--- a/libgav1/src/dsp/common.h
+++ b/libgav1/src/dsp/common.h

@@ -25,7 +25,7 @@
 
 namespace libgav1 {
 
-enum { kSgrStride = kRestorationUnitWidth + 8 };  // anonymous enum
+enum { kSgrStride = kRestorationUnitWidth + 32 };  // anonymous enum
 
 // Self guided projection filter.
 struct SgrProjInfo {
@@ -57,8 +57,9 @@
   alignas(kMaxAlignment) uint32_t b343[4 * kRestorationUnitWidth];
   alignas(kMaxAlignment) uint32_t b444[3 * kRestorationUnitWidth];
   alignas(kMaxAlignment) uint32_t b565[2 * kRestorationUnitWidth];
-  alignas(kMaxAlignment) uint16_t
-      temp_buffer[12 * (kRestorationUnitHeight + 2)];
+  // The following 2 buffers are only used by the C functions. Since SgrBuffer
+  // is smaller than |wiener_buffer| in RestorationBuffer which is an union,
+  // it's OK to always keep the following 2 buffers.
   alignas(kMaxAlignment) uint8_t ma[kSgrStride];  // [0, 255]
   // b is less than 2^16 for 8-bit. However, making it a template slows down the
   // C function by 5%. So b is fixed to 32-bit.

diff --git a/libgav1/src/dsp/constants.cc b/libgav1/src/dsp/constants.cc
index 0099ca3..1b85795 100644
--- a/libgav1/src/dsp/constants.cc
+++ b/libgav1/src/dsp/constants.cc

@@ -20,7 +20,7 @@
 
 // Each set of 7 taps is padded with a 0 to easily align and pack into the high
 // and low 8 bytes. This way, we can load 16 at a time to fit mulhi and mullo.
-const int8_t kFilterIntraTaps[kNumFilterIntraPredictors][8][8] = {
+alignas(16) const int8_t kFilterIntraTaps[kNumFilterIntraPredictors][8][8] = {
     {{-6, 10, 0, 0, 0, 12, 0, 0},
      {-5, 2, 10, 0, 0, 9, 0, 0},
      {-3, 1, 1, 10, 0, 7, 0, 0},

diff --git a/libgav1/src/dsp/convolve.cc b/libgav1/src/dsp/convolve.cc
index c8df357..727b4af 100644
--- a/libgav1/src/dsp/convolve.cc
+++ b/libgav1/src/dsp/convolve.cc

@@ -226,8 +226,9 @@
 void ConvolveCompound2D_C(const void* const reference,
                           const ptrdiff_t reference_stride,
                           const int horizontal_filter_index,
-                          const int vertical_filter_index, const int subpixel_x,
-                          const int subpixel_y, const int width,
+                          const int vertical_filter_index,
+                          const int horizontal_filter_id,
+                          const int vertical_filter_id, const int width,
                           const int height, void* prediction,
                           const ptrdiff_t pred_stride) {
   // All compound functions output to the predictor buffer with |pred_stride|
@@ -257,16 +258,17 @@
   const auto* src = static_cast<const Pixel*>(reference) -
                     kVerticalOffset * src_stride - kHorizontalOffset;
   auto* dest = static_cast<uint16_t*>(prediction);
-  int filter_id = (subpixel_x >> 6) & kSubPixelMask;
-  // If |filter_id| == 0 then ConvolveVertical() should be called.
-  assert(filter_id != 0);
+
+  // If |horizontal_filter_id| == 0 then ConvolveVertical() should be called.
+  assert(horizontal_filter_id != 0);
   int y = 0;
   do {
     int x = 0;
     do {
       int sum = 0;
       for (int k = 0; k < kSubPixelTaps; ++k) {
-        sum += kHalfSubPixelFilters[filter_index][filter_id][k] * src[x + k];
+        sum += kHalfSubPixelFilters[filter_index][horizontal_filter_id][k] *
+               src[x + k];
       }
       intermediate[x] = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
     } while (++x < width);
@@ -278,16 +280,15 @@
   // Vertical filter.
   filter_index = GetFilterIndex(vertical_filter_index, height);
   intermediate = intermediate_result;
-  filter_id = ((subpixel_y & 1023) >> 6) & kSubPixelMask;
-  // If |filter_id| == 0 then ConvolveHorizontal() should be called.
-  assert(filter_id != 0);
+  // If |vertical_filter_id| == 0 then ConvolveHorizontal() should be called.
+  assert(vertical_filter_id != 0);
   y = 0;
   do {
     int x = 0;
     do {
       int sum = 0;
       for (int k = 0; k < kSubPixelTaps; ++k) {
-        sum += kHalfSubPixelFilters[filter_index][filter_id][k] *
+        sum += kHalfSubPixelFilters[filter_index][vertical_filter_id][k] *
                intermediate[k * intermediate_stride + x];
       }
       sum = RightShiftWithRounding(sum, kRoundBitsVertical - 1);
@@ -308,9 +309,10 @@
 template <int bitdepth, typename Pixel>
 void Convolve2D_C(const void* const reference, const ptrdiff_t reference_stride,
                   const int horizontal_filter_index,
-                  const int vertical_filter_index, const int subpixel_x,
-                  const int subpixel_y, const int width, const int height,
-                  void* prediction, const ptrdiff_t pred_stride) {
+                  const int vertical_filter_index,
+                  const int horizontal_filter_id, const int vertical_filter_id,
+                  const int width, const int height, void* prediction,
+                  const ptrdiff_t pred_stride) {
   constexpr int kRoundBitsHorizontal = (bitdepth == 12)
                                            ? kInterRoundBitsHorizontal12bpp
                                            : kInterRoundBitsHorizontal;
@@ -336,16 +338,16 @@
                     kVerticalOffset * src_stride - kHorizontalOffset;
   auto* dest = static_cast<Pixel*>(prediction);
   const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
-  int filter_id = (subpixel_x >> 6) & kSubPixelMask;
-  // If |filter_id| == 0 then ConvolveVertical() should be called.
-  assert(filter_id != 0);
+  // If |horizontal_filter_id| == 0 then ConvolveVertical() should be called.
+  assert(horizontal_filter_id != 0);
   int y = 0;
   do {
     int x = 0;
     do {
       int sum = 0;
       for (int k = 0; k < kSubPixelTaps; ++k) {
-        sum += kHalfSubPixelFilters[filter_index][filter_id][k] * src[x + k];
+        sum += kHalfSubPixelFilters[filter_index][horizontal_filter_id][k] *
+               src[x + k];
       }
       intermediate[x] = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
     } while (++x < width);
@@ -357,16 +359,15 @@
   // Vertical filter.
   filter_index = GetFilterIndex(vertical_filter_index, height);
   intermediate = intermediate_result;
-  filter_id = ((subpixel_y & 1023) >> 6) & kSubPixelMask;
-  // If |filter_id| == 0 then ConvolveHorizontal() should be called.
-  assert(filter_id != 0);
+  // If |vertical_filter_id| == 0 then ConvolveHorizontal() should be called.
+  assert(vertical_filter_id != 0);
   y = 0;
   do {
     int x = 0;
     do {
       int sum = 0;
       for (int k = 0; k < kSubPixelTaps; ++k) {
-        sum += kHalfSubPixelFilters[filter_index][filter_id][k] *
+        sum += kHalfSubPixelFilters[filter_index][vertical_filter_id][k] *
                intermediate[k * intermediate_stride + x];
       }
       dest[x] = Clip3(RightShiftWithRounding(sum, kRoundBitsVertical - 1), 0,
@@ -388,8 +389,9 @@
                           const ptrdiff_t reference_stride,
                           const int horizontal_filter_index,
                           const int /*vertical_filter_index*/,
-                          const int subpixel_x, const int /*subpixel_y*/,
-                          const int width, const int height, void* prediction,
+                          const int horizontal_filter_id,
+                          const int /*vertical_filter_id*/, const int width,
+                          const int height, void* prediction,
                           const ptrdiff_t pred_stride) {
   constexpr int kRoundBitsHorizontal = (bitdepth == 12)
                                            ? kInterRoundBitsHorizontal12bpp
@@ -400,7 +402,6 @@
   const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
   auto* dest = static_cast<Pixel*>(prediction);
   const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
-  const int filter_id = (subpixel_x >> 6) & kSubPixelMask;
   const int max_pixel_value = (1 << bitdepth) - 1;
   int y = 0;
   do {
@@ -408,7 +409,8 @@
     do {
       int sum = 0;
       for (int k = 0; k < kSubPixelTaps; ++k) {
-        sum += kHalfSubPixelFilters[filter_index][filter_id][k] * src[x + k];
+        sum += kHalfSubPixelFilters[filter_index][horizontal_filter_id][k] *
+               src[x + k];
       }
       sum = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
       dest[x] = Clip3(RightShiftWithRounding(sum, bits), 0, max_pixel_value);
@@ -429,8 +431,9 @@
                         const ptrdiff_t reference_stride,
                         const int /*horizontal_filter_index*/,
                         const int vertical_filter_index,
-                        const int /*subpixel_x*/, const int subpixel_y,
-                        const int width, const int height, void* prediction,
+                        const int /*horizontal_filter_id*/,
+                        const int vertical_filter_id, const int width,
+                        const int height, void* prediction,
                         const ptrdiff_t pred_stride) {
   const int filter_index = GetFilterIndex(vertical_filter_index, height);
   const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
@@ -438,9 +441,8 @@
       static_cast<const Pixel*>(reference) - kVerticalOffset * src_stride;
   auto* dest = static_cast<Pixel*>(prediction);
   const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
-  const int filter_id = (subpixel_y >> 6) & kSubPixelMask;
   // Copy filters must call ConvolveCopy().
-  assert(filter_id != 0);
+  assert(vertical_filter_id != 0);
 
   const int max_pixel_value = (1 << bitdepth) - 1;
   int y = 0;
@@ -449,7 +451,7 @@
     do {
       int sum = 0;
       for (int k = 0; k < kSubPixelTaps; ++k) {
-        sum += kHalfSubPixelFilters[filter_index][filter_id][k] *
+        sum += kHalfSubPixelFilters[filter_index][vertical_filter_id][k] *
                src[k * src_stride + x];
       }
       dest[x] = Clip3(RightShiftWithRounding(sum, kFilterBits - 1), 0,
@@ -466,8 +468,9 @@
                     const ptrdiff_t reference_stride,
                     const int /*horizontal_filter_index*/,
                     const int /*vertical_filter_index*/,
-                    const int /*subpixel_x*/, const int /*subpixel_y*/,
-                    const int width, const int height, void* prediction,
+                    const int /*horizontal_filter_id*/,
+                    const int /*vertical_filter_id*/, const int width,
+                    const int height, void* prediction,
                     const ptrdiff_t pred_stride) {
   const auto* src = static_cast<const uint8_t*>(reference);
   auto* dest = static_cast<uint8_t*>(prediction);
@@ -484,8 +487,9 @@
                             const ptrdiff_t reference_stride,
                             const int /*horizontal_filter_index*/,
                             const int /*vertical_filter_index*/,
-                            const int /*subpixel_x*/, const int /*subpixel_y*/,
-                            const int width, const int height, void* prediction,
+                            const int /*horizontal_filter_id*/,
+                            const int /*vertical_filter_id*/, const int width,
+                            const int height, void* prediction,
                             const ptrdiff_t pred_stride) {
   // All compound functions output to the predictor buffer with |pred_stride|
   // equal to |width|.
@@ -521,8 +525,9 @@
 void ConvolveCompoundHorizontal_C(
     const void* const reference, const ptrdiff_t reference_stride,
     const int horizontal_filter_index, const int /*vertical_filter_index*/,
-    const int subpixel_x, const int /*subpixel_y*/, const int width,
-    const int height, void* prediction, const ptrdiff_t pred_stride) {
+    const int horizontal_filter_id, const int /*vertical_filter_id*/,
+    const int width, const int height, void* prediction,
+    const ptrdiff_t pred_stride) {
   // All compound functions output to the predictor buffer with |pred_stride|
   // equal to |width|.
   assert(pred_stride == width);
@@ -535,16 +540,16 @@
   const auto* src = static_cast<const Pixel*>(reference) - kHorizontalOffset;
   const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
   auto* dest = static_cast<uint16_t*>(prediction);
-  const int filter_id = (subpixel_x >> 6) & kSubPixelMask;
   // Copy filters must call ConvolveCopy().
-  assert(filter_id != 0);
+  assert(horizontal_filter_id != 0);
   int y = 0;
   do {
     int x = 0;
     do {
       int sum = 0;
       for (int k = 0; k < kSubPixelTaps; ++k) {
-        sum += kHalfSubPixelFilters[filter_index][filter_id][k] * src[x + k];
+        sum += kHalfSubPixelFilters[filter_index][horizontal_filter_id][k] *
+               src[x + k];
       }
       sum = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
       sum += (bitdepth == 8) ? 0 : kCompoundOffset;
@@ -566,9 +571,10 @@
                                 const ptrdiff_t reference_stride,
                                 const int /*horizontal_filter_index*/,
                                 const int vertical_filter_index,
-                                const int /*subpixel_x*/, const int subpixel_y,
-                                const int width, const int height,
-                                void* prediction, const ptrdiff_t pred_stride) {
+                                const int /*horizontal_filter_id*/,
+                                const int vertical_filter_id, const int width,
+                                const int height, void* prediction,
+                                const ptrdiff_t pred_stride) {
   // All compound functions output to the predictor buffer with |pred_stride|
   // equal to |width|.
   assert(pred_stride == width);
@@ -582,16 +588,15 @@
   const auto* src =
       static_cast<const Pixel*>(reference) - kVerticalOffset * src_stride;
   auto* dest = static_cast<uint16_t*>(prediction);
-  const int filter_id = (subpixel_y >> 6) & kSubPixelMask;
   // Copy filters must call ConvolveCopy().
-  assert(filter_id != 0);
+  assert(vertical_filter_id != 0);
   int y = 0;
   do {
     int x = 0;
     do {
       int sum = 0;
       for (int k = 0; k < kSubPixelTaps; ++k) {
-        sum += kHalfSubPixelFilters[filter_index][filter_id][k] *
+        sum += kHalfSubPixelFilters[filter_index][vertical_filter_id][k] *
                src[k * src_stride + x];
       }
       sum = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
@@ -610,11 +615,16 @@
 // The output is the single prediction of the block, clipped to valid pixel
 // range.
 template <int bitdepth, typename Pixel>
-void ConvolveIntraBlockCopy2D_C(
-    const void* const reference, const ptrdiff_t reference_stride,
-    const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
-    const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
-    const int height, void* prediction, const ptrdiff_t pred_stride) {
+void ConvolveIntraBlockCopy2D_C(const void* const reference,
+                                const ptrdiff_t reference_stride,
+                                const int /*horizontal_filter_index*/,
+                                const int /*vertical_filter_index*/,
+                                const int /*horizontal_filter_id*/,
+                                const int /*vertical_filter_id*/,
+                                const int width, const int height,
+                                void* prediction, const ptrdiff_t pred_stride) {
+  assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
+  assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
   const auto* src = static_cast<const Pixel*>(reference);
   const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
   auto* dest = static_cast<Pixel*>(prediction);
@@ -660,11 +670,16 @@
 // The filtering of intra block copy is simply the average of current and
 // the next pixel.
 template <int bitdepth, typename Pixel, bool is_horizontal>
-void ConvolveIntraBlockCopy1D_C(
-    const void* const reference, const ptrdiff_t reference_stride,
-    const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
-    const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
-    const int height, void* prediction, const ptrdiff_t pred_stride) {
+void ConvolveIntraBlockCopy1D_C(const void* const reference,
+                                const ptrdiff_t reference_stride,
+                                const int /*horizontal_filter_index*/,
+                                const int /*vertical_filter_index*/,
+                                const int /*horizontal_filter_id*/,
+                                const int /*vertical_filter_id*/,
+                                const int width, const int height,
+                                void* prediction, const ptrdiff_t pred_stride) {
+  assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
+  assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
   const auto* src = static_cast<const Pixel*>(reference);
   const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
   auto* dest = static_cast<Pixel*>(prediction);

diff --git a/libgav1/src/dsp/convolve.h b/libgav1/src/dsp/convolve.h
index 95019e2..5bc0bad 100644
--- a/libgav1/src/dsp/convolve.h
+++ b/libgav1/src/dsp/convolve.h

@@ -30,6 +30,7 @@
 // The order of includes is important as each tests for a superior version
 // before setting the base.
 // clang-format off
+#include "src/dsp/x86/convolve_avx2.h"
 #include "src/dsp/x86/convolve_sse4.h"
 // clang-format on
 

diff --git a/libgav1/src/dsp/convolve.inc b/libgav1/src/dsp/convolve.inc
new file mode 100644
index 0000000..140648b
--- /dev/null
+++ b/libgav1/src/dsp/convolve.inc

@@ -0,0 +1,50 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Constants and utility functions used for convolve implementations.
+// This will be included inside an anonymous namespace on files where these are
+// necessary.
+
+int GetNumTapsInFilter(const int filter_index) {
+  if (filter_index < 2) {
+    // Despite the names these only use 6 taps.
+    // kInterpolationFilterEightTap
+    // kInterpolationFilterEightTapSmooth
+    return 6;
+  }
+
+  if (filter_index == 2) {
+    // kInterpolationFilterEightTapSharp
+    return 8;
+  }
+
+  if (filter_index == 3) {
+    // kInterpolationFilterBilinear
+    return 2;
+  }
+
+  assert(filter_index > 3);
+  // For small sizes (width/height <= 4) the large filters are replaced with 4
+  // tap options.
+  // If the original filters were |kInterpolationFilterEightTap| or
+  // |kInterpolationFilterEightTapSharp| then it becomes
+  // |kInterpolationFilterSwitchable|.
+  // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4
+  // tap filter.
+  return 4;
+}
+
+constexpr int kIntermediateStride = kMaxSuperBlockSizeInPixels;
+constexpr int kHorizontalOffset = 3;
+constexpr int kFilterIndexShift = 6;

diff --git a/libgav1/src/dsp/dsp.cc b/libgav1/src/dsp/dsp.cc
index c1df276..a3d7701 100644
--- a/libgav1/src/dsp/dsp.cc
+++ b/libgav1/src/dsp/dsp.cc

@@ -16,7 +16,6 @@
 
 #include <mutex>  // NOLINT (unapproved c++11 header)
 
-#include "src/dsp/arm/weight_mask_neon.h"
 #include "src/dsp/average_blend.h"
 #include "src/dsp/cdef.h"
 #include "src/dsp/convolve.h"
@@ -24,6 +23,10 @@
 #include "src/dsp/film_grain.h"
 #include "src/dsp/intra_edge.h"
 #include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_cfl.h"
+#include "src/dsp/intrapred_directional.h"
+#include "src/dsp/intrapred_filter.h"
+#include "src/dsp/intrapred_smooth.h"
 #include "src/dsp/inverse_transform.h"
 #include "src/dsp/loop_filter.h"
 #include "src/dsp/loop_restoration.h"
@@ -39,6 +42,30 @@
 namespace libgav1 {
 namespace dsp_internal {
 
+void DspInit_C() {
+  dsp::AverageBlendInit_C();
+  dsp::CdefInit_C();
+  dsp::ConvolveInit_C();
+  dsp::DistanceWeightedBlendInit_C();
+  dsp::FilmGrainInit_C();
+  dsp::IntraEdgeInit_C();
+  dsp::IntraPredCflInit_C();
+  dsp::IntraPredDirectionalInit_C();
+  dsp::IntraPredFilterInit_C();
+  dsp::IntraPredInit_C();
+  dsp::IntraPredSmoothInit_C();
+  dsp::InverseTransformInit_C();
+  dsp::LoopFilterInit_C();
+  dsp::LoopRestorationInit_C();
+  dsp::MaskBlendInit_C();
+  dsp::MotionFieldProjectionInit_C();
+  dsp::MotionVectorSearchInit_C();
+  dsp::ObmcInit_C();
+  dsp::SuperResInit_C();
+  dsp::WarpInit_C();
+  dsp::WeightMaskInit_C();
+}
+
 dsp::Dsp* GetWritableDspTable(int bitdepth) {
   switch (bitdepth) {
     case 8: {
@@ -62,31 +89,20 @@
 void DspInit() {
   static std::once_flag once;
   std::call_once(once, []() {
-    AverageBlendInit_C();
-    CdefInit_C();
-    ConvolveInit_C();
-    DistanceWeightedBlendInit_C();
-    FilmGrainInit_C();
-    IntraEdgeInit_C();
-    IntraPredInit_C();
-    InverseTransformInit_C();
-    LoopFilterInit_C();
-    LoopRestorationInit_C();
-    MaskBlendInit_C();
-    MotionFieldProjectionInit_C();
-    MotionVectorSearchInit_C();
-    ObmcInit_C();
-    SuperResInit_C();
-    WarpInit_C();
-    WeightMaskInit_C();
-#if LIBGAV1_ENABLE_SSE4_1
+    dsp_internal::DspInit_C();
+#if LIBGAV1_ENABLE_SSE4_1 || LIBGAV1_ENABLE_AVX2
     const uint32_t cpu_features = GetCpuInfo();
+#if LIBGAV1_ENABLE_SSE4_1
     if ((cpu_features & kSSE4_1) != 0) {
       AverageBlendInit_SSE4_1();
       CdefInit_SSE4_1();
       ConvolveInit_SSE4_1();
       DistanceWeightedBlendInit_SSE4_1();
+      FilmGrainInit_SSE4_1();
       IntraEdgeInit_SSE4_1();
+      IntraPredCflInit_SSE4_1();
+      IntraPredDirectionalInit_SSE4_1();
+      IntraPredFilterInit_SSE4_1();
       IntraPredInit_SSE4_1();
       IntraPredCflInit_SSE4_1();
       IntraPredSmoothInit_SSE4_1();
@@ -100,8 +116,22 @@
       SuperResInit_SSE4_1();
       WarpInit_SSE4_1();
       WeightMaskInit_SSE4_1();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      LoopRestorationInit10bpp_SSE4_1();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
     }
 #endif  // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_AVX2
+    if ((cpu_features & kAVX2) != 0) {
+      CdefInit_AVX2();
+      ConvolveInit_AVX2();
+      LoopRestorationInit_AVX2();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      LoopRestorationInit10bpp_AVX2();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+    }
+#endif  // LIBGAV1_ENABLE_AVX2
+#endif  // LIBGAV1_ENABLE_SSE4_1 || LIBGAV1_ENABLE_AVX2
 #if LIBGAV1_ENABLE_NEON
     AverageBlendInit_NEON();
     CdefInit_NEON();
@@ -111,7 +141,7 @@
     IntraEdgeInit_NEON();
     IntraPredCflInit_NEON();
     IntraPredDirectionalInit_NEON();
-    IntraPredFilterIntraInit_NEON();
+    IntraPredFilterInit_NEON();
     IntraPredInit_NEON();
     IntraPredSmoothInit_NEON();
     InverseTransformInit_NEON();
@@ -124,6 +154,9 @@
     SuperResInit_NEON();
     WarpInit_NEON();
     WeightMaskInit_NEON();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+    InverseTransformInit10bpp_NEON();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
 #endif  // LIBGAV1_ENABLE_NEON
   });
 }

diff --git a/libgav1/src/dsp/dsp.h b/libgav1/src/dsp/dsp.h
index 1fa1560..153db7f 100644
--- a/libgav1/src/dsp/dsp.h
+++ b/libgav1/src/dsp/dsp.h

@@ -17,7 +17,7 @@
 #ifndef LIBGAV1_SRC_DSP_DSP_H_
 #define LIBGAV1_SRC_DSP_DSP_H_
 
-#include <cstddef>  // ptrdiff_t
+#include <cstddef>
 #include <cstdint>
 #include <cstdlib>
 
@@ -79,6 +79,11 @@
   kNumLoopFilterSizes
 };
 
+enum : uint8_t {
+  kRow = 0,
+  kColumn = 1,
+};
+
 //------------------------------------------------------------------------------
 // ToString()
 //
@@ -298,16 +303,20 @@
 // 7.13.3).
 // Apply the inverse transforms and add the residual to the destination frame
 // for the transform type and block size |tx_size| starting at position
-// |start_x| and |start_y|.  |dst_frame| is a pointer to an Array2D. |is_row|
-// signals the direction of the transform loop. |non_zero_coeff_count| is the
-// number of non zero coefficients in the block.
+// |start_x| and |start_y|. |dst_frame| is a pointer to an Array2D.
+// |adjusted_tx_height| is the number of rows to process based on the non-zero
+// coefficient count in the block. It will be 1 (non-zero coefficient count ==
+// 1), 4 or a multiple of 8 up to 32 or the original transform height,
+// whichever is less.
 using InverseTransformAddFunc = void (*)(TransformType tx_type,
                                          TransformSize tx_size,
+                                         int adjusted_tx_height,
                                          void* src_buffer, int start_x,
-                                         int start_y, void* dst_frame,
-                                         bool is_row, int non_zero_coeff_count);
+                                         int start_y, void* dst_frame);
+// The final dimension holds row and column transforms indexed with kRow and
+// kColumn.
 using InverseTransformAddFuncs =
-    InverseTransformAddFunc[kNum1DTransformSizes][kNum1DTransforms];
+    InverseTransformAddFunc[kNum1DTransforms][kNum1DTransformSizes][2];
 
 //------------------------------------------------------------------------------
 // Post processing.
@@ -325,7 +334,7 @@
 // with |stride| given in bytes. |direction| and |variance| are output
 // parameters and must not be nullptr.
 using CdefDirectionFunc = void (*)(const void* src, ptrdiff_t stride,
-                                   int* direction, int* variance);
+                                   uint8_t* direction, int* variance);
 
 // Cdef filtering function signature. Section 7.15.3.
 // |source| is a pointer to the input block padded with kCdefLargeValue if at a
@@ -346,30 +355,53 @@
 // |primary_strength| only, [2]: |secondary_strength| only.
 using CdefFilteringFuncs = CdefFilteringFunc[2][3];
 
-// Upscaling process function signature. Section 7.16.
-// Operates on a single row.
-// |source| is the input frame buffer at the given row.
-// |dest| is the output row.
+// Upscaling coefficients function signature. Section 7.16.
+// This is an auxiliary function for SIMD optimizations and has no corresponding
+// C function. Different SIMD versions may have different outputs. So it must
+// pair with the corresponding version of SuperResFunc.
 // |upscaled_width| is the width of the output frame.
 // |step| is the number of subpixels to move the kernel for the next destination
 // pixel.
 // |initial_subpixel_x| is a base offset from which |step| increments.
-using SuperResRowFunc = void (*)(const void* source, const int upscaled_width,
-                                 const int initial_subpixel_x, const int step,
-                                 void* const dest);
+// |coefficients| is the upscale filter used by each pixel in a row.
+using SuperResCoefficientsFunc = void (*)(int upscaled_width,
+                                          int initial_subpixel_x, int step,
+                                          void* coefficients);
+
+// Upscaling process function signature. Section 7.16.
+// |coefficients| is the upscale filter used by each pixel in a row. It is not
+// used by the C function.
+// |source| is the input frame buffer. It will be line extended.
+// |source_stride| is given in pixels.
+// |dest| is the output buffer.
+// |dest_stride| is given in pixels.
+// |height| is the height of the block to be processed.
+// |downscaled_width| is the width of the input frame.
+// |upscaled_width| is the width of the output frame.
+// |step| is the number of subpixels to move the kernel for the next destination
+// pixel.
+// |initial_subpixel_x| is a base offset from which |step| increments.
+using SuperResFunc = void (*)(const void* coefficients, void* source,
+                              ptrdiff_t source_stride, int height,
+                              int downscaled_width, int upscaled_width,
+                              int initial_subpixel_x, int step, void* dest,
+                              ptrdiff_t dest_stride);
 
 // Loop restoration function signature. Sections 7.16, 7.17.
-// |source| is the input frame buffer, which is deblocked and cdef filtered.
-// |dest| is the output.
 // |restoration_info| contains loop restoration information, such as filter
 // type, strength.
-// |source_stride| and |dest_stride| are given in pixels.
-// |buffer| contains buffers required for self guided filter and wiener filter.
-// They must be initialized before calling.
+// |source| is the input frame buffer, which is deblocked and cdef filtered.
+// |top_border| and |bottom_border| are the top and bottom borders.
+// |dest| is the output.
+// |stride| is given in pixels, and shared by |source| and |dest|.
+// |top_border_stride| and |bottom_border_stride| are given in pixels.
+// |restoration_buffer| contains buffers required for self guided filter and
+// wiener filter. They must be initialized before calling.
 using LoopRestorationFunc = void (*)(
-    const void* source, void* dest, const RestorationUnitInfo& restoration_info,
-    ptrdiff_t source_stride, ptrdiff_t dest_stride, int width, int height,
-    RestorationBuffer* buffer);
+    const RestorationUnitInfo& restoration_info, const void* source,
+    ptrdiff_t stride, const void* top_border, ptrdiff_t top_border_stride,
+    const void* bottom_border, ptrdiff_t bottom_border_stride, int width,
+    int height, RestorationBuffer* restoration_buffer, void* dest);
 
 // Index 0 is Wiener Filter.
 // Index 1 is Self Guided Restoration Filter.
@@ -383,7 +415,7 @@
 // |vertical_filter_index|/|horizontal_filter_index| is the index to
 // retrieve the type of filter to be applied for vertical/horizontal direction
 // from the filter lookup table 'kSubPixelFilters'.
-// |subpixel_x| and |subpixel_y| are starting positions in units of 1/1024.
+// |horizontal_filter_id| and |vertical_filter_id| are the filter ids.
 // |width| and |height| are width and height of the block to be filtered.
 // |ref_last_x| and |ref_last_y| are the last pixel of the reference frame in
 // x/y direction.
@@ -395,9 +427,10 @@
 // be used.
 using ConvolveFunc = void (*)(const void* reference, ptrdiff_t reference_stride,
                               int horizontal_filter_index,
-                              int vertical_filter_index, int subpixel_x,
-                              int subpixel_y, int width, int height,
-                              void* prediction, ptrdiff_t pred_stride);
+                              int vertical_filter_index,
+                              int horizontal_filter_id, int vertical_filter_id,
+                              int width, int height, void* prediction,
+                              ptrdiff_t pred_stride);
 
 // Convolve functions signature. Each points to one convolve function with
 // a specific setting:
@@ -815,7 +848,8 @@
   MvProjectionCompoundFunc mv_projection_compound[3];
   MvProjectionSingleFunc mv_projection_single[3];
   ObmcBlendFuncs obmc_blend;
-  SuperResRowFunc super_res_row;
+  SuperResCoefficientsFunc super_res_coefficients;
+  SuperResFunc super_res;
   WarpCompoundFunc warp_compound;
   WarpFunc warp;
   WeightMaskFuncs weight_mask;
@@ -834,6 +868,14 @@
 
 namespace dsp_internal {
 
+// Visual Studio builds don't have a way to detect SSE4_1. Only exclude the C
+// functions if /arch:AVX2 is used across all sources.
+#if !LIBGAV1_TARGETING_AVX2 && \
+    (defined(_MSC_VER) || (defined(_M_IX86) || defined(_M_X64)))
+#undef LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#define LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS 1
+#endif
+
 // Returns true if a more highly optimized version of |func| is not defined for
 // the associated bitdepth or if it is forcibly enabled with
 // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS. The define checked for |func| corresponds
@@ -848,6 +890,12 @@
 //  NEON support is the only extension available for ARM and it is always
 //  required. Because of this restriction DSP_ENABLED_8BPP_NEON(func) is always
 //  true and can be omitted.
+#define DSP_ENABLED_8BPP_AVX2(func)    \
+  (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+   LIBGAV1_Dsp8bpp_##func == LIBGAV1_CPU_AVX2)
+#define DSP_ENABLED_10BPP_AVX2(func)   \
+  (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+   LIBGAV1_Dsp10bpp_##func == LIBGAV1_CPU_AVX2)
 #define DSP_ENABLED_8BPP_SSE4_1(func)  \
   (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
    LIBGAV1_Dsp8bpp_##func == LIBGAV1_CPU_SSE4_1)
@@ -855,6 +903,11 @@
   (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
    LIBGAV1_Dsp10bpp_##func == LIBGAV1_CPU_SSE4_1)
 
+// Initializes C-only function pointers. Note some entries may be set to
+// nullptr if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS is not defined. This is meant
+// for use in tests only, it is not thread-safe.
+void DspInit_C();
+
 // Returns the appropriate Dsp table for |bitdepth| or nullptr if one doesn't
 // exist. This version is meant for use by test or dsp/*Init() functions only.
 dsp::Dsp* GetWritableDspTable(int bitdepth);

diff --git a/libgav1/src/dsp/film_grain.cc b/libgav1/src/dsp/film_grain.cc
index 2ee290b..41d1dd0 100644
--- a/libgav1/src/dsp/film_grain.cc
+++ b/libgav1/src/dsp/film_grain.cc

@@ -209,7 +209,7 @@
             luma += luma_grain[(luma_y + i) * kLumaWidth + (luma_x + j)];
           } while (++j <= subsampling_x);
         } while (++i <= subsampling_y);
-        luma = RightShiftWithRounding(luma, subsampling_x + subsampling_y);
+        luma = SubsampledValue(luma, subsampling_x + subsampling_y);
         const int coeff_u = params.auto_regression_coeff_u[pos];
         const int coeff_v = params.auto_regression_coeff_v[pos];
         sum_u += luma * coeff_u;

diff --git a/libgav1/src/dsp/film_grain.h b/libgav1/src/dsp/film_grain.h
index fe93270..f75a354 100644
--- a/libgav1/src/dsp/film_grain.h
+++ b/libgav1/src/dsp/film_grain.h

@@ -25,6 +25,14 @@
 // ARM:
 #include "src/dsp/arm/film_grain_neon.h"
 
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/film_grain_sse4.h"
+// clang-format on
+
 // IWYU pragma: end_exports
 
 namespace libgav1 {

diff --git a/libgav1/src/dsp/intrapred.cc b/libgav1/src/dsp/intrapred.cc
index 4bcb580..4520c2c 100644
--- a/libgav1/src/dsp/intrapred.cc
+++ b/libgav1/src/dsp/intrapred.cc

@@ -19,21 +19,18 @@
 #include <cstddef>
 #include <cstdint>
 #include <cstdlib>
-#include <cstring>  // memset
+#include <cstring>
 
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
 #include "src/utils/common.h"
+#include "src/utils/constants.h"
 #include "src/utils/memory.h"
 
 namespace libgav1 {
 namespace dsp {
 namespace {
 
-constexpr TransformSize kTransformSizesLargerThan32x32[] = {
-    kTransformSize16x64, kTransformSize32x64, kTransformSize64x16,
-    kTransformSize64x32, kTransformSize64x64};
-
 template <int block_width, int block_height, typename Pixel>
 struct IntraPredFuncs_C {
   IntraPredFuncs_C() = delete;
@@ -50,12 +47,6 @@
                          const void* left_column);
   static void Paeth(void* dest, ptrdiff_t stride, const void* top_row,
                     const void* left_column);
-  static void Smooth(void* dest, ptrdiff_t stride, const void* top_row,
-                     const void* left_column);
-  static void SmoothVertical(void* dest, ptrdiff_t stride, const void* top_row,
-                             const void* left_column);
-  static void SmoothHorizontal(void* dest, ptrdiff_t stride,
-                               const void* top_row, const void* left_column);
 };
 
 // Intra-predictors that require bitdepth.
@@ -190,16 +181,6 @@
   }
 }
 
-template <typename Pixel>
-inline Pixel Average(Pixel a, Pixel b) {
-  return static_cast<Pixel>((a + b + 1) >> 1);
-}
-
-template <typename Pixel>
-inline Pixel Average(Pixel a, Pixel b, Pixel c) {
-  return static_cast<Pixel>((a + 2 * b + c + 2) >> 2);
-}
-
 // IntraPredFuncs_C::Paeth
 template <int block_width, int block_height, typename Pixel>
 void IntraPredFuncs_C<block_width, block_height, Pixel>::Paeth(
@@ -238,110 +219,6 @@
   }
 }
 
-constexpr uint8_t kSmoothWeights[] = {
-    // block dimension = 4
-    255, 149, 85, 64,
-    // block dimension = 8
-    255, 197, 146, 105, 73, 50, 37, 32,
-    // block dimension = 16
-    255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
-    // block dimension = 32
-    255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
-    66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
-    // block dimension = 64
-    255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
-    150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73,
-    69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16,
-    15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4};
-
-// IntraPredFuncs_C::Smooth
-template <int block_width, int block_height, typename Pixel>
-void IntraPredFuncs_C<block_width, block_height, Pixel>::Smooth(
-    void* const dest, ptrdiff_t stride, const void* const top_row,
-    const void* const left_column) {
-  const auto* const top = static_cast<const Pixel*>(top_row);
-  const auto* const left = static_cast<const Pixel*>(left_column);
-  const Pixel top_right = top[block_width - 1];
-  const Pixel bottom_left = left[block_height - 1];
-  static_assert(
-      block_width >= 4 && block_height >= 4,
-      "Weights for smooth predictor undefined for block width/height < 4");
-  const uint8_t* const weights_x = kSmoothWeights + block_width - 4;
-  const uint8_t* const weights_y = kSmoothWeights + block_height - 4;
-  const uint16_t scale_value = (1 << kSmoothWeightScale);
-  auto* dst = static_cast<Pixel*>(dest);
-  stride /= sizeof(Pixel);
-
-  for (int y = 0; y < block_height; ++y) {
-    for (int x = 0; x < block_width; ++x) {
-      assert(scale_value >= weights_y[y] && scale_value >= weights_x[x]);
-      uint32_t pred = weights_y[y] * top[x];
-      pred += weights_x[x] * left[y];
-      pred += static_cast<uint8_t>(scale_value - weights_y[y]) * bottom_left;
-      pred += static_cast<uint8_t>(scale_value - weights_x[x]) * top_right;
-      // The maximum value of pred with the rounder is 2^9 * (2^bitdepth - 1)
-      // + 256. With the descale there's no need for saturation.
-      dst[x] = static_cast<Pixel>(
-          RightShiftWithRounding(pred, kSmoothWeightScale + 1));
-    }
-    dst += stride;
-  }
-}
-
-// IntraPredFuncs_C::SmoothVertical
-template <int block_width, int block_height, typename Pixel>
-void IntraPredFuncs_C<block_width, block_height, Pixel>::SmoothVertical(
-    void* const dest, ptrdiff_t stride, const void* const top_row,
-    const void* const left_column) {
-  const auto* const top = static_cast<const Pixel*>(top_row);
-  const auto* const left = static_cast<const Pixel*>(left_column);
-  const Pixel bottom_left = left[block_height - 1];
-  static_assert(block_height >= 4,
-                "Weights for smooth predictor undefined for block height < 4");
-  const uint8_t* const weights_y = kSmoothWeights + block_height - 4;
-  const uint16_t scale_value = (1 << kSmoothWeightScale);
-  auto* dst = static_cast<Pixel*>(dest);
-  stride /= sizeof(Pixel);
-
-  for (int y = 0; y < block_height; ++y) {
-    for (int x = 0; x < block_width; ++x) {
-      assert(scale_value >= weights_y[y]);
-      uint32_t pred = weights_y[y] * top[x];
-      pred += static_cast<uint8_t>(scale_value - weights_y[y]) * bottom_left;
-      dst[x] =
-          static_cast<Pixel>(RightShiftWithRounding(pred, kSmoothWeightScale));
-    }
-    dst += stride;
-  }
-}
-
-// IntraPredFuncs_C::SmoothHorizontal
-template <int block_width, int block_height, typename Pixel>
-void IntraPredFuncs_C<block_width, block_height, Pixel>::SmoothHorizontal(
-    void* const dest, ptrdiff_t stride, const void* const top_row,
-    const void* const left_column) {
-  const auto* const top = static_cast<const Pixel*>(top_row);
-  const auto* const left = static_cast<const Pixel*>(left_column);
-  const Pixel top_right = top[block_width - 1];
-  static_assert(block_width >= 4,
-                "Weights for smooth predictor undefined for block width < 4");
-  const uint8_t* const weights_x = kSmoothWeights + block_width - 4;
-  const uint16_t scale_value = (1 << kSmoothWeightScale);
-  auto* dst = static_cast<Pixel*>(dest);
-  stride /= sizeof(Pixel);
-
-  for (int y = 0; y < block_height; ++y) {
-    for (int x = 0; x < block_width; ++x) {
-      assert(scale_value >= weights_x[x]);
-      uint32_t pred = weights_x[x] * left[y];
-      pred += static_cast<uint8_t>(scale_value - weights_x[x]) * top_right;
-      dst[x] =
-          static_cast<Pixel>(RightShiftWithRounding(pred, kSmoothWeightScale));
-    }
-    dst += stride;
-  }
-}
-
 //------------------------------------------------------------------------------
 // IntraPredBppFuncs_C
 template <int fill, typename Pixel>
@@ -366,288 +243,7 @@
                                           block_height);
 }
 
-//------------------------------------------------------------------------------
-// FilterIntraPredictor_C
-
-template <int bitdepth, typename Pixel>
-void FilterIntraPredictor_C(void* const dest, ptrdiff_t stride,
-                            const void* const top_row,
-                            const void* const left_column,
-                            const FilterIntraPredictor pred, const int width,
-                            const int height) {
-  const int kMaxPixel = (1 << bitdepth) - 1;
-  const auto* const top = static_cast<const Pixel*>(top_row);
-  const auto* const left = static_cast<const Pixel*>(left_column);
-
-  assert(width <= 32 && height <= 32);
-
-  Pixel buffer[3][33];  // cache 2 rows + top & left boundaries
-  memcpy(buffer[0], &top[-1], (width + 1) * sizeof(top[0]));
-
-  auto* dst = static_cast<Pixel*>(dest);
-  stride /= sizeof(Pixel);
-  int row0 = 0, row2 = 2;
-  int ystep = 1;
-  int y = 0;
-  do {
-    buffer[1][0] = left[y];
-    buffer[row2][0] = left[y + 1];
-    int x = 1;
-    do {
-      const Pixel p0 = buffer[row0][x - 1];  // top-left
-      const Pixel p1 = buffer[row0][x + 0];  // top 0
-      const Pixel p2 = buffer[row0][x + 1];  // top 1
-      const Pixel p3 = buffer[row0][x + 2];  // top 2
-      const Pixel p4 = buffer[row0][x + 3];  // top 3
-      const Pixel p5 = buffer[1][x - 1];     // left 0
-      const Pixel p6 = buffer[row2][x - 1];  // left 1
-      for (int i = 0; i < 8; ++i) {
-        const int xoffset = i & 0x03;
-        const int yoffset = (i >> 2) * ystep;
-        const int value = kFilterIntraTaps[pred][i][0] * p0 +
-                          kFilterIntraTaps[pred][i][1] * p1 +
-                          kFilterIntraTaps[pred][i][2] * p2 +
-                          kFilterIntraTaps[pred][i][3] * p3 +
-                          kFilterIntraTaps[pred][i][4] * p4 +
-                          kFilterIntraTaps[pred][i][5] * p5 +
-                          kFilterIntraTaps[pred][i][6] * p6;
-        buffer[1 + yoffset][x + xoffset] = static_cast<Pixel>(
-            Clip3(RightShiftWithRounding(value, 4), 0, kMaxPixel));
-      }
-      x += 4;
-    } while (x < width);
-    memcpy(dst, &buffer[1][1], width * sizeof(dst[0]));
-    dst += stride;
-    memcpy(dst, &buffer[row2][1], width * sizeof(dst[0]));
-    dst += stride;
-
-    // The final row becomes the top for the next pass.
-    row0 ^= 2;
-    row2 ^= 2;
-    ystep = -ystep;
-    y += 2;
-  } while (y < height);
-}
-
-//------------------------------------------------------------------------------
-// CflIntraPredictor_C
-
-// |luma| can be within +/-(((1 << bitdepth) - 1) << 3), inclusive.
-// |alpha| can be -16 to 16 (inclusive).
-template <int block_width, int block_height, int bitdepth, typename Pixel>
-void CflIntraPredictor_C(
-    void* const dest, ptrdiff_t stride,
-    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
-    const int alpha) {
-  auto* dst = static_cast<Pixel*>(dest);
-  const int dc = dst[0];
-  stride /= sizeof(Pixel);
-  const int max_value = (1 << bitdepth) - 1;
-  for (int y = 0; y < block_height; ++y) {
-    for (int x = 0; x < block_width; ++x) {
-      assert(luma[y][x] >= -(((1 << bitdepth) - 1) << 3));
-      assert(luma[y][x] <= ((1 << bitdepth) - 1) << 3);
-      dst[x] = Clip3(dc + RightShiftWithRoundingSigned(alpha * luma[y][x], 6),
-                     0, max_value);
-    }
-    dst += stride;
-  }
-}
-
-//------------------------------------------------------------------------------
-// CflSubsampler_C
-
-template <int block_width, int block_height, int bitdepth, typename Pixel,
-          int subsampling_x, int subsampling_y>
-void CflSubsampler_C(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
-                     const int max_luma_width, const int max_luma_height,
-                     const void* const source, ptrdiff_t stride) {
-  assert(max_luma_width >= 4);
-  assert(max_luma_height >= 4);
-  const auto* src = static_cast<const Pixel*>(source);
-  stride /= sizeof(Pixel);
-  int sum = 0;
-  for (int y = 0; y < block_height; ++y) {
-    for (int x = 0; x < block_width; ++x) {
-      const ptrdiff_t luma_x =
-          std::min(x << subsampling_x, max_luma_width - (1 << subsampling_x));
-      const ptrdiff_t luma_x_next = luma_x + stride;
-      luma[y][x] =
-          (src[luma_x] + ((subsampling_x != 0) ? src[luma_x + 1] : 0) +
-           ((subsampling_y != 0) ? (src[luma_x_next] + src[luma_x_next + 1])
-                                 : 0))
-          << (3 - subsampling_x - subsampling_y);
-      sum += luma[y][x];
-    }
-    if ((y << subsampling_y) < (max_luma_height - (1 << subsampling_y))) {
-      src += stride << subsampling_y;
-    }
-  }
-  const int average = RightShiftWithRounding(
-      sum, FloorLog2(block_width) + FloorLog2(block_height));
-  for (int y = 0; y < block_height; ++y) {
-    for (int x = 0; x < block_width; ++x) {
-      luma[y][x] -= average;
-    }
-  }
-}
-
-//------------------------------------------------------------------------------
-// 7.11.2.4. Directional intra prediction process
-
-template <typename Pixel>
-void DirectionalIntraPredictorZone1_C(void* const dest, ptrdiff_t stride,
-                                      const void* const top_row,
-                                      const int width, const int height,
-                                      const int xstep,
-                                      const bool upsampled_top) {
-  const auto* const top = static_cast<const Pixel*>(top_row);
-  auto* dst = static_cast<Pixel*>(dest);
-  stride /= sizeof(Pixel);
-
-  assert(xstep > 0);
-
-  // If xstep == 64 then |shift| always evaluates to 0 which sets |val| to
-  // |top[top_base_x]|. This corresponds to a 45 degree prediction.
-  if (xstep == 64) {
-    // 7.11.2.10. Intra edge upsample selection process
-    // if ( d <= 0 || d >= 40 ) useUpsample = 0
-    // For |upsampled_top| the delta is |predictor_angle - 90|. Since the
-    // |predictor_angle| is 45 the delta is also 45.
-    assert(!upsampled_top);
-    const Pixel* top_ptr = top + 1;
-    for (int y = 0; y < height; ++y, dst += stride, ++top_ptr) {
-      memcpy(dst, top_ptr, sizeof(*top_ptr) * width);
-    }
-    return;
-  }
-
-  const int upsample_shift = static_cast<int>(upsampled_top);
-  const int max_base_x = ((width + height) - 1) << upsample_shift;
-  const int scale_bits = 6 - upsample_shift;
-  const int base_step = 1 << upsample_shift;
-  int top_x = xstep;
-  int y = 0;
-  do {
-    int top_base_x = top_x >> scale_bits;
-
-    if (top_base_x >= max_base_x) {
-      for (int i = y; i < height; ++i) {
-        Memset(dst, top[max_base_x], width);
-        dst += stride;
-      }
-      return;
-    }
-
-    const int shift = ((top_x << upsample_shift) & 0x3F) >> 1;
-    int x = 0;
-    do {
-      if (top_base_x >= max_base_x) {
-        Memset(dst + x, top[max_base_x], width - x);
-        break;
-      }
-
-      const int val =
-          top[top_base_x] * (32 - shift) + top[top_base_x + 1] * shift;
-      dst[x] = RightShiftWithRounding(val, 5);
-      top_base_x += base_step;
-    } while (++x < width);
-
-    dst += stride;
-    top_x += xstep;
-  } while (++y < height);
-}
-
-template <typename Pixel>
-void DirectionalIntraPredictorZone2_C(void* const dest, ptrdiff_t stride,
-                                      const void* const top_row,
-                                      const void* const left_column,
-                                      const int width, const int height,
-                                      const int xstep, const int ystep,
-                                      const bool upsampled_top,
-                                      const bool upsampled_left) {
-  const auto* const top = static_cast<const Pixel*>(top_row);
-  const auto* const left = static_cast<const Pixel*>(left_column);
-  auto* dst = static_cast<Pixel*>(dest);
-  stride /= sizeof(Pixel);
-
-  assert(xstep > 0);
-  assert(ystep > 0);
-
-  const int upsample_top_shift = static_cast<int>(upsampled_top);
-  const int upsample_left_shift = static_cast<int>(upsampled_left);
-  const int scale_bits_x = 6 - upsample_top_shift;
-  const int scale_bits_y = 6 - upsample_left_shift;
-  const int min_base_x = -(1 << upsample_top_shift);
-  const int base_step_x = 1 << upsample_top_shift;
-  int y = 0;
-  int top_x = -xstep;
-  do {
-    int top_base_x = top_x >> scale_bits_x;
-    int left_y = (y << 6) - ystep;
-    int x = 0;
-    do {
-      int val;
-      if (top_base_x >= min_base_x) {
-        const int shift = ((top_x * (1 << upsample_top_shift)) & 0x3F) >> 1;
-        val = top[top_base_x] * (32 - shift) + top[top_base_x + 1] * shift;
-      } else {
-        // Note this assumes an arithmetic shift to handle negative values.
-        const int left_base_y = left_y >> scale_bits_y;
-        const int shift = ((left_y * (1 << upsample_left_shift)) & 0x3F) >> 1;
-        assert(left_base_y >= -(1 << upsample_left_shift));
-        val = left[left_base_y] * (32 - shift) + left[left_base_y + 1] * shift;
-      }
-      dst[x] = RightShiftWithRounding(val, 5);
-      top_base_x += base_step_x;
-      left_y -= ystep;
-    } while (++x < width);
-
-    top_x -= xstep;
-    dst += stride;
-  } while (++y < height);
-}
-
-template <typename Pixel>
-void DirectionalIntraPredictorZone3_C(void* const dest, ptrdiff_t stride,
-                                      const void* const left_column,
-                                      const int width, const int height,
-                                      const int ystep,
-                                      const bool upsampled_left) {
-  const auto* const left = static_cast<const Pixel*>(left_column);
-  stride /= sizeof(Pixel);
-
-  assert(ystep > 0);
-
-  const int upsample_shift = static_cast<int>(upsampled_left);
-  const int scale_bits = 6 - upsample_shift;
-  const int base_step = 1 << upsample_shift;
-  // Zone3 never runs out of left_column values.
-  assert((width + height - 1) << upsample_shift >  // max_base_y
-         ((ystep * width) >> scale_bits) +
-             base_step * (height - 1));  // left_base_y
-
-  int left_y = ystep;
-  int x = 0;
-  do {
-    auto* dst = static_cast<Pixel*>(dest);
-
-    int left_base_y = left_y >> scale_bits;
-    int y = 0;
-    do {
-      const int shift = ((left_y << upsample_shift) & 0x3F) >> 1;
-      const int val =
-          left[left_base_y] * (32 - shift) + left[left_base_y + 1] * shift;
-      dst[x] = RightShiftWithRounding(val, 5);
-      dst += stride;
-      left_base_y += base_step;
-    } while (++y < height);
-
-    left_y += ystep;
-  } while (++x < width);
-}
-
-//------------------------------------------------------------------------------
+// -----------------------------------------------------------------------------
 
 template <typename Pixel>
 struct IntraPredDefs {
@@ -718,15 +314,7 @@
   dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorHorizontal] = \
       DEFS::_##W##x##H::Horizontal;                                           \
   dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorPaeth] =      \
-      DEFS::_##W##x##H::Paeth;                                                \
-  dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorSmooth] =     \
-      DEFS::_##W##x##H::Smooth;                                               \
-  dsp->intra_predictors[kTransformSize##W##x##H]                              \
-                       [kIntraPredictorSmoothVertical] =                      \
-      DEFS::_##W##x##H::SmoothVertical;                                       \
-  dsp->intra_predictors[kTransformSize##W##x##H]                              \
-                       [kIntraPredictorSmoothHorizontal] =                    \
-      DEFS::_##W##x##H::SmoothHorizontal
+      DEFS::_##W##x##H::Paeth
 
 #define INIT_INTRAPREDICTORS(DEFS, DEFSBPP)        \
   INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 4, 4);   \
@@ -749,45 +337,11 @@
   INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 64, 32); \
   INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 64, 64)
 
-#define INIT_CFL_INTRAPREDICTOR_WxH(W, H, BITDEPTH, PIXEL)             \
-  dsp->cfl_intra_predictors[kTransformSize##W##x##H] =                 \
-      CflIntraPredictor_C<W, H, BITDEPTH, PIXEL>;                      \
-  dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType444] = \
-      CflSubsampler_C<W, H, BITDEPTH, PIXEL, 0, 0>;                    \
-  dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType422] = \
-      CflSubsampler_C<W, H, BITDEPTH, PIXEL, 1, 0>;                    \
-  dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType420] = \
-      CflSubsampler_C<W, H, BITDEPTH, PIXEL, 1, 1>
-
-#define INIT_CFL_INTRAPREDICTORS(BITDEPTH, PIXEL)       \
-  INIT_CFL_INTRAPREDICTOR_WxH(4, 4, BITDEPTH, PIXEL);   \
-  INIT_CFL_INTRAPREDICTOR_WxH(4, 8, BITDEPTH, PIXEL);   \
-  INIT_CFL_INTRAPREDICTOR_WxH(4, 16, BITDEPTH, PIXEL);  \
-  INIT_CFL_INTRAPREDICTOR_WxH(8, 4, BITDEPTH, PIXEL);   \
-  INIT_CFL_INTRAPREDICTOR_WxH(8, 8, BITDEPTH, PIXEL);   \
-  INIT_CFL_INTRAPREDICTOR_WxH(8, 16, BITDEPTH, PIXEL);  \
-  INIT_CFL_INTRAPREDICTOR_WxH(8, 32, BITDEPTH, PIXEL);  \
-  INIT_CFL_INTRAPREDICTOR_WxH(16, 4, BITDEPTH, PIXEL);  \
-  INIT_CFL_INTRAPREDICTOR_WxH(16, 8, BITDEPTH, PIXEL);  \
-  INIT_CFL_INTRAPREDICTOR_WxH(16, 16, BITDEPTH, PIXEL); \
-  INIT_CFL_INTRAPREDICTOR_WxH(16, 32, BITDEPTH, PIXEL); \
-  INIT_CFL_INTRAPREDICTOR_WxH(32, 8, BITDEPTH, PIXEL);  \
-  INIT_CFL_INTRAPREDICTOR_WxH(32, 16, BITDEPTH, PIXEL); \
-  INIT_CFL_INTRAPREDICTOR_WxH(32, 32, BITDEPTH, PIXEL)
-
 void Init8bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
   assert(dsp != nullptr);
 #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
   INIT_INTRAPREDICTORS(Defs, Defs8bpp);
-  dsp->directional_intra_predictor_zone1 =
-      DirectionalIntraPredictorZone1_C<uint8_t>;
-  dsp->directional_intra_predictor_zone2 =
-      DirectionalIntraPredictorZone2_C<uint8_t>;
-  dsp->directional_intra_predictor_zone3 =
-      DirectionalIntraPredictorZone3_C<uint8_t>;
-  dsp->filter_intra_predictor = FilterIntraPredictor_C<8, uint8_t>;
-  INIT_CFL_INTRAPREDICTORS(8, uint8_t);
 #else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
 #ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcFill] =
@@ -816,19 +370,6 @@
   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
       Defs::_4x4::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
-      Defs::_4x4::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
-      Defs::_4x4::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
-      Defs::_4x4::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcFill] =
       Defs8bpp::_4x8::DcFill;
@@ -856,19 +397,6 @@
   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
       Defs::_4x8::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
-      Defs::_4x8::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
-      Defs::_4x8::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
-      Defs::_4x8::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcFill] =
       Defs8bpp::_4x16::DcFill;
@@ -897,19 +425,6 @@
   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
       Defs::_4x16::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
-      Defs::_4x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
-      Defs::_4x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
-      Defs::_4x16::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcFill] =
       Defs8bpp::_8x4::DcFill;
@@ -937,19 +452,6 @@
   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
       Defs::_8x4::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
-      Defs::_8x4::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
-      Defs::_8x4::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
-      Defs::_8x4::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcFill] =
       Defs8bpp::_8x8::DcFill;
@@ -977,19 +479,6 @@
   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
       Defs::_8x8::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
-      Defs::_8x8::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
-      Defs::_8x8::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
-      Defs::_8x8::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcFill] =
       Defs8bpp::_8x16::DcFill;
@@ -1018,19 +507,6 @@
   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
       Defs::_8x16::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
-      Defs::_8x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
-      Defs::_8x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
-      Defs::_8x16::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcFill] =
       Defs8bpp::_8x32::DcFill;
@@ -1059,19 +535,6 @@
   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
       Defs::_8x32::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
-      Defs::_8x32::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
-      Defs::_8x32::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
-      Defs::_8x32::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcFill] =
       Defs8bpp::_16x4::DcFill;
@@ -1100,19 +563,6 @@
   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
       Defs::_16x4::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
-      Defs::_16x4::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
-      Defs::_16x4::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
-      Defs::_16x4::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcFill] =
       Defs8bpp::_16x8::DcFill;
@@ -1141,19 +591,6 @@
   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
       Defs::_16x8::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
-      Defs::_16x8::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
-      Defs::_16x8::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
-      Defs::_16x8::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcFill] =
       Defs8bpp::_16x16::DcFill;
@@ -1182,19 +619,6 @@
   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
       Defs::_16x16::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
-      Defs::_16x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
-      Defs::_16x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
-      Defs::_16x16::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcFill] =
       Defs8bpp::_16x32::DcFill;
@@ -1223,19 +647,6 @@
   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
       Defs::_16x32::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
-      Defs::_16x32::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
-      Defs::_16x32::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
-      Defs::_16x32::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcFill] =
       Defs8bpp::_16x64::DcFill;
@@ -1264,19 +675,6 @@
   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
       Defs::_16x64::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
-      Defs::_16x64::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
-      Defs::_16x64::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
-      Defs::_16x64::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcFill] =
       Defs8bpp::_32x8::DcFill;
@@ -1305,19 +703,6 @@
   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
       Defs::_32x8::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
-      Defs::_32x8::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
-      Defs::_32x8::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
-      Defs::_32x8::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcFill] =
       Defs8bpp::_32x16::DcFill;
@@ -1346,19 +731,6 @@
   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
       Defs::_32x16::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
-      Defs::_32x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
-      Defs::_32x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
-      Defs::_32x16::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcFill] =
       Defs8bpp::_32x32::DcFill;
@@ -1387,19 +759,6 @@
   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
       Defs::_32x32::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
-      Defs::_32x32::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
-      Defs::_32x32::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
-      Defs::_32x32::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcFill] =
       Defs8bpp::_32x64::DcFill;
@@ -1428,19 +787,6 @@
   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
       Defs::_32x64::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
-      Defs::_32x64::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
-      Defs::_32x64::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
-      Defs::_32x64::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcFill] =
       Defs8bpp::_64x16::DcFill;
@@ -1469,19 +815,6 @@
   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
       Defs::_64x16::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
-      Defs::_64x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
-      Defs::_64x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
-      Defs::_64x16::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcFill] =
       Defs8bpp::_64x32::DcFill;
@@ -1510,19 +843,6 @@
   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
       Defs::_64x32::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
-      Defs::_64x32::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
-      Defs::_64x32::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
-      Defs::_64x32::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcFill] =
       Defs8bpp::_64x64::DcFill;
@@ -1551,282 +871,7 @@
   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
       Defs::_64x64::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
-      Defs::_64x64::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
-      Defs::_64x64::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
-      Defs::_64x64::SmoothHorizontal;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1
-  dsp->directional_intra_predictor_zone1 =
-      DirectionalIntraPredictorZone1_C<uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2
-  dsp->directional_intra_predictor_zone2 =
-      DirectionalIntraPredictorZone2_C<uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3
-  dsp->directional_intra_predictor_zone3 =
-      DirectionalIntraPredictorZone3_C<uint8_t>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_FilterIntraPredictor
-  dsp->filter_intra_predictor = FilterIntraPredictor_C<8, uint8_t>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize4x4] =
-      CflIntraPredictor_C<4, 4, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
-      CflSubsampler_C<4, 4, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] =
-      CflSubsampler_C<4, 4, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
-      CflSubsampler_C<4, 4, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize4x8] =
-      CflIntraPredictor_C<4, 8, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
-      CflSubsampler_C<4, 8, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] =
-      CflSubsampler_C<4, 8, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
-      CflSubsampler_C<4, 8, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize4x16] =
-      CflIntraPredictor_C<4, 16, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
-      CflSubsampler_C<4, 16, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] =
-      CflSubsampler_C<4, 16, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
-      CflSubsampler_C<4, 16, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize8x4] =
-      CflIntraPredictor_C<8, 4, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
-      CflSubsampler_C<8, 4, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] =
-      CflSubsampler_C<8, 4, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
-      CflSubsampler_C<8, 4, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize8x8] =
-      CflIntraPredictor_C<8, 8, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
-      CflSubsampler_C<8, 8, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] =
-      CflSubsampler_C<8, 8, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
-      CflSubsampler_C<8, 8, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize8x16] =
-      CflIntraPredictor_C<8, 16, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
-      CflSubsampler_C<8, 16, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] =
-      CflSubsampler_C<8, 16, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
-      CflSubsampler_C<8, 16, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize8x32] =
-      CflIntraPredictor_C<8, 32, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
-      CflSubsampler_C<8, 32, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] =
-      CflSubsampler_C<8, 32, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
-      CflSubsampler_C<8, 32, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize16x4] =
-      CflIntraPredictor_C<16, 4, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
-      CflSubsampler_C<16, 4, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] =
-      CflSubsampler_C<16, 4, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
-      CflSubsampler_C<16, 4, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize16x8] =
-      CflIntraPredictor_C<16, 8, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
-      CflSubsampler_C<16, 8, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] =
-      CflSubsampler_C<16, 8, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
-      CflSubsampler_C<16, 8, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize16x16] =
-      CflIntraPredictor_C<16, 16, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
-      CflSubsampler_C<16, 16, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] =
-      CflSubsampler_C<16, 16, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
-      CflSubsampler_C<16, 16, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize16x32] =
-      CflIntraPredictor_C<16, 32, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
-      CflSubsampler_C<16, 32, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] =
-      CflSubsampler_C<16, 32, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
-      CflSubsampler_C<16, 32, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize32x8] =
-      CflIntraPredictor_C<32, 8, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
-      CflSubsampler_C<32, 8, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] =
-      CflSubsampler_C<32, 8, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
-      CflSubsampler_C<32, 8, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize32x16] =
-      CflIntraPredictor_C<32, 16, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
-      CflSubsampler_C<32, 16, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] =
-      CflSubsampler_C<32, 16, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
-      CflSubsampler_C<32, 16, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize32x32] =
-      CflIntraPredictor_C<32, 32, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
-      CflSubsampler_C<32, 32, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] =
-      CflSubsampler_C<32, 32, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
-      CflSubsampler_C<32, 32, 8, uint8_t, 1, 1>;
-#endif
 #endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
-  // Cfl predictors are available only for transform sizes with max(width,
-  // height) <= 32. Set all others to nullptr.
-  for (const auto i : kTransformSizesLargerThan32x32) {
-    dsp->cfl_intra_predictors[i] = nullptr;
-    for (int j = 0; j < kNumSubsamplingTypes; ++j) {
-      dsp->cfl_subsamplers[i][j] = nullptr;
-    }
-  }
 }  // NOLINT(readability/fn_size)
 
 #if LIBGAV1_MAX_BITDEPTH >= 10
@@ -1838,14 +883,6 @@
   assert(dsp != nullptr);
 #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
   INIT_INTRAPREDICTORS(DefsHbd, Defs10bpp);
-  dsp->directional_intra_predictor_zone1 =
-      DirectionalIntraPredictorZone1_C<uint16_t>;
-  dsp->directional_intra_predictor_zone2 =
-      DirectionalIntraPredictorZone2_C<uint16_t>;
-  dsp->directional_intra_predictor_zone3 =
-      DirectionalIntraPredictorZone3_C<uint16_t>;
-  dsp->filter_intra_predictor = FilterIntraPredictor_C<10, uint16_t>;
-  INIT_CFL_INTRAPREDICTORS(10, uint16_t);
 #else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
 #ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcFill] =
@@ -1875,19 +912,6 @@
   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
       DefsHbd::_4x4::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
-      DefsHbd::_4x4::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
-      DefsHbd::_4x4::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_4x4::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcFill] =
       Defs10bpp::_4x8::DcFill;
@@ -1916,19 +940,6 @@
   dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
       DefsHbd::_4x8::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
-      DefsHbd::_4x8::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
-      DefsHbd::_4x8::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_4x8::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcFill] =
       Defs10bpp::_4x16::DcFill;
@@ -1957,19 +968,6 @@
   dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
       DefsHbd::_4x16::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
-      DefsHbd::_4x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
-      DefsHbd::_4x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_4x16::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcFill] =
       Defs10bpp::_8x4::DcFill;
@@ -1998,19 +996,6 @@
   dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
       DefsHbd::_8x4::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
-      DefsHbd::_8x4::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
-      DefsHbd::_8x4::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_8x4::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcFill] =
       Defs10bpp::_8x8::DcFill;
@@ -2039,19 +1024,6 @@
   dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
       DefsHbd::_8x8::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
-      DefsHbd::_8x8::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
-      DefsHbd::_8x8::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_8x8::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcFill] =
       Defs10bpp::_8x16::DcFill;
@@ -2080,19 +1052,6 @@
   dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
       DefsHbd::_8x16::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
-      DefsHbd::_8x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
-      DefsHbd::_8x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_8x16::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcFill] =
       Defs10bpp::_8x32::DcFill;
@@ -2121,19 +1080,6 @@
   dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
       DefsHbd::_8x32::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
-      DefsHbd::_8x32::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
-      DefsHbd::_8x32::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_8x32::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcFill] =
       Defs10bpp::_16x4::DcFill;
@@ -2162,19 +1108,6 @@
   dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
       DefsHbd::_16x4::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
-      DefsHbd::_16x4::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
-      DefsHbd::_16x4::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_16x4::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcFill] =
       Defs10bpp::_16x8::DcFill;
@@ -2203,19 +1136,6 @@
   dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
       DefsHbd::_16x8::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
-      DefsHbd::_16x8::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
-      DefsHbd::_16x8::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_16x8::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcFill] =
       Defs10bpp::_16x16::DcFill;
@@ -2244,19 +1164,6 @@
   dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
       DefsHbd::_16x16::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
-      DefsHbd::_16x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
-      DefsHbd::_16x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_16x16::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcFill] =
       Defs10bpp::_16x32::DcFill;
@@ -2285,19 +1192,6 @@
   dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
       DefsHbd::_16x32::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
-      DefsHbd::_16x32::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
-      DefsHbd::_16x32::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_16x32::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcFill] =
       Defs10bpp::_16x64::DcFill;
@@ -2326,19 +1220,6 @@
   dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
       DefsHbd::_16x64::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
-      DefsHbd::_16x64::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
-      DefsHbd::_16x64::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_16x64::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcFill] =
       Defs10bpp::_32x8::DcFill;
@@ -2367,19 +1248,6 @@
   dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
       DefsHbd::_32x8::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
-      DefsHbd::_32x8::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
-      DefsHbd::_32x8::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_32x8::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcFill] =
       Defs10bpp::_32x16::DcFill;
@@ -2408,19 +1276,6 @@
   dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
       DefsHbd::_32x16::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
-      DefsHbd::_32x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
-      DefsHbd::_32x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_32x16::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcFill] =
       Defs10bpp::_32x32::DcFill;
@@ -2449,19 +1304,6 @@
   dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
       DefsHbd::_32x32::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
-      DefsHbd::_32x32::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
-      DefsHbd::_32x32::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_32x32::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcFill] =
       Defs10bpp::_32x64::DcFill;
@@ -2490,19 +1332,6 @@
   dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
       DefsHbd::_32x64::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
-      DefsHbd::_32x64::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
-      DefsHbd::_32x64::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_32x64::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcFill] =
       Defs10bpp::_64x16::DcFill;
@@ -2531,19 +1360,6 @@
   dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
       DefsHbd::_64x16::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
-      DefsHbd::_64x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
-      DefsHbd::_64x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_64x16::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcFill] =
       Defs10bpp::_64x32::DcFill;
@@ -2572,19 +1388,6 @@
   dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
       DefsHbd::_64x32::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
-      DefsHbd::_64x32::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
-      DefsHbd::_64x32::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_64x32::SmoothHorizontal;
-#endif
-
 #ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcFill
   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcFill] =
       Defs10bpp::_64x64::DcFill;
@@ -2613,291 +1416,12 @@
   dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
       DefsHbd::_64x64::Paeth;
 #endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmooth
-  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
-      DefsHbd::_64x64::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothVertical
-  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
-      DefsHbd::_64x64::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
-  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
-      DefsHbd::_64x64::SmoothHorizontal;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1
-  dsp->directional_intra_predictor_zone1 =
-      DirectionalIntraPredictorZone1_C<uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone2
-  dsp->directional_intra_predictor_zone2 =
-      DirectionalIntraPredictorZone2_C<uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3
-  dsp->directional_intra_predictor_zone3 =
-      DirectionalIntraPredictorZone3_C<uint16_t>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_FilterIntraPredictor
-  dsp->filter_intra_predictor = FilterIntraPredictor_C<10, uint16_t>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize4x4] =
-      CflIntraPredictor_C<4, 4, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
-      CflSubsampler_C<4, 4, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] =
-      CflSubsampler_C<4, 4, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
-      CflSubsampler_C<4, 4, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize4x8] =
-      CflIntraPredictor_C<4, 8, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
-      CflSubsampler_C<4, 8, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] =
-      CflSubsampler_C<4, 8, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
-      CflSubsampler_C<4, 8, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize4x16] =
-      CflIntraPredictor_C<4, 16, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
-      CflSubsampler_C<4, 16, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] =
-      CflSubsampler_C<4, 16, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
-      CflSubsampler_C<4, 16, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize8x4] =
-      CflIntraPredictor_C<8, 4, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
-      CflSubsampler_C<8, 4, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] =
-      CflSubsampler_C<8, 4, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
-      CflSubsampler_C<8, 4, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize8x8] =
-      CflIntraPredictor_C<8, 8, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
-      CflSubsampler_C<8, 8, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] =
-      CflSubsampler_C<8, 8, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
-      CflSubsampler_C<8, 8, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize8x16] =
-      CflIntraPredictor_C<8, 16, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
-      CflSubsampler_C<8, 16, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] =
-      CflSubsampler_C<8, 16, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
-      CflSubsampler_C<8, 16, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize8x32] =
-      CflIntraPredictor_C<8, 32, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
-      CflSubsampler_C<8, 32, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] =
-      CflSubsampler_C<8, 32, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
-      CflSubsampler_C<8, 32, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize16x4] =
-      CflIntraPredictor_C<16, 4, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
-      CflSubsampler_C<16, 4, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] =
-      CflSubsampler_C<16, 4, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
-      CflSubsampler_C<16, 4, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize16x8] =
-      CflIntraPredictor_C<16, 8, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
-      CflSubsampler_C<16, 8, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] =
-      CflSubsampler_C<16, 8, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
-      CflSubsampler_C<16, 8, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize16x16] =
-      CflIntraPredictor_C<16, 16, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
-      CflSubsampler_C<16, 16, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] =
-      CflSubsampler_C<16, 16, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
-      CflSubsampler_C<16, 16, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize16x32] =
-      CflIntraPredictor_C<16, 32, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
-      CflSubsampler_C<16, 32, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] =
-      CflSubsampler_C<16, 32, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
-      CflSubsampler_C<16, 32, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize32x8] =
-      CflIntraPredictor_C<32, 8, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
-      CflSubsampler_C<32, 8, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] =
-      CflSubsampler_C<32, 8, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
-      CflSubsampler_C<32, 8, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize32x16] =
-      CflIntraPredictor_C<32, 16, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
-      CflSubsampler_C<32, 16, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] =
-      CflSubsampler_C<32, 16, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
-      CflSubsampler_C<32, 16, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor
-  dsp->cfl_intra_predictors[kTransformSize32x32] =
-      CflIntraPredictor_C<32, 32, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444
-  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
-      CflSubsampler_C<32, 32, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler422
-  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] =
-      CflSubsampler_C<32, 32, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420
-  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
-      CflSubsampler_C<32, 32, 10, uint16_t, 1, 1>;
-#endif
-
 #endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
-  // Cfl predictors are available only for transform sizes with max(width,
-  // height) <= 32. Set all others to nullptr.
-  for (const auto i : kTransformSizesLargerThan32x32) {
-    dsp->cfl_intra_predictors[i] = nullptr;
-    for (int j = 0; j < kNumSubsamplingTypes; ++j) {
-      dsp->cfl_subsamplers[i][j] = nullptr;
-    }
-  }
 }  // NOLINT(readability/fn_size)
 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
 
-#undef INIT_CFL_INTRAPREDICTOR_WxH
-#undef INIT_CFL_INTRAPREDICTORS
 #undef INIT_INTRAPREDICTORS_WxH
 #undef INIT_INTRAPREDICTORS
-
 }  // namespace
 
 void IntraPredInit_C() {

diff --git a/libgav1/src/dsp/intrapred.h b/libgav1/src/dsp/intrapred.h
index c5286ef..2cb625d 100644
--- a/libgav1/src/dsp/intrapred.h
+++ b/libgav1/src/dsp/intrapred.h

@@ -38,9 +38,7 @@
 namespace libgav1 {
 namespace dsp {
 
-// Initializes Dsp::intra_predictors, Dsp::directional_intra_predictor_zone*,
-// Dsp::cfl_intra_predictors, Dsp::cfl_subsamplers and
-// Dsp::filter_intra_predictor. This function is not thread-safe.
+// Initializes Dsp::intra_predictors. This function is not thread-safe.
 void IntraPredInit_C();
 
 }  // namespace dsp

diff --git a/libgav1/src/dsp/intrapred_cfl.cc b/libgav1/src/dsp/intrapred_cfl.cc
new file mode 100644
index 0000000..948c0c0
--- /dev/null
+++ b/libgav1/src/dsp/intrapred_cfl.cc

@@ -0,0 +1,654 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_cfl.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr TransformSize kTransformSizesLargerThan32x32[] = {
+    kTransformSize16x64, kTransformSize32x64, kTransformSize64x16,
+    kTransformSize64x32, kTransformSize64x64};
+
+//------------------------------------------------------------------------------
+// CflIntraPredictor_C
+
+// |luma| can be within +/-(((1 << bitdepth) - 1) << 3), inclusive.
+// |alpha| can be -16 to 16 (inclusive).
+template <int block_width, int block_height, int bitdepth, typename Pixel>
+void CflIntraPredictor_C(
+    void* const dest, ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  auto* dst = static_cast<Pixel*>(dest);
+  const int dc = dst[0];
+  stride /= sizeof(Pixel);
+  const int max_value = (1 << bitdepth) - 1;
+  for (int y = 0; y < block_height; ++y) {
+    for (int x = 0; x < block_width; ++x) {
+      assert(luma[y][x] >= -(((1 << bitdepth) - 1) << 3));
+      assert(luma[y][x] <= ((1 << bitdepth) - 1) << 3);
+      dst[x] = Clip3(dc + RightShiftWithRoundingSigned(alpha * luma[y][x], 6),
+                     0, max_value);
+    }
+    dst += stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// CflSubsampler_C
+
+template <int block_width, int block_height, int bitdepth, typename Pixel,
+          int subsampling_x, int subsampling_y>
+void CflSubsampler_C(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+                     const int max_luma_width, const int max_luma_height,
+                     const void* const source, ptrdiff_t stride) {
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+  const auto* src = static_cast<const Pixel*>(source);
+  stride /= sizeof(Pixel);
+  int sum = 0;
+  for (int y = 0; y < block_height; ++y) {
+    for (int x = 0; x < block_width; ++x) {
+      const ptrdiff_t luma_x =
+          std::min(x << subsampling_x, max_luma_width - (1 << subsampling_x));
+      const ptrdiff_t luma_x_next = luma_x + stride;
+      luma[y][x] =
+          (src[luma_x] + ((subsampling_x != 0) ? src[luma_x + 1] : 0) +
+           ((subsampling_y != 0) ? (src[luma_x_next] + src[luma_x_next + 1])
+                                 : 0))
+          << (3 - subsampling_x - subsampling_y);
+      sum += luma[y][x];
+    }
+    if ((y << subsampling_y) < (max_luma_height - (1 << subsampling_y))) {
+      src += stride << subsampling_y;
+    }
+  }
+  const int average = RightShiftWithRounding(
+      sum, FloorLog2(block_width) + FloorLog2(block_height));
+  for (int y = 0; y < block_height; ++y) {
+    for (int x = 0; x < block_width; ++x) {
+      luma[y][x] -= average;
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+
+// Initializes dsp entries for kTransformSize|W|x|H|.
+#define INIT_CFL_INTRAPREDICTOR_WxH(W, H, BITDEPTH, PIXEL)             \
+  dsp->cfl_intra_predictors[kTransformSize##W##x##H] =                 \
+      CflIntraPredictor_C<W, H, BITDEPTH, PIXEL>;                      \
+  dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType444] = \
+      CflSubsampler_C<W, H, BITDEPTH, PIXEL, 0, 0>;                    \
+  dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType422] = \
+      CflSubsampler_C<W, H, BITDEPTH, PIXEL, 1, 0>;                    \
+  dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType420] = \
+      CflSubsampler_C<W, H, BITDEPTH, PIXEL, 1, 1>
+
+#define INIT_CFL_INTRAPREDICTORS(BITDEPTH, PIXEL)       \
+  INIT_CFL_INTRAPREDICTOR_WxH(4, 4, BITDEPTH, PIXEL);   \
+  INIT_CFL_INTRAPREDICTOR_WxH(4, 8, BITDEPTH, PIXEL);   \
+  INIT_CFL_INTRAPREDICTOR_WxH(4, 16, BITDEPTH, PIXEL);  \
+  INIT_CFL_INTRAPREDICTOR_WxH(8, 4, BITDEPTH, PIXEL);   \
+  INIT_CFL_INTRAPREDICTOR_WxH(8, 8, BITDEPTH, PIXEL);   \
+  INIT_CFL_INTRAPREDICTOR_WxH(8, 16, BITDEPTH, PIXEL);  \
+  INIT_CFL_INTRAPREDICTOR_WxH(8, 32, BITDEPTH, PIXEL);  \
+  INIT_CFL_INTRAPREDICTOR_WxH(16, 4, BITDEPTH, PIXEL);  \
+  INIT_CFL_INTRAPREDICTOR_WxH(16, 8, BITDEPTH, PIXEL);  \
+  INIT_CFL_INTRAPREDICTOR_WxH(16, 16, BITDEPTH, PIXEL); \
+  INIT_CFL_INTRAPREDICTOR_WxH(16, 32, BITDEPTH, PIXEL); \
+  INIT_CFL_INTRAPREDICTOR_WxH(32, 8, BITDEPTH, PIXEL);  \
+  INIT_CFL_INTRAPREDICTOR_WxH(32, 16, BITDEPTH, PIXEL); \
+  INIT_CFL_INTRAPREDICTOR_WxH(32, 32, BITDEPTH, PIXEL)
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  INIT_CFL_INTRAPREDICTORS(8, uint8_t);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize4x4] =
+      CflIntraPredictor_C<4, 4, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+      CflSubsampler_C<4, 4, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] =
+      CflSubsampler_C<4, 4, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+      CflSubsampler_C<4, 4, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize4x8] =
+      CflIntraPredictor_C<4, 8, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+      CflSubsampler_C<4, 8, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] =
+      CflSubsampler_C<4, 8, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+      CflSubsampler_C<4, 8, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize4x16] =
+      CflIntraPredictor_C<4, 16, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+      CflSubsampler_C<4, 16, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] =
+      CflSubsampler_C<4, 16, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+      CflSubsampler_C<4, 16, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x4] =
+      CflIntraPredictor_C<8, 4, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+      CflSubsampler_C<8, 4, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] =
+      CflSubsampler_C<8, 4, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+      CflSubsampler_C<8, 4, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x8] =
+      CflIntraPredictor_C<8, 8, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+      CflSubsampler_C<8, 8, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] =
+      CflSubsampler_C<8, 8, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+      CflSubsampler_C<8, 8, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x16] =
+      CflIntraPredictor_C<8, 16, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+      CflSubsampler_C<8, 16, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] =
+      CflSubsampler_C<8, 16, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+      CflSubsampler_C<8, 16, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x32] =
+      CflIntraPredictor_C<8, 32, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+      CflSubsampler_C<8, 32, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] =
+      CflSubsampler_C<8, 32, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+      CflSubsampler_C<8, 32, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x4] =
+      CflIntraPredictor_C<16, 4, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+      CflSubsampler_C<16, 4, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] =
+      CflSubsampler_C<16, 4, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+      CflSubsampler_C<16, 4, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x8] =
+      CflIntraPredictor_C<16, 8, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+      CflSubsampler_C<16, 8, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] =
+      CflSubsampler_C<16, 8, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+      CflSubsampler_C<16, 8, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x16] =
+      CflIntraPredictor_C<16, 16, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+      CflSubsampler_C<16, 16, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] =
+      CflSubsampler_C<16, 16, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+      CflSubsampler_C<16, 16, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x32] =
+      CflIntraPredictor_C<16, 32, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+      CflSubsampler_C<16, 32, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] =
+      CflSubsampler_C<16, 32, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+      CflSubsampler_C<16, 32, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize32x8] =
+      CflIntraPredictor_C<32, 8, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+      CflSubsampler_C<32, 8, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] =
+      CflSubsampler_C<32, 8, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+      CflSubsampler_C<32, 8, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize32x16] =
+      CflIntraPredictor_C<32, 16, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+      CflSubsampler_C<32, 16, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] =
+      CflSubsampler_C<32, 16, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+      CflSubsampler_C<32, 16, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize32x32] =
+      CflIntraPredictor_C<32, 32, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+      CflSubsampler_C<32, 32, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] =
+      CflSubsampler_C<32, 32, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+      CflSubsampler_C<32, 32, 8, uint8_t, 1, 1>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  // Cfl predictors are available only for transform sizes with max(width,
+  // height) <= 32. Set all others to nullptr.
+  for (const auto i : kTransformSizesLargerThan32x32) {
+    dsp->cfl_intra_predictors[i] = nullptr;
+    for (int j = 0; j < kNumSubsamplingTypes; ++j) {
+      dsp->cfl_subsamplers[i][j] = nullptr;
+    }
+  }
+}  // NOLINT(readability/fn_size)
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  INIT_CFL_INTRAPREDICTORS(10, uint16_t);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize4x4] =
+      CflIntraPredictor_C<4, 4, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+      CflSubsampler_C<4, 4, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] =
+      CflSubsampler_C<4, 4, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+      CflSubsampler_C<4, 4, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize4x8] =
+      CflIntraPredictor_C<4, 8, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+      CflSubsampler_C<4, 8, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] =
+      CflSubsampler_C<4, 8, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+      CflSubsampler_C<4, 8, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize4x16] =
+      CflIntraPredictor_C<4, 16, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+      CflSubsampler_C<4, 16, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] =
+      CflSubsampler_C<4, 16, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+      CflSubsampler_C<4, 16, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x4] =
+      CflIntraPredictor_C<8, 4, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+      CflSubsampler_C<8, 4, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] =
+      CflSubsampler_C<8, 4, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+      CflSubsampler_C<8, 4, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x8] =
+      CflIntraPredictor_C<8, 8, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+      CflSubsampler_C<8, 8, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] =
+      CflSubsampler_C<8, 8, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+      CflSubsampler_C<8, 8, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x16] =
+      CflIntraPredictor_C<8, 16, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+      CflSubsampler_C<8, 16, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] =
+      CflSubsampler_C<8, 16, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+      CflSubsampler_C<8, 16, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize8x32] =
+      CflIntraPredictor_C<8, 32, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+      CflSubsampler_C<8, 32, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] =
+      CflSubsampler_C<8, 32, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+      CflSubsampler_C<8, 32, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x4] =
+      CflIntraPredictor_C<16, 4, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+      CflSubsampler_C<16, 4, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] =
+      CflSubsampler_C<16, 4, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+      CflSubsampler_C<16, 4, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x8] =
+      CflIntraPredictor_C<16, 8, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+      CflSubsampler_C<16, 8, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] =
+      CflSubsampler_C<16, 8, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+      CflSubsampler_C<16, 8, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x16] =
+      CflIntraPredictor_C<16, 16, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+      CflSubsampler_C<16, 16, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] =
+      CflSubsampler_C<16, 16, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+      CflSubsampler_C<16, 16, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize16x32] =
+      CflIntraPredictor_C<16, 32, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+      CflSubsampler_C<16, 32, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] =
+      CflSubsampler_C<16, 32, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+      CflSubsampler_C<16, 32, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize32x8] =
+      CflIntraPredictor_C<32, 8, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+      CflSubsampler_C<32, 8, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] =
+      CflSubsampler_C<32, 8, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+      CflSubsampler_C<32, 8, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize32x16] =
+      CflIntraPredictor_C<32, 16, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+      CflSubsampler_C<32, 16, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] =
+      CflSubsampler_C<32, 16, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+      CflSubsampler_C<32, 16, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor
+  dsp->cfl_intra_predictors[kTransformSize32x32] =
+      CflIntraPredictor_C<32, 32, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+      CflSubsampler_C<32, 32, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler422
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] =
+      CflSubsampler_C<32, 32, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+      CflSubsampler_C<32, 32, 10, uint16_t, 1, 1>;
+#endif
+
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  // Cfl predictors are available only for transform sizes with max(width,
+  // height) <= 32. Set all others to nullptr.
+  for (const auto i : kTransformSizesLargerThan32x32) {
+    dsp->cfl_intra_predictors[i] = nullptr;
+    for (int j = 0; j < kNumSubsamplingTypes; ++j) {
+      dsp->cfl_subsamplers[i][j] = nullptr;
+    }
+  }
+}  // NOLINT(readability/fn_size)
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#undef INIT_CFL_INTRAPREDICTOR_WxH
+#undef INIT_CFL_INTRAPREDICTORS
+
+}  // namespace
+
+void IntraPredCflInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1

diff --git a/libgav1/src/dsp/intrapred_cfl.h b/libgav1/src/dsp/intrapred_cfl.h
new file mode 100644
index 0000000..4e8a11f
--- /dev/null
+++ b/libgav1/src/dsp/intrapred_cfl.h

@@ -0,0 +1,48 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRAPRED_CFL_H_
+#define LIBGAV1_SRC_DSP_INTRAPRED_CFL_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intrapred_cfl_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intrapred_cfl_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cfl_intra_predictors and Dsp::cfl_subsamplers.
+// This function is not thread-safe.
+void IntraPredCflInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_INTRAPRED_CFL_H_

diff --git a/libgav1/src/dsp/intrapred_directional.cc b/libgav1/src/dsp/intrapred_directional.cc
new file mode 100644
index 0000000..e670769
--- /dev/null
+++ b/libgav1/src/dsp/intrapred_directional.cc

@@ -0,0 +1,252 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_directional.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+//------------------------------------------------------------------------------
+// 7.11.2.4. Directional intra prediction process
+
+template <typename Pixel>
+void DirectionalIntraPredictorZone1_C(void* const dest, ptrdiff_t stride,
+                                      const void* const top_row,
+                                      const int width, const int height,
+                                      const int xstep,
+                                      const bool upsampled_top) {
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+
+  assert(xstep > 0);
+
+  // If xstep == 64 then |shift| always evaluates to 0 which sets |val| to
+  // |top[top_base_x]|. This corresponds to a 45 degree prediction.
+  if (xstep == 64) {
+    // 7.11.2.10. Intra edge upsample selection process
+    // if ( d <= 0 || d >= 40 ) useUpsample = 0
+    // For |upsampled_top| the delta is |predictor_angle - 90|. Since the
+    // |predictor_angle| is 45 the delta is also 45.
+    assert(!upsampled_top);
+    const Pixel* top_ptr = top + 1;
+    for (int y = 0; y < height; ++y, dst += stride, ++top_ptr) {
+      memcpy(dst, top_ptr, sizeof(*top_ptr) * width);
+    }
+    return;
+  }
+
+  const int upsample_shift = static_cast<int>(upsampled_top);
+  const int max_base_x = ((width + height) - 1) << upsample_shift;
+  const int scale_bits = 6 - upsample_shift;
+  const int base_step = 1 << upsample_shift;
+  int top_x = xstep;
+  int y = 0;
+  do {
+    int top_base_x = top_x >> scale_bits;
+
+    if (top_base_x >= max_base_x) {
+      for (int i = y; i < height; ++i) {
+        Memset(dst, top[max_base_x], width);
+        dst += stride;
+      }
+      return;
+    }
+
+    const int shift = ((top_x << upsample_shift) & 0x3F) >> 1;
+    int x = 0;
+    do {
+      if (top_base_x >= max_base_x) {
+        Memset(dst + x, top[max_base_x], width - x);
+        break;
+      }
+
+      const int val =
+          top[top_base_x] * (32 - shift) + top[top_base_x + 1] * shift;
+      dst[x] = RightShiftWithRounding(val, 5 /*log2(32)*/);
+      top_base_x += base_step;
+    } while (++x < width);
+
+    dst += stride;
+    top_x += xstep;
+  } while (++y < height);
+}
+
+template <typename Pixel>
+void DirectionalIntraPredictorZone2_C(void* const dest, ptrdiff_t stride,
+                                      const void* const top_row,
+                                      const void* const left_column,
+                                      const int width, const int height,
+                                      const int xstep, const int ystep,
+                                      const bool upsampled_top,
+                                      const bool upsampled_left) {
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  const auto* const left = static_cast<const Pixel*>(left_column);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+
+  assert(xstep > 0);
+  assert(ystep > 0);
+
+  const int upsample_top_shift = static_cast<int>(upsampled_top);
+  const int upsample_left_shift = static_cast<int>(upsampled_left);
+  const int scale_bits_x = 6 - upsample_top_shift;
+  const int scale_bits_y = 6 - upsample_left_shift;
+  const int min_base_x = -(1 << upsample_top_shift);
+  const int base_step_x = 1 << upsample_top_shift;
+  int y = 0;
+  int top_x = -xstep;
+  do {
+    int top_base_x = top_x >> scale_bits_x;
+    int left_y = (y << 6) - ystep;
+    int x = 0;
+    do {
+      int val;
+      if (top_base_x >= min_base_x) {
+        const int shift = ((top_x * (1 << upsample_top_shift)) & 0x3F) >> 1;
+        val = top[top_base_x] * (32 - shift) + top[top_base_x + 1] * shift;
+      } else {
+        // Note this assumes an arithmetic shift to handle negative values.
+        const int left_base_y = left_y >> scale_bits_y;
+        const int shift = ((left_y * (1 << upsample_left_shift)) & 0x3F) >> 1;
+        assert(left_base_y >= -(1 << upsample_left_shift));
+        val = left[left_base_y] * (32 - shift) + left[left_base_y + 1] * shift;
+      }
+      dst[x] = RightShiftWithRounding(val, 5);
+      top_base_x += base_step_x;
+      left_y -= ystep;
+    } while (++x < width);
+
+    top_x -= xstep;
+    dst += stride;
+  } while (++y < height);
+}
+
+template <typename Pixel>
+void DirectionalIntraPredictorZone3_C(void* const dest, ptrdiff_t stride,
+                                      const void* const left_column,
+                                      const int width, const int height,
+                                      const int ystep,
+                                      const bool upsampled_left) {
+  const auto* const left = static_cast<const Pixel*>(left_column);
+  stride /= sizeof(Pixel);
+
+  assert(ystep > 0);
+
+  const int upsample_shift = static_cast<int>(upsampled_left);
+  const int scale_bits = 6 - upsample_shift;
+  const int base_step = 1 << upsample_shift;
+  // Zone3 never runs out of left_column values.
+  assert((width + height - 1) << upsample_shift >  // max_base_y
+         ((ystep * width) >> scale_bits) +
+             base_step * (height - 1));  // left_base_y
+
+  int left_y = ystep;
+  int x = 0;
+  do {
+    auto* dst = static_cast<Pixel*>(dest);
+
+    int left_base_y = left_y >> scale_bits;
+    int y = 0;
+    do {
+      const int shift = ((left_y << upsample_shift) & 0x3F) >> 1;
+      const int val =
+          left[left_base_y] * (32 - shift) + left[left_base_y + 1] * shift;
+      dst[x] = RightShiftWithRounding(val, 5);
+      dst += stride;
+      left_base_y += base_step;
+    } while (++y < height);
+
+    left_y += ystep;
+  } while (++x < width);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->directional_intra_predictor_zone1 =
+      DirectionalIntraPredictorZone1_C<uint8_t>;
+  dsp->directional_intra_predictor_zone2 =
+      DirectionalIntraPredictorZone2_C<uint8_t>;
+  dsp->directional_intra_predictor_zone3 =
+      DirectionalIntraPredictorZone3_C<uint8_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1
+  dsp->directional_intra_predictor_zone1 =
+      DirectionalIntraPredictorZone1_C<uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2
+  dsp->directional_intra_predictor_zone2 =
+      DirectionalIntraPredictorZone2_C<uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3
+  dsp->directional_intra_predictor_zone3 =
+      DirectionalIntraPredictorZone3_C<uint8_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->directional_intra_predictor_zone1 =
+      DirectionalIntraPredictorZone1_C<uint16_t>;
+  dsp->directional_intra_predictor_zone2 =
+      DirectionalIntraPredictorZone2_C<uint16_t>;
+  dsp->directional_intra_predictor_zone3 =
+      DirectionalIntraPredictorZone3_C<uint16_t>;
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1
+  dsp->directional_intra_predictor_zone1 =
+      DirectionalIntraPredictorZone1_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone2
+  dsp->directional_intra_predictor_zone2 =
+      DirectionalIntraPredictorZone2_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3
+  dsp->directional_intra_predictor_zone3 =
+      DirectionalIntraPredictorZone3_C<uint16_t>;
+#endif
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+
+void IntraPredDirectionalInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1

diff --git a/libgav1/src/dsp/intrapred_directional.h b/libgav1/src/dsp/intrapred_directional.h
new file mode 100644
index 0000000..bcd1bc1
--- /dev/null
+++ b/libgav1/src/dsp/intrapred_directional.h

@@ -0,0 +1,48 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRAPRED_DIRECTIONAL_H_
+#define LIBGAV1_SRC_DSP_INTRAPRED_DIRECTIONAL_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intrapred_directional_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intrapred_directional_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::directional_intra_predictor_zone*. This function is not
+// thread-safe.
+void IntraPredDirectionalInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_INTRAPRED_DIRECTIONAL_H_

diff --git a/libgav1/src/dsp/intrapred_filter.cc b/libgav1/src/dsp/intrapred_filter.cc
new file mode 100644
index 0000000..f4bd296
--- /dev/null
+++ b/libgav1/src/dsp/intrapred_filter.cc

@@ -0,0 +1,144 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_filter.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+//------------------------------------------------------------------------------
+// FilterIntraPredictor_C
+
+// The recursive filter applies a different filter to the top 4 and 2 left
+// pixels to produce each pixel in a 4x2 sub-block. Each successive 4x2 uses the
+// prediction output of the blocks above and to the left, unless they are
+// adjacent to the |top_row| or |left_column|. The set of 8 filters is selected
+// according to |pred|.
+template <int bitdepth, typename Pixel>
+void FilterIntraPredictor_C(void* const dest, ptrdiff_t stride,
+                            const void* const top_row,
+                            const void* const left_column,
+                            const FilterIntraPredictor pred, const int width,
+                            const int height) {
+  const int kMaxPixel = (1 << bitdepth) - 1;
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  const auto* const left = static_cast<const Pixel*>(left_column);
+
+  assert(width <= 32 && height <= 32);
+
+  Pixel buffer[3][33];  // cache 2 rows + top & left boundaries
+  memcpy(buffer[0], &top[-1], (width + 1) * sizeof(top[0]));
+
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+  int row0 = 0, row2 = 2;
+  int ystep = 1;
+  int y = 0;
+  do {
+    buffer[1][0] = left[y];
+    buffer[row2][0] = left[y + 1];
+    int x = 1;
+    do {
+      const Pixel p0 = buffer[row0][x - 1];  // top-left
+      const Pixel p1 = buffer[row0][x + 0];  // top 0
+      const Pixel p2 = buffer[row0][x + 1];  // top 1
+      const Pixel p3 = buffer[row0][x + 2];  // top 2
+      const Pixel p4 = buffer[row0][x + 3];  // top 3
+      const Pixel p5 = buffer[1][x - 1];     // left 0
+      const Pixel p6 = buffer[row2][x - 1];  // left 1
+      for (int i = 0; i < 8; ++i) {
+        const int xoffset = i & 0x03;
+        const int yoffset = (i >> 2) * ystep;
+        const int value = kFilterIntraTaps[pred][i][0] * p0 +
+                          kFilterIntraTaps[pred][i][1] * p1 +
+                          kFilterIntraTaps[pred][i][2] * p2 +
+                          kFilterIntraTaps[pred][i][3] * p3 +
+                          kFilterIntraTaps[pred][i][4] * p4 +
+                          kFilterIntraTaps[pred][i][5] * p5 +
+                          kFilterIntraTaps[pred][i][6] * p6;
+        // Section 7.11.2.3 specifies the right-hand side of the assignment as
+        //   Clip1( Round2Signed( pr, INTRA_FILTER_SCALE_BITS ) ).
+        // Since Clip1() clips a negative value to 0, it is safe to replace
+        // Round2Signed() with Round2().
+        buffer[1 + yoffset][x + xoffset] = static_cast<Pixel>(
+            Clip3(RightShiftWithRounding(value, 4), 0, kMaxPixel));
+      }
+      x += 4;
+    } while (x < width);
+    memcpy(dst, &buffer[1][1], width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, &buffer[row2][1], width * sizeof(dst[0]));
+    dst += stride;
+
+    // The final row becomes the top for the next pass.
+    row0 ^= 2;
+    row2 ^= 2;
+    ystep = -ystep;
+    y += 2;
+  } while (y < height);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->filter_intra_predictor = FilterIntraPredictor_C<8, uint8_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_FilterIntraPredictor
+  dsp->filter_intra_predictor = FilterIntraPredictor_C<8, uint8_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  dsp->filter_intra_predictor = FilterIntraPredictor_C<10, uint16_t>;
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_FilterIntraPredictor
+  dsp->filter_intra_predictor = FilterIntraPredictor_C<10, uint16_t>;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace
+
+void IntraPredFilterInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1

diff --git a/libgav1/src/dsp/intrapred_filter.h b/libgav1/src/dsp/intrapred_filter.h
new file mode 100644
index 0000000..8146b82
--- /dev/null
+++ b/libgav1/src/dsp/intrapred_filter.h

@@ -0,0 +1,49 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRAPRED_FILTER_H_
+#define LIBGAV1_SRC_DSP_INTRAPRED_FILTER_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intrapred_filter_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intrapred_filter_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors, Dsp::directional_intra_predictor_zone*,
+// Dsp::cfl_intra_predictors, Dsp::cfl_subsamplers and
+// Dsp::filter_intra_predictor. This function is not thread-safe.
+void IntraPredFilterInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_INTRAPRED_FILTER_H_

diff --git a/libgav1/src/dsp/intrapred_smooth.cc b/libgav1/src/dsp/intrapred_smooth.cc
new file mode 100644
index 0000000..83c005e
--- /dev/null
+++ b/libgav1/src/dsp/intrapred_smooth.cc

@@ -0,0 +1,738 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_smooth.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+template <int block_width, int block_height, typename Pixel>
+struct SmoothFuncs_C {
+  SmoothFuncs_C() = delete;
+
+  static void Smooth(void* dest, ptrdiff_t stride, const void* top_row,
+                     const void* left_column);
+  static void SmoothVertical(void* dest, ptrdiff_t stride, const void* top_row,
+                             const void* left_column);
+  static void SmoothHorizontal(void* dest, ptrdiff_t stride,
+                               const void* top_row, const void* left_column);
+};
+
+constexpr uint8_t kSmoothWeights[] = {
+    // block dimension = 4
+    255, 149, 85, 64,
+    // block dimension = 8
+    255, 197, 146, 105, 73, 50, 37, 32,
+    // block dimension = 16
+    255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
+    // block dimension = 32
+    255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
+    66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
+    // block dimension = 64
+    255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
+    150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73,
+    69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16,
+    15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4};
+
+// SmoothFuncs_C::Smooth
+template <int block_width, int block_height, typename Pixel>
+void SmoothFuncs_C<block_width, block_height, Pixel>::Smooth(
+    void* const dest, ptrdiff_t stride, const void* const top_row,
+    const void* const left_column) {
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  const auto* const left = static_cast<const Pixel*>(left_column);
+  const Pixel top_right = top[block_width - 1];
+  const Pixel bottom_left = left[block_height - 1];
+  static_assert(
+      block_width >= 4 && block_height >= 4,
+      "Weights for smooth predictor undefined for block width/height < 4");
+  const uint8_t* const weights_x = kSmoothWeights + block_width - 4;
+  const uint8_t* const weights_y = kSmoothWeights + block_height - 4;
+  const uint16_t scale_value = (1 << kSmoothWeightScale);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+
+  for (int y = 0; y < block_height; ++y) {
+    for (int x = 0; x < block_width; ++x) {
+      assert(scale_value >= weights_y[y] && scale_value >= weights_x[x]);
+      uint32_t pred = weights_y[y] * top[x];
+      pred += weights_x[x] * left[y];
+      pred += static_cast<uint8_t>(scale_value - weights_y[y]) * bottom_left;
+      pred += static_cast<uint8_t>(scale_value - weights_x[x]) * top_right;
+      // The maximum value of pred with the rounder is 2^9 * (2^bitdepth - 1)
+      // + 256. With the descale there's no need for saturation.
+      dst[x] = static_cast<Pixel>(
+          RightShiftWithRounding(pred, kSmoothWeightScale + 1));
+    }
+    dst += stride;
+  }
+}
+
+// SmoothFuncs_C::SmoothVertical
+template <int block_width, int block_height, typename Pixel>
+void SmoothFuncs_C<block_width, block_height, Pixel>::SmoothVertical(
+    void* const dest, ptrdiff_t stride, const void* const top_row,
+    const void* const left_column) {
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  const auto* const left = static_cast<const Pixel*>(left_column);
+  const Pixel bottom_left = left[block_height - 1];
+  static_assert(block_height >= 4,
+                "Weights for smooth predictor undefined for block height < 4");
+  const uint8_t* const weights_y = kSmoothWeights + block_height - 4;
+  const uint16_t scale_value = (1 << kSmoothWeightScale);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+
+  for (int y = 0; y < block_height; ++y) {
+    for (int x = 0; x < block_width; ++x) {
+      assert(scale_value >= weights_y[y]);
+      uint32_t pred = weights_y[y] * top[x];
+      pred += static_cast<uint8_t>(scale_value - weights_y[y]) * bottom_left;
+      dst[x] =
+          static_cast<Pixel>(RightShiftWithRounding(pred, kSmoothWeightScale));
+    }
+    dst += stride;
+  }
+}
+
+// SmoothFuncs_C::SmoothHorizontal
+template <int block_width, int block_height, typename Pixel>
+void SmoothFuncs_C<block_width, block_height, Pixel>::SmoothHorizontal(
+    void* const dest, ptrdiff_t stride, const void* const top_row,
+    const void* const left_column) {
+  const auto* const top = static_cast<const Pixel*>(top_row);
+  const auto* const left = static_cast<const Pixel*>(left_column);
+  const Pixel top_right = top[block_width - 1];
+  static_assert(block_width >= 4,
+                "Weights for smooth predictor undefined for block width < 4");
+  const uint8_t* const weights_x = kSmoothWeights + block_width - 4;
+  const uint16_t scale_value = (1 << kSmoothWeightScale);
+  auto* dst = static_cast<Pixel*>(dest);
+  stride /= sizeof(Pixel);
+
+  for (int y = 0; y < block_height; ++y) {
+    for (int x = 0; x < block_width; ++x) {
+      assert(scale_value >= weights_x[x]);
+      uint32_t pred = weights_x[x] * left[y];
+      pred += static_cast<uint8_t>(scale_value - weights_x[x]) * top_right;
+      dst[x] =
+          static_cast<Pixel>(RightShiftWithRounding(pred, kSmoothWeightScale));
+    }
+    dst += stride;
+  }
+}
+
+// -----------------------------------------------------------------------------
+
+template <typename Pixel>
+struct SmoothDefs {
+  SmoothDefs() = delete;
+
+  using _4x4 = SmoothFuncs_C<4, 4, Pixel>;
+  using _4x8 = SmoothFuncs_C<4, 8, Pixel>;
+  using _4x16 = SmoothFuncs_C<4, 16, Pixel>;
+  using _8x4 = SmoothFuncs_C<8, 4, Pixel>;
+  using _8x8 = SmoothFuncs_C<8, 8, Pixel>;
+  using _8x16 = SmoothFuncs_C<8, 16, Pixel>;
+  using _8x32 = SmoothFuncs_C<8, 32, Pixel>;
+  using _16x4 = SmoothFuncs_C<16, 4, Pixel>;
+  using _16x8 = SmoothFuncs_C<16, 8, Pixel>;
+  using _16x16 = SmoothFuncs_C<16, 16, Pixel>;
+  using _16x32 = SmoothFuncs_C<16, 32, Pixel>;
+  using _16x64 = SmoothFuncs_C<16, 64, Pixel>;
+  using _32x8 = SmoothFuncs_C<32, 8, Pixel>;
+  using _32x16 = SmoothFuncs_C<32, 16, Pixel>;
+  using _32x32 = SmoothFuncs_C<32, 32, Pixel>;
+  using _32x64 = SmoothFuncs_C<32, 64, Pixel>;
+  using _64x16 = SmoothFuncs_C<64, 16, Pixel>;
+  using _64x32 = SmoothFuncs_C<64, 32, Pixel>;
+  using _64x64 = SmoothFuncs_C<64, 64, Pixel>;
+};
+
+using Defs = SmoothDefs<uint8_t>;
+
+// Initializes dsp entries for kTransformSize|W|x|H| from |DEFS| of
+// the same size.
+#define INIT_SMOOTH_WxH(DEFS, W, H)                                       \
+  dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorSmooth] = \
+      DEFS::_##W##x##H::Smooth;                                           \
+  dsp->intra_predictors[kTransformSize##W##x##H]                          \
+                       [kIntraPredictorSmoothVertical] =                  \
+      DEFS::_##W##x##H::SmoothVertical;                                   \
+  dsp->intra_predictors[kTransformSize##W##x##H]                          \
+                       [kIntraPredictorSmoothHorizontal] =                \
+      DEFS::_##W##x##H::SmoothHorizontal
+
+#define INIT_SMOOTH(DEFS)        \
+  INIT_SMOOTH_WxH(DEFS, 4, 4);   \
+  INIT_SMOOTH_WxH(DEFS, 4, 8);   \
+  INIT_SMOOTH_WxH(DEFS, 4, 16);  \
+  INIT_SMOOTH_WxH(DEFS, 8, 4);   \
+  INIT_SMOOTH_WxH(DEFS, 8, 8);   \
+  INIT_SMOOTH_WxH(DEFS, 8, 16);  \
+  INIT_SMOOTH_WxH(DEFS, 8, 32);  \
+  INIT_SMOOTH_WxH(DEFS, 16, 4);  \
+  INIT_SMOOTH_WxH(DEFS, 16, 8);  \
+  INIT_SMOOTH_WxH(DEFS, 16, 16); \
+  INIT_SMOOTH_WxH(DEFS, 16, 32); \
+  INIT_SMOOTH_WxH(DEFS, 16, 64); \
+  INIT_SMOOTH_WxH(DEFS, 32, 8);  \
+  INIT_SMOOTH_WxH(DEFS, 32, 16); \
+  INIT_SMOOTH_WxH(DEFS, 32, 32); \
+  INIT_SMOOTH_WxH(DEFS, 32, 64); \
+  INIT_SMOOTH_WxH(DEFS, 64, 16); \
+  INIT_SMOOTH_WxH(DEFS, 64, 32); \
+  INIT_SMOOTH_WxH(DEFS, 64, 64)
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  INIT_SMOOTH(Defs);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
+      Defs::_4x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
+      Defs::_4x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
+      Defs::_4x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
+      Defs::_4x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
+      Defs::_4x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
+      Defs::_4x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
+      Defs::_4x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
+      Defs::_4x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
+      Defs::_4x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
+      Defs::_8x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
+      Defs::_8x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
+      Defs::_8x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
+      Defs::_8x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
+      Defs::_8x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
+      Defs::_8x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
+      Defs::_8x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
+      Defs::_8x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
+      Defs::_8x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
+      Defs::_8x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
+      Defs::_8x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
+      Defs::_8x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
+      Defs::_16x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
+      Defs::_16x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
+      Defs::_16x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
+      Defs::_16x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
+      Defs::_16x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
+      Defs::_16x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
+      Defs::_16x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
+      Defs::_16x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
+      Defs::_16x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
+      Defs::_16x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
+      Defs::_16x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
+      Defs::_16x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
+      Defs::_16x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
+      Defs::_16x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
+      Defs::_16x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
+      Defs::_32x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
+      Defs::_32x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
+      Defs::_32x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
+      Defs::_32x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
+      Defs::_32x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
+      Defs::_32x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
+      Defs::_32x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
+      Defs::_32x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
+      Defs::_32x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
+      Defs::_32x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
+      Defs::_32x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
+      Defs::_32x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
+      Defs::_64x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
+      Defs::_64x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
+      Defs::_64x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
+      Defs::_64x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
+      Defs::_64x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
+      Defs::_64x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
+      Defs::_64x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
+      Defs::_64x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
+      Defs::_64x64::SmoothHorizontal;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}  // NOLINT(readability/fn_size)
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using DefsHbd = SmoothDefs<uint16_t>;
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  INIT_SMOOTH(DefsHbd);
+#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
+      DefsHbd::_4x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
+      DefsHbd::_4x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_4x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
+      DefsHbd::_4x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
+      DefsHbd::_4x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_4x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
+      DefsHbd::_4x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
+      DefsHbd::_4x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_4x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
+      DefsHbd::_8x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
+      DefsHbd::_8x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_8x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
+      DefsHbd::_8x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
+      DefsHbd::_8x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_8x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
+      DefsHbd::_8x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
+      DefsHbd::_8x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_8x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
+      DefsHbd::_8x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
+      DefsHbd::_8x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_8x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
+      DefsHbd::_16x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
+      DefsHbd::_16x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_16x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
+      DefsHbd::_16x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
+      DefsHbd::_16x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_16x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
+      DefsHbd::_16x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
+      DefsHbd::_16x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_16x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
+      DefsHbd::_16x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
+      DefsHbd::_16x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_16x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
+      DefsHbd::_16x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
+      DefsHbd::_16x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_16x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
+      DefsHbd::_32x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
+      DefsHbd::_32x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_32x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
+      DefsHbd::_32x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
+      DefsHbd::_32x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_32x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
+      DefsHbd::_32x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
+      DefsHbd::_32x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_32x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
+      DefsHbd::_32x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
+      DefsHbd::_32x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_32x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
+      DefsHbd::_64x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
+      DefsHbd::_64x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_64x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
+      DefsHbd::_64x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
+      DefsHbd::_64x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_64x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmooth
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
+      DefsHbd::_64x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothVertical
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
+      DefsHbd::_64x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
+  dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
+      DefsHbd::_64x64::SmoothHorizontal;
+#endif
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}  // NOLINT(readability/fn_size)
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+#undef INIT_SMOOTH_WxH
+#undef INIT_SMOOTH
+}  // namespace
+
+void IntraPredSmoothInit_C() {
+  Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1

diff --git a/libgav1/src/dsp/intrapred_smooth.h b/libgav1/src/dsp/intrapred_smooth.h
new file mode 100644
index 0000000..6802003
--- /dev/null
+++ b/libgav1/src/dsp/intrapred_smooth.h

@@ -0,0 +1,48 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRAPRED_SMOOTH_H_
+#define LIBGAV1_SRC_DSP_INTRAPRED_SMOOTH_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intrapred_smooth_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intrapred_smooth_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors[][kIntraPredictorSmooth.*].
+// This function is not thread-safe.
+void IntraPredSmoothInit_C();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DSP_INTRAPRED_SMOOTH_H_

diff --git a/libgav1/src/dsp/inverse_transform.cc b/libgav1/src/dsp/inverse_transform.cc
index 1c5a4a6..ed984d8 100644
--- a/libgav1/src/dsp/inverse_transform.cc
+++ b/libgav1/src/dsp/inverse_transform.cc

@@ -161,16 +161,13 @@
      3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63}};
 
 template <typename Residual, int size_log2>
-void Dct_C(void* dest, const void* source, int8_t range) {
+void Dct_C(void* dest, int8_t range) {
   static_assert(size_log2 >= 2 && size_log2 <= 6, "");
   auto* const dst = static_cast<Residual*>(dest);
-  const auto* const src = static_cast<const Residual*>(source);
   // stage 1.
   const int size = 1 << size_log2;
-  // The copy is necessary because |dst| and |src| could be pointing to the same
-  // buffer.
   Residual temp[size];
-  memcpy(temp, src, sizeof(temp));
+  memcpy(temp, dst, sizeof(temp));
   for (int i = 0; i < size; ++i) {
     dst[i] = temp[kBitReverseLookup[size_log2 - 2][i]];
   }
@@ -266,7 +263,7 @@
   if (size_log2 >= 3) {
     for (int i = 0; i < 2; ++i) {
       HadamardRotation_C(dst, MultiplyBy2(i) + 4, MultiplyBy2(i) + 5,
-                         static_cast<bool>(i), range);
+                         /*flip=*/i != 0, range);
     }
   }
   // stage 14.
@@ -308,7 +305,7 @@
     for (int i = 0; i < 2; ++i) {
       for (int j = 0; j < 2; ++j) {
         HadamardRotation_C(dst, MultiplyBy4(i) + j + 8, MultiplyBy4(i) - j + 11,
-                           static_cast<bool>(i), range);
+                           /*flip=*/i != 0, range);
       }
     }
   }
@@ -396,12 +393,10 @@
 }
 
 template <int bitdepth, typename Residual, int size_log2>
-void DctDcOnly_C(void* dest, const void* source, int8_t range,
-                 bool should_round, int row_shift, bool is_row) {
+void DctDcOnly_C(void* dest, int8_t range, bool should_round, int row_shift,
+                 bool is_row) {
   auto* const dst = static_cast<Residual*>(dest);
-  const auto* const src = static_cast<const Residual*>(source);
 
-  dst[0] = src[0];
   if (is_row && should_round) {
     dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
   }
@@ -428,11 +423,9 @@
  * Column transform max range in bits for bitdepths 8/10/12: 28/28/30.
  */
 template <typename Residual>
-void Adst4_C(void* dest, const void* source, int8_t range) {
+void Adst4_C(void* dest, int8_t range) {
   auto* const dst = static_cast<Residual*>(dest);
-  const auto* const src = static_cast<const Residual*>(source);
-  if ((src[0] | src[1] | src[2] | src[3]) == 0) {
-    memset(dst, 0, 4 * sizeof(dst[0]));
+  if ((dst[0] | dst[1] | dst[2] | dst[3]) == 0) {
     return;
   }
 
@@ -441,22 +434,22 @@
   // values stored in the s and x arrays by this process are representable by
   // a signed integer using range + 12 bits of precision.
   int32_t s[7];
-  s[0] = RangeCheckValue(kAdst4Multiplier[0] * src[0], range + 12);
-  s[1] = RangeCheckValue(kAdst4Multiplier[1] * src[0], range + 12);
-  s[2] = RangeCheckValue(kAdst4Multiplier[2] * src[1], range + 12);
-  s[3] = RangeCheckValue(kAdst4Multiplier[3] * src[2], range + 12);
-  s[4] = RangeCheckValue(kAdst4Multiplier[0] * src[2], range + 12);
-  s[5] = RangeCheckValue(kAdst4Multiplier[1] * src[3], range + 12);
-  s[6] = RangeCheckValue(kAdst4Multiplier[3] * src[3], range + 12);
+  s[0] = RangeCheckValue(kAdst4Multiplier[0] * dst[0], range + 12);
+  s[1] = RangeCheckValue(kAdst4Multiplier[1] * dst[0], range + 12);
+  s[2] = RangeCheckValue(kAdst4Multiplier[2] * dst[1], range + 12);
+  s[3] = RangeCheckValue(kAdst4Multiplier[3] * dst[2], range + 12);
+  s[4] = RangeCheckValue(kAdst4Multiplier[0] * dst[2], range + 12);
+  s[5] = RangeCheckValue(kAdst4Multiplier[1] * dst[3], range + 12);
+  s[6] = RangeCheckValue(kAdst4Multiplier[3] * dst[3], range + 12);
   // stage 2.
   // Section 7.13.2.6: It is a requirement of bitstream conformance that
   // values stored in the variable a7 by this process are representable by a
   // signed integer using range + 1 bits of precision.
-  const int32_t a7 = RangeCheckValue(src[0] - src[2], range + 1);
+  const int32_t a7 = RangeCheckValue(dst[0] - dst[2], range + 1);
   // Section 7.13.2.6: It is a requirement of bitstream conformance that
   // values stored in the variable b7 by this process are representable by a
   // signed integer using |range| bits of precision.
-  const int32_t b7 = RangeCheckValue(a7 + src[3], range);
+  const int32_t b7 = RangeCheckValue(a7 + dst[3], range);
   // stage 3.
   s[0] = RangeCheckValue(s[0] + s[3], range + 12);
   s[1] = RangeCheckValue(s[1] - s[4], range + 12);
@@ -490,14 +483,12 @@
 }
 
 template <int bitdepth, typename Residual>
-void Adst4DcOnly_C(void* dest, const void* source, int8_t range,
-                   bool should_round, int row_shift, bool is_row) {
+void Adst4DcOnly_C(void* dest, int8_t range, bool should_round, int row_shift,
+                   bool is_row) {
   auto* const dst = static_cast<Residual*>(dest);
-  const auto* const src = static_cast<const Residual*>(source);
 
-  dst[0] = src[0];
   if (is_row && should_round) {
-    dst[0] = RightShiftWithRounding(src[0] * kTransformRowMultiplier, 12);
+    dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
   }
 
   // stage 1.
@@ -570,12 +561,11 @@
 }
 
 template <typename Residual>
-void Adst8_C(void* dest, const void* source, int8_t range) {
+void Adst8_C(void* dest, int8_t range) {
   auto* const dst = static_cast<Residual*>(dest);
-  const auto* const src = static_cast<const Residual*>(source);
   // stage 1.
   int32_t temp[8];
-  AdstInputPermutation(temp, src, 8);
+  AdstInputPermutation(temp, dst, 8);
   // stage 2.
   for (int i = 0; i < 4; ++i) {
     ButterflyRotation_C(temp, MultiplyBy2(i), MultiplyBy2(i) + 1, 60 - 16 * i,
@@ -606,15 +596,14 @@
 }
 
 template <int bitdepth, typename Residual>
-void Adst8DcOnly_C(void* dest, const void* source, int8_t range,
-                   bool should_round, int row_shift, bool is_row) {
+void Adst8DcOnly_C(void* dest, int8_t range, bool should_round, int row_shift,
+                   bool is_row) {
   auto* const dst = static_cast<Residual*>(dest);
-  const auto* const src = static_cast<const Residual*>(source);
 
   // stage 1.
   int32_t temp[8];
   // After the permutation, the dc value is in temp[1]. The remaining are zero.
-  AdstInputPermutation(temp, src, 8);
+  AdstInputPermutation(temp, dst, 8);
 
   if (is_row && should_round) {
     temp[1] = RightShiftWithRounding(temp[1] * kTransformRowMultiplier, 12);
@@ -654,12 +643,11 @@
 }
 
 template <typename Residual>
-void Adst16_C(void* dest, const void* source, int8_t range) {
+void Adst16_C(void* dest, int8_t range) {
   auto* const dst = static_cast<Residual*>(dest);
-  const auto* const src = static_cast<const Residual*>(source);
   // stage 1.
   int32_t temp[16];
-  AdstInputPermutation(temp, src, 16);
+  AdstInputPermutation(temp, dst, 16);
   // stage 2.
   for (int i = 0; i < 8; ++i) {
     ButterflyRotation_C(temp, MultiplyBy2(i), MultiplyBy2(i) + 1, 62 - 8 * i,
@@ -707,15 +695,14 @@
 }
 
 template <int bitdepth, typename Residual>
-void Adst16DcOnly_C(void* dest, const void* source, int8_t range,
-                    bool should_round, int row_shift, bool is_row) {
+void Adst16DcOnly_C(void* dest, int8_t range, bool should_round, int row_shift,
+                    bool is_row) {
   auto* const dst = static_cast<Residual*>(dest);
-  const auto* const src = static_cast<const Residual*>(source);
 
   // stage 1.
   int32_t temp[16];
   // After the permutation, the dc value is in temp[1].  The remaining are zero.
-  AdstInputPermutation(temp, src, 16);
+  AdstInputPermutation(temp, dst, 16);
 
   if (is_row && should_round) {
     temp[1] = RightShiftWithRounding(temp[1] * kTransformRowMultiplier, 12);
@@ -798,7 +785,7 @@
 //    optimized.
 //
 // The identity transform functions have the following prototype:
-//   void Identity_C(void* dest, const void* source, int8_t shift);
+//   void Identity_C(void* dest, int8_t shift);
 //
 // The |shift| parameter is the amount of shift for the Round2() call. For row
 // transforms, |shift| is 0, 1, or 2. For column transforms, |shift| is always
@@ -852,10 +839,9 @@
 // 4 (2 bits) and |shift| is always 4.
 
 template <typename Residual>
-void Identity4Row_C(void* dest, const void* source, int8_t shift) {
+void Identity4Row_C(void* dest, int8_t shift) {
   assert(shift == 0 || shift == 1);
   auto* const dst = static_cast<Residual*>(dest);
-  const auto* const src = static_cast<const Residual*>(source);
   // If |shift| is 0, |rounding| should be 1 << 11. If |shift| is 1, |rounding|
   // should be (1 + (1 << 1)) << 11. The following expression works for both
   // values of |shift|.
@@ -864,7 +850,7 @@
     // The intermediate value here will have to fit into an int32_t for it to be
     // bitstream conformant. The multiplication is promoted to int32_t by
     // defining kIdentity4Multiplier as int32_t.
-    int32_t dst_i = (src[i] * kIdentity4Multiplier + rounding) >> (12 + shift);
+    int32_t dst_i = (dst[i] * kIdentity4Multiplier + rounding) >> (12 + shift);
     if (sizeof(Residual) == 2) {
       dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
     }
@@ -873,27 +859,24 @@
 }
 
 template <typename Residual>
-void Identity4Column_C(void* dest, const void* source, int8_t /*shift*/) {
+void Identity4Column_C(void* dest, int8_t /*shift*/) {
   auto* const dst = static_cast<Residual*>(dest);
-  const auto* const src = static_cast<const Residual*>(source);
   const int32_t rounding = (1 + (1 << kTransformColumnShift)) << 11;
   for (int i = 0; i < 4; ++i) {
     // The intermediate value here will have to fit into an int32_t for it to be
     // bitstream conformant. The multiplication is promoted to int32_t by
     // defining kIdentity4Multiplier as int32_t.
-    dst[i] = static_cast<Residual>((src[i] * kIdentity4Multiplier + rounding) >>
+    dst[i] = static_cast<Residual>((dst[i] * kIdentity4Multiplier + rounding) >>
                                    (12 + kTransformColumnShift));
   }
 }
 
 template <int bitdepth, typename Residual>
-void Identity4DcOnly_C(void* dest, const void* source, int8_t /*range*/,
-                       bool should_round, int row_shift, bool is_row) {
+void Identity4DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
+                       int row_shift, bool is_row) {
   auto* const dst = static_cast<Residual*>(dest);
-  const auto* const src = static_cast<const Residual*>(source);
 
   if (is_row) {
-    dst[0] = src[0];
     if (should_round) {
       dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
     }
@@ -911,17 +894,16 @@
   }
 
   const int32_t rounding = (1 + (1 << kTransformColumnShift)) << 11;
-  dst[0] = static_cast<Residual>((src[0] * kIdentity4Multiplier + rounding) >>
+  dst[0] = static_cast<Residual>((dst[0] * kIdentity4Multiplier + rounding) >>
                                  (12 + kTransformColumnShift));
 }
 
 template <typename Residual>
-void Identity8Row_C(void* dest, const void* source, int8_t shift) {
+void Identity8Row_C(void* dest, int8_t shift) {
   assert(shift == 0 || shift == 1 || shift == 2);
   auto* const dst = static_cast<Residual*>(dest);
-  const auto* const src = static_cast<const Residual*>(source);
   for (int i = 0; i < 8; ++i) {
-    int32_t dst_i = RightShiftWithRounding(MultiplyBy2(src[i]), shift);
+    int32_t dst_i = RightShiftWithRounding(MultiplyBy2(dst[i]), shift);
     if (sizeof(Residual) == 2) {
       dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
     }
@@ -930,23 +912,20 @@
 }
 
 template <typename Residual>
-void Identity8Column_C(void* dest, const void* source, int8_t /*shift*/) {
+void Identity8Column_C(void* dest, int8_t /*shift*/) {
   auto* const dst = static_cast<Residual*>(dest);
-  const auto* const src = static_cast<const Residual*>(source);
   for (int i = 0; i < 8; ++i) {
     dst[i] = static_cast<Residual>(
-        RightShiftWithRounding(src[i], kTransformColumnShift - 1));
+        RightShiftWithRounding(dst[i], kTransformColumnShift - 1));
   }
 }
 
 template <int bitdepth, typename Residual>
-void Identity8DcOnly_C(void* dest, const void* source, int8_t /*range*/,
-                       bool should_round, int row_shift, bool is_row) {
+void Identity8DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
+                       int row_shift, bool is_row) {
   auto* const dst = static_cast<Residual*>(dest);
-  const auto* const src = static_cast<const Residual*>(source);
 
   if (is_row) {
-    dst[0] = src[0];
     if (should_round) {
       dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
     }
@@ -969,20 +948,19 @@
   }
 
   dst[0] = static_cast<Residual>(
-      RightShiftWithRounding(src[0], kTransformColumnShift - 1));
+      RightShiftWithRounding(dst[0], kTransformColumnShift - 1));
 }
 
 template <typename Residual>
-void Identity16Row_C(void* dest, const void* source, int8_t shift) {
+void Identity16Row_C(void* dest, int8_t shift) {
   assert(shift == 1 || shift == 2);
   auto* const dst = static_cast<Residual*>(dest);
-  const auto* const src = static_cast<const Residual*>(source);
   const int32_t rounding = (1 + (1 << shift)) << 11;
   for (int i = 0; i < 16; ++i) {
     // The intermediate value here will have to fit into an int32_t for it to be
     // bitstream conformant. The multiplication is promoted to int32_t by
     // defining kIdentity16Multiplier as int32_t.
-    int32_t dst_i = (src[i] * kIdentity16Multiplier + rounding) >> (12 + shift);
+    int32_t dst_i = (dst[i] * kIdentity16Multiplier + rounding) >> (12 + shift);
     if (sizeof(Residual) == 2) {
       dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
     }
@@ -991,28 +969,25 @@
 }
 
 template <typename Residual>
-void Identity16Column_C(void* dest, const void* source, int8_t /*shift*/) {
+void Identity16Column_C(void* dest, int8_t /*shift*/) {
   auto* const dst = static_cast<Residual*>(dest);
-  const auto* const src = static_cast<const Residual*>(source);
   const int32_t rounding = (1 + (1 << kTransformColumnShift)) << 11;
   for (int i = 0; i < 16; ++i) {
     // The intermediate value here will have to fit into an int32_t for it to be
     // bitstream conformant. The multiplication is promoted to int32_t by
     // defining kIdentity16Multiplier as int32_t.
     dst[i] =
-        static_cast<Residual>((src[i] * kIdentity16Multiplier + rounding) >>
+        static_cast<Residual>((dst[i] * kIdentity16Multiplier + rounding) >>
                               (12 + kTransformColumnShift));
   }
 }
 
 template <int bitdepth, typename Residual>
-void Identity16DcOnly_C(void* dest, const void* source, int8_t /*range*/,
-                        bool should_round, int row_shift, bool is_row) {
+void Identity16DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
+                        int row_shift, bool is_row) {
   auto* const dst = static_cast<Residual*>(dest);
-  const auto* const src = static_cast<const Residual*>(source);
 
   if (is_row) {
-    dst[0] = src[0];
     if (should_round) {
       dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
     }
@@ -1030,17 +1005,16 @@
   }
 
   const int32_t rounding = (1 + (1 << kTransformColumnShift)) << 11;
-  dst[0] = static_cast<Residual>((src[0] * kIdentity16Multiplier + rounding) >>
+  dst[0] = static_cast<Residual>((dst[0] * kIdentity16Multiplier + rounding) >>
                                  (12 + kTransformColumnShift));
 }
 
 template <typename Residual>
-void Identity32Row_C(void* dest, const void* source, int8_t shift) {
+void Identity32Row_C(void* dest, int8_t shift) {
   assert(shift == 1 || shift == 2);
   auto* const dst = static_cast<Residual*>(dest);
-  const auto* const src = static_cast<const Residual*>(source);
   for (int i = 0; i < 32; ++i) {
-    int32_t dst_i = RightShiftWithRounding(MultiplyBy4(src[i]), shift);
+    int32_t dst_i = RightShiftWithRounding(MultiplyBy4(dst[i]), shift);
     if (sizeof(Residual) == 2) {
       dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
     }
@@ -1049,23 +1023,20 @@
 }
 
 template <typename Residual>
-void Identity32Column_C(void* dest, const void* source, int8_t /*shift*/) {
+void Identity32Column_C(void* dest, int8_t /*shift*/) {
   auto* const dst = static_cast<Residual*>(dest);
-  const auto* const src = static_cast<const Residual*>(source);
   for (int i = 0; i < 32; ++i) {
     dst[i] = static_cast<Residual>(
-        RightShiftWithRounding(src[i], kTransformColumnShift - 2));
+        RightShiftWithRounding(dst[i], kTransformColumnShift - 2));
   }
 }
 
 template <int bitdepth, typename Residual>
-void Identity32DcOnly_C(void* dest, const void* source, int8_t /*range*/,
-                        bool should_round, int row_shift, bool is_row) {
+void Identity32DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
+                        int row_shift, bool is_row) {
   auto* const dst = static_cast<Residual*>(dest);
-  const auto* const src = static_cast<const Residual*>(source);
 
   if (is_row) {
-    dst[0] = src[0];
     if (should_round) {
       dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
     }
@@ -1081,21 +1052,20 @@
   }
 
   dst[0] = static_cast<Residual>(
-      RightShiftWithRounding(src[0], kTransformColumnShift - 2));
+      RightShiftWithRounding(dst[0], kTransformColumnShift - 2));
 }
 
 //------------------------------------------------------------------------------
 // Walsh Hadamard Transform.
 
 template <typename Residual>
-void Wht4_C(void* dest, const void* source, int8_t shift) {
+void Wht4_C(void* dest, int8_t shift) {
   auto* const dst = static_cast<Residual*>(dest);
-  const auto* const src = static_cast<const Residual*>(source);
   Residual temp[4];
-  temp[0] = src[0] >> shift;
-  temp[2] = src[1] >> shift;
-  temp[3] = src[2] >> shift;
-  temp[1] = src[3] >> shift;
+  temp[0] = dst[0] >> shift;
+  temp[2] = dst[1] >> shift;
+  temp[3] = dst[2] >> shift;
+  temp[1] = dst[3] >> shift;
   temp[0] += temp[2];
   temp[3] -= temp[1];
   // This signed right shift must be an arithmetic shift.
@@ -1107,13 +1077,12 @@
 }
 
 template <int bitdepth, typename Residual>
-void Wht4DcOnly_C(void* dest, const void* source, int8_t range,
-                  bool /*should_round*/, int /*row_shift*/, bool /*is_row*/) {
+void Wht4DcOnly_C(void* dest, int8_t range, bool /*should_round*/,
+                  int /*row_shift*/, bool /*is_row*/) {
   auto* const dst = static_cast<Residual*>(dest);
-  const auto* const src = static_cast<const Residual*>(source);
   const int shift = range;
 
-  Residual temp = src[0] >> shift;
+  Residual temp = dst[0] >> shift;
   // This signed right shift must be an arithmetic shift.
   Residual e = temp >> 1;
   dst[0] = temp - e;
@@ -1127,20 +1096,18 @@
 //------------------------------------------------------------------------------
 // row/column transform loop
 
-using InverseTransform1DFunc = void (*)(void* dst, const void* src,
-                                        int8_t range);
-using InverseTransformDcOnlyFunc = void (*)(void* dest, const void* source,
-                                            int8_t range, bool should_round,
-                                            int row_shift, bool is_row);
+using InverseTransform1DFunc = void (*)(void* dst, int8_t range);
+using InverseTransformDcOnlyFunc = void (*)(void* dest, int8_t range,
+                                            bool should_round, int row_shift,
+                                            bool is_row);
 
 template <int bitdepth, typename Residual, typename Pixel,
           Transform1D transform1d_type,
           InverseTransformDcOnlyFunc dconly_transform1d,
-          InverseTransform1DFunc row_transform1d_func,
-          InverseTransform1DFunc column_transform1d_func = row_transform1d_func>
+          InverseTransform1DFunc transform1d_func, bool is_row>
 void TransformLoop_C(TransformType tx_type, TransformSize tx_size,
-                     void* src_buffer, int start_x, int start_y,
-                     void* dst_frame, bool is_row, int non_zero_coeff_count) {
+                     int adjusted_tx_height, void* src_buffer, int start_x,
+                     int start_y, void* dst_frame) {
   constexpr bool lossless = transform1d_type == k1DTransformWht;
   constexpr bool is_identity = transform1d_type == k1DTransformIdentity;
   // The transform size of the WHT is always 4x4. Setting tx_width and
@@ -1168,19 +1135,16 @@
     // the fraction 2896 / 2^12.
     const bool should_round = std::abs(tx_width_log2 - tx_height_log2) == 1;
 
-    if (non_zero_coeff_count == 1) {
-      dconly_transform1d(residual[0], residual[0], row_clamp_range,
-                         should_round, row_shift, true);
+    if (adjusted_tx_height == 1) {
+      dconly_transform1d(residual[0], row_clamp_range, should_round, row_shift,
+                         true);
       return;
     }
 
     // Row transforms need to be done only up to 32 because the rest of the rows
     // are always all zero if |tx_height| is 64.  Otherwise, only process the
     // rows that have a non zero coefficients.
-    // TODO(slavarnway): Expand to include other possible non_zero_coeff_count
-    // values.
-    const int num_rows = std::min(tx_height, 32);
-    for (int i = 0; i < num_rows; ++i) {
+    for (int i = 0; i < adjusted_tx_height; ++i) {
       // If lossless, the transform size is 4x4, so should_round is false.
       if (!lossless && should_round) {
         // The last 32 values of every row are always zero if the |tx_width| is
@@ -1190,10 +1154,9 @@
               residual[i][j] * kTransformRowMultiplier, 12);
         }
       }
-      // For identity transform, |row_transform1d_func| also performs the
+      // For identity transform, |transform1d_func| also performs the
       // Round2(T[j], rowShift) call in the spec.
-      row_transform1d_func(residual[i], residual[i],
-                           is_identity ? row_shift : row_clamp_range);
+      transform1d_func(residual[i], is_identity ? row_shift : row_clamp_range);
       if (!lossless && !is_identity && row_shift > 0) {
         for (int j = 0; j < tx_width; ++j) {
           residual[i][j] = RightShiftWithRounding(residual[i][j], row_shift);
@@ -1221,17 +1184,17 @@
   Residual tx_buffer[64];
   for (int j = 0; j < tx_width; ++j) {
     const int flipped_j = flip_columns ? tx_width - j - 1 : j;
-    for (int i = 0; i < tx_height; ++i) {
+    int i = 0;
+    do {
       tx_buffer[i] = residual[i][flipped_j];
-    }
-    if (non_zero_coeff_count == 1) {
-      dconly_transform1d(tx_buffer, tx_buffer, column_clamp_range, false, 0,
-                         false);
+    } while (++i != tx_height);
+    if (adjusted_tx_height == 1) {
+      dconly_transform1d(tx_buffer, column_clamp_range, false, 0, false);
     } else {
-      // For identity transform, |column_transform1d_func| also performs the
+      // For identity transform, |transform1d_func| also performs the
       // Round2(T[i], colShift) call in the spec.
-      column_transform1d_func(tx_buffer, tx_buffer,
-                              is_identity ? column_shift : column_clamp_range);
+      transform1d_func(tx_buffer,
+                       is_identity ? column_shift : column_clamp_range);
     }
     const int x = start_x + j;
     for (int i = 0; i < tx_height; ++i) {
@@ -1249,139 +1212,264 @@
 
 //------------------------------------------------------------------------------
 
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
 template <int bitdepth, typename Residual, typename Pixel>
 void InitAll(Dsp* const dsp) {
   // Maximum transform size for Dct is 64.
-  dsp->inverse_transforms[k1DTransformSize4][k1DTransformDct] =
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
       TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
-                      DctDcOnly_C<bitdepth, Residual, 2>, Dct_C<Residual, 2>>;
-  dsp->inverse_transforms[k1DTransformSize8][k1DTransformDct] =
+                      DctDcOnly_C<bitdepth, Residual, 2>, Dct_C<Residual, 2>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
       TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
-                      DctDcOnly_C<bitdepth, Residual, 3>, Dct_C<Residual, 3>>;
-  dsp->inverse_transforms[k1DTransformSize16][k1DTransformDct] =
+                      DctDcOnly_C<bitdepth, Residual, 2>, Dct_C<Residual, 2>,
+                      /*is_row=*/false>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
       TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
-                      DctDcOnly_C<bitdepth, Residual, 4>, Dct_C<Residual, 4>>;
-  dsp->inverse_transforms[k1DTransformSize32][k1DTransformDct] =
+                      DctDcOnly_C<bitdepth, Residual, 3>, Dct_C<Residual, 3>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
       TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
-                      DctDcOnly_C<bitdepth, Residual, 5>, Dct_C<Residual, 5>>;
-  dsp->inverse_transforms[k1DTransformSize64][k1DTransformDct] =
+                      DctDcOnly_C<bitdepth, Residual, 3>, Dct_C<Residual, 3>,
+                      /*is_row=*/false>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
       TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
-                      DctDcOnly_C<bitdepth, Residual, 6>, Dct_C<Residual, 6>>;
+                      DctDcOnly_C<bitdepth, Residual, 4>, Dct_C<Residual, 4>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
+                      DctDcOnly_C<bitdepth, Residual, 4>, Dct_C<Residual, 4>,
+                      /*is_row=*/false>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
+                      DctDcOnly_C<bitdepth, Residual, 5>, Dct_C<Residual, 5>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
+                      DctDcOnly_C<bitdepth, Residual, 5>, Dct_C<Residual, 5>,
+                      /*is_row=*/false>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
+                      DctDcOnly_C<bitdepth, Residual, 6>, Dct_C<Residual, 6>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
+                      DctDcOnly_C<bitdepth, Residual, 6>, Dct_C<Residual, 6>,
+                      /*is_row=*/false>;
 
   // Maximum transform size for Adst is 16.
-  dsp->inverse_transforms[k1DTransformSize4][k1DTransformAdst] =
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
       TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformAdst,
-                      Adst4DcOnly_C<bitdepth, Residual>, Adst4_C<Residual>>;
-  dsp->inverse_transforms[k1DTransformSize8][k1DTransformAdst] =
+                      Adst4DcOnly_C<bitdepth, Residual>, Adst4_C<Residual>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
       TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformAdst,
-                      Adst8DcOnly_C<bitdepth, Residual>, Adst8_C<Residual>>;
-  dsp->inverse_transforms[k1DTransformSize16][k1DTransformAdst] =
+                      Adst4DcOnly_C<bitdepth, Residual>, Adst4_C<Residual>,
+                      /*is_row=*/false>;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
       TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformAdst,
-                      Adst16DcOnly_C<bitdepth, Residual>, Adst16_C<Residual>>;
+                      Adst8DcOnly_C<bitdepth, Residual>, Adst8_C<Residual>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformAdst,
+                      Adst8DcOnly_C<bitdepth, Residual>, Adst8_C<Residual>,
+                      /*is_row=*/false>;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformAdst,
+                      Adst16DcOnly_C<bitdepth, Residual>, Adst16_C<Residual>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformAdst,
+                      Adst16DcOnly_C<bitdepth, Residual>, Adst16_C<Residual>,
+                      /*is_row=*/false>;
 
   // Maximum transform size for Identity transform is 32.
-  dsp->inverse_transforms[k1DTransformSize4][k1DTransformIdentity] =
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
       TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformIdentity,
                       Identity4DcOnly_C<bitdepth, Residual>,
-                      Identity4Row_C<Residual>, Identity4Column_C<Residual>>;
-  dsp->inverse_transforms[k1DTransformSize8][k1DTransformIdentity] =
+                      Identity4Row_C<Residual>, /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformIdentity,
+                      Identity4DcOnly_C<bitdepth, Residual>,
+                      Identity4Column_C<Residual>, /*is_row=*/false>;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
       TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformIdentity,
                       Identity8DcOnly_C<bitdepth, Residual>,
-                      Identity8Row_C<Residual>, Identity8Column_C<Residual>>;
-  dsp->inverse_transforms[k1DTransformSize16][k1DTransformIdentity] =
+                      Identity8Row_C<Residual>, /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformIdentity,
+                      Identity8DcOnly_C<bitdepth, Residual>,
+                      Identity8Column_C<Residual>, /*is_row=*/false>;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
       TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformIdentity,
                       Identity16DcOnly_C<bitdepth, Residual>,
-                      Identity16Row_C<Residual>, Identity16Column_C<Residual>>;
-  dsp->inverse_transforms[k1DTransformSize32][k1DTransformIdentity] =
+                      Identity16Row_C<Residual>, /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformIdentity,
+                      Identity16DcOnly_C<bitdepth, Residual>,
+                      Identity16Column_C<Residual>, /*is_row=*/false>;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] =
       TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformIdentity,
                       Identity32DcOnly_C<bitdepth, Residual>,
-                      Identity32Row_C<Residual>, Identity32Column_C<Residual>>;
+                      Identity32Row_C<Residual>, /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformIdentity,
+                      Identity32DcOnly_C<bitdepth, Residual>,
+                      Identity32Column_C<Residual>, /*is_row=*/false>;
 
   // Maximum transform size for Wht is 4.
-  dsp->inverse_transforms[k1DTransformSize4][k1DTransformWht] =
+  dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] =
       TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformWht,
-                      Wht4DcOnly_C<bitdepth, Residual>, Wht4_C<Residual>>;
+                      Wht4DcOnly_C<bitdepth, Residual>, Wht4_C<Residual>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] =
+      TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformWht,
+                      Wht4DcOnly_C<bitdepth, Residual>, Wht4_C<Residual>,
+                      /*is_row=*/false>;
 }
+#endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
 
 void Init8bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
   assert(dsp != nullptr);
   for (auto& inverse_transform_by_size : dsp->inverse_transforms) {
     for (auto& inverse_transform : inverse_transform_by_size) {
-      inverse_transform = nullptr;
+      inverse_transform[kRow] = nullptr;
+      inverse_transform[kColumn] = nullptr;
     }
   }
 #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
   InitAll<8, int16_t, uint8_t>(dsp);
 #else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
 #ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformDct
-  dsp->inverse_transforms[k1DTransformSize4][k1DTransformDct] =
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
       TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
-                      DctDcOnly_C<8, int16_t, 2>, Dct_C<int16_t, 2>>;
+                      DctDcOnly_C<8, int16_t, 2>, Dct_C<int16_t, 2>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
+                      DctDcOnly_C<8, int16_t, 2>, Dct_C<int16_t, 2>,
+                      /*is_row=*/false>;
 #endif
 #ifndef LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformDct
-  dsp->inverse_transforms[k1DTransformSize8][k1DTransformDct] =
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
       TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
-                      DctDcOnly_C<8, int16_t, 3>, Dct_C<int16_t, 3>>;
+                      DctDcOnly_C<8, int16_t, 3>, Dct_C<int16_t, 3>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
+                      DctDcOnly_C<8, int16_t, 3>, Dct_C<int16_t, 3>,
+                      /*is_row=*/false>;
 #endif
 #ifndef LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformDct
-  dsp->inverse_transforms[k1DTransformSize16][k1DTransformDct] =
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
       TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
-                      DctDcOnly_C<8, int16_t, 4>, Dct_C<int16_t, 4>>;
+                      DctDcOnly_C<8, int16_t, 4>, Dct_C<int16_t, 4>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
+                      DctDcOnly_C<8, int16_t, 4>, Dct_C<int16_t, 4>,
+                      /*is_row=*/false>;
 #endif
 #ifndef LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformDct
-  dsp->inverse_transforms[k1DTransformSize32][k1DTransformDct] =
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
       TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
-                      DctDcOnly_C<8, int16_t, 5>, Dct_C<int16_t, 5>>;
+                      DctDcOnly_C<8, int16_t, 5>, Dct_C<int16_t, 5>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
+                      DctDcOnly_C<8, int16_t, 5>, Dct_C<int16_t, 5>,
+                      /*is_row=*/false>;
 #endif
 #ifndef LIBGAV1_Dsp8bpp_1DTransformSize64_1DTransformDct
-  dsp->inverse_transforms[k1DTransformSize64][k1DTransformDct] =
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
       TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
-                      DctDcOnly_C<8, int16_t, 6>, Dct_C<int16_t, 6>>;
+                      DctDcOnly_C<8, int16_t, 6>, Dct_C<int16_t, 6>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
+                      DctDcOnly_C<8, int16_t, 6>, Dct_C<int16_t, 6>,
+                      /*is_row=*/false>;
 #endif
 #ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformAdst
-  dsp->inverse_transforms[k1DTransformSize4][k1DTransformAdst] =
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
       TransformLoop_C<8, int16_t, uint8_t, k1DTransformAdst,
-                      Adst4DcOnly_C<8, int16_t>, Adst4_C<int16_t>>;
+                      Adst4DcOnly_C<8, int16_t>, Adst4_C<int16_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformAdst,
+                      Adst4DcOnly_C<8, int16_t>, Adst4_C<int16_t>,
+                      /*is_row=*/false>;
 #endif
 #ifndef LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformAdst
-  dsp->inverse_transforms[k1DTransformSize8][k1DTransformAdst] =
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
       TransformLoop_C<8, int16_t, uint8_t, k1DTransformAdst,
-                      Adst8DcOnly_C<8, int16_t>, Adst8_C<int16_t>>;
+                      Adst8DcOnly_C<8, int16_t>, Adst8_C<int16_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformAdst,
+                      Adst8DcOnly_C<8, int16_t>, Adst8_C<int16_t>,
+                      /*is_row=*/false>;
 #endif
 #ifndef LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformAdst
-  dsp->inverse_transforms[k1DTransformSize16][k1DTransformAdst] =
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
       TransformLoop_C<8, int16_t, uint8_t, k1DTransformAdst,
-                      Adst16DcOnly_C<8, int16_t>, Adst16_C<int16_t>>;
+                      Adst16DcOnly_C<8, int16_t>, Adst16_C<int16_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformAdst,
+                      Adst16DcOnly_C<8, int16_t>, Adst16_C<int16_t>,
+                      /*is_row=*/false>;
 #endif
 #ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformIdentity
-  dsp->inverse_transforms[k1DTransformSize4][k1DTransformIdentity] =
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
       TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity,
                       Identity4DcOnly_C<8, int16_t>, Identity4Row_C<int16_t>,
-                      Identity4Column_C<int16_t>>;
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity,
+                      Identity4DcOnly_C<8, int16_t>, Identity4Column_C<int16_t>,
+                      /*is_row=*/false>;
 #endif
 #ifndef LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformIdentity
-  dsp->inverse_transforms[k1DTransformSize8][k1DTransformIdentity] =
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
       TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity,
                       Identity8DcOnly_C<8, int16_t>, Identity8Row_C<int16_t>,
-                      Identity8Column_C<int16_t>>;
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity,
+                      Identity8DcOnly_C<8, int16_t>, Identity8Column_C<int16_t>,
+                      /*is_row=*/false>;
 #endif
 #ifndef LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformIdentity
-  dsp->inverse_transforms[k1DTransformSize16][k1DTransformIdentity] =
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
       TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity,
                       Identity16DcOnly_C<8, int16_t>, Identity16Row_C<int16_t>,
-                      Identity16Column_C<int16_t>>;
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity,
+                      Identity16DcOnly_C<8, int16_t>,
+                      Identity16Column_C<int16_t>, /*is_row=*/false>;
 #endif
 #ifndef LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformIdentity
-  dsp->inverse_transforms[k1DTransformSize32][k1DTransformIdentity] =
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] =
       TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity,
                       Identity32DcOnly_C<8, int16_t>, Identity32Row_C<int16_t>,
-                      Identity32Column_C<int16_t>>;
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity,
+                      Identity32DcOnly_C<8, int16_t>,
+                      Identity32Column_C<int16_t>, /*is_row=*/false>;
 #endif
 #ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformWht
-  dsp->inverse_transforms[k1DTransformSize4][k1DTransformWht] =
+  dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] =
       TransformLoop_C<8, int16_t, uint8_t, k1DTransformWht,
-                      Wht4DcOnly_C<8, int16_t>, Wht4_C<int16_t>>;
+                      Wht4DcOnly_C<8, int16_t>, Wht4_C<int16_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] =
+      TransformLoop_C<8, int16_t, uint8_t, k1DTransformWht,
+                      Wht4DcOnly_C<8, int16_t>, Wht4_C<int16_t>,
+                      /*is_row=*/false>;
 #endif
 #endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
 }
@@ -1392,80 +1480,142 @@
   assert(dsp != nullptr);
   for (auto& inverse_transform_by_size : dsp->inverse_transforms) {
     for (auto& inverse_transform : inverse_transform_by_size) {
-      inverse_transform = nullptr;
+      inverse_transform[kRow] = nullptr;
+      inverse_transform[kColumn] = nullptr;
     }
   }
 #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
   InitAll<10, int32_t, uint16_t>(dsp);
 #else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
 #ifndef LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformDct
-  dsp->inverse_transforms[k1DTransformSize4][k1DTransformDct] =
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
       TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
-                      DctDcOnly_C<10, int32_t, 2>, Dct_C<int32_t, 2>>;
+                      DctDcOnly_C<10, int32_t, 2>, Dct_C<int32_t, 2>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
+                      DctDcOnly_C<10, int32_t, 2>, Dct_C<int32_t, 2>,
+                      /*is_row=*/false>;
 #endif
 #ifndef LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformDct
-  dsp->inverse_transforms[k1DTransformSize8][k1DTransformDct] =
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
       TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
-                      DctDcOnly_C<10, int32_t, 3>, Dct_C<int32_t, 3>>;
+                      DctDcOnly_C<10, int32_t, 3>, Dct_C<int32_t, 3>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
+                      DctDcOnly_C<10, int32_t, 3>, Dct_C<int32_t, 3>,
+                      /*is_row=*/false>;
 #endif
 #ifndef LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformDct
-  dsp->inverse_transforms[k1DTransformSize16][k1DTransformDct] =
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
       TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
-                      DctDcOnly_C<10, int32_t, 4>, Dct_C<int32_t, 4>>;
+                      DctDcOnly_C<10, int32_t, 4>, Dct_C<int32_t, 4>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
+                      DctDcOnly_C<10, int32_t, 4>, Dct_C<int32_t, 4>,
+                      /*is_row=*/false>;
 #endif
 #ifndef LIBGAV1_Dsp10bpp_1DTransformSize32_1DTransformDct
-  dsp->inverse_transforms[k1DTransformSize32][k1DTransformDct] =
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
       TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
-                      DctDcOnly_C<10, int32_t, 5>, Dct_C<int32_t, 5>>;
+                      DctDcOnly_C<10, int32_t, 5>, Dct_C<int32_t, 5>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
+                      DctDcOnly_C<10, int32_t, 5>, Dct_C<int32_t, 5>,
+                      /*is_row=*/false>;
 #endif
 #ifndef LIBGAV1_Dsp10bpp_1DTransformSize64_1DTransformDct
-  dsp->inverse_transforms[k1DTransformSize64][k1DTransformDct] =
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
       TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
-                      DctDcOnly_C<10, int32_t, 6>, Dct_C<int32_t, 6>>;
+                      DctDcOnly_C<10, int32_t, 6>, Dct_C<int32_t, 6>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
+                      DctDcOnly_C<10, int32_t, 6>, Dct_C<int32_t, 6>,
+                      /*is_row=*/false>;
 #endif
 #ifndef LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformAdst
-  dsp->inverse_transforms[k1DTransformSize4][k1DTransformAdst] =
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
       TransformLoop_C<10, int32_t, uint16_t, k1DTransformAdst,
-                      Adst4DcOnly_C<10, int32_t>, Adst4_C<int32_t>>;
+                      Adst4DcOnly_C<10, int32_t>, Adst4_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformAdst,
+                      Adst4DcOnly_C<10, int32_t>, Adst4_C<int32_t>,
+                      /*is_row=*/false>;
 #endif
 #ifndef LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformAdst
-  dsp->inverse_transforms[k1DTransformSize8][k1DTransformAdst] =
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
       TransformLoop_C<10, int32_t, uint16_t, k1DTransformAdst,
-                      Adst8DcOnly_C<10, int32_t>, Adst8_C<int32_t>>;
+                      Adst8DcOnly_C<10, int32_t>, Adst8_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformAdst,
+                      Adst8DcOnly_C<10, int32_t>, Adst8_C<int32_t>,
+                      /*is_row=*/false>;
 #endif
 #ifndef LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformAdst
-  dsp->inverse_transforms[k1DTransformSize16][k1DTransformAdst] =
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
       TransformLoop_C<10, int32_t, uint16_t, k1DTransformAdst,
-                      Adst16DcOnly_C<10, int32_t>, Adst16_C<int32_t>>;
+                      Adst16DcOnly_C<10, int32_t>, Adst16_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformAdst,
+                      Adst16DcOnly_C<10, int32_t>, Adst16_C<int32_t>,
+                      /*is_row=*/false>;
 #endif
 #ifndef LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformIdentity
-  dsp->inverse_transforms[k1DTransformSize4][k1DTransformIdentity] =
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
       TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity,
                       Identity4DcOnly_C<10, int32_t>, Identity4Row_C<int32_t>,
-                      Identity4Column_C<int32_t>>;
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity,
+                      Identity4DcOnly_C<10, int32_t>,
+                      Identity4Column_C<int32_t>, /*is_row=*/false>;
 #endif
 #ifndef LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformIdentity
-  dsp->inverse_transforms[k1DTransformSize8][k1DTransformIdentity] =
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
       TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity,
                       Identity8DcOnly_C<10, int32_t>, Identity8Row_C<int32_t>,
-                      Identity8Column_C<int32_t>>;
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity,
+                      Identity8DcOnly_C<10, int32_t>,
+                      Identity8Column_C<int32_t>, /*is_row=*/false>;
 #endif
 #ifndef LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformIdentity
-  dsp->inverse_transforms[k1DTransformSize16][k1DTransformIdentity] =
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
       TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity,
                       Identity16DcOnly_C<10, int32_t>, Identity16Row_C<int32_t>,
-                      Identity16Column_C<int32_t>>;
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity,
+                      Identity16DcOnly_C<10, int32_t>,
+                      Identity16Column_C<int32_t>, /*is_row=*/false>;
 #endif
 #ifndef LIBGAV1_Dsp10bpp_1DTransformSize32_1DTransformIdentity
-  dsp->inverse_transforms[k1DTransformSize32][k1DTransformIdentity] =
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] =
       TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity,
                       Identity32DcOnly_C<10, int32_t>, Identity32Row_C<int32_t>,
-                      Identity32Column_C<int32_t>>;
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity,
+                      Identity32DcOnly_C<10, int32_t>,
+                      Identity32Column_C<int32_t>, /*is_row=*/false>;
 #endif
 #ifndef LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformWht
-  dsp->inverse_transforms[k1DTransformSize4][k1DTransformWht] =
+  dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] =
       TransformLoop_C<10, int32_t, uint16_t, k1DTransformWht,
-                      Wht4DcOnly_C<10, int32_t>, Wht4_C<int32_t>>;
+                      Wht4DcOnly_C<10, int32_t>, Wht4_C<int32_t>,
+                      /*is_row=*/true>;
+  dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] =
+      TransformLoop_C<10, int32_t, uint16_t, k1DTransformWht,
+                      Wht4DcOnly_C<10, int32_t>, Wht4_C<int32_t>,
+                      /*is_row=*/false>;
 #endif
 #endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
 }

diff --git a/libgav1/src/dsp/inverse_transform.inc b/libgav1/src/dsp/inverse_transform.inc
index 1893884..55e68b6 100644
--- a/libgav1/src/dsp/inverse_transform.inc
+++ b/libgav1/src/dsp/inverse_transform.inc

@@ -46,84 +46,6 @@
 
 inline int16_t Sin128(int angle) { return Cos128(angle - 64); }
 
-template <int tx_width>
-LIBGAV1_ALWAYS_INLINE int GetNumRows(TransformType tx_type, int tx_height,
-                                     int non_zero_coeff_count) {
-  const TransformClass tx_class = GetTransformClass(tx_type);
-  // The transform loops process either 4 or a multiple of 8 rows.  Use tx_class
-  // to determine the scan order.  Then return the number of rows based on the
-  // non_zero_coeff_count.
-  if (tx_height > 4) {
-    if (tx_class == kTransformClass2D) {
-      if (tx_width == 4) {
-        if (non_zero_coeff_count <= 10) return 4;
-        if (non_zero_coeff_count <= 29) return 8;
-        return tx_height;
-      }
-      if (tx_width == 8) {
-        if (non_zero_coeff_count <= 10) return 4;
-        if (non_zero_coeff_count <= 43) return 8;
-        if ((non_zero_coeff_count <= 107) & (tx_height > 16)) return 16;
-        if ((non_zero_coeff_count <= 171) & (tx_height > 16)) return 24;
-        return tx_height;
-      }
-      if (tx_width == 16) {
-        if (non_zero_coeff_count <= 10) return 4;
-        if (non_zero_coeff_count <= 36) return 8;
-        if ((non_zero_coeff_count <= 151) & (tx_height > 16)) return 16;
-        if ((non_zero_coeff_count <= 279) & (tx_height > 16)) return 24;
-        return tx_height;
-      }
-      if (tx_width == 32) {
-        if (non_zero_coeff_count <= 10) return 4;
-        if (non_zero_coeff_count <= 36) return 8;
-        if ((non_zero_coeff_count <= 136) & (tx_height > 16)) return 16;
-        if ((non_zero_coeff_count <= 300) & (tx_height > 16)) return 24;
-        return tx_height;
-      }
-    }
-
-    if (tx_class == kTransformClassHorizontal) {
-      if (non_zero_coeff_count <= 4) return 4;
-      if (non_zero_coeff_count <= 8) return 8;
-      if ((non_zero_coeff_count <= 16) & (tx_height > 16)) return 16;
-      if ((non_zero_coeff_count <= 24) & (tx_height > 16)) return 24;
-      return tx_height;
-    }
-
-    if (tx_class == kTransformClassVertical) {
-      if (tx_width == 4) {
-        if (non_zero_coeff_count <= 16) return 4;
-        if (non_zero_coeff_count <= 32) return 8;
-        return tx_height;
-      }
-      if (tx_width == 8) {
-        if (non_zero_coeff_count <= 32) return 4;
-        if (non_zero_coeff_count <= 64) return 8;
-        if ((non_zero_coeff_count <= 128) & (tx_height > 16)) return 16;
-        if ((non_zero_coeff_count <= 192) & (tx_height > 16)) return 24;
-        return tx_height;
-      }
-
-      if (tx_width == 16) {
-        if (non_zero_coeff_count <= 64) return 4;
-        if (non_zero_coeff_count <= 128) return 8;
-        if ((non_zero_coeff_count <= 256) & (tx_height > 16)) return 16;
-        if ((non_zero_coeff_count <= 384) & (tx_height > 16)) return 24;
-        return tx_height;
-      }
-      if (tx_width == 32) {
-        if (non_zero_coeff_count <= 128) return 4;
-        if (non_zero_coeff_count <= 256) return 8;
-        if ((non_zero_coeff_count <= 512) & (tx_height > 16)) return 16;
-        if ((non_zero_coeff_count <= 768) & (tx_height > 16)) return 24;
-        return tx_height;
-      }
-    }
-  }
-  return tx_height;
-}
-
 // The value for index i is derived as:
 // round(sqrt(2) * sin(i * pi / 9) * 2 / 3 * (1 << 12)).
 constexpr int16_t kAdst4Multiplier[4] = {1321, 2482, 3344, 3803};

diff --git a/libgav1/src/dsp/libgav1_dsp.cmake b/libgav1/src/dsp/libgav1_dsp.cmake
index 00574fa..a28334d 100644
--- a/libgav1/src/dsp/libgav1_dsp.cmake
+++ b/libgav1/src/dsp/libgav1_dsp.cmake

@@ -30,6 +30,7 @@
             "${libgav1_source}/dsp/constants.h"
             "${libgav1_source}/dsp/convolve.cc"
             "${libgav1_source}/dsp/convolve.h"
+            "${libgav1_source}/dsp/convolve.inc"
             "${libgav1_source}/dsp/distance_weighted_blend.cc"
             "${libgav1_source}/dsp/distance_weighted_blend.h"
             "${libgav1_source}/dsp/dsp.cc"
@@ -39,8 +40,16 @@
             "${libgav1_source}/dsp/film_grain_common.h"
             "${libgav1_source}/dsp/intra_edge.cc"
             "${libgav1_source}/dsp/intra_edge.h"
+            "${libgav1_source}/dsp/intrapred_cfl.cc"
+            "${libgav1_source}/dsp/intrapred_cfl.h"
+            "${libgav1_source}/dsp/intrapred_directional.cc"
+            "${libgav1_source}/dsp/intrapred_directional.h"
+            "${libgav1_source}/dsp/intrapred_filter.cc"
+            "${libgav1_source}/dsp/intrapred_filter.h"
             "${libgav1_source}/dsp/intrapred.cc"
             "${libgav1_source}/dsp/intrapred.h"
+            "${libgav1_source}/dsp/intrapred_smooth.cc"
+            "${libgav1_source}/dsp/intrapred_smooth.h"
             "${libgav1_source}/dsp/inverse_transform.cc"
             "${libgav1_source}/dsp/inverse_transform.h"
             "${libgav1_source}/dsp/inverse_transform.inc"
@@ -64,6 +73,16 @@
             "${libgav1_source}/dsp/weight_mask.cc"
             "${libgav1_source}/dsp/weight_mask.h")
 
+list(APPEND libgav1_dsp_sources_avx2
+            ${libgav1_dsp_sources_avx2}
+            "${libgav1_source}/dsp/x86/cdef_avx2.cc"
+            "${libgav1_source}/dsp/x86/cdef_avx2.h"
+            "${libgav1_source}/dsp/x86/convolve_avx2.cc"
+            "${libgav1_source}/dsp/x86/convolve_avx2.h"
+            "${libgav1_source}/dsp/x86/loop_restoration_10bit_avx2.cc"
+            "${libgav1_source}/dsp/x86/loop_restoration_avx2.cc"
+            "${libgav1_source}/dsp/x86/loop_restoration_avx2.h")
+
 list(APPEND libgav1_dsp_sources_neon
             ${libgav1_dsp_sources_neon}
             "${libgav1_source}/dsp/arm/average_blend_neon.cc"
@@ -80,11 +99,16 @@
             "${libgav1_source}/dsp/arm/intra_edge_neon.cc"
             "${libgav1_source}/dsp/arm/intra_edge_neon.h"
             "${libgav1_source}/dsp/arm/intrapred_cfl_neon.cc"
+            "${libgav1_source}/dsp/arm/intrapred_cfl_neon.h"
+            "${libgav1_source}/dsp/arm/intrapred_directional_neon.h"
             "${libgav1_source}/dsp/arm/intrapred_directional_neon.cc"
-            "${libgav1_source}/dsp/arm/intrapred_filter_intra_neon.cc"
+            "${libgav1_source}/dsp/arm/intrapred_filter_neon.cc"
+            "${libgav1_source}/dsp/arm/intrapred_filter_neon.h"
             "${libgav1_source}/dsp/arm/intrapred_neon.cc"
             "${libgav1_source}/dsp/arm/intrapred_neon.h"
             "${libgav1_source}/dsp/arm/intrapred_smooth_neon.cc"
+            "${libgav1_source}/dsp/arm/intrapred_smooth_neon.h"
+            "${libgav1_source}/dsp/arm/inverse_transform_10bit_neon.cc"
             "${libgav1_source}/dsp/arm/inverse_transform_neon.cc"
             "${libgav1_source}/dsp/arm/inverse_transform_neon.h"
             "${libgav1_source}/dsp/arm/loop_filter_neon.cc"
@@ -115,18 +139,28 @@
             "${libgav1_source}/dsp/x86/cdef_sse4.h"
             "${libgav1_source}/dsp/x86/convolve_sse4.cc"
             "${libgav1_source}/dsp/x86/convolve_sse4.h"
+            "${libgav1_source}/dsp/x86/convolve_sse4.inc"
             "${libgav1_source}/dsp/x86/distance_weighted_blend_sse4.cc"
             "${libgav1_source}/dsp/x86/distance_weighted_blend_sse4.h"
+            "${libgav1_source}/dsp/x86/film_grain_sse4.cc"
+            "${libgav1_source}/dsp/x86/film_grain_sse4.h"
             "${libgav1_source}/dsp/x86/intra_edge_sse4.cc"
             "${libgav1_source}/dsp/x86/intra_edge_sse4.h"
+            "${libgav1_source}/dsp/x86/intrapred_cfl_sse4.cc"
+            "${libgav1_source}/dsp/x86/intrapred_cfl_sse4.h"
+            "${libgav1_source}/dsp/x86/intrapred_directional_sse4.cc"
+            "${libgav1_source}/dsp/x86/intrapred_directional_sse4.h"
+            "${libgav1_source}/dsp/x86/intrapred_filter_sse4.cc"
+            "${libgav1_source}/dsp/x86/intrapred_filter_sse4.h"
             "${libgav1_source}/dsp/x86/intrapred_sse4.cc"
             "${libgav1_source}/dsp/x86/intrapred_sse4.h"
-            "${libgav1_source}/dsp/x86/intrapred_cfl_sse4.cc"
             "${libgav1_source}/dsp/x86/intrapred_smooth_sse4.cc"
+            "${libgav1_source}/dsp/x86/intrapred_smooth_sse4.h"
             "${libgav1_source}/dsp/x86/inverse_transform_sse4.cc"
             "${libgav1_source}/dsp/x86/inverse_transform_sse4.h"
             "${libgav1_source}/dsp/x86/loop_filter_sse4.cc"
             "${libgav1_source}/dsp/x86/loop_filter_sse4.h"
+            "${libgav1_source}/dsp/x86/loop_restoration_10bit_sse4.cc"
             "${libgav1_source}/dsp/x86/loop_restoration_sse4.cc"
             "${libgav1_source}/dsp/x86/loop_restoration_sse4.h"
             "${libgav1_source}/dsp/x86/mask_blend_sse4.cc"
@@ -143,12 +177,13 @@
             "${libgav1_source}/dsp/x86/warp_sse4.cc"
             "${libgav1_source}/dsp/x86/warp_sse4.h"
             "${libgav1_source}/dsp/x86/weight_mask_sse4.cc"
-            "${libgav1_source}/dsp/x86/weight_mask_sse4.h"
-            )
+            "${libgav1_source}/dsp/x86/weight_mask_sse4.h")
 
 macro(libgav1_add_dsp_targets)
   unset(dsp_sources)
-  list(APPEND dsp_sources ${libgav1_dsp_sources} ${libgav1_dsp_sources_neon}
+  list(APPEND dsp_sources ${libgav1_dsp_sources}
+              ${libgav1_dsp_sources_neon}
+              ${libgav1_dsp_sources_avx2}
               ${libgav1_dsp_sources_sse4})
 
   libgav1_add_library(NAME

diff --git a/libgav1/src/dsp/loop_restoration.cc b/libgav1/src/dsp/loop_restoration.cc
index fce54f2..1a15d90 100644
--- a/libgav1/src/dsp/loop_restoration.cc
+++ b/libgav1/src/dsp/loop_restoration.cc

@@ -18,6 +18,7 @@
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
+#include <cstring>
 
 #include "src/dsp/common.h"
 #include "src/dsp/dsp.h"
@@ -36,7 +37,7 @@
 // else
 //   a2 = ((z << kSgrProjSgrBits) + (z >> 1)) / (z + 1);
 // ma = 256 - a2;
-const uint8_t kSgrMaLookup[256] = {
+alignas(16) const uint8_t kSgrMaLookup[256] = {
     255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16, 15, 14,
     13,  13,  12, 12, 11, 11, 10, 10, 9,  9,  9,  9,  8,  8,  8,  8,  7,  7,
     7,   7,   7,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,  5,  5,
@@ -68,8 +69,7 @@
   constexpr int offset =
       1 << (bitdepth + kWienerFilterBits - kRoundBitsHorizontal - 1);
   constexpr int limit = (offset << 2) - 1;
-  int y = height;
-  do {
+  for (int y = 0; y < height; ++y) {
     int x = 0;
     do {
       // sum fits into 16 bits only when bitdepth = 8.
@@ -84,7 +84,7 @@
     } while (++x != width);
     source += source_stride;
     *wiener_buffer += width;
-  } while (--y != 0);
+  }
 }
 
 template <int bitdepth, typename Pixel>
@@ -143,10 +143,12 @@
 // filter[3] = 0 - 2 * (filter[0] + filter[1] + filter[2]).
 // Thus in libaom's computation, an offset of 128 is needed for filter[3].
 template <int bitdepth, typename Pixel>
-void WienerFilter_C(const void* const source, void* const dest,
-                    const RestorationUnitInfo& restoration_info,
-                    ptrdiff_t source_stride, ptrdiff_t dest_stride, int width,
-                    int height, RestorationBuffer* const restoration_buffer) {
+void WienerFilter_C(
+    const RestorationUnitInfo& restoration_info, const void* const source,
+    const ptrdiff_t stride, const void* const top_border,
+    const ptrdiff_t top_border_stride, const void* const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* const restoration_buffer, void* const dest) {
   constexpr int kCenterTap = kWienerFilterTaps / 2;
   const int16_t* const number_leading_zero_coefficients =
       restoration_info.wiener_info.number_leading_zero_coefficients;
@@ -158,28 +160,51 @@
   // horizontal filtering.
   const int height_horizontal =
       height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+  const int height_extra = (height_horizontal - height) >> 1;
+  assert(height_extra <= 2);
   const int16_t* const filter_horizontal =
       restoration_info.wiener_info.filter[WienerInfo::kHorizontal];
-  const auto* src = static_cast<const Pixel*>(source);
-  src -= (kCenterTap - number_rows_to_skip) * source_stride + kCenterTap;
+  const auto* src = static_cast<const Pixel*>(source) - kCenterTap;
+  const auto* top = static_cast<const Pixel*>(top_border) - kCenterTap;
+  const auto* bottom = static_cast<const Pixel*>(bottom_border) - kCenterTap;
   auto* wiener_buffer = wiener_buffer_org + number_rows_to_skip * width;
 
   if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
-    WienerHorizontal<bitdepth, Pixel>(src, source_stride, width,
-                                      height_horizontal, filter_horizontal, 0,
+    WienerHorizontal<bitdepth, Pixel>(
+        top + (2 - height_extra) * top_border_stride, top_border_stride, width,
+        height_extra, filter_horizontal, 0, &wiener_buffer);
+    WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
+                                      filter_horizontal, 0, &wiener_buffer);
+    WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
+                                      height_extra, filter_horizontal, 0,
                                       &wiener_buffer);
   } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
-    WienerHorizontal<bitdepth, Pixel>(src, source_stride, width,
-                                      height_horizontal, filter_horizontal, 1,
+    WienerHorizontal<bitdepth, Pixel>(
+        top + (2 - height_extra) * top_border_stride, top_border_stride, width,
+        height_extra, filter_horizontal, 1, &wiener_buffer);
+    WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
+                                      filter_horizontal, 1, &wiener_buffer);
+    WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
+                                      height_extra, filter_horizontal, 1,
                                       &wiener_buffer);
   } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
-    WienerHorizontal<bitdepth, Pixel>(src, source_stride, width,
-                                      height_horizontal, filter_horizontal, 2,
+    WienerHorizontal<bitdepth, Pixel>(
+        top + (2 - height_extra) * top_border_stride, top_border_stride, width,
+        height_extra, filter_horizontal, 2, &wiener_buffer);
+    WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
+                                      filter_horizontal, 2, &wiener_buffer);
+    WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
+                                      height_extra, filter_horizontal, 2,
                                       &wiener_buffer);
   } else {
     assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
-    WienerHorizontal<bitdepth, Pixel>(src, source_stride, width,
-                                      height_horizontal, filter_horizontal, 3,
+    WienerHorizontal<bitdepth, Pixel>(
+        top + (2 - height_extra) * top_border_stride, top_border_stride, width,
+        height_extra, filter_horizontal, 3, &wiener_buffer);
+    WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
+                                      filter_horizontal, 3, &wiener_buffer);
+    WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
+                                      height_extra, filter_horizontal, 3,
                                       &wiener_buffer);
   }
 
@@ -195,28 +220,29 @@
     memcpy(wiener_buffer_org, wiener_buffer_org + width,
            sizeof(*wiener_buffer) * width);
     WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
-                                    filter_vertical, 0, dest, dest_stride);
+                                    filter_vertical, 0, dest, stride);
   } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
     WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
-                                    filter_vertical, 1, dest, dest_stride);
+                                    filter_vertical, 1, dest, stride);
   } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
     WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
-                                    filter_vertical, 2, dest, dest_stride);
+                                    filter_vertical, 2, dest, stride);
   } else {
     assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
     WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
-                                    filter_vertical, 3, dest, dest_stride);
+                                    filter_vertical, 3, dest, stride);
   }
 }
 
 //------------------------------------------------------------------------------
 // SGR
 
+// When |height| is 1, |src_stride| could be set to an arbitrary value.
 template <typename Pixel, int size>
 LIBGAV1_ALWAYS_INLINE void BoxSum(const Pixel* src, const ptrdiff_t src_stride,
                                   const int height, const int width,
-                                  uint16_t* sums, uint32_t* square_sums,
-                                  const ptrdiff_t sum_stride) {
+                                  uint16_t* const* sums,
+                                  uint32_t* const* square_sums) {
   int y = height;
   do {
     uint32_t sum = 0;
@@ -226,8 +252,8 @@
       sum += source;
       square_sum += source * source;
     }
-    sums[0] = sum;
-    square_sums[0] = square_sum;
+    (*sums)[0] = sum;
+    (*square_sums)[0] = square_sum;
     int x = 1;
     do {
       const Pixel source0 = src[x - 1];
@@ -236,21 +262,22 @@
       sum += source1;
       square_sum -= source0 * source0;
       square_sum += source1 * source1;
-      sums[x] = sum;
-      square_sums[x] = square_sum;
+      (*sums)[x] = sum;
+      (*square_sums)[x] = square_sum;
     } while (++x != width);
     src += src_stride;
-    sums += sum_stride;
-    square_sums += sum_stride;
+    ++sums;
+    ++square_sums;
   } while (--y != 0);
 }
 
+// When |height| is 1, |src_stride| could be set to an arbitrary value.
 template <typename Pixel>
 LIBGAV1_ALWAYS_INLINE void BoxSum(const Pixel* src, const ptrdiff_t src_stride,
                                   const int height, const int width,
-                                  uint16_t* sum3, uint16_t* sum5,
-                                  uint32_t* square_sum3, uint32_t* square_sum5,
-                                  const ptrdiff_t sum_stride) {
+                                  uint16_t* const* sum3, uint16_t* const* sum5,
+                                  uint32_t* const* square_sum3,
+                                  uint32_t* const* square_sum5) {
   int y = height;
   do {
     uint32_t sum = 0;
@@ -266,18 +293,18 @@
       const Pixel source1 = src[x + 4];
       sum -= source0;
       square_sum -= source0 * source0;
-      sum3[x] = sum;
-      square_sum3[x] = square_sum;
+      (*sum3)[x] = sum;
+      (*square_sum3)[x] = square_sum;
       sum += source1;
       square_sum += source1 * source1;
-      sum5[x] = sum + source0;
-      square_sum5[x] = square_sum + source0 * source0;
+      (*sum5)[x] = sum + source0;
+      (*square_sum5)[x] = square_sum + source0 * source0;
     } while (++x != width);
     src += src_stride;
-    sum3 += sum_stride;
-    sum5 += sum_stride;
-    square_sum3 += sum_stride;
-    square_sum5 += sum_stride;
+    ++sum3;
+    ++sum5;
+    ++square_sum3;
+    ++square_sum5;
   } while (--y != 0);
 }
 
@@ -396,20 +423,20 @@
 }
 
 template <typename Pixel>
-inline void BoxFilterPass(const Pixel src0, const Pixel src1,
-                          const uint16_t* const ma565[2],
-                          const uint32_t* const b565[2], const ptrdiff_t x,
-                          int p[2]) {
+inline void BoxFilterPass1Kernel(const Pixel src0, const Pixel src1,
+                                 const uint16_t* const ma565[2],
+                                 const uint32_t* const b565[2],
+                                 const ptrdiff_t x, int p[2]) {
   p[0] = CalculateFilteredOutput<Pixel>(src0, ma565[0][x] + ma565[1][x],
                                         b565[0][x] + b565[1][x], 5);
   p[1] = CalculateFilteredOutput<Pixel>(src1, ma565[1][x], b565[1][x], 4);
 }
 
 template <typename Pixel>
-inline int BoxFilterPass2(const Pixel src, const uint16_t* const ma343[3],
-                          const uint16_t* const ma444,
-                          const uint32_t* const b343[3],
-                          const uint32_t* const b444, const ptrdiff_t x) {
+inline int BoxFilterPass2Kernel(const Pixel src, const uint16_t* const ma343[3],
+                                const uint16_t* const ma444,
+                                const uint32_t* const b343[3],
+                                const uint32_t* const b444, const ptrdiff_t x) {
   const uint32_t ma = ma343[0][x] + ma444[x] + ma343[2][x];
   const uint32_t b = b343[0][x] + b444[x] + b343[2][x];
   return CalculateFilteredOutput<Pixel>(src, ma, b, 5);
@@ -441,37 +468,90 @@
   return SelfGuidedFinal<bitdepth, Pixel>(src, v);
 }
 
-template <typename T>
-void Circulate3PointersBy1(T* p[3]) {
-  T* const p0 = p[0];
-  p[0] = p[1];
-  p[1] = p[2];
-  p[2] = p0;
+template <int bitdepth, typename Pixel>
+inline void BoxFilterPass1(const Pixel* const src, const ptrdiff_t stride,
+                           uint16_t* const sum5[5],
+                           uint32_t* const square_sum5[5], const int width,
+                           const uint32_t scale, const int16_t w0,
+                           SgrBuffer* const sgr_buffer,
+                           uint16_t* const ma565[2], uint32_t* const b565[2],
+                           Pixel* dst) {
+  BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scale, sgr_buffer,
+                                 ma565[1], b565[1]);
+  int x = 0;
+  do {
+    int p[2];
+    BoxFilterPass1Kernel<Pixel>(src[x], src[stride + x], ma565, b565, x, p);
+    dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p[0], w0);
+    dst[stride + x] =
+        SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[stride + x], p[1], w0);
+  } while (++x != width);
 }
 
-template <typename T>
-void Circulate4PointersBy2(T* p[4]) {
-  std::swap(p[0], p[2]);
-  std::swap(p[1], p[3]);
+template <int bitdepth, typename Pixel>
+inline void BoxFilterPass2(const Pixel* const src, const Pixel* const src0,
+                           const int width, const uint16_t scale,
+                           const int16_t w0, uint16_t* const sum3[4],
+                           uint32_t* const square_sum3[4],
+                           SgrBuffer* const sgr_buffer,
+                           uint16_t* const ma343[4], uint16_t* const ma444[3],
+                           uint32_t* const b343[4], uint32_t* const b444[3],
+                           Pixel* dst) {
+  BoxSum<Pixel, 3>(src0, 0, 1, width + 2, sum3 + 2, square_sum3 + 2);
+  BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scale, true,
+                                 sgr_buffer, ma343[2], b343[2], ma444[1],
+                                 b444[1]);
+  int x = 0;
+  do {
+    const int p =
+        BoxFilterPass2Kernel<Pixel>(src[x], ma343, ma444[0], b343, b444[0], x);
+    dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p, w0);
+  } while (++x != width);
 }
 
-template <typename T>
-void Circulate5PointersBy2(T* p[5]) {
-  T* const p0 = p[0];
-  T* const p1 = p[1];
-  p[0] = p[2];
-  p[1] = p[3];
-  p[2] = p[4];
-  p[3] = p0;
-  p[4] = p1;
+template <int bitdepth, typename Pixel>
+inline void BoxFilter(const Pixel* const src, const ptrdiff_t stride,
+                      uint16_t* const sum3[4], uint16_t* const sum5[5],
+                      uint32_t* const square_sum3[4],
+                      uint32_t* const square_sum5[5], const int width,
+                      const uint16_t scales[2], const int16_t w0,
+                      const int16_t w2, SgrBuffer* const sgr_buffer,
+                      uint16_t* const ma343[4], uint16_t* const ma444[3],
+                      uint16_t* const ma565[2], uint32_t* const b343[4],
+                      uint32_t* const b444[3], uint32_t* const b565[2],
+                      Pixel* dst) {
+  BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0],
+                                 sgr_buffer, ma565[1], b565[1]);
+  BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scales[1], true,
+                                 sgr_buffer, ma343[2], b343[2], ma444[1],
+                                 b444[1]);
+  BoxFilterPreProcess3<bitdepth>(sum3 + 1, square_sum3 + 1, width, scales[1],
+                                 true, sgr_buffer, ma343[3], b343[3], ma444[2],
+                                 b444[2]);
+  int x = 0;
+  do {
+    int p[2][2];
+    BoxFilterPass1Kernel<Pixel>(src[x], src[stride + x], ma565, b565, x, p[0]);
+    p[1][0] =
+        BoxFilterPass2Kernel<Pixel>(src[x], ma343, ma444[0], b343, b444[0], x);
+    p[1][1] = BoxFilterPass2Kernel<Pixel>(src[stride + x], ma343 + 1, ma444[1],
+                                          b343 + 1, b444[1], x);
+    dst[x] = SelfGuidedDoubleMultiplier<bitdepth, Pixel>(src[x], p[0][0],
+                                                         p[1][0], w0, w2);
+    dst[stride + x] = SelfGuidedDoubleMultiplier<bitdepth, Pixel>(
+        src[stride + x], p[0][1], p[1][1], w0, w2);
+  } while (++x != width);
 }
 
 template <int bitdepth, typename Pixel>
 inline void BoxFilterProcess(const RestorationUnitInfo& restoration_info,
-                             const Pixel* src, const ptrdiff_t src_stride,
+                             const Pixel* src, const ptrdiff_t stride,
+                             const Pixel* const top_border,
+                             const ptrdiff_t top_border_stride,
+                             const Pixel* bottom_border,
+                             const ptrdiff_t bottom_border_stride,
                              const int width, const int height,
-                             SgrBuffer* const sgr_buffer, Pixel* dst,
-                             const ptrdiff_t dst_stride) {
+                             SgrBuffer* const sgr_buffer, Pixel* dst) {
   const auto temp_stride = Align<ptrdiff_t>(width, 8);
   const ptrdiff_t sum_stride = temp_stride + 8;
   const int sgr_proj_index = restoration_info.sgr_proj_info.index;
@@ -509,10 +589,15 @@
   b565[1] = b565[0] + temp_stride;
   assert(scales[0] != 0);
   assert(scales[1] != 0);
-  BoxSum<Pixel>(src - 2 * src_stride - 3, src_stride, 4, width + 2, sum3[0],
-                sum5[1], square_sum3[0], square_sum5[1], sum_stride);
-  memcpy(sum5[0], sum5[1], sizeof(**sum5) * sum_stride);
-  memcpy(square_sum5[0], square_sum5[1], sizeof(**square_sum5) * sum_stride);
+  BoxSum<Pixel>(top_border, top_border_stride, 2, width + 2, sum3, sum5 + 1,
+                square_sum3, square_sum5 + 1);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  BoxSum<Pixel>(src, stride, 1, width + 2, sum3 + 2, sum5 + 3, square_sum3 + 2,
+                square_sum5 + 3);
+  const Pixel* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSum<Pixel>(s, 0, 1, width + 2, sum3 + 3, sum5 + 4, square_sum3 + 3,
+                square_sum5 + 4);
   BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0],
                                  sgr_buffer, ma565[0], b565[0]);
   BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scales[1], false,
@@ -521,38 +606,21 @@
   BoxFilterPreProcess3<bitdepth>(sum3 + 1, square_sum3 + 1, width, scales[1],
                                  true, sgr_buffer, ma343[1], b343[1], ma444[0],
                                  b444[0]);
-  for (int y = height >> 1; y != 0; --y) {
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
     Circulate4PointersBy2<uint16_t>(sum3);
     Circulate4PointersBy2<uint32_t>(square_sum3);
     Circulate5PointersBy2<uint16_t>(sum5);
     Circulate5PointersBy2<uint32_t>(square_sum5);
-    BoxSum<Pixel>(src + 2 * src_stride - 3, src_stride, 1, width + 2, sum3[2],
-                  sum5[3], square_sum3[2], square_sum5[3], sum_stride);
-    BoxSum<Pixel>(src + 3 * src_stride - 3, src_stride, 1, width + 2, sum3[3],
-                  sum5[4], square_sum3[3], square_sum5[4], sum_stride);
-    BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0],
-                                   sgr_buffer, ma565[1], b565[1]);
-    BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scales[1], true,
-                                   sgr_buffer, ma343[2], b343[2], ma444[1],
-                                   b444[1]);
-    BoxFilterPreProcess3<bitdepth>(sum3 + 1, square_sum3 + 1, width, scales[1],
-                                   true, sgr_buffer, ma343[3], b343[3],
-                                   ma444[2], b444[2]);
-    int x = 0;
-    do {
-      int p[2][2];
-      BoxFilterPass<Pixel>(src[x], src[src_stride + x], ma565, b565, x, p[0]);
-      p[1][0] =
-          BoxFilterPass2<Pixel>(src[x], ma343, ma444[0], b343, b444[0], x);
-      p[1][1] = BoxFilterPass2<Pixel>(src[src_stride + x], ma343 + 1, ma444[1],
-                                      b343 + 1, b444[1], x);
-      dst[x] = SelfGuidedDoubleMultiplier<bitdepth, Pixel>(src[x], p[0][0],
-                                                           p[1][0], w0, w2);
-      dst[dst_stride + x] = SelfGuidedDoubleMultiplier<bitdepth, Pixel>(
-          src[src_stride + x], p[0][1], p[1][1], w0, w2);
-    } while (++x != width);
-    src += 2 * src_stride;
-    dst += 2 * dst_stride;
+    BoxSum<Pixel>(src + 2 * stride, stride, 2, width + 2, sum3 + 2, sum5 + 3,
+                  square_sum3 + 2, square_sum5 + 3);
+    BoxFilter<bitdepth, Pixel>(src + 3, stride, sum3, sum5, square_sum3,
+                               square_sum5, width, scales, w0, w2, sgr_buffer,
+                               ma343, ma444, ma565, b343, b444, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
     Circulate4PointersBy2<uint16_t>(ma343);
     Circulate4PointersBy2<uint32_t>(b343);
     std::swap(ma444[0], ma444[2]);
@@ -560,15 +628,48 @@
     std::swap(ma565[0], ma565[1]);
     std::swap(b565[0], b565[1]);
   }
+
+  Circulate4PointersBy2<uint16_t>(sum3);
+  Circulate4PointersBy2<uint32_t>(square_sum3);
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const Pixel* sr;
+    ptrdiff_t s_stride;
+    if ((height & 1) == 0) {
+      sr = bottom_border;
+      s_stride = bottom_border_stride;
+    } else {
+      sr = src + 2 * stride;
+      s_stride = bottom_border - (src + 2 * stride);
+    }
+    BoxSum<Pixel>(sr, s_stride, 2, width + 2, sum3 + 2, sum5 + 3,
+                  square_sum3 + 2, square_sum5 + 3);
+    BoxFilter<bitdepth, Pixel>(src + 3, stride, sum3, sum5, square_sum3,
+                               square_sum5, width, scales, w0, w2, sgr_buffer,
+                               ma343, ma444, ma565, b343, b444, b565, dst);
+  }
   if ((height & 1) != 0) {
-    Circulate4PointersBy2<uint16_t>(sum3);
-    Circulate4PointersBy2<uint32_t>(square_sum3);
-    Circulate5PointersBy2<uint16_t>(sum5);
-    Circulate5PointersBy2<uint32_t>(square_sum5);
-    BoxSum<Pixel>(src + 2 * src_stride - 3, src_stride, 1, width + 2, sum3[2],
-                  sum5[3], square_sum3[2], square_sum5[3], sum_stride);
-    memcpy(sum5[4], sum5[3], sizeof(**sum5) * sum_stride);
-    memcpy(square_sum5[4], square_sum5[3], sizeof(**square_sum5) * sum_stride);
+    src += 3;
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      Circulate4PointersBy2<uint16_t>(sum3);
+      Circulate4PointersBy2<uint32_t>(square_sum3);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+      Circulate4PointersBy2<uint16_t>(ma343);
+      Circulate4PointersBy2<uint32_t>(b343);
+      std::swap(ma444[0], ma444[2]);
+      std::swap(b444[0], b444[2]);
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+    }
+    BoxSum<Pixel>(bottom_border + bottom_border_stride, bottom_border_stride, 1,
+                  width + 2, sum3 + 2, sum5 + 3, square_sum3 + 2,
+                  square_sum5 + 3);
+    sum5[4] = sum5[3];
+    square_sum5[4] = square_sum5[3];
     BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0],
                                    sgr_buffer, ma565[1], b565[1]);
     BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scales[1], false,
@@ -578,8 +679,8 @@
     do {
       const int p0 = CalculateFilteredOutput<Pixel>(
           src[x], ma565[0][x] + ma565[1][x], b565[0][x] + b565[1][x], 5);
-      const int p1 =
-          BoxFilterPass2<Pixel>(src[x], ma343, ma444[0], b343, b444[0], x);
+      const int p1 = BoxFilterPass2Kernel<Pixel>(src[x], ma343, ma444[0], b343,
+                                                 b444[0], x);
       dst[x] =
           SelfGuidedDoubleMultiplier<bitdepth, Pixel>(src[x], p0, p1, w0, w2);
     } while (++x != width);
@@ -588,14 +689,17 @@
 
 template <int bitdepth, typename Pixel>
 inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
-                                  const Pixel* src, const ptrdiff_t src_stride,
+                                  const Pixel* src, const ptrdiff_t stride,
+                                  const Pixel* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const Pixel* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
                                   const int width, const int height,
-                                  SgrBuffer* const sgr_buffer, Pixel* dst,
-                                  const ptrdiff_t dst_stride) {
+                                  SgrBuffer* const sgr_buffer, Pixel* dst) {
   const auto temp_stride = Align<ptrdiff_t>(width, 8);
   const ptrdiff_t sum_stride = temp_stride + 8;
   const int sgr_proj_index = restoration_info.sgr_proj_info.index;
-  const uint32_t s = kSgrScaleParameter[sgr_proj_index][0];  // s < 2^12.
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0];  // < 2^12.
   const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
   uint16_t *sum5[5], *ma565[2];
   uint32_t *square_sum5[5], *b565[2];
@@ -609,43 +713,63 @@
   ma565[1] = ma565[0] + temp_stride;
   b565[0] = sgr_buffer->b565;
   b565[1] = b565[0] + temp_stride;
-  assert(s != 0);
-  BoxSum<Pixel, 5>(src - 2 * src_stride - 3, src_stride, 4, width + 2, sum5[1],
-                   square_sum5[1], sum_stride);
-  memcpy(sum5[0], sum5[1], sizeof(**sum5) * sum_stride);
-  memcpy(square_sum5[0], square_sum5[1], sizeof(**square_sum5) * sum_stride);
-  BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, s, sgr_buffer,
+  assert(scale != 0);
+  BoxSum<Pixel, 5>(top_border, top_border_stride, 2, width + 2, sum5 + 1,
+                   square_sum5 + 1);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  BoxSum<Pixel, 5>(src, stride, 1, width + 2, sum5 + 3, square_sum5 + 3);
+  const Pixel* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSum<Pixel, 5>(s, 0, 1, width + 2, sum5 + 4, square_sum5 + 4);
+  BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scale, sgr_buffer,
                                  ma565[0], b565[0]);
-  for (int y = height >> 1; y != 0; --y) {
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
     Circulate5PointersBy2<uint16_t>(sum5);
     Circulate5PointersBy2<uint32_t>(square_sum5);
-    BoxSum<Pixel, 5>(src + 2 * src_stride - 3, src_stride, 1, width + 2,
-                     sum5[3], square_sum5[3], sum_stride);
-    BoxSum<Pixel, 5>(src + 3 * src_stride - 3, src_stride, 1, width + 2,
-                     sum5[4], square_sum5[4], sum_stride);
-    BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, s, sgr_buffer,
-                                   ma565[1], b565[1]);
-    int x = 0;
-    do {
-      int p[2];
-      BoxFilterPass<Pixel>(src[x], src[src_stride + x], ma565, b565, x, p);
-      dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p[0], w0);
-      dst[dst_stride + x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(
-          src[src_stride + x], p[1], w0);
-    } while (++x != width);
-    src += 2 * src_stride;
-    dst += 2 * dst_stride;
+    BoxSum<Pixel, 5>(src + 2 * stride, stride, 2, width + 2, sum5 + 3,
+                     square_sum5 + 3);
+    BoxFilterPass1<bitdepth, Pixel>(src + 3, stride, sum5, square_sum5, width,
+                                    scale, w0, sgr_buffer, ma565, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
     std::swap(ma565[0], ma565[1]);
     std::swap(b565[0], b565[1]);
   }
+
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const Pixel* sr;
+    ptrdiff_t s_stride;
+    if ((height & 1) == 0) {
+      sr = bottom_border;
+      s_stride = bottom_border_stride;
+    } else {
+      sr = src + 2 * stride;
+      s_stride = bottom_border - (src + 2 * stride);
+    }
+    BoxSum<Pixel, 5>(sr, s_stride, 2, width + 2, sum5 + 3, square_sum5 + 3);
+    BoxFilterPass1<bitdepth, Pixel>(src + 3, stride, sum5, square_sum5, width,
+                                    scale, w0, sgr_buffer, ma565, b565, dst);
+  }
   if ((height & 1) != 0) {
-    Circulate5PointersBy2<uint16_t>(sum5);
-    Circulate5PointersBy2<uint32_t>(square_sum5);
-    BoxSum<Pixel, 5>(src + 2 * src_stride - 3, src_stride, 1, width + 2,
-                     sum5[3], square_sum5[3], sum_stride);
-    memcpy(sum5[4], sum5[3], sizeof(**sum5) * sum_stride);
-    memcpy(square_sum5[4], square_sum5[3], sizeof(**square_sum5) * sum_stride);
-    BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, s, sgr_buffer,
+    src += 3;
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+    }
+    BoxSum<Pixel, 5>(bottom_border + bottom_border_stride, bottom_border_stride,
+                     1, width + 2, sum5 + 3, square_sum5 + 3);
+    sum5[4] = sum5[3];
+    square_sum5[4] = square_sum5[3];
+    BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scale, sgr_buffer,
                                    ma565[1], b565[1]);
     int x = 0;
     do {
@@ -658,17 +782,20 @@
 
 template <int bitdepth, typename Pixel>
 inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
-                                  const Pixel* src, const ptrdiff_t src_stride,
+                                  const Pixel* src, const ptrdiff_t stride,
+                                  const Pixel* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const Pixel* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
                                   const int width, const int height,
-                                  SgrBuffer* const sgr_buffer, Pixel* dst,
-                                  const ptrdiff_t dst_stride) {
+                                  SgrBuffer* const sgr_buffer, Pixel* dst) {
   assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
   const auto temp_stride = Align<ptrdiff_t>(width, 8);
   const ptrdiff_t sum_stride = temp_stride + 8;
   const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
   const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
   const int sgr_proj_index = restoration_info.sgr_proj_info.index;
-  const uint32_t s = kSgrScaleParameter[sgr_proj_index][1];  // s < 2^12.
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1];  // < 2^12.
   uint16_t *sum3[3], *ma343[3], *ma444[2];
   uint32_t *square_sum3[3], *b343[3], *b444[2];
   sum3[0] = sgr_buffer->sum3;
@@ -685,34 +812,52 @@
   ma444[1] = ma444[0] + temp_stride;
   b444[0] = sgr_buffer->b444;
   b444[1] = b444[0] + temp_stride;
-  assert(s != 0);
-  BoxSum<Pixel, 3>(src - 2 * src_stride - 2, src_stride, 3, width + 2, sum3[0],
-                   square_sum3[0], sum_stride);
-  BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, s, false, sgr_buffer,
-                                 ma343[0], b343[0], nullptr, nullptr);
+  assert(scale != 0);
+  BoxSum<Pixel, 3>(top_border, top_border_stride, 2, width + 2, sum3,
+                   square_sum3);
+  BoxSum<Pixel, 3>(src, stride, 1, width + 2, sum3 + 2, square_sum3 + 2);
+  BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scale, false,
+                                 sgr_buffer, ma343[0], b343[0], nullptr,
+                                 nullptr);
   Circulate3PointersBy1<uint16_t>(sum3);
   Circulate3PointersBy1<uint32_t>(square_sum3);
-  BoxSum<Pixel, 3>(src + src_stride - 2, src_stride, 1, width + 2, sum3[2],
-                   square_sum3[2], sum_stride);
-  BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, s, true, sgr_buffer,
-                                 ma343[1], b343[1], ma444[0], b444[0]);
-  int y = height;
+  const Pixel* s;
+  if (height > 1) {
+    s = src + stride;
+  } else {
+    s = bottom_border;
+    bottom_border += bottom_border_stride;
+  }
+  BoxSum<Pixel, 3>(s, 0, 1, width + 2, sum3 + 2, square_sum3 + 2);
+  BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scale, true,
+                                 sgr_buffer, ma343[1], b343[1], ma444[0],
+                                 b444[0]);
+
+  for (int y = height - 2; y > 0; --y) {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2<bitdepth, Pixel>(src + 2, src + 2 * stride, width, scale, w0,
+                                    sum3, square_sum3, sgr_buffer, ma343, ma444,
+                                    b343, b444, dst);
+    src += stride;
+    dst += stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  }
+
+  src += 2;
+  int y = std::min(height, 2);
   do {
     Circulate3PointersBy1<uint16_t>(sum3);
     Circulate3PointersBy1<uint32_t>(square_sum3);
-    BoxSum<Pixel, 3>(src + 2 * src_stride - 2, src_stride, 1, width + 2,
-                     sum3[2], square_sum3[2], sum_stride);
-    BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, s, true,
-                                   sgr_buffer, ma343[2], b343[2], ma444[1],
-                                   b444[1]);
-    int x = 0;
-    do {
-      const int p =
-          BoxFilterPass2<Pixel>(src[x], ma343, ma444[0], b343, b444[0], x);
-      dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p, w0);
-    } while (++x != width);
-    src += src_stride;
-    dst += dst_stride;
+    BoxFilterPass2<bitdepth, Pixel>(src, bottom_border, width, scale, w0, sum3,
+                                    square_sum3, sgr_buffer, ma343, ma444, b343,
+                                    b444, dst);
+    src += stride;
+    dst += stride;
+    bottom_border += bottom_border_stride;
     Circulate3PointersBy1<uint16_t>(ma343);
     Circulate3PointersBy1<uint32_t>(b343);
     std::swap(ma444[0], ma444[1]);
@@ -721,32 +866,35 @@
 }
 
 template <int bitdepth, typename Pixel>
-void SelfGuidedFilter_C(const void* const source, void* const dest,
-                        const RestorationUnitInfo& restoration_info,
-                        ptrdiff_t source_stride, ptrdiff_t dest_stride,
-                        int width, int height,
-                        RestorationBuffer* const restoration_buffer) {
+void SelfGuidedFilter_C(
+    const RestorationUnitInfo& restoration_info, const void* const source,
+    const ptrdiff_t stride, const void* const top_border,
+    const ptrdiff_t top_border_stride, const void* const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* const restoration_buffer, void* const dest) {
   const int index = restoration_info.sgr_proj_info.index;
   const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
   const int radius_pass_1 = kSgrProjParams[index][2];  // 1 or 0
   const auto* src = static_cast<const Pixel*>(source);
+  const auto* top = static_cast<const Pixel*>(top_border);
+  const auto* bottom = static_cast<const Pixel*>(bottom_border);
   auto* dst = static_cast<Pixel*>(dest);
   SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
   if (radius_pass_1 == 0) {
     // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
     // following assertion.
     assert(radius_pass_0 != 0);
-    BoxFilterProcessPass1<bitdepth, Pixel>(restoration_info, src, source_stride,
-                                           width, height, sgr_buffer, dst,
-                                           dest_stride);
+    BoxFilterProcessPass1<bitdepth, Pixel>(
+        restoration_info, src - 3, stride, top - 3, top_border_stride,
+        bottom - 3, bottom_border_stride, width, height, sgr_buffer, dst);
   } else if (radius_pass_0 == 0) {
-    BoxFilterProcessPass2<bitdepth, Pixel>(restoration_info, src, source_stride,
-                                           width, height, sgr_buffer, dst,
-                                           dest_stride);
+    BoxFilterProcessPass2<bitdepth, Pixel>(
+        restoration_info, src - 2, stride, top - 2, top_border_stride,
+        bottom - 2, bottom_border_stride, width, height, sgr_buffer, dst);
   } else {
-    BoxFilterProcess<bitdepth, Pixel>(restoration_info, src, source_stride,
-                                      width, height, sgr_buffer, dst,
-                                      dest_stride);
+    BoxFilterProcess<bitdepth, Pixel>(
+        restoration_info, src - 3, stride, top - 3, top_border_stride,
+        bottom - 3, bottom_border_stride, width, height, sgr_buffer, dst);
   }
 }
 

diff --git a/libgav1/src/dsp/loop_restoration.h b/libgav1/src/dsp/loop_restoration.h
index a902e9b..de80926 100644
--- a/libgav1/src/dsp/loop_restoration.h
+++ b/libgav1/src/dsp/loop_restoration.h

@@ -30,6 +30,7 @@
 // The order of includes is important as each tests for a superior version
 // before setting the base.
 // clang-format off
+#include "src/dsp/x86/loop_restoration_avx2.h"
 #include "src/dsp/x86/loop_restoration_sse4.h"
 // clang-format on
 
@@ -53,6 +54,31 @@
 // Initializes Dsp::loop_restorations. This function is not thread-safe.
 void LoopRestorationInit_C();
 
+template <typename T>
+void Circulate3PointersBy1(T* p[3]) {
+  T* const p0 = p[0];
+  p[0] = p[1];
+  p[1] = p[2];
+  p[2] = p0;
+}
+
+template <typename T>
+void Circulate4PointersBy2(T* p[4]) {
+  std::swap(p[0], p[2]);
+  std::swap(p[1], p[3]);
+}
+
+template <typename T>
+void Circulate5PointersBy2(T* p[5]) {
+  T* const p0 = p[0];
+  T* const p1 = p[1];
+  p[0] = p[2];
+  p[1] = p[3];
+  p[2] = p[4];
+  p[3] = p0;
+  p[4] = p1;
+}
+
 }  // namespace dsp
 }  // namespace libgav1
 

diff --git a/libgav1/src/dsp/mask_blend.cc b/libgav1/src/dsp/mask_blend.cc
index 101c410..15ef821 100644
--- a/libgav1/src/dsp/mask_blend.cc
+++ b/libgav1/src/dsp/mask_blend.cc

@@ -25,8 +25,8 @@
 namespace dsp {
 namespace {
 
-template <int subsampling_x, int subsampling_y>
-uint8_t GetMaskValue(const uint8_t* mask, const uint8_t* mask_next_row, int x) {
+uint8_t GetMaskValue(const uint8_t* mask, const uint8_t* mask_next_row, int x,
+                     int subsampling_x, int subsampling_y) {
   if ((subsampling_x | subsampling_y) == 0) {
     return mask[x];
   }
@@ -63,7 +63,7 @@
   for (int y = 0; y < height; ++y) {
     for (int x = 0; x < width; ++x) {
       const uint8_t mask_value =
-          GetMaskValue<subsampling_x, subsampling_y>(mask, mask_next_row, x);
+          GetMaskValue(mask, mask_next_row, x, subsampling_x, subsampling_y);
       if (is_inter_intra) {
         dst[x] = static_cast<Pixel>(RightShiftWithRounding(
             mask_value * pred_1[x] + (64 - mask_value) * pred_0[x], 6));
@@ -96,7 +96,7 @@
   for (int y = 0; y < height; ++y) {
     for (int x = 0; x < width; ++x) {
       const uint8_t mask_value =
-          GetMaskValue<subsampling_x, subsampling_y>(mask, mask_next_row, x);
+          GetMaskValue(mask, mask_next_row, x, subsampling_x, subsampling_y);
       prediction_1[x] = static_cast<uint8_t>(RightShiftWithRounding(
           mask_value * prediction_1[x] + (64 - mask_value) * prediction_0[x],
           6));
@@ -148,6 +148,7 @@
 #ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420
   dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_C<1, 1>;
 #endif
+  static_cast<void>(GetMaskValue);
 #endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
 }
 

diff --git a/libgav1/src/dsp/super_res.cc b/libgav1/src/dsp/super_res.cc
index 9379f46..abb01a1 100644
--- a/libgav1/src/dsp/super_res.cc
+++ b/libgav1/src/dsp/super_res.cc

@@ -25,47 +25,57 @@
 namespace {
 
 template <int bitdepth, typename Pixel>
-void ComputeSuperRes(const void* source, const int upscaled_width,
-                     const int initial_subpixel_x, const int step,
-                     void* const dest) {
-  // If (original) upscaled_width is <= 9, the downscaled_width may be
-  // upscaled_width - 1 (i.e. 8, 9), and become the same (i.e. 4) when
-  // subsampled via RightShiftWithRounding. This leads to an edge case where
-  // |step| == 1 << 14.
-  assert(step <= kSuperResScaleMask || upscaled_width <= 4);
-  const auto* src = static_cast<const Pixel*>(source);
+void SuperRes_C(const void* /*coefficients*/, void* const source,
+                const ptrdiff_t source_stride, const int height,
+                const int downscaled_width, const int upscaled_width,
+                const int initial_subpixel_x, const int step, void* const dest,
+                ptrdiff_t dest_stride) {
+  assert(step <= 1 << kSuperResScaleBits);
+  auto* src = static_cast<Pixel*>(source) - DivideBy2(kSuperResFilterTaps);
   auto* dst = static_cast<Pixel*>(dest);
-  src -= DivideBy2(kSuperResFilterTaps);
-  int subpixel_x = initial_subpixel_x;
-  for (int x = 0; x < upscaled_width; ++x) {
-    int sum = 0;
-    const Pixel* const src_x = &src[subpixel_x >> kSuperResScaleBits];
-    const int src_x_subpixel =
-        (subpixel_x & kSuperResScaleMask) >> kSuperResExtraBits;
-    // The sign of each tap is: - + - + + - + -
-    sum -= src_x[0] * kUpscaleFilterUnsigned[src_x_subpixel][0];
-    sum += src_x[1] * kUpscaleFilterUnsigned[src_x_subpixel][1];
-    sum -= src_x[2] * kUpscaleFilterUnsigned[src_x_subpixel][2];
-    sum += src_x[3] * kUpscaleFilterUnsigned[src_x_subpixel][3];
-    sum += src_x[4] * kUpscaleFilterUnsigned[src_x_subpixel][4];
-    sum -= src_x[5] * kUpscaleFilterUnsigned[src_x_subpixel][5];
-    sum += src_x[6] * kUpscaleFilterUnsigned[src_x_subpixel][6];
-    sum -= src_x[7] * kUpscaleFilterUnsigned[src_x_subpixel][7];
-    dst[x] =
-        Clip3(RightShiftWithRounding(sum, kFilterBits), 0, (1 << bitdepth) - 1);
-    subpixel_x += step;
-  }
+  int y = height;
+  do {
+    ExtendLine<Pixel>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+                      kSuperResHorizontalBorder, kSuperResHorizontalBorder);
+    // If (original) upscaled_width is <= 9, the downscaled_width may be
+    // upscaled_width - 1 (i.e. 8, 9), and become the same (i.e. 4) when
+    // subsampled via RightShiftWithRounding. This leads to an edge case where
+    // |step| == 1 << 14.
+    int subpixel_x = initial_subpixel_x;
+    int x = 0;
+    do {
+      int sum = 0;
+      const Pixel* const src_x = &src[subpixel_x >> kSuperResScaleBits];
+      const int src_x_subpixel =
+          (subpixel_x & kSuperResScaleMask) >> kSuperResExtraBits;
+      // The sign of each tap is: - + - + + - + -
+      sum -= src_x[0] * kUpscaleFilterUnsigned[src_x_subpixel][0];
+      sum += src_x[1] * kUpscaleFilterUnsigned[src_x_subpixel][1];
+      sum -= src_x[2] * kUpscaleFilterUnsigned[src_x_subpixel][2];
+      sum += src_x[3] * kUpscaleFilterUnsigned[src_x_subpixel][3];
+      sum += src_x[4] * kUpscaleFilterUnsigned[src_x_subpixel][4];
+      sum -= src_x[5] * kUpscaleFilterUnsigned[src_x_subpixel][5];
+      sum += src_x[6] * kUpscaleFilterUnsigned[src_x_subpixel][6];
+      sum -= src_x[7] * kUpscaleFilterUnsigned[src_x_subpixel][7];
+      dst[x] = Clip3(RightShiftWithRounding(sum, kFilterBits), 0,
+                     (1 << bitdepth) - 1);
+      subpixel_x += step;
+    } while (++x < upscaled_width);
+    src += source_stride;
+    dst += dest_stride;
+  } while (--y != 0);
 }
 
 void Init8bpp() {
   Dsp* dsp = dsp_internal::GetWritableDspTable(8);
   assert(dsp != nullptr);
+  dsp->super_res_coefficients = nullptr;
 #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
-  dsp->super_res_row = ComputeSuperRes<8, uint8_t>;
+  dsp->super_res = SuperRes_C<8, uint8_t>;
 #else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
   static_cast<void>(dsp);
 #ifndef LIBGAV1_Dsp8bpp_SuperRes
-  dsp->super_res_row = ComputeSuperRes<8, uint8_t>;
+  dsp->super_res = SuperRes_C<8, uint8_t>;
 #endif
 #endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
 }
@@ -74,12 +84,13 @@
 void Init10bpp() {
   Dsp* dsp = dsp_internal::GetWritableDspTable(10);
   assert(dsp != nullptr);
+  dsp->super_res_coefficients = nullptr;
 #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
-  dsp->super_res_row = ComputeSuperRes<10, uint16_t>;
+  dsp->super_res = SuperRes_C<10, uint16_t>;
 #else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
   static_cast<void>(dsp);
 #ifndef LIBGAV1_Dsp10bpp_SuperRes
-  dsp->super_res_row = ComputeSuperRes<10, uint16_t>;
+  dsp->super_res = SuperRes_C<10, uint16_t>;
 #endif
 #endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
 }

diff --git a/libgav1/src/dsp/super_res.h b/libgav1/src/dsp/super_res.h
index cd69474..2ca9d2b 100644
--- a/libgav1/src/dsp/super_res.h
+++ b/libgav1/src/dsp/super_res.h

@@ -38,7 +38,7 @@
 namespace libgav1 {
 namespace dsp {
 
-// Initializes Dsp::super_res_row. This function is not thread-safe.
+// Initializes Dsp::super_res. This function is not thread-safe.
 void SuperResInit_C();
 
 }  // namespace dsp

diff --git a/libgav1/src/dsp/x86/average_blend_sse4.cc b/libgav1/src/dsp/x86/average_blend_sse4.cc
index 6c37658..ec9f589 100644
--- a/libgav1/src/dsp/x86/average_blend_sse4.cc
+++ b/libgav1/src/dsp/x86/average_blend_sse4.cc

@@ -15,7 +15,7 @@
 #include "src/dsp/average_blend.h"
 #include "src/utils/cpu.h"
 
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
 
 #include <xmmintrin.h>
 
@@ -30,6 +30,7 @@
 
 namespace libgav1 {
 namespace dsp {
+namespace low_bitdepth {
 namespace {
 
 constexpr int kInterPostRoundBit = 4;
@@ -138,13 +139,232 @@
 }
 
 }  // namespace
+}  // namespace low_bitdepth
 
-void AverageBlendInit_SSE4_1() { Init8bpp(); }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+constexpr int kInterPostRoundBitPlusOne = 5;
+
+template <const int width, const int offset>
+inline void AverageBlendRow(const uint16_t* prediction_0,
+                            const uint16_t* prediction_1,
+                            const __m128i& compound_offset,
+                            const __m128i& round_offset, const __m128i& max,
+                            const __m128i& zero, uint16_t* dst,
+                            const ptrdiff_t dest_stride) {
+  // pred_0/1 max range is 16b.
+  const __m128i pred_0 = LoadUnaligned16(prediction_0 + offset);
+  const __m128i pred_1 = LoadUnaligned16(prediction_1 + offset);
+  const __m128i pred_00 = _mm_cvtepu16_epi32(pred_0);
+  const __m128i pred_01 = _mm_unpackhi_epi16(pred_0, zero);
+  const __m128i pred_10 = _mm_cvtepu16_epi32(pred_1);
+  const __m128i pred_11 = _mm_unpackhi_epi16(pred_1, zero);
+
+  const __m128i pred_add_0 = _mm_add_epi32(pred_00, pred_10);
+  const __m128i pred_add_1 = _mm_add_epi32(pred_01, pred_11);
+  const __m128i compound_offset_0 = _mm_sub_epi32(pred_add_0, compound_offset);
+  const __m128i compound_offset_1 = _mm_sub_epi32(pred_add_1, compound_offset);
+  // RightShiftWithRounding and Clip3.
+  const __m128i round_0 = _mm_add_epi32(compound_offset_0, round_offset);
+  const __m128i round_1 = _mm_add_epi32(compound_offset_1, round_offset);
+  const __m128i res_0 = _mm_srai_epi32(round_0, kInterPostRoundBitPlusOne);
+  const __m128i res_1 = _mm_srai_epi32(round_1, kInterPostRoundBitPlusOne);
+  const __m128i result = _mm_min_epi16(_mm_packus_epi32(res_0, res_1), max);
+  if (width != 4) {
+    // Store width=8/16/32/64/128.
+    StoreUnaligned16(dst + offset, result);
+    return;
+  }
+  assert(width == 4);
+  StoreLo8(dst, result);
+  StoreHi8(dst + dest_stride, result);
+}
+
+void AverageBlend10bpp_SSE4_1(const void* prediction_0,
+                              const void* prediction_1, const int width,
+                              const int height, void* const dest,
+                              const ptrdiff_t dst_stride) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t dest_stride = dst_stride / sizeof(dst[0]);
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  const __m128i compound_offset =
+      _mm_set1_epi32(kCompoundOffset + kCompoundOffset);
+  const __m128i round_offset =
+      _mm_set1_epi32((1 << kInterPostRoundBitPlusOne) >> 1);
+  const __m128i max = _mm_set1_epi16((1 << kBitdepth10) - 1);
+  const __m128i zero = _mm_setzero_si128();
+  int y = height;
+
+  if (width == 4) {
+    const ptrdiff_t dest_stride2 = dest_stride << 1;
+    const ptrdiff_t width2 = width << 1;
+    do {
+      // row0,1
+      AverageBlendRow<4, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+      dst += dest_stride2;
+      pred_0 += width2;
+      pred_1 += width2;
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+  if (width == 8) {
+    const ptrdiff_t dest_stride2 = dest_stride << 1;
+    const ptrdiff_t width2 = width << 1;
+    do {
+      // row0.
+      AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+      // row1.
+      AverageBlendRow<8, 0>(pred_0 + width, pred_1 + width, compound_offset,
+                            round_offset, max, zero, dst + dest_stride,
+                            dest_stride);
+      dst += dest_stride2;
+      pred_0 += width2;
+      pred_1 += width2;
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+  if (width == 16) {
+    const ptrdiff_t dest_stride2 = dest_stride << 1;
+    const ptrdiff_t width2 = width << 1;
+    do {
+      // row0.
+      AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+      AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+      // row1.
+      AverageBlendRow<8, 0>(pred_0 + width, pred_1 + width, compound_offset,
+                            round_offset, max, zero, dst + dest_stride,
+                            dest_stride);
+      AverageBlendRow<8, 8>(pred_0 + width, pred_1 + width, compound_offset,
+                            round_offset, max, zero, dst + dest_stride,
+                            dest_stride);
+      dst += dest_stride2;
+      pred_0 += width2;
+      pred_1 += width2;
+      y -= 2;
+    } while (y != 0);
+    return;
+  }
+  if (width == 32) {
+    do {
+      // pred [0 - 15].
+      AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+      AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+      // pred [16 - 31].
+      AverageBlendRow<8, 16>(pred_0, pred_1, compound_offset, round_offset, max,
+                             zero, dst, dest_stride);
+      AverageBlendRow<8, 24>(pred_0, pred_1, compound_offset, round_offset, max,
+                             zero, dst, dest_stride);
+      dst += dest_stride;
+      pred_0 += width;
+      pred_1 += width;
+    } while (--y != 0);
+    return;
+  }
+  if (width == 64) {
+    do {
+      // pred [0 - 31].
+      AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+      AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+      AverageBlendRow<8, 16>(pred_0, pred_1, compound_offset, round_offset, max,
+                             zero, dst, dest_stride);
+      AverageBlendRow<8, 24>(pred_0, pred_1, compound_offset, round_offset, max,
+                             zero, dst, dest_stride);
+      // pred [31 - 63].
+      AverageBlendRow<8, 32>(pred_0, pred_1, compound_offset, round_offset, max,
+                             zero, dst, dest_stride);
+      AverageBlendRow<8, 40>(pred_0, pred_1, compound_offset, round_offset, max,
+                             zero, dst, dest_stride);
+      AverageBlendRow<8, 48>(pred_0, pred_1, compound_offset, round_offset, max,
+                             zero, dst, dest_stride);
+      AverageBlendRow<8, 56>(pred_0, pred_1, compound_offset, round_offset, max,
+                             zero, dst, dest_stride);
+      dst += dest_stride;
+      pred_0 += width;
+      pred_1 += width;
+    } while (--y != 0);
+    return;
+  }
+  assert(width == 128);
+  do {
+    // pred [0 - 31].
+    AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+                          zero, dst, dest_stride);
+    AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max,
+                          zero, dst, dest_stride);
+    AverageBlendRow<8, 16>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    AverageBlendRow<8, 24>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    // pred [31 - 63].
+    AverageBlendRow<8, 32>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    AverageBlendRow<8, 40>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    AverageBlendRow<8, 48>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    AverageBlendRow<8, 56>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+
+    // pred [64 - 95].
+    AverageBlendRow<8, 64>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    AverageBlendRow<8, 72>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    AverageBlendRow<8, 80>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    AverageBlendRow<8, 88>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    // pred [96 - 127].
+    AverageBlendRow<8, 96>(pred_0, pred_1, compound_offset, round_offset, max,
+                           zero, dst, dest_stride);
+    AverageBlendRow<8, 104>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+    AverageBlendRow<8, 112>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+    AverageBlendRow<8, 120>(pred_0, pred_1, compound_offset, round_offset, max,
+                            zero, dst, dest_stride);
+    dst += dest_stride;
+    pred_0 += width;
+    pred_1 += width;
+  } while (--y != 0);
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_10BPP_SSE4_1(AverageBlend)
+  dsp->average_blend = AverageBlend10bpp_SSE4_1;
+#endif
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void AverageBlendInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}
 
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_SSE4_1
+#else   // !LIBGAV1_TARGETING_SSE4_1
 
 namespace libgav1 {
 namespace dsp {
@@ -153,4 +373,4 @@
 
 }  // namespace dsp
 }  // namespace libgav1
-#endif  // LIBGAV1_ENABLE_SSE4_1
+#endif  // LIBGAV1_TARGETING_SSE4_1

diff --git a/libgav1/src/dsp/x86/average_blend_sse4.h b/libgav1/src/dsp/x86/average_blend_sse4.h
index e205c2b..cd07112 100644
--- a/libgav1/src/dsp/x86/average_blend_sse4.h
+++ b/libgav1/src/dsp/x86/average_blend_sse4.h

@@ -31,11 +31,15 @@
 
 // If sse4 is enabled and the baseline isn't set due to a higher level of
 // optimization being enabled, signal the sse4 implementation should be used.
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
+
 #ifndef LIBGAV1_Dsp8bpp_AverageBlend
 #define LIBGAV1_Dsp8bpp_AverageBlend LIBGAV1_CPU_SSE4_1
 #endif
+#ifndef LIBGAV1_Dsp10bpp_AverageBlend
+#define LIBGAV1_Dsp10bpp_AverageBlend LIBGAV1_CPU_SSE4_1
+#endif
 
-#endif  // LIBGAV1_ENABLE_SSE4_1
+#endif  // LIBGAV1_TARGETING_SSE4_1
 
 #endif  // LIBGAV1_SRC_DSP_X86_AVERAGE_BLEND_SSE4_H_

diff --git a/libgav1/src/dsp/x86/cdef_avx2.cc b/libgav1/src/dsp/x86/cdef_avx2.cc
new file mode 100644
index 0000000..d41dc38
--- /dev/null
+++ b/libgav1/src/dsp/x86/cdef_avx2.cc

@@ -0,0 +1,784 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/cdef.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_avx2.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+#include "src/dsp/cdef.inc"
+
+// Used when calculating odd |cost[x]| values.
+// Holds elements 1 3 5 7 7 7 7 7
+alignas(32) constexpr uint32_t kCdefDivisionTableOddPairsPadded[] = {
+    420, 210, 140, 105, 420, 210, 140, 105,
+    105, 105, 105, 105, 105, 105, 105, 105};
+
+// ----------------------------------------------------------------------------
+// Refer to CdefDirection_C().
+//
+// int32_t partial[8][15] = {};
+// for (int i = 0; i < 8; ++i) {
+//   for (int j = 0; j < 8; ++j) {
+//     const int x = 1;
+//     partial[0][i + j] += x;
+//     partial[1][i + j / 2] += x;
+//     partial[2][i] += x;
+//     partial[3][3 + i - j / 2] += x;
+//     partial[4][7 + i - j] += x;
+//     partial[5][3 - i / 2 + j] += x;
+//     partial[6][j] += x;
+//     partial[7][i / 2 + j] += x;
+//   }
+// }
+//
+// Using the code above, generate the position count for partial[8][15].
+//
+// partial[0]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[1]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[2]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[3]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[4]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[5]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[6]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[7]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+//
+// The SIMD code shifts the input horizontally, then adds vertically to get the
+// correct partial value for the given position.
+// ----------------------------------------------------------------------------
+
+// ----------------------------------------------------------------------------
+// partial[0][i + j] += x;
+//
+// 00 01 02 03 04 05 06 07  00 00 00 00 00 00 00
+// 00 10 11 12 13 14 15 16  17 00 00 00 00 00 00
+// 00 00 20 21 22 23 24 25  26 27 00 00 00 00 00
+// 00 00 00 30 31 32 33 34  35 36 37 00 00 00 00
+// 00 00 00 00 40 41 42 43  44 45 46 47 00 00 00
+// 00 00 00 00 00 50 51 52  53 54 55 56 57 00 00
+// 00 00 00 00 00 00 60 61  62 63 64 65 66 67 00
+// 00 00 00 00 00 00 00 70  71 72 73 74 75 76 77
+//
+// partial[4] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D0_D4(__m256i* v_src_16,
+                                            __m256i* partial_lo,
+                                            __m256i* partial_hi) {
+  // 00 01 02 03 04 05 06 07
+  *partial_lo = v_src_16[0];
+  // 00 00 00 00 00 00 00 00
+  *partial_hi = _mm256_setzero_si256();
+
+  // 00 10 11 12 13 14 15 16
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[1], 2));
+  // 17 00 00 00 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[1], 14));
+
+  // 00 00 20 21 22 23 24 25
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[2], 4));
+  // 26 27 00 00 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[2], 12));
+
+  // 00 00 00 30 31 32 33 34
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[3], 6));
+  // 35 36 37 00 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[3], 10));
+
+  // 00 00 00 00 40 41 42 43
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[4], 8));
+  // 44 45 46 47 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[4], 8));
+
+  // 00 00 00 00 00 50 51 52
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[5], 10));
+  // 53 54 55 56 57 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[5], 6));
+
+  // 00 00 00 00 00 00 60 61
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[6], 12));
+  // 62 63 64 65 66 67 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[6], 4));
+
+  // 00 00 00 00 00 00 00 70
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[7], 14));
+  // 71 72 73 74 75 76 77 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[7], 2));
+}
+
+// ----------------------------------------------------------------------------
+// partial[1][i + j / 2] += x;
+//
+// A0 = src[0] + src[1], A1 = src[2] + src[3], ...
+//
+// A0 A1 A2 A3 00 00 00 00  00 00 00 00 00 00 00
+// 00 B0 B1 B2 B3 00 00 00  00 00 00 00 00 00 00
+// 00 00 C0 C1 C2 C3 00 00  00 00 00 00 00 00 00
+// 00 00 00 D0 D1 D2 D3 00  00 00 00 00 00 00 00
+// 00 00 00 00 E0 E1 E2 E3  00 00 00 00 00 00 00
+// 00 00 00 00 00 F0 F1 F2  F3 00 00 00 00 00 00
+// 00 00 00 00 00 00 G0 G1  G2 G3 00 00 00 00 00
+// 00 00 00 00 00 00 00 H0  H1 H2 H3 00 00 00 00
+//
+// partial[3] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D1_D3(__m256i* v_src_16,
+                                            __m256i* partial_lo,
+                                            __m256i* partial_hi) {
+  __m256i v_d1_temp[8];
+  const __m256i v_zero = _mm256_setzero_si256();
+
+  for (int i = 0; i < 8; ++i) {
+    v_d1_temp[i] = _mm256_hadd_epi16(v_src_16[i], v_zero);
+  }
+
+  *partial_lo = *partial_hi = v_zero;
+  // A0 A1 A2 A3 00 00 00 00
+  *partial_lo = _mm256_add_epi16(*partial_lo, v_d1_temp[0]);
+
+  // 00 B0 B1 B2 B3 00 00 00
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[1], 2));
+
+  // 00 00 C0 C1 C2 C3 00 00
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[2], 4));
+  // 00 00 00 D0 D1 D2 D3 00
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[3], 6));
+  // 00 00 00 00 E0 E1 E2 E3
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[4], 8));
+
+  // 00 00 00 00 00 F0 F1 F2
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[5], 10));
+  // F3 00 00 00 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_d1_temp[5], 6));
+
+  // 00 00 00 00 00 00 G0 G1
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[6], 12));
+  // G2 G3 00 00 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_d1_temp[6], 4));
+
+  // 00 00 00 00 00 00 00 H0
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[7], 14));
+  // H1 H2 H3 00 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_d1_temp[7], 2));
+}
+
+// ----------------------------------------------------------------------------
+// partial[7][i / 2 + j] += x;
+//
+// 00 01 02 03 04 05 06 07  00 00 00 00 00 00 00
+// 10 11 12 13 14 15 16 17  00 00 00 00 00 00 00
+// 00 20 21 22 23 24 25 26  27 00 00 00 00 00 00
+// 00 30 31 32 33 34 35 36  37 00 00 00 00 00 00
+// 00 00 40 41 42 43 44 45  46 47 00 00 00 00 00
+// 00 00 50 51 52 53 54 55  56 57 00 00 00 00 00
+// 00 00 00 60 61 62 63 64  65 66 67 00 00 00 00
+// 00 00 00 70 71 72 73 74  75 76 77 00 00 00 00
+//
+// partial[5] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D7_D5(__m256i* v_src, __m256i* partial_lo,
+                                            __m256i* partial_hi) {
+  __m256i v_pair_add[4];
+  // Add vertical source pairs.
+  v_pair_add[0] = _mm256_add_epi16(v_src[0], v_src[1]);
+  v_pair_add[1] = _mm256_add_epi16(v_src[2], v_src[3]);
+  v_pair_add[2] = _mm256_add_epi16(v_src[4], v_src[5]);
+  v_pair_add[3] = _mm256_add_epi16(v_src[6], v_src[7]);
+
+  // 00 01 02 03 04 05 06 07
+  // 10 11 12 13 14 15 16 17
+  *partial_lo = v_pair_add[0];
+  // 00 00 00 00 00 00 00 00
+  // 00 00 00 00 00 00 00 00
+  *partial_hi = _mm256_setzero_si256();
+
+  // 00 20 21 22 23 24 25 26
+  // 00 30 31 32 33 34 35 36
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_pair_add[1], 2));
+  // 27 00 00 00 00 00 00 00
+  // 37 00 00 00 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_pair_add[1], 14));
+
+  // 00 00 40 41 42 43 44 45
+  // 00 00 50 51 52 53 54 55
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_pair_add[2], 4));
+  // 46 47 00 00 00 00 00 00
+  // 56 57 00 00 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_pair_add[2], 12));
+
+  // 00 00 00 60 61 62 63 64
+  // 00 00 00 70 71 72 73 74
+  *partial_lo =
+      _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_pair_add[3], 6));
+  // 65 66 67 00 00 00 00 00
+  // 75 76 77 00 00 00 00 00
+  *partial_hi =
+      _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_pair_add[3], 10));
+}
+
+LIBGAV1_ALWAYS_INLINE void AddPartial(const uint8_t* src, ptrdiff_t stride,
+                                      __m256i* partial) {
+  // 8x8 input
+  // 00 01 02 03 04 05 06 07
+  // 10 11 12 13 14 15 16 17
+  // 20 21 22 23 24 25 26 27
+  // 30 31 32 33 34 35 36 37
+  // 40 41 42 43 44 45 46 47
+  // 50 51 52 53 54 55 56 57
+  // 60 61 62 63 64 65 66 67
+  // 70 71 72 73 74 75 76 77
+  __m256i v_src[8];
+  for (auto& i : v_src) {
+    i = _mm256_castsi128_si256(LoadLo8(src));
+    // Dup lower lane.
+    i = _mm256_permute2x128_si256(i, i, 0x0);
+    src += stride;
+  }
+
+  const __m256i v_zero = _mm256_setzero_si256();
+  // partial for direction 2
+  // --------------------------------------------------------------------------
+  // partial[2][i] += x;
+  // 00 10 20 30 40 50 60 70  xx xx xx xx xx xx xx xx
+  // 01 11 21 33 41 51 61 71  xx xx xx xx xx xx xx xx
+  // 02 12 22 33 42 52 62 72  xx xx xx xx xx xx xx xx
+  // 03 13 23 33 43 53 63 73  xx xx xx xx xx xx xx xx
+  // 04 14 24 34 44 54 64 74  xx xx xx xx xx xx xx xx
+  // 05 15 25 35 45 55 65 75  xx xx xx xx xx xx xx xx
+  // 06 16 26 36 46 56 66 76  xx xx xx xx xx xx xx xx
+  // 07 17 27 37 47 57 67 77  xx xx xx xx xx xx xx xx
+  const __m256i v_src_4_0 = _mm256_unpacklo_epi64(v_src[0], v_src[4]);
+  const __m256i v_src_5_1 = _mm256_unpacklo_epi64(v_src[1], v_src[5]);
+  const __m256i v_src_6_2 = _mm256_unpacklo_epi64(v_src[2], v_src[6]);
+  const __m256i v_src_7_3 = _mm256_unpacklo_epi64(v_src[3], v_src[7]);
+  const __m256i v_hsum_4_0 = _mm256_sad_epu8(v_src_4_0, v_zero);
+  const __m256i v_hsum_5_1 = _mm256_sad_epu8(v_src_5_1, v_zero);
+  const __m256i v_hsum_6_2 = _mm256_sad_epu8(v_src_6_2, v_zero);
+  const __m256i v_hsum_7_3 = _mm256_sad_epu8(v_src_7_3, v_zero);
+  const __m256i v_hsum_1_0 = _mm256_unpacklo_epi16(v_hsum_4_0, v_hsum_5_1);
+  const __m256i v_hsum_3_2 = _mm256_unpacklo_epi16(v_hsum_6_2, v_hsum_7_3);
+  const __m256i v_hsum_5_4 = _mm256_unpackhi_epi16(v_hsum_4_0, v_hsum_5_1);
+  const __m256i v_hsum_7_6 = _mm256_unpackhi_epi16(v_hsum_6_2, v_hsum_7_3);
+  partial[2] =
+      _mm256_unpacklo_epi64(_mm256_unpacklo_epi32(v_hsum_1_0, v_hsum_3_2),
+                            _mm256_unpacklo_epi32(v_hsum_5_4, v_hsum_7_6));
+
+  const __m256i extend_reverse = SetrM128i(
+      _mm_set_epi32(static_cast<int>(0x80078006), static_cast<int>(0x80058004),
+                    static_cast<int>(0x80038002), static_cast<int>(0x80018000)),
+      _mm_set_epi32(static_cast<int>(0x80008001), static_cast<int>(0x80028003),
+                    static_cast<int>(0x80048005),
+                    static_cast<int>(0x80068007)));
+
+  for (auto& i : v_src) {
+    // Zero extend unsigned 8 to 16. The upper lane is reversed.
+    i = _mm256_shuffle_epi8(i, extend_reverse);
+  }
+
+  // partial for direction 6
+  // --------------------------------------------------------------------------
+  // partial[6][j] += x;
+  // 00 01 02 03 04 05 06 07  xx xx xx xx xx xx xx xx
+  // 10 11 12 13 14 15 16 17  xx xx xx xx xx xx xx xx
+  // 20 21 22 23 24 25 26 27  xx xx xx xx xx xx xx xx
+  // 30 31 32 33 34 35 36 37  xx xx xx xx xx xx xx xx
+  // 40 41 42 43 44 45 46 47  xx xx xx xx xx xx xx xx
+  // 50 51 52 53 54 55 56 57  xx xx xx xx xx xx xx xx
+  // 60 61 62 63 64 65 66 67  xx xx xx xx xx xx xx xx
+  // 70 71 72 73 74 75 76 77  xx xx xx xx xx xx xx xx
+  partial[6] = v_src[0];
+  for (int i = 1; i < 8; ++i) {
+    partial[6] = _mm256_add_epi16(partial[6], v_src[i]);
+  }
+
+  AddPartial_D0_D4(v_src, &partial[0], &partial[4]);
+  AddPartial_D1_D3(v_src, &partial[1], &partial[3]);
+  AddPartial_D7_D5(v_src, &partial[7], &partial[5]);
+}
+
+inline __m256i SumVectorPair_S32(__m256i a) {
+  a = _mm256_hadd_epi32(a, a);
+  a = _mm256_add_epi32(a, _mm256_srli_si256(a, 4));
+  return a;
+}
+
+// |cost[0]| and |cost[4]| square the input and sum with the corresponding
+// element from the other end of the vector:
+// |kCdefDivisionTable[]| element:
+// cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) *
+//             kCdefDivisionTable[i + 1];
+// cost[0] += Square(partial[0][7]) * kCdefDivisionTable[8];
+inline void Cost0Or4_Pair(uint32_t* cost, const __m256i partial_0,
+                          const __m256i partial_4,
+                          const __m256i division_table) {
+  const __m256i division_table_0 =
+      _mm256_permute2x128_si256(division_table, division_table, 0x0);
+  const __m256i division_table_1 =
+      _mm256_permute2x128_si256(division_table, division_table, 0x11);
+
+  // partial_lo
+  const __m256i a = partial_0;
+  // partial_hi
+  const __m256i b = partial_4;
+
+  // Reverse and clear upper 2 bytes.
+  const __m256i reverser = _mm256_broadcastsi128_si256(_mm_set_epi32(
+      static_cast<int>(0x80800100), 0x03020504, 0x07060908, 0x0b0a0d0c));
+
+  // 14 13 12 11 10 09 08 ZZ
+  const __m256i b_reversed = _mm256_shuffle_epi8(b, reverser);
+  // 00 14 01 13 02 12 03 11
+  const __m256i ab_lo = _mm256_unpacklo_epi16(a, b_reversed);
+  // 04 10 05 09 06 08 07 ZZ
+  const __m256i ab_hi = _mm256_unpackhi_epi16(a, b_reversed);
+
+  // Square(partial[0][i]) + Square(partial[0][14 - i])
+  const __m256i square_lo = _mm256_madd_epi16(ab_lo, ab_lo);
+  const __m256i square_hi = _mm256_madd_epi16(ab_hi, ab_hi);
+
+  const __m256i c = _mm256_mullo_epi32(square_lo, division_table_0);
+  const __m256i d = _mm256_mullo_epi32(square_hi, division_table_1);
+  const __m256i e = SumVectorPair_S32(_mm256_add_epi32(c, d));
+  // Copy upper 32bit sum to lower lane.
+  const __m128i sums =
+      _mm256_castsi256_si128(_mm256_permute4x64_epi64(e, 0x08));
+  cost[0] = _mm_cvtsi128_si32(sums);
+  cost[4] = _mm_cvtsi128_si32(_mm_srli_si128(sums, 8));
+}
+
+template <int index_a, int index_b>
+inline void CostOdd_Pair(uint32_t* cost, const __m256i partial_a,
+                         const __m256i partial_b,
+                         const __m256i division_table[2]) {
+  // partial_lo
+  const __m256i a = partial_a;
+  // partial_hi
+  const __m256i b = partial_b;
+
+  // Reverse and clear upper 10 bytes.
+  const __m256i reverser = _mm256_broadcastsi128_si256(
+      _mm_set_epi32(static_cast<int>(0x80808080), static_cast<int>(0x80808080),
+                    static_cast<int>(0x80800100), 0x03020504));
+
+  // 10 09 08 ZZ ZZ ZZ ZZ ZZ
+  const __m256i b_reversed = _mm256_shuffle_epi8(b, reverser);
+  // 00 10 01 09 02 08 03 ZZ
+  const __m256i ab_lo = _mm256_unpacklo_epi16(a, b_reversed);
+  // 04 ZZ 05 ZZ 06 ZZ 07 ZZ
+  const __m256i ab_hi = _mm256_unpackhi_epi16(a, b_reversed);
+
+  // Square(partial[0][i]) + Square(partial[0][14 - i])
+  const __m256i square_lo = _mm256_madd_epi16(ab_lo, ab_lo);
+  const __m256i square_hi = _mm256_madd_epi16(ab_hi, ab_hi);
+
+  const __m256i c = _mm256_mullo_epi32(square_lo, division_table[0]);
+  const __m256i d = _mm256_mullo_epi32(square_hi, division_table[1]);
+  const __m256i e = SumVectorPair_S32(_mm256_add_epi32(c, d));
+  // Copy upper 32bit sum to lower lane.
+  const __m128i sums =
+      _mm256_castsi256_si128(_mm256_permute4x64_epi64(e, 0x08));
+  cost[index_a] = _mm_cvtsi128_si32(sums);
+  cost[index_b] = _mm_cvtsi128_si32(_mm_srli_si128(sums, 8));
+}
+
+inline void Cost2And6_Pair(uint32_t* cost, const __m256i partial_a,
+                           const __m256i partial_b,
+                           const __m256i division_table) {
+  // The upper lane is a "don't care", so only use the lower lane for
+  // calculating cost.
+  const __m256i a = _mm256_permute2x128_si256(partial_a, partial_b, 0x20);
+
+  const __m256i square_a = _mm256_madd_epi16(a, a);
+  const __m256i b = _mm256_mullo_epi32(square_a, division_table);
+  const __m256i c = SumVectorPair_S32(b);
+  // Copy upper 32bit sum to lower lane.
+  const __m128i sums =
+      _mm256_castsi256_si128(_mm256_permute4x64_epi64(c, 0x08));
+  cost[2] = _mm_cvtsi128_si32(sums);
+  cost[6] = _mm_cvtsi128_si32(_mm_srli_si128(sums, 8));
+}
+
+void CdefDirection_AVX2(const void* const source, ptrdiff_t stride,
+                        uint8_t* const direction, int* const variance) {
+  assert(direction != nullptr);
+  assert(variance != nullptr);
+  const auto* src = static_cast<const uint8_t*>(source);
+  uint32_t cost[8];
+
+  // partial[0] = add partial 0,4 low
+  // partial[1] = add partial 1,3 low
+  // partial[2] = add partial 2 low
+  // partial[3] = add partial 1,3 high
+  // partial[4] = add partial 0,4 high
+  // partial[5] = add partial 7,5 high
+  // partial[6] = add partial 6 low
+  // partial[7] = add partial 7,5 low
+  __m256i partial[8];
+
+  AddPartial(src, stride, partial);
+
+  const __m256i division_table = LoadUnaligned32(kCdefDivisionTable);
+  const __m256i division_table_7 =
+      _mm256_broadcastd_epi32(_mm_cvtsi32_si128(kCdefDivisionTable[7]));
+
+  Cost2And6_Pair(cost, partial[2], partial[6], division_table_7);
+
+  Cost0Or4_Pair(cost, partial[0], partial[4], division_table);
+
+  const __m256i division_table_odd[2] = {
+      LoadUnaligned32(kCdefDivisionTableOddPairsPadded),
+      LoadUnaligned32(kCdefDivisionTableOddPairsPadded + 8)};
+
+  CostOdd_Pair<1, 3>(cost, partial[1], partial[3], division_table_odd);
+  CostOdd_Pair<7, 5>(cost, partial[7], partial[5], division_table_odd);
+
+  uint32_t best_cost = 0;
+  *direction = 0;
+  for (int i = 0; i < 8; ++i) {
+    if (cost[i] > best_cost) {
+      best_cost = cost[i];
+      *direction = i;
+    }
+  }
+  *variance = (best_cost - cost[(*direction + 4) & 7]) >> 10;
+}
+
+// -------------------------------------------------------------------------
+// CdefFilter
+
+// Load 4 vectors based on the given |direction|.
+inline void LoadDirection(const uint16_t* const src, const ptrdiff_t stride,
+                          __m128i* output, const int direction) {
+  // Each |direction| describes a different set of source values. Expand this
+  // set by negating each set. For |direction| == 0 this gives a diagonal line
+  // from top right to bottom left. The first value is y, the second x. Negative
+  // y values move up.
+  //    a       b         c       d
+  // {-1, 1}, {1, -1}, {-2, 2}, {2, -2}
+  //         c
+  //       a
+  //     0
+  //   b
+  // d
+  const int y_0 = kCdefDirections[direction][0][0];
+  const int x_0 = kCdefDirections[direction][0][1];
+  const int y_1 = kCdefDirections[direction][1][0];
+  const int x_1 = kCdefDirections[direction][1][1];
+  output[0] = LoadUnaligned16(src - y_0 * stride - x_0);
+  output[1] = LoadUnaligned16(src + y_0 * stride + x_0);
+  output[2] = LoadUnaligned16(src - y_1 * stride - x_1);
+  output[3] = LoadUnaligned16(src + y_1 * stride + x_1);
+}
+
+// Load 4 vectors based on the given |direction|. Use when |block_width| == 4 to
+// do 2 rows at a time.
+void LoadDirection4(const uint16_t* const src, const ptrdiff_t stride,
+                    __m128i* output, const int direction) {
+  const int y_0 = kCdefDirections[direction][0][0];
+  const int x_0 = kCdefDirections[direction][0][1];
+  const int y_1 = kCdefDirections[direction][1][0];
+  const int x_1 = kCdefDirections[direction][1][1];
+  output[0] = LoadHi8(LoadLo8(src - y_0 * stride - x_0),
+                      src - y_0 * stride + stride - x_0);
+  output[1] = LoadHi8(LoadLo8(src + y_0 * stride + x_0),
+                      src + y_0 * stride + stride + x_0);
+  output[2] = LoadHi8(LoadLo8(src - y_1 * stride - x_1),
+                      src - y_1 * stride + stride - x_1);
+  output[3] = LoadHi8(LoadLo8(src + y_1 * stride + x_1),
+                      src + y_1 * stride + stride + x_1);
+}
+
+inline __m256i Constrain(const __m256i& pixel, const __m256i& reference,
+                         const __m128i& damping, const __m256i& threshold) {
+  const __m256i diff = _mm256_sub_epi16(pixel, reference);
+  const __m256i abs_diff = _mm256_abs_epi16(diff);
+  // sign(diff) * Clip3(threshold - (std::abs(diff) >> damping),
+  //                    0, std::abs(diff))
+  const __m256i shifted_diff = _mm256_srl_epi16(abs_diff, damping);
+  // For bitdepth == 8, the threshold range is [0, 15] and the damping range is
+  // [3, 6]. If pixel == kCdefLargeValue(0x4000), shifted_diff will always be
+  // larger than threshold. Subtract using saturation will return 0 when pixel
+  // == kCdefLargeValue.
+  static_assert(kCdefLargeValue == 0x4000, "Invalid kCdefLargeValue");
+  const __m256i thresh_minus_shifted_diff =
+      _mm256_subs_epu16(threshold, shifted_diff);
+  const __m256i clamp_abs_diff =
+      _mm256_min_epi16(thresh_minus_shifted_diff, abs_diff);
+  // Restore the sign.
+  return _mm256_sign_epi16(clamp_abs_diff, diff);
+}
+
+inline __m256i ApplyConstrainAndTap(const __m256i& pixel, const __m256i& val,
+                                    const __m256i& tap, const __m128i& damping,
+                                    const __m256i& threshold) {
+  const __m256i constrained = Constrain(val, pixel, damping, threshold);
+  return _mm256_mullo_epi16(constrained, tap);
+}
+
+template <int width, bool enable_primary = true, bool enable_secondary = true>
+void CdefFilter_AVX2(const uint16_t* src, const ptrdiff_t src_stride,
+                     const int height, const int primary_strength,
+                     const int secondary_strength, const int damping,
+                     const int direction, void* dest,
+                     const ptrdiff_t dst_stride) {
+  static_assert(width == 8 || width == 4, "Invalid CDEF width.");
+  static_assert(enable_primary || enable_secondary, "");
+  constexpr bool clipping_required = enable_primary && enable_secondary;
+  auto* dst = static_cast<uint8_t*>(dest);
+  __m128i primary_damping_shift, secondary_damping_shift;
+
+  // FloorLog2() requires input to be > 0.
+  // 8-bit damping range: Y: [3, 6], UV: [2, 5].
+  if (enable_primary) {
+    // primary_strength: [0, 15] -> FloorLog2: [0, 3] so a clamp is necessary
+    // for UV filtering.
+    primary_damping_shift =
+        _mm_cvtsi32_si128(std::max(0, damping - FloorLog2(primary_strength)));
+  }
+  if (enable_secondary) {
+    // secondary_strength: [0, 4] -> FloorLog2: [0, 2] so no clamp to 0 is
+    // necessary.
+    assert(damping - FloorLog2(secondary_strength) >= 0);
+    secondary_damping_shift =
+        _mm_cvtsi32_si128(damping - FloorLog2(secondary_strength));
+  }
+  const __m256i primary_tap_0 = _mm256_broadcastw_epi16(
+      _mm_cvtsi32_si128(kCdefPrimaryTaps[primary_strength & 1][0]));
+  const __m256i primary_tap_1 = _mm256_broadcastw_epi16(
+      _mm_cvtsi32_si128(kCdefPrimaryTaps[primary_strength & 1][1]));
+  const __m256i secondary_tap_0 =
+      _mm256_broadcastw_epi16(_mm_cvtsi32_si128(kCdefSecondaryTap0));
+  const __m256i secondary_tap_1 =
+      _mm256_broadcastw_epi16(_mm_cvtsi32_si128(kCdefSecondaryTap1));
+  const __m256i cdef_large_value_mask = _mm256_broadcastw_epi16(
+      _mm_cvtsi32_si128(static_cast<int16_t>(~kCdefLargeValue)));
+  const __m256i primary_threshold =
+      _mm256_broadcastw_epi16(_mm_cvtsi32_si128(primary_strength));
+  const __m256i secondary_threshold =
+      _mm256_broadcastw_epi16(_mm_cvtsi32_si128(secondary_strength));
+
+  int y = height;
+  do {
+    __m128i pixel_128;
+    if (width == 8) {
+      pixel_128 = LoadUnaligned16(src);
+    } else {
+      pixel_128 = LoadHi8(LoadLo8(src), src + src_stride);
+    }
+
+    __m256i pixel = SetrM128i(pixel_128, pixel_128);
+
+    __m256i min = pixel;
+    __m256i max = pixel;
+    __m256i sum_pair;
+
+    if (enable_primary) {
+      // Primary |direction|.
+      __m128i primary_val_128[4];
+      if (width == 8) {
+        LoadDirection(src, src_stride, primary_val_128, direction);
+      } else {
+        LoadDirection4(src, src_stride, primary_val_128, direction);
+      }
+
+      __m256i primary_val[2];
+      primary_val[0] = SetrM128i(primary_val_128[0], primary_val_128[1]);
+      primary_val[1] = SetrM128i(primary_val_128[2], primary_val_128[3]);
+
+      if (clipping_required) {
+        min = _mm256_min_epu16(min, primary_val[0]);
+        min = _mm256_min_epu16(min, primary_val[1]);
+
+        // The source is 16 bits, however, we only really care about the lower
+        // 8 bits.  The upper 8 bits contain the "large" flag.  After the final
+        // primary max has been calculated, zero out the upper 8 bits.  Use this
+        // to find the "16 bit" max.
+        const __m256i max_p01 = _mm256_max_epu8(primary_val[0], primary_val[1]);
+        max = _mm256_max_epu16(
+            max, _mm256_and_si256(max_p01, cdef_large_value_mask));
+      }
+
+      sum_pair = ApplyConstrainAndTap(pixel, primary_val[0], primary_tap_0,
+                                      primary_damping_shift, primary_threshold);
+      sum_pair = _mm256_add_epi16(
+          sum_pair,
+          ApplyConstrainAndTap(pixel, primary_val[1], primary_tap_1,
+                               primary_damping_shift, primary_threshold));
+    } else {
+      sum_pair = _mm256_setzero_si256();
+    }
+
+    if (enable_secondary) {
+      // Secondary |direction| values (+/- 2). Clamp |direction|.
+      __m128i secondary_val_128[8];
+      if (width == 8) {
+        LoadDirection(src, src_stride, secondary_val_128, direction + 2);
+        LoadDirection(src, src_stride, secondary_val_128 + 4, direction - 2);
+      } else {
+        LoadDirection4(src, src_stride, secondary_val_128, direction + 2);
+        LoadDirection4(src, src_stride, secondary_val_128 + 4, direction - 2);
+      }
+
+      __m256i secondary_val[4];
+      secondary_val[0] = SetrM128i(secondary_val_128[0], secondary_val_128[1]);
+      secondary_val[1] = SetrM128i(secondary_val_128[2], secondary_val_128[3]);
+      secondary_val[2] = SetrM128i(secondary_val_128[4], secondary_val_128[5]);
+      secondary_val[3] = SetrM128i(secondary_val_128[6], secondary_val_128[7]);
+
+      if (clipping_required) {
+        min = _mm256_min_epu16(min, secondary_val[0]);
+        min = _mm256_min_epu16(min, secondary_val[1]);
+        min = _mm256_min_epu16(min, secondary_val[2]);
+        min = _mm256_min_epu16(min, secondary_val[3]);
+
+        const __m256i max_s01 =
+            _mm256_max_epu8(secondary_val[0], secondary_val[1]);
+        const __m256i max_s23 =
+            _mm256_max_epu8(secondary_val[2], secondary_val[3]);
+        const __m256i max_s = _mm256_max_epu8(max_s01, max_s23);
+        max = _mm256_max_epu8(max,
+                              _mm256_and_si256(max_s, cdef_large_value_mask));
+      }
+
+      sum_pair = _mm256_add_epi16(
+          sum_pair,
+          ApplyConstrainAndTap(pixel, secondary_val[0], secondary_tap_0,
+                               secondary_damping_shift, secondary_threshold));
+      sum_pair = _mm256_add_epi16(
+          sum_pair,
+          ApplyConstrainAndTap(pixel, secondary_val[1], secondary_tap_1,
+                               secondary_damping_shift, secondary_threshold));
+      sum_pair = _mm256_add_epi16(
+          sum_pair,
+          ApplyConstrainAndTap(pixel, secondary_val[2], secondary_tap_0,
+                               secondary_damping_shift, secondary_threshold));
+      sum_pair = _mm256_add_epi16(
+          sum_pair,
+          ApplyConstrainAndTap(pixel, secondary_val[3], secondary_tap_1,
+                               secondary_damping_shift, secondary_threshold));
+    }
+
+    __m128i sum = _mm_add_epi16(_mm256_castsi256_si128(sum_pair),
+                                _mm256_extracti128_si256(sum_pair, 1));
+
+    // Clip3(pixel + ((8 + sum - (sum < 0)) >> 4), min, max))
+    const __m128i sum_lt_0 = _mm_srai_epi16(sum, 15);
+    // 8 + sum
+    sum = _mm_add_epi16(sum, _mm_set1_epi16(8));
+    // (... - (sum < 0)) >> 4
+    sum = _mm_add_epi16(sum, sum_lt_0);
+    sum = _mm_srai_epi16(sum, 4);
+    // pixel + ...
+    sum = _mm_add_epi16(sum, _mm256_castsi256_si128(pixel));
+    if (clipping_required) {
+      const __m128i min_128 = _mm_min_epu16(_mm256_castsi256_si128(min),
+                                            _mm256_extracti128_si256(min, 1));
+
+      const __m128i max_128 = _mm_max_epu16(_mm256_castsi256_si128(max),
+                                            _mm256_extracti128_si256(max, 1));
+      // Clip3
+      sum = _mm_min_epi16(sum, max_128);
+      sum = _mm_max_epi16(sum, min_128);
+    }
+
+    const __m128i result = _mm_packus_epi16(sum, sum);
+    if (width == 8) {
+      src += src_stride;
+      StoreLo8(dst, result);
+      dst += dst_stride;
+      --y;
+    } else {
+      src += src_stride << 1;
+      Store4(dst, result);
+      dst += dst_stride;
+      Store4(dst, _mm_srli_si128(result, 4));
+      dst += dst_stride;
+      y -= 2;
+    }
+  } while (y != 0);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  assert(dsp != nullptr);
+  dsp->cdef_direction = CdefDirection_AVX2;
+
+  dsp->cdef_filters[0][0] = CdefFilter_AVX2<4>;
+  dsp->cdef_filters[0][1] =
+      CdefFilter_AVX2<4, /*enable_primary=*/true, /*enable_secondary=*/false>;
+  dsp->cdef_filters[0][2] = CdefFilter_AVX2<4, /*enable_primary=*/false>;
+  dsp->cdef_filters[1][0] = CdefFilter_AVX2<8>;
+  dsp->cdef_filters[1][1] =
+      CdefFilter_AVX2<8, /*enable_primary=*/true, /*enable_secondary=*/false>;
+  dsp->cdef_filters[1][2] = CdefFilter_AVX2<8, /*enable_primary=*/false>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void CdefInit_AVX2() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+#else   // !LIBGAV1_TARGETING_AVX2
+namespace libgav1 {
+namespace dsp {
+
+void CdefInit_AVX2() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_AVX2

diff --git a/libgav1/src/dsp/x86/cdef_avx2.h b/libgav1/src/dsp/x86/cdef_avx2.h
new file mode 100644
index 0000000..41f2d3f
--- /dev/null
+++ b/libgav1/src/dsp/x86/cdef_avx2.h

@@ -0,0 +1,45 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_CDEF_AVX2_H_
+#define LIBGAV1_SRC_DSP_X86_CDEF_AVX2_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cdef_direction and Dsp::cdef_filters. This function is not
+// thread-safe.
+void CdefInit_AVX2();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_TARGETING_AVX2
+
+#ifndef LIBGAV1_Dsp8bpp_CdefDirection
+#define LIBGAV1_Dsp8bpp_CdefDirection LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_CdefFilters
+#define LIBGAV1_Dsp8bpp_CdefFilters LIBGAV1_CPU_AVX2
+#endif
+
+#endif  // LIBGAV1_TARGETING_AVX2
+
+#endif  // LIBGAV1_SRC_DSP_X86_CDEF_AVX2_H_

diff --git a/libgav1/src/dsp/x86/cdef_sse4.cc b/libgav1/src/dsp/x86/cdef_sse4.cc
index 4478bde..6ede778 100644
--- a/libgav1/src/dsp/x86/cdef_sse4.cc
+++ b/libgav1/src/dsp/x86/cdef_sse4.cc

@@ -15,7 +15,7 @@
 #include "src/dsp/cdef.h"
 #include "src/utils/cpu.h"
 
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
 
 #include <emmintrin.h>
 #include <tmmintrin.h>
@@ -349,8 +349,8 @@
 inline uint32_t Cost0Or4(const __m128i a, const __m128i b,
                          const __m128i division_table[2]) {
   // Reverse and clear upper 2 bytes.
-  const __m128i reverser =
-      _mm_set_epi32(0x80800100, 0x03020504, 0x07060908, 0x0b0a0d0c);
+  const __m128i reverser = _mm_set_epi32(static_cast<int>(0x80800100),
+                                         0x03020504, 0x07060908, 0x0b0a0d0c);
   // 14 13 12 11 10 09 08 ZZ
   const __m128i b_reversed = _mm_shuffle_epi8(b, reverser);
   // 00 14 01 13 02 12 03 11
@@ -371,7 +371,8 @@
                         const __m128i division_table[2]) {
   // Reverse and clear upper 10 bytes.
   const __m128i reverser =
-      _mm_set_epi32(0x80808080, 0x80808080, 0x80800100, 0x03020504);
+      _mm_set_epi32(static_cast<int>(0x80808080), static_cast<int>(0x80808080),
+                    static_cast<int>(0x80800100), 0x03020504);
   // 10 09 08 ZZ ZZ ZZ ZZ ZZ
   const __m128i b_reversed = _mm_shuffle_epi8(b, reverser);
   // 00 10 01 09 02 08 03 ZZ
@@ -395,7 +396,7 @@
 }
 
 void CdefDirection_SSE4_1(const void* const source, ptrdiff_t stride,
-                          int* const direction, int* const variance) {
+                          uint8_t* const direction, int* const variance) {
   assert(direction != nullptr);
   assert(variance != nullptr);
   const auto* src = static_cast<const uint8_t*>(source);
@@ -414,8 +415,8 @@
   cost[4] = Cost0Or4(partial_lo[4], partial_hi[4], division_table);
 
   const __m128i division_table_odd[2] = {
-      LoadUnaligned16(kCdefDivisionTableOddPadded),
-      LoadUnaligned16(kCdefDivisionTableOddPadded + 4)};
+      LoadAligned16(kCdefDivisionTableOddPadded),
+      LoadAligned16(kCdefDivisionTableOddPadded + 4)};
 
   cost[1] = CostOdd(partial_lo[1], partial_hi[1], division_table_odd);
   cost[3] = CostOdd(partial_lo[3], partial_hi[3], division_table_odd);
@@ -717,7 +718,7 @@
 
 }  // namespace dsp
 }  // namespace libgav1
-#else  // !LIBGAV1_ENABLE_SSE4_1
+#else   // !LIBGAV1_TARGETING_SSE4_1
 namespace libgav1 {
 namespace dsp {
 
@@ -725,4 +726,4 @@
 
 }  // namespace dsp
 }  // namespace libgav1
-#endif  // LIBGAV1_ENABLE_SSE4_1
+#endif  // LIBGAV1_TARGETING_SSE4_1

diff --git a/libgav1/src/dsp/x86/cdef_sse4.h b/libgav1/src/dsp/x86/cdef_sse4.h
index 2593c72..6631eb7 100644
--- a/libgav1/src/dsp/x86/cdef_sse4.h
+++ b/libgav1/src/dsp/x86/cdef_sse4.h

@@ -30,9 +30,16 @@
 }  // namespace dsp
 }  // namespace libgav1
 
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_CdefDirection
 #define LIBGAV1_Dsp8bpp_CdefDirection LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_CdefFilters
 #define LIBGAV1_Dsp8bpp_CdefFilters LIBGAV1_CPU_SSE4_1
-#endif  // LIBGAV1_ENABLE_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
 
 #endif  // LIBGAV1_SRC_DSP_X86_CDEF_SSE4_H_

diff --git a/libgav1/src/dsp/x86/common_avx2.h b/libgav1/src/dsp/x86/common_avx2.h
new file mode 100644
index 0000000..373116a
--- /dev/null
+++ b/libgav1/src/dsp/x86/common_avx2.h

@@ -0,0 +1,89 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_COMMON_AVX2_H_
+#define LIBGAV1_SRC_DSP_X86_COMMON_AVX2_H_
+
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2
+
+#include <immintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+namespace libgav1 {
+namespace dsp {
+namespace avx2 {
+
+#include "src/dsp/x86/common_avx2.inc"
+#include "src/dsp/x86/common_sse4.inc"
+
+}  // namespace avx2
+
+// NOLINTBEGIN(misc-unused-using-decls)
+// These function aliases shall not be visible to external code. They are
+// restricted to x86/*_avx2.cc files only. This scheme exists to distinguish two
+// possible implementations of common functions, which may differ based on
+// whether the compiler is permitted to use avx2 instructions.
+
+// common_sse4.inc
+using avx2::Load2;
+using avx2::Load2x2;
+using avx2::Load4;
+using avx2::Load4x2;
+using avx2::LoadAligned16;
+using avx2::LoadAligned16Msan;
+using avx2::LoadHi8;
+using avx2::LoadHi8Msan;
+using avx2::LoadLo8;
+using avx2::LoadLo8Msan;
+using avx2::LoadUnaligned16;
+using avx2::LoadUnaligned16Msan;
+using avx2::MaskHighNBytes;
+using avx2::RightShiftWithRounding_S16;
+using avx2::RightShiftWithRounding_S32;
+using avx2::RightShiftWithRounding_U16;
+using avx2::RightShiftWithRounding_U32;
+using avx2::Store2;
+using avx2::Store4;
+using avx2::StoreAligned16;
+using avx2::StoreHi8;
+using avx2::StoreLo8;
+using avx2::StoreUnaligned16;
+
+// common_avx2.inc
+using avx2::LoadAligned32;
+using avx2::LoadAligned32Msan;
+using avx2::LoadAligned64;
+using avx2::LoadAligned64Msan;
+using avx2::LoadUnaligned32;
+using avx2::LoadUnaligned32Msan;
+using avx2::SetrM128i;
+using avx2::StoreAligned32;
+using avx2::StoreAligned64;
+using avx2::StoreUnaligned32;
+// NOLINTEND
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#endif  // LIBGAV1_TARGETING_AVX2
+#endif  // LIBGAV1_SRC_DSP_X86_COMMON_AVX2_H_

diff --git a/libgav1/src/dsp/x86/common_avx2.inc b/libgav1/src/dsp/x86/common_avx2.inc
new file mode 100644
index 0000000..53b4e2e
--- /dev/null
+++ b/libgav1/src/dsp/x86/common_avx2.inc

@@ -0,0 +1,121 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//------------------------------------------------------------------------------
+// Compatibility functions.
+
+inline __m256i SetrM128i(const __m128i lo, const __m128i hi) {
+  // For compatibility with older gcc toolchains (< 8) use
+  // _mm256_inserti128_si256 over _mm256_setr_m128i. Newer gcc implementations
+  // are implemented similarly to the following, clang uses a different method
+  // but no differences in assembly have been observed.
+  return _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1);
+}
+
+//------------------------------------------------------------------------------
+// Load functions.
+
+inline __m256i LoadAligned32(const void* a) {
+  assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+  return _mm256_load_si256(static_cast<const __m256i*>(a));
+}
+
+inline void LoadAligned64(const void* a, __m256i dst[2]) {
+  assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+  dst[0] = _mm256_load_si256(static_cast<const __m256i*>(a) + 0);
+  dst[1] = _mm256_load_si256(static_cast<const __m256i*>(a) + 1);
+}
+
+inline __m256i LoadUnaligned32(const void* a) {
+  return _mm256_loadu_si256(static_cast<const __m256i*>(a));
+}
+
+//------------------------------------------------------------------------------
+// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
+
+inline __m256i MaskOverreads(const __m256i source,
+                             const ptrdiff_t over_read_in_bytes) {
+  __m256i dst = source;
+#if LIBGAV1_MSAN
+  if (over_read_in_bytes >= 32) return _mm256_setzero_si256();
+  if (over_read_in_bytes > 0) {
+    __m128i m = _mm_set1_epi8(-1);
+    for (ptrdiff_t i = 0; i < over_read_in_bytes % 16; ++i) {
+      m = _mm_srli_si128(m, 1);
+    }
+    const __m256i mask = (over_read_in_bytes < 16)
+                             ? SetrM128i(_mm_set1_epi8(-1), m)
+                             : SetrM128i(m, _mm_setzero_si128());
+    dst = _mm256_and_si256(dst, mask);
+  }
+#else
+  static_cast<void>(over_read_in_bytes);
+#endif
+  return dst;
+}
+
+inline __m256i LoadAligned32Msan(const void* const source,
+                                 const ptrdiff_t over_read_in_bytes) {
+  return MaskOverreads(LoadAligned32(source), over_read_in_bytes);
+}
+
+inline void LoadAligned64Msan(const void* const source,
+                              const ptrdiff_t over_read_in_bytes,
+                              __m256i dst[2]) {
+  dst[0] = MaskOverreads(LoadAligned32(source), over_read_in_bytes);
+  dst[1] = MaskOverreads(LoadAligned32(static_cast<const __m256i*>(source) + 1),
+                         over_read_in_bytes);
+}
+
+inline __m256i LoadUnaligned32Msan(const void* const source,
+                                   const ptrdiff_t over_read_in_bytes) {
+  return MaskOverreads(LoadUnaligned32(source), over_read_in_bytes);
+}
+
+//------------------------------------------------------------------------------
+// Store functions.
+
+inline void StoreAligned32(void* a, const __m256i v) {
+  assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+  _mm256_store_si256(static_cast<__m256i*>(a), v);
+}
+
+inline void StoreAligned64(void* a, const __m256i v[2]) {
+  assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+  _mm256_store_si256(static_cast<__m256i*>(a) + 0, v[0]);
+  _mm256_store_si256(static_cast<__m256i*>(a) + 1, v[1]);
+}
+
+inline void StoreUnaligned32(void* a, const __m256i v) {
+  _mm256_storeu_si256(static_cast<__m256i*>(a), v);
+}
+
+//------------------------------------------------------------------------------
+// Arithmetic utilities.
+
+inline __m256i RightShiftWithRounding_S16(const __m256i v_val_d, int bits) {
+  assert(bits <= 16);
+  const __m256i v_bias_d =
+      _mm256_set1_epi16(static_cast<int16_t>((1 << bits) >> 1));
+  const __m256i v_tmp_d = _mm256_add_epi16(v_val_d, v_bias_d);
+  return _mm256_srai_epi16(v_tmp_d, bits);
+}
+
+inline __m256i RightShiftWithRounding_S32(const __m256i v_val_d, int bits) {
+  const __m256i v_bias_d = _mm256_set1_epi32((1 << bits) >> 1);
+  const __m256i v_tmp_d = _mm256_add_epi32(v_val_d, v_bias_d);
+  return _mm256_srai_epi32(v_tmp_d, bits);
+}

diff --git a/libgav1/src/dsp/x86/common_sse4.h b/libgav1/src/dsp/x86/common_sse4.h
index 24c801f..41a3a68 100644
--- a/libgav1/src/dsp/x86/common_sse4.h
+++ b/libgav1/src/dsp/x86/common_sse4.h

@@ -20,14 +20,14 @@
 #include "src/utils/compiler_attributes.h"
 #include "src/utils/cpu.h"
 
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
 
 #include <emmintrin.h>
 #include <smmintrin.h>
 
 #include <cassert>
+#include <cstddef>
 #include <cstdint>
-#include <cstdlib>
 #include <cstring>
 
 #if 0
@@ -70,189 +70,61 @@
 #define PR(var, N) PrintReg(var, #var, N)
 #define PD(var) PrintReg(var, #var);
 #define PX(var) PrintRegX(var, #var);
+
+#if LIBGAV1_MSAN
+#include <sanitizer/msan_interface.h>
+
+inline void PrintShadow(const void* r, const char* const name,
+                        const size_t size) {
+  fprintf(stderr, "Shadow for %s:\n", name);
+  __msan_print_shadow(r, size);
+}
+#define PS(var, N) PrintShadow(var, #var, N)
+
+#endif  // LIBGAV1_MSAN
+
 #endif  // 0
 
 namespace libgav1 {
 namespace dsp {
+namespace sse4 {
 
-//------------------------------------------------------------------------------
-// Load functions.
+#include "src/dsp/x86/common_sse4.inc"
 
-inline __m128i Load2(const void* src) {
-  int16_t val;
-  memcpy(&val, src, sizeof(val));
-  return _mm_cvtsi32_si128(val);
-}
+}  // namespace sse4
 
-inline __m128i Load2x2(const void* src1, const void* src2) {
-  uint16_t val1;
-  uint16_t val2;
-  memcpy(&val1, src1, sizeof(val1));
-  memcpy(&val2, src2, sizeof(val2));
-  return _mm_cvtsi32_si128(val1 | (val2 << 16));
-}
-
-// Load 2 uint8_t values into |lane| * 2 and |lane| * 2 + 1.
-template <int lane>
-inline __m128i Load2(const void* const buf, __m128i val) {
-  uint16_t temp;
-  memcpy(&temp, buf, 2);
-  return _mm_insert_epi16(val, temp, lane);
-}
-
-inline __m128i Load4(const void* src) {
-  // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
-  // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
-  // movss instruction.
-  //
-  // Until compiler support of _mm_loadu_si32 is widespread, use of
-  // _mm_loadu_si32 is banned.
-  int val;
-  memcpy(&val, src, sizeof(val));
-  return _mm_cvtsi32_si128(val);
-}
-
-inline __m128i Load4x2(const void* src1, const void* src2) {
-  // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
-  // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
-  // movss instruction.
-  //
-  // Until compiler support of _mm_loadu_si32 is widespread, use of
-  // _mm_loadu_si32 is banned.
-  int val1, val2;
-  memcpy(&val1, src1, sizeof(val1));
-  memcpy(&val2, src2, sizeof(val2));
-  return _mm_insert_epi32(_mm_cvtsi32_si128(val1), val2, 1);
-}
-
-inline __m128i LoadLo8(const void* a) {
-  return _mm_loadl_epi64(static_cast<const __m128i*>(a));
-}
-
-inline __m128i LoadHi8(const __m128i v, const void* a) {
-  const __m128 x =
-      _mm_loadh_pi(_mm_castsi128_ps(v), static_cast<const __m64*>(a));
-  return _mm_castps_si128(x);
-}
-
-inline __m128i LoadUnaligned16(const void* a) {
-  return _mm_loadu_si128(static_cast<const __m128i*>(a));
-}
-
-inline __m128i LoadAligned16(const void* a) {
-  assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0);
-  return _mm_load_si128(static_cast<const __m128i*>(a));
-}
-
-//------------------------------------------------------------------------------
-// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
-
-inline __m128i MaskOverreads(const __m128i source,
-                             const int over_read_in_bytes) {
-  __m128i dst = source;
-#if LIBGAV1_MSAN
-  if (over_read_in_bytes > 0) {
-    __m128i mask = _mm_set1_epi8(-1);
-    for (int i = 0; i < over_read_in_bytes; ++i) {
-      mask = _mm_srli_si128(mask, 1);
-    }
-    dst = _mm_and_si128(dst, mask);
-  }
-#else
-  static_cast<void>(over_read_in_bytes);
-#endif
-  return dst;
-}
-
-inline __m128i LoadLo8Msan(const void* const source,
-                           const int over_read_in_bytes) {
-  return MaskOverreads(LoadLo8(source), over_read_in_bytes + 8);
-}
-
-inline __m128i LoadAligned16Msan(const void* const source,
-                                 const int over_read_in_bytes) {
-  return MaskOverreads(LoadAligned16(source), over_read_in_bytes);
-}
-
-inline __m128i LoadUnaligned16Msan(const void* const source,
-                                   const int over_read_in_bytes) {
-  return MaskOverreads(LoadUnaligned16(source), over_read_in_bytes);
-}
-
-//------------------------------------------------------------------------------
-// Store functions.
-
-inline void Store2(void* dst, const __m128i x) {
-  const int val = _mm_cvtsi128_si32(x);
-  memcpy(dst, &val, 2);
-}
-
-inline void Store4(void* dst, const __m128i x) {
-  const int val = _mm_cvtsi128_si32(x);
-  memcpy(dst, &val, sizeof(val));
-}
-
-inline void StoreLo8(void* a, const __m128i v) {
-  _mm_storel_epi64(static_cast<__m128i*>(a), v);
-}
-
-inline void StoreHi8(void* a, const __m128i v) {
-  _mm_storeh_pi(static_cast<__m64*>(a), _mm_castsi128_ps(v));
-}
-
-inline void StoreAligned16(void* a, const __m128i v) {
-  _mm_store_si128(static_cast<__m128i*>(a), v);
-}
-
-inline void StoreUnaligned16(void* a, const __m128i v) {
-  _mm_storeu_si128(static_cast<__m128i*>(a), v);
-}
-
-//------------------------------------------------------------------------------
-// Arithmetic utilities.
-
-inline __m128i RightShiftWithRounding_U16(const __m128i v_val_d, int bits) {
-  assert(bits <= 16);
-  const __m128i v_bias_d =
-      _mm_set1_epi16(static_cast<int16_t>((1 << bits) >> 1));
-  const __m128i v_tmp_d = _mm_add_epi16(v_val_d, v_bias_d);
-  return _mm_srli_epi16(v_tmp_d, bits);
-}
-
-inline __m128i RightShiftWithRounding_S16(const __m128i v_val_d, int bits) {
-  assert(bits <= 16);
-  const __m128i v_bias_d =
-      _mm_set1_epi16(static_cast<int16_t>((1 << bits) >> 1));
-  const __m128i v_tmp_d = _mm_add_epi16(v_val_d, v_bias_d);
-  return _mm_srai_epi16(v_tmp_d, bits);
-}
-
-inline __m128i RightShiftWithRounding_U32(const __m128i v_val_d, int bits) {
-  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
-  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
-  return _mm_srli_epi32(v_tmp_d, bits);
-}
-
-inline __m128i RightShiftWithRounding_S32(const __m128i v_val_d, int bits) {
-  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
-  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
-  return _mm_srai_epi32(v_tmp_d, bits);
-}
-
-//------------------------------------------------------------------------------
-// Masking utilities
-inline __m128i MaskHighNBytes(int n) {
-  static constexpr uint8_t kMask[32] = {
-      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-      0,   0,   0,   0,   0,   255, 255, 255, 255, 255, 255,
-      255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-  };
-
-  return LoadUnaligned16(kMask + n);
-}
+// NOLINTBEGIN(misc-unused-using-decls)
+// These function aliases shall not be visible to external code. They are
+// restricted to x86/*_sse4.cc files only. This scheme exists to distinguish two
+// possible implementations of common functions, which may differ based on
+// whether the compiler is permitted to use avx2 instructions.
+using sse4::Load2;
+using sse4::Load2x2;
+using sse4::Load4;
+using sse4::Load4x2;
+using sse4::LoadAligned16;
+using sse4::LoadAligned16Msan;
+using sse4::LoadHi8;
+using sse4::LoadHi8Msan;
+using sse4::LoadLo8;
+using sse4::LoadLo8Msan;
+using sse4::LoadUnaligned16;
+using sse4::LoadUnaligned16Msan;
+using sse4::MaskHighNBytes;
+using sse4::RightShiftWithRounding_S16;
+using sse4::RightShiftWithRounding_S32;
+using sse4::RightShiftWithRounding_U16;
+using sse4::RightShiftWithRounding_U32;
+using sse4::Store2;
+using sse4::Store4;
+using sse4::StoreAligned16;
+using sse4::StoreHi8;
+using sse4::StoreLo8;
+using sse4::StoreUnaligned16;
+// NOLINTEND
 
 }  // namespace dsp
 }  // namespace libgav1
 
-#endif  // LIBGAV1_ENABLE_SSE4_1
+#endif  // LIBGAV1_TARGETING_SSE4_1
 #endif  // LIBGAV1_SRC_DSP_X86_COMMON_SSE4_H_

diff --git a/libgav1/src/dsp/x86/common_sse4.inc b/libgav1/src/dsp/x86/common_sse4.inc
new file mode 100644
index 0000000..35c56b8
--- /dev/null
+++ b/libgav1/src/dsp/x86/common_sse4.inc

@@ -0,0 +1,206 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//------------------------------------------------------------------------------
+// Load functions.
+
+inline __m128i Load2(const void* src) {
+  int16_t val;
+  memcpy(&val, src, sizeof(val));
+  return _mm_cvtsi32_si128(val);
+}
+
+inline __m128i Load2x2(const void* src1, const void* src2) {
+  uint16_t val1;
+  uint16_t val2;
+  memcpy(&val1, src1, sizeof(val1));
+  memcpy(&val2, src2, sizeof(val2));
+  return _mm_cvtsi32_si128(val1 | (val2 << 16));
+}
+
+// Load 2 uint8_t values into |lane| * 2 and |lane| * 2 + 1.
+template <int lane>
+inline __m128i Load2(const void* const buf, __m128i val) {
+  int16_t temp;
+  memcpy(&temp, buf, 2);
+  return _mm_insert_epi16(val, temp, lane);
+}
+
+inline __m128i Load4(const void* src) {
+  // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
+  // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
+  // movss instruction.
+  //
+  // Until compiler support of _mm_loadu_si32 is widespread, use of
+  // _mm_loadu_si32 is banned.
+  int val;
+  memcpy(&val, src, sizeof(val));
+  return _mm_cvtsi32_si128(val);
+}
+
+inline __m128i Load4x2(const void* src1, const void* src2) {
+  // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
+  // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
+  // movss instruction.
+  //
+  // Until compiler support of _mm_loadu_si32 is widespread, use of
+  // _mm_loadu_si32 is banned.
+  int val1, val2;
+  memcpy(&val1, src1, sizeof(val1));
+  memcpy(&val2, src2, sizeof(val2));
+  return _mm_insert_epi32(_mm_cvtsi32_si128(val1), val2, 1);
+}
+
+inline __m128i LoadLo8(const void* a) {
+  return _mm_loadl_epi64(static_cast<const __m128i*>(a));
+}
+
+inline __m128i LoadHi8(const __m128i v, const void* a) {
+  const __m128 x =
+      _mm_loadh_pi(_mm_castsi128_ps(v), static_cast<const __m64*>(a));
+  return _mm_castps_si128(x);
+}
+
+inline __m128i LoadUnaligned16(const void* a) {
+  return _mm_loadu_si128(static_cast<const __m128i*>(a));
+}
+
+inline __m128i LoadAligned16(const void* a) {
+  assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0);
+  return _mm_load_si128(static_cast<const __m128i*>(a));
+}
+
+//------------------------------------------------------------------------------
+// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
+
+inline __m128i MaskOverreads(const __m128i source,
+                             const ptrdiff_t over_read_in_bytes) {
+  __m128i dst = source;
+#if LIBGAV1_MSAN
+  if (over_read_in_bytes > 0) {
+    __m128i mask = _mm_set1_epi8(-1);
+    for (ptrdiff_t i = 0; i < over_read_in_bytes; ++i) {
+      mask = _mm_srli_si128(mask, 1);
+    }
+    dst = _mm_and_si128(dst, mask);
+  }
+#else
+  static_cast<void>(over_read_in_bytes);
+#endif
+  return dst;
+}
+
+inline __m128i LoadLo8Msan(const void* const source,
+                           const ptrdiff_t over_read_in_bytes) {
+  return MaskOverreads(LoadLo8(source), over_read_in_bytes + 8);
+}
+
+inline __m128i LoadHi8Msan(const __m128i v, const void* source,
+                           const ptrdiff_t over_read_in_bytes) {
+  return MaskOverreads(LoadHi8(v, source), over_read_in_bytes);
+}
+
+inline __m128i LoadAligned16Msan(const void* const source,
+                                 const ptrdiff_t over_read_in_bytes) {
+  return MaskOverreads(LoadAligned16(source), over_read_in_bytes);
+}
+
+inline __m128i LoadUnaligned16Msan(const void* const source,
+                                   const ptrdiff_t over_read_in_bytes) {
+  return MaskOverreads(LoadUnaligned16(source), over_read_in_bytes);
+}
+
+//------------------------------------------------------------------------------
+// Store functions.
+
+inline void Store2(void* dst, const __m128i x) {
+  const int val = _mm_cvtsi128_si32(x);
+  memcpy(dst, &val, 2);
+}
+
+inline void Store4(void* dst, const __m128i x) {
+  const int val = _mm_cvtsi128_si32(x);
+  memcpy(dst, &val, sizeof(val));
+}
+
+inline void StoreLo8(void* a, const __m128i v) {
+  _mm_storel_epi64(static_cast<__m128i*>(a), v);
+}
+
+inline void StoreHi8(void* a, const __m128i v) {
+  _mm_storeh_pi(static_cast<__m64*>(a), _mm_castsi128_ps(v));
+}
+
+inline void StoreAligned16(void* a, const __m128i v) {
+  assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0);
+  _mm_store_si128(static_cast<__m128i*>(a), v);
+}
+
+inline void StoreUnaligned16(void* a, const __m128i v) {
+  _mm_storeu_si128(static_cast<__m128i*>(a), v);
+}
+
+//------------------------------------------------------------------------------
+// Arithmetic utilities.
+
+inline __m128i RightShiftWithRounding_U16(const __m128i v_val_d, int bits) {
+  assert(bits <= 16);
+  // Shift out all but the last bit.
+  const __m128i v_tmp_d = _mm_srli_epi16(v_val_d, bits - 1);
+  // Avg with zero will shift by 1 and round.
+  return _mm_avg_epu16(v_tmp_d, _mm_setzero_si128());
+}
+
+inline __m128i RightShiftWithRounding_S16(const __m128i v_val_d, int bits) {
+  assert(bits < 16);
+  const __m128i v_bias_d =
+      _mm_set1_epi16(static_cast<int16_t>((1 << bits) >> 1));
+  const __m128i v_tmp_d = _mm_add_epi16(v_val_d, v_bias_d);
+  return _mm_srai_epi16(v_tmp_d, bits);
+}
+
+inline __m128i RightShiftWithRounding_U32(const __m128i v_val_d, int bits) {
+  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+  return _mm_srli_epi32(v_tmp_d, bits);
+}
+
+inline __m128i RightShiftWithRounding_S32(const __m128i v_val_d, int bits) {
+  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+  return _mm_srai_epi32(v_tmp_d, bits);
+}
+
+// Use this when |bits| is not an immediate value.
+inline __m128i VariableRightShiftWithRounding_S32(const __m128i v_val_d,
+                                                  int bits) {
+  const __m128i v_bias_d =
+      _mm_set1_epi32(static_cast<int32_t>((1 << bits) >> 1));
+  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+  return _mm_sra_epi32(v_tmp_d, _mm_cvtsi32_si128(bits));
+}
+
+//------------------------------------------------------------------------------
+// Masking utilities
+inline __m128i MaskHighNBytes(int n) {
+  static constexpr uint8_t kMask[32] = {
+      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+      0,   0,   0,   0,   0,   255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  };
+
+  return LoadUnaligned16(kMask + n);
+}

diff --git a/libgav1/src/dsp/x86/convolve_avx2.cc b/libgav1/src/dsp/x86/convolve_avx2.cc
new file mode 100644
index 0000000..2ecb77c
--- /dev/null
+++ b/libgav1/src/dsp/x86/convolve_avx2.cc

@@ -0,0 +1,1544 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/convolve.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_avx2.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+#include "src/dsp/x86/convolve_sse4.inc"
+
+// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
+// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
+// sum from outranging int16_t.
+template <int filter_index>
+__m256i SumOnePassTaps(const __m256i* const src, const __m256i* const taps) {
+  __m256i sum;
+  if (filter_index < 2) {
+    // 6 taps.
+    const __m256i v_madd_21 = _mm256_maddubs_epi16(src[0], taps[0]);  // k2k1
+    const __m256i v_madd_43 = _mm256_maddubs_epi16(src[1], taps[1]);  // k4k3
+    const __m256i v_madd_65 = _mm256_maddubs_epi16(src[2], taps[2]);  // k6k5
+    sum = _mm256_add_epi16(v_madd_21, v_madd_43);
+    sum = _mm256_add_epi16(sum, v_madd_65);
+  } else if (filter_index == 2) {
+    // 8 taps.
+    const __m256i v_madd_10 = _mm256_maddubs_epi16(src[0], taps[0]);  // k1k0
+    const __m256i v_madd_32 = _mm256_maddubs_epi16(src[1], taps[1]);  // k3k2
+    const __m256i v_madd_54 = _mm256_maddubs_epi16(src[2], taps[2]);  // k5k4
+    const __m256i v_madd_76 = _mm256_maddubs_epi16(src[3], taps[3]);  // k7k6
+    const __m256i v_sum_3210 = _mm256_add_epi16(v_madd_10, v_madd_32);
+    const __m256i v_sum_7654 = _mm256_add_epi16(v_madd_54, v_madd_76);
+    sum = _mm256_add_epi16(v_sum_7654, v_sum_3210);
+  } else if (filter_index == 3) {
+    // 2 taps.
+    sum = _mm256_maddubs_epi16(src[0], taps[0]);  // k4k3
+  } else {
+    // 4 taps.
+    const __m256i v_madd_32 = _mm256_maddubs_epi16(src[0], taps[0]);  // k3k2
+    const __m256i v_madd_54 = _mm256_maddubs_epi16(src[1], taps[1]);  // k5k4
+    sum = _mm256_add_epi16(v_madd_32, v_madd_54);
+  }
+  return sum;
+}
+
+template <int filter_index>
+__m256i SumHorizontalTaps(const __m256i* const src,
+                          const __m256i* const v_tap) {
+  __m256i v_src[4];
+  const __m256i src_long = *src;
+  const __m256i src_long_dup_lo = _mm256_unpacklo_epi8(src_long, src_long);
+  const __m256i src_long_dup_hi = _mm256_unpackhi_epi8(src_long, src_long);
+
+  if (filter_index < 2) {
+    // 6 taps.
+    v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 3);   // _21
+    v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7);   // _43
+    v_src[2] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 11);  // _65
+  } else if (filter_index == 2) {
+    // 8 taps.
+    v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 1);   // _10
+    v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5);   // _32
+    v_src[2] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9);   // _54
+    v_src[3] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 13);  // _76
+  } else if (filter_index == 3) {
+    // 2 taps.
+    v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7);  // _43
+  } else if (filter_index > 3) {
+    // 4 taps.
+    v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5);  // _32
+    v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9);  // _54
+  }
+  return SumOnePassTaps<filter_index>(v_src, v_tap);
+}
+
+template <int filter_index>
+__m256i SimpleHorizontalTaps(const __m256i* const src,
+                             const __m256i* const v_tap) {
+  __m256i sum = SumHorizontalTaps<filter_index>(src, v_tap);
+
+  // Normally the Horizontal pass does the downshift in two passes:
+  // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+  // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
+  // requires adding the rounding offset from the skipped shift.
+  constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
+
+  sum = _mm256_add_epi16(sum, _mm256_set1_epi16(first_shift_rounding_bit));
+  sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
+  return _mm256_packus_epi16(sum, sum);
+}
+
+template <int filter_index>
+__m256i HorizontalTaps8To16(const __m256i* const src,
+                            const __m256i* const v_tap) {
+  const __m256i sum = SumHorizontalTaps<filter_index>(src, v_tap);
+
+  return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+// Filter 2xh sizes.
+template <int num_taps, int filter_index, bool is_2d = false,
+          bool is_compound = false>
+void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
+                      void* const dest, const ptrdiff_t pred_stride,
+                      const int /*width*/, const int height,
+                      const __m128i* const v_tap) {
+  auto* dest8 = static_cast<uint8_t*>(dest);
+  auto* dest16 = static_cast<uint16_t*>(dest);
+
+  // Horizontal passes only need to account for |num_taps| 2 and 4 when
+  // |width| <= 4.
+  assert(num_taps <= 4);
+  if (num_taps <= 4) {
+    if (!is_compound) {
+      int y = height;
+      if (is_2d) y -= 1;
+      do {
+        if (is_2d) {
+          const __m128i sum =
+              HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap);
+          Store4(&dest16[0], sum);
+          dest16 += pred_stride;
+          Store4(&dest16[0], _mm_srli_si128(sum, 8));
+          dest16 += pred_stride;
+        } else {
+          const __m128i sum =
+              SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+          Store2(dest8, sum);
+          dest8 += pred_stride;
+          Store2(dest8, _mm_srli_si128(sum, 4));
+          dest8 += pred_stride;
+        }
+
+        src += src_stride << 1;
+        y -= 2;
+      } while (y != 0);
+
+      // The 2d filters have an odd |height| because the horizontal pass
+      // generates context for the vertical pass.
+      if (is_2d) {
+        assert(height % 2 == 1);
+        __m128i sum;
+        const __m128i input = LoadLo8(&src[2]);
+        if (filter_index == 3) {
+          // 03 04 04 05 05 06 06 07 ....
+          const __m128i v_src_43 =
+              _mm_srli_si128(_mm_unpacklo_epi8(input, input), 3);
+          sum = _mm_maddubs_epi16(v_src_43, v_tap[0]);  // k4k3
+        } else {
+          // 02 03 03 04 04 05 05 06 06 07 ....
+          const __m128i v_src_32 =
+              _mm_srli_si128(_mm_unpacklo_epi8(input, input), 1);
+          // 04 05 05 06 06 07 07 08 ...
+          const __m128i v_src_54 = _mm_srli_si128(v_src_32, 4);
+          const __m128i v_madd_32 =
+              _mm_maddubs_epi16(v_src_32, v_tap[0]);  // k3k2
+          const __m128i v_madd_54 =
+              _mm_maddubs_epi16(v_src_54, v_tap[1]);  // k5k4
+          sum = _mm_add_epi16(v_madd_54, v_madd_32);
+        }
+        sum = RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+        Store4(dest16, sum);
+      }
+    }
+  }
+}
+
+// Filter widths >= 4.
+template <int num_taps, int filter_index, bool is_2d = false,
+          bool is_compound = false>
+void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
+                      void* const dest, const ptrdiff_t pred_stride,
+                      const int width, const int height,
+                      const __m256i* const v_tap) {
+  auto* dest8 = static_cast<uint8_t*>(dest);
+  auto* dest16 = static_cast<uint16_t*>(dest);
+
+  if (width >= 32) {
+    int y = height;
+    do {
+      int x = 0;
+      do {
+        if (is_2d || is_compound) {
+          // Load into 2 128 bit lanes.
+          const __m256i src_long =
+              SetrM128i(LoadUnaligned16(&src[x]), LoadUnaligned16(&src[x + 8]));
+          const __m256i result =
+              HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+          const __m256i src_long2 = SetrM128i(LoadUnaligned16(&src[x + 16]),
+                                              LoadUnaligned16(&src[x + 24]));
+          const __m256i result2 =
+              HorizontalTaps8To16<filter_index>(&src_long2, v_tap);
+          if (is_2d) {
+            StoreAligned32(&dest16[x], result);
+            StoreAligned32(&dest16[x + 16], result2);
+          } else {
+            StoreUnaligned32(&dest16[x], result);
+            StoreUnaligned32(&dest16[x + 16], result2);
+          }
+        } else {
+          // Load src used to calculate dest8[7:0] and dest8[23:16].
+          const __m256i src_long = LoadUnaligned32(&src[x]);
+          const __m256i result =
+              SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+          // Load src used to calculate dest8[15:8] and dest8[31:24].
+          const __m256i src_long2 = LoadUnaligned32(&src[x + 8]);
+          const __m256i result2 =
+              SimpleHorizontalTaps<filter_index>(&src_long2, v_tap);
+          // Combine results and store.
+          StoreUnaligned32(&dest8[x], _mm256_unpacklo_epi64(result, result2));
+        }
+        x += 32;
+      } while (x < width);
+      src += src_stride;
+      dest8 += pred_stride;
+      dest16 += pred_stride;
+    } while (--y != 0);
+  } else if (width == 16) {
+    int y = height;
+    if (is_2d) y -= 1;
+    do {
+      if (is_2d || is_compound) {
+        // Load into 2 128 bit lanes.
+        const __m256i src_long =
+            SetrM128i(LoadUnaligned16(&src[0]), LoadUnaligned16(&src[8]));
+        const __m256i result =
+            HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+        const __m256i src_long2 =
+            SetrM128i(LoadUnaligned16(&src[src_stride]),
+                      LoadUnaligned16(&src[8 + src_stride]));
+        const __m256i result2 =
+            HorizontalTaps8To16<filter_index>(&src_long2, v_tap);
+        if (is_2d) {
+          StoreAligned32(&dest16[0], result);
+          StoreAligned32(&dest16[pred_stride], result2);
+        } else {
+          StoreUnaligned32(&dest16[0], result);
+          StoreUnaligned32(&dest16[pred_stride], result2);
+        }
+      } else {
+        // Load into 2 128 bit lanes.
+        const __m256i src_long = SetrM128i(LoadUnaligned16(&src[0]),
+                                           LoadUnaligned16(&src[src_stride]));
+        const __m256i result =
+            SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+        const __m256i src_long2 = SetrM128i(
+            LoadUnaligned16(&src[8]), LoadUnaligned16(&src[8 + src_stride]));
+        const __m256i result2 =
+            SimpleHorizontalTaps<filter_index>(&src_long2, v_tap);
+        const __m256i packed_result = _mm256_unpacklo_epi64(result, result2);
+        StoreUnaligned16(&dest8[0], _mm256_castsi256_si128(packed_result));
+        StoreUnaligned16(&dest8[pred_stride],
+                         _mm256_extracti128_si256(packed_result, 1));
+      }
+      src += src_stride * 2;
+      dest8 += pred_stride * 2;
+      dest16 += pred_stride * 2;
+      y -= 2;
+    } while (y != 0);
+
+    // The 2d filters have an odd |height| during the horizontal pass, so
+    // filter the remaining row.
+    if (is_2d) {
+      const __m256i src_long =
+          SetrM128i(LoadUnaligned16(&src[0]), LoadUnaligned16(&src[8]));
+      const __m256i result =
+          HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+      StoreAligned32(&dest16[0], result);
+    }
+
+  } else if (width == 8) {
+    int y = height;
+    if (is_2d) y -= 1;
+    do {
+      // Load into 2 128 bit lanes.
+      const __m128i this_row = LoadUnaligned16(&src[0]);
+      const __m128i next_row = LoadUnaligned16(&src[src_stride]);
+      const __m256i src_long = SetrM128i(this_row, next_row);
+      if (is_2d || is_compound) {
+        const __m256i result =
+            HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+        if (is_2d) {
+          StoreAligned16(&dest16[0], _mm256_castsi256_si128(result));
+          StoreAligned16(&dest16[pred_stride],
+                         _mm256_extracti128_si256(result, 1));
+        } else {
+          StoreUnaligned16(&dest16[0], _mm256_castsi256_si128(result));
+          StoreUnaligned16(&dest16[pred_stride],
+                           _mm256_extracti128_si256(result, 1));
+        }
+      } else {
+        const __m128i this_row = LoadUnaligned16(&src[0]);
+        const __m128i next_row = LoadUnaligned16(&src[src_stride]);
+        // Load into 2 128 bit lanes.
+        const __m256i src_long = SetrM128i(this_row, next_row);
+        const __m256i result =
+            SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+        StoreLo8(&dest8[0], _mm256_castsi256_si128(result));
+        StoreLo8(&dest8[pred_stride], _mm256_extracti128_si256(result, 1));
+      }
+      src += src_stride * 2;
+      dest8 += pred_stride * 2;
+      dest16 += pred_stride * 2;
+      y -= 2;
+    } while (y != 0);
+
+    // The 2d filters have an odd |height| during the horizontal pass, so
+    // filter the remaining row.
+    if (is_2d) {
+      const __m256i src_long = _mm256_castsi128_si256(LoadUnaligned16(&src[0]));
+      const __m256i result =
+          HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+      StoreAligned16(&dest16[0], _mm256_castsi256_si128(result));
+    }
+
+  } else {  // width == 4
+    int y = height;
+    if (is_2d) y -= 1;
+    do {
+      // Load into 2 128 bit lanes.
+      const __m128i this_row = LoadUnaligned16(&src[0]);
+      const __m128i next_row = LoadUnaligned16(&src[src_stride]);
+      const __m256i src_long = SetrM128i(this_row, next_row);
+      if (is_2d || is_compound) {
+        const __m256i result =
+            HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+        StoreLo8(&dest16[0], _mm256_castsi256_si128(result));
+        StoreLo8(&dest16[pred_stride], _mm256_extracti128_si256(result, 1));
+      } else {
+        const __m128i this_row = LoadUnaligned16(&src[0]);
+        const __m128i next_row = LoadUnaligned16(&src[src_stride]);
+        // Load into 2 128 bit lanes.
+        const __m256i src_long = SetrM128i(this_row, next_row);
+        const __m256i result =
+            SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+        Store4(&dest8[0], _mm256_castsi256_si128(result));
+        Store4(&dest8[pred_stride], _mm256_extracti128_si256(result, 1));
+      }
+      src += src_stride * 2;
+      dest8 += pred_stride * 2;
+      dest16 += pred_stride * 2;
+      y -= 2;
+    } while (y != 0);
+
+    // The 2d filters have an odd |height| during the horizontal pass, so
+    // filter the remaining row.
+    if (is_2d) {
+      const __m256i src_long = _mm256_castsi128_si256(LoadUnaligned16(&src[0]));
+      const __m256i result =
+          HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+      StoreLo8(&dest16[0], _mm256_castsi256_si128(result));
+    }
+  }
+}
+
+template <int num_taps, bool is_2d_vertical = false>
+LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter,
+                                     __m256i* v_tap) {
+  if (num_taps == 8) {
+    if (is_2d_vertical) {
+      v_tap[0] = _mm256_broadcastd_epi32(*filter);                      // k1k0
+      v_tap[1] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 4));   // k3k2
+      v_tap[2] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 8));   // k5k4
+      v_tap[3] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 12));  // k7k6
+    } else {
+      v_tap[0] = _mm256_broadcastw_epi16(*filter);                     // k1k0
+      v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 2));  // k3k2
+      v_tap[2] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 4));  // k5k4
+      v_tap[3] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 6));  // k7k6
+    }
+  } else if (num_taps == 6) {
+    if (is_2d_vertical) {
+      v_tap[0] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 2));   // k2k1
+      v_tap[1] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 6));   // k4k3
+      v_tap[2] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 10));  // k6k5
+    } else {
+      v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 1));  // k2k1
+      v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 3));  // k4k3
+      v_tap[2] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 5));  // k6k5
+    }
+  } else if (num_taps == 4) {
+    if (is_2d_vertical) {
+      v_tap[0] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 4));  // k3k2
+      v_tap[1] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 8));  // k5k4
+    } else {
+      v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 2));  // k3k2
+      v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 4));  // k5k4
+    }
+  } else {  // num_taps == 2
+    if (is_2d_vertical) {
+      v_tap[0] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 6));  // k4k3
+    } else {
+      v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 3));  // k4k3
+    }
+  }
+}
+
+template <int num_taps, bool is_compound>
+__m256i SimpleSum2DVerticalTaps(const __m256i* const src,
+                                const __m256i* const taps) {
+  __m256i sum_lo =
+      _mm256_madd_epi16(_mm256_unpacklo_epi16(src[0], src[1]), taps[0]);
+  __m256i sum_hi =
+      _mm256_madd_epi16(_mm256_unpackhi_epi16(src[0], src[1]), taps[0]);
+  if (num_taps >= 4) {
+    __m256i madd_lo =
+        _mm256_madd_epi16(_mm256_unpacklo_epi16(src[2], src[3]), taps[1]);
+    __m256i madd_hi =
+        _mm256_madd_epi16(_mm256_unpackhi_epi16(src[2], src[3]), taps[1]);
+    sum_lo = _mm256_add_epi32(sum_lo, madd_lo);
+    sum_hi = _mm256_add_epi32(sum_hi, madd_hi);
+    if (num_taps >= 6) {
+      madd_lo =
+          _mm256_madd_epi16(_mm256_unpacklo_epi16(src[4], src[5]), taps[2]);
+      madd_hi =
+          _mm256_madd_epi16(_mm256_unpackhi_epi16(src[4], src[5]), taps[2]);
+      sum_lo = _mm256_add_epi32(sum_lo, madd_lo);
+      sum_hi = _mm256_add_epi32(sum_hi, madd_hi);
+      if (num_taps == 8) {
+        madd_lo =
+            _mm256_madd_epi16(_mm256_unpacklo_epi16(src[6], src[7]), taps[3]);
+        madd_hi =
+            _mm256_madd_epi16(_mm256_unpackhi_epi16(src[6], src[7]), taps[3]);
+        sum_lo = _mm256_add_epi32(sum_lo, madd_lo);
+        sum_hi = _mm256_add_epi32(sum_hi, madd_hi);
+      }
+    }
+  }
+
+  if (is_compound) {
+    return _mm256_packs_epi32(
+        RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+        RightShiftWithRounding_S32(sum_hi,
+                                   kInterRoundBitsCompoundVertical - 1));
+  }
+
+  return _mm256_packs_epi32(
+      RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+      RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical16xH(const uint16_t* src, void* const dst,
+                          const ptrdiff_t dst_stride, const int width,
+                          const int height, const __m256i* const taps) {
+  assert(width >= 8);
+  constexpr int next_row = num_taps - 1;
+  // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
+  const ptrdiff_t src_stride = width;
+
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  int x = 0;
+  do {
+    __m256i srcs[8];
+    const uint16_t* src_x = src + x;
+    srcs[0] = LoadAligned32(src_x);
+    src_x += src_stride;
+    if (num_taps >= 4) {
+      srcs[1] = LoadAligned32(src_x);
+      src_x += src_stride;
+      srcs[2] = LoadAligned32(src_x);
+      src_x += src_stride;
+      if (num_taps >= 6) {
+        srcs[3] = LoadAligned32(src_x);
+        src_x += src_stride;
+        srcs[4] = LoadAligned32(src_x);
+        src_x += src_stride;
+        if (num_taps == 8) {
+          srcs[5] = LoadAligned32(src_x);
+          src_x += src_stride;
+          srcs[6] = LoadAligned32(src_x);
+          src_x += src_stride;
+        }
+      }
+    }
+
+    auto* dst8_x = dst8 + x;
+    auto* dst16_x = dst16 + x;
+    int y = height;
+    do {
+      srcs[next_row] = LoadAligned32(src_x);
+      src_x += src_stride;
+
+      const __m256i sum =
+          SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+      if (is_compound) {
+        StoreUnaligned32(dst16_x, sum);
+        dst16_x += dst_stride;
+      } else {
+        const __m128i packed_sum = _mm_packus_epi16(
+            _mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
+        StoreUnaligned16(dst8_x, packed_sum);
+        dst8_x += dst_stride;
+      }
+
+      srcs[0] = srcs[1];
+      if (num_taps >= 4) {
+        srcs[1] = srcs[2];
+        srcs[2] = srcs[3];
+        if (num_taps >= 6) {
+          srcs[3] = srcs[4];
+          srcs[4] = srcs[5];
+          if (num_taps == 8) {
+            srcs[5] = srcs[6];
+            srcs[6] = srcs[7];
+          }
+        }
+      }
+    } while (--y != 0);
+    x += 16;
+  } while (x < width);
+}
+
+template <bool is_2d = false, bool is_compound = false>
+LIBGAV1_ALWAYS_INLINE void DoHorizontalPass2xH(
+    const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
+    const ptrdiff_t dst_stride, const int width, const int height,
+    const int filter_id, const int filter_index) {
+  assert(filter_id != 0);
+  __m128i v_tap[4];
+  const __m128i v_horizontal_filter =
+      LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
+
+  if (filter_index == 4) {  // 4 tap.
+    SetupTaps<4>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
+  } else if (filter_index == 5) {  // 4 tap.
+    SetupTaps<4>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
+  } else {  // 2 tap.
+    SetupTaps<2>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
+  }
+}
+
+template <bool is_2d = false, bool is_compound = false>
+LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
+    const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
+    const ptrdiff_t dst_stride, const int width, const int height,
+    const int filter_id, const int filter_index) {
+  assert(filter_id != 0);
+  __m256i v_tap[4];
+  const __m128i v_horizontal_filter =
+      LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
+
+  if (filter_index == 2) {  // 8 tap.
+    SetupTaps<8>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<8, 2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
+  } else if (filter_index == 1) {  // 6 tap.
+    SetupTaps<6>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<6, 1, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
+  } else if (filter_index == 0) {  // 6 tap.
+    SetupTaps<6>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<6, 0, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
+  } else if (filter_index == 4) {  // 4 tap.
+    SetupTaps<4>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
+  } else if (filter_index == 5) {  // 4 tap.
+    SetupTaps<4>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
+  } else {  // 2 tap.
+    SetupTaps<2>(&v_horizontal_filter, v_tap);
+    FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
+  }
+}
+
+void Convolve2D_AVX2(const void* const reference,
+                     const ptrdiff_t reference_stride,
+                     const int horizontal_filter_index,
+                     const int vertical_filter_index,
+                     const int horizontal_filter_id,
+                     const int vertical_filter_id, const int width,
+                     const int height, void* prediction,
+                     const ptrdiff_t pred_stride) {
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+
+  // The output of the horizontal filter is guaranteed to fit in 16 bits.
+  alignas(32) uint16_t
+      intermediate_result[kMaxSuperBlockSizeInPixels *
+                          (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+  const int intermediate_height = height + vertical_taps - 1;
+
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
+  if (width > 2) {
+    DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result,
+                                     width, width, intermediate_height,
+                                     horizontal_filter_id, horiz_filter_index);
+  } else {
+    // Use non avx2 version for smaller widths.
+    DoHorizontalPass2xH</*is_2d=*/true>(
+        src, src_stride, intermediate_result, width, width, intermediate_height,
+        horizontal_filter_id, horiz_filter_index);
+  }
+
+  // Vertical filter.
+  auto* dest = static_cast<uint8_t*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride;
+  assert(vertical_filter_id != 0);
+
+  const __m128i v_filter =
+      LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]);
+
+  // Use 256 bits for width > 8.
+  if (width > 8) {
+    __m256i taps_256[4];
+    const __m128i v_filter_ext = _mm_cvtepi8_epi16(v_filter);
+
+    if (vertical_taps == 8) {
+      SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+      Filter2DVertical16xH<8>(intermediate_result, dest, dest_stride, width,
+                              height, taps_256);
+    } else if (vertical_taps == 6) {
+      SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+      Filter2DVertical16xH<6>(intermediate_result, dest, dest_stride, width,
+                              height, taps_256);
+    } else if (vertical_taps == 4) {
+      SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+      Filter2DVertical16xH<4>(intermediate_result, dest, dest_stride, width,
+                              height, taps_256);
+    } else {  // |vertical_taps| == 2
+      SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+      Filter2DVertical16xH<2>(intermediate_result, dest, dest_stride, width,
+                              height, taps_256);
+    }
+  } else {  // width <= 8
+    __m128i taps[4];
+    // Use 128 bit code.
+    if (vertical_taps == 8) {
+      SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
+      if (width == 2) {
+        Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height,
+                               taps);
+      } else if (width == 4) {
+        Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height,
+                               taps);
+      } else {
+        Filter2DVertical<8>(intermediate_result, dest, dest_stride, width,
+                            height, taps);
+      }
+    } else if (vertical_taps == 6) {
+      SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps);
+      if (width == 2) {
+        Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height,
+                               taps);
+      } else if (width == 4) {
+        Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height,
+                               taps);
+      } else {
+        Filter2DVertical<6>(intermediate_result, dest, dest_stride, width,
+                            height, taps);
+      }
+    } else if (vertical_taps == 4) {
+      SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps);
+      if (width == 2) {
+        Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height,
+                               taps);
+      } else if (width == 4) {
+        Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height,
+                               taps);
+      } else {
+        Filter2DVertical<4>(intermediate_result, dest, dest_stride, width,
+                            height, taps);
+      }
+    } else {  // |vertical_taps| == 2
+      SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps);
+      if (width == 2) {
+        Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height,
+                               taps);
+      } else if (width == 4) {
+        Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height,
+                               taps);
+      } else {
+        Filter2DVertical<2>(intermediate_result, dest, dest_stride, width,
+                            height, taps);
+      }
+    }
+  }
+}
+
+// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
+// Vertical calculations.
+__m256i Compound1DShift(const __m256i sum) {
+  return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int filter_index, bool unpack_high = false>
+__m256i SumVerticalTaps(const __m256i* const srcs, const __m256i* const v_tap) {
+  __m256i v_src[4];
+
+  if (!unpack_high) {
+    if (filter_index < 2) {
+      // 6 taps.
+      v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
+      v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]);
+      v_src[2] = _mm256_unpacklo_epi8(srcs[4], srcs[5]);
+    } else if (filter_index == 2) {
+      // 8 taps.
+      v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
+      v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]);
+      v_src[2] = _mm256_unpacklo_epi8(srcs[4], srcs[5]);
+      v_src[3] = _mm256_unpacklo_epi8(srcs[6], srcs[7]);
+    } else if (filter_index == 3) {
+      // 2 taps.
+      v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
+    } else if (filter_index > 3) {
+      // 4 taps.
+      v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
+      v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]);
+    }
+  } else {
+    if (filter_index < 2) {
+      // 6 taps.
+      v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
+      v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]);
+      v_src[2] = _mm256_unpackhi_epi8(srcs[4], srcs[5]);
+    } else if (filter_index == 2) {
+      // 8 taps.
+      v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
+      v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]);
+      v_src[2] = _mm256_unpackhi_epi8(srcs[4], srcs[5]);
+      v_src[3] = _mm256_unpackhi_epi8(srcs[6], srcs[7]);
+    } else if (filter_index == 3) {
+      // 2 taps.
+      v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
+    } else if (filter_index > 3) {
+      // 4 taps.
+      v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
+      v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]);
+    }
+  }
+  return SumOnePassTaps<filter_index>(v_src, v_tap);
+}
+
+template <int filter_index, bool is_compound = false>
+void FilterVertical32xH(const uint8_t* src, const ptrdiff_t src_stride,
+                        void* const dst, const ptrdiff_t dst_stride,
+                        const int width, const int height,
+                        const __m256i* const v_tap) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  const int next_row = num_taps - 1;
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+  assert(width >= 32);
+  int x = 0;
+  do {
+    const uint8_t* src_x = src + x;
+    __m256i srcs[8];
+    srcs[0] = LoadUnaligned32(src_x);
+    src_x += src_stride;
+    if (num_taps >= 4) {
+      srcs[1] = LoadUnaligned32(src_x);
+      src_x += src_stride;
+      srcs[2] = LoadUnaligned32(src_x);
+      src_x += src_stride;
+      if (num_taps >= 6) {
+        srcs[3] = LoadUnaligned32(src_x);
+        src_x += src_stride;
+        srcs[4] = LoadUnaligned32(src_x);
+        src_x += src_stride;
+        if (num_taps == 8) {
+          srcs[5] = LoadUnaligned32(src_x);
+          src_x += src_stride;
+          srcs[6] = LoadUnaligned32(src_x);
+          src_x += src_stride;
+        }
+      }
+    }
+
+    auto* dst8_x = dst8 + x;
+    auto* dst16_x = dst16 + x;
+    int y = height;
+    do {
+      srcs[next_row] = LoadUnaligned32(src_x);
+      src_x += src_stride;
+
+      const __m256i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m256i sums_hi =
+          SumVerticalTaps<filter_index, /*unpack_high=*/true>(srcs, v_tap);
+      if (is_compound) {
+        const __m256i results =
+            Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x20));
+        const __m256i results_hi =
+            Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x31));
+        StoreUnaligned32(dst16_x, results);
+        StoreUnaligned32(dst16_x + 16, results_hi);
+        dst16_x += dst_stride;
+      } else {
+        const __m256i results =
+            RightShiftWithRounding_S16(sums, kFilterBits - 1);
+        const __m256i results_hi =
+            RightShiftWithRounding_S16(sums_hi, kFilterBits - 1);
+        const __m256i packed_results = _mm256_packus_epi16(results, results_hi);
+
+        StoreUnaligned32(dst8_x, packed_results);
+        dst8_x += dst_stride;
+      }
+
+      srcs[0] = srcs[1];
+      if (num_taps >= 4) {
+        srcs[1] = srcs[2];
+        srcs[2] = srcs[3];
+        if (num_taps >= 6) {
+          srcs[3] = srcs[4];
+          srcs[4] = srcs[5];
+          if (num_taps == 8) {
+            srcs[5] = srcs[6];
+            srcs[6] = srcs[7];
+          }
+        }
+      }
+    } while (--y != 0);
+    x += 32;
+  } while (x < width);
+}
+
+template <int filter_index, bool is_compound = false>
+void FilterVertical16xH(const uint8_t* src, const ptrdiff_t src_stride,
+                        void* const dst, const ptrdiff_t dst_stride,
+                        const int /*width*/, const int height,
+                        const __m256i* const v_tap) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  const int next_row = num_taps;
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  const uint8_t* src_x = src;
+  __m256i srcs[8 + 1];
+  // The upper 128 bits hold the filter data for the next row.
+  srcs[0] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+  src_x += src_stride;
+  if (num_taps >= 4) {
+    srcs[1] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+    src_x += src_stride;
+    srcs[0] =
+        _mm256_inserti128_si256(srcs[0], _mm256_castsi256_si128(srcs[1]), 1);
+    srcs[2] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+    src_x += src_stride;
+    srcs[1] =
+        _mm256_inserti128_si256(srcs[1], _mm256_castsi256_si128(srcs[2]), 1);
+    if (num_taps >= 6) {
+      srcs[3] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+      src_x += src_stride;
+      srcs[2] =
+          _mm256_inserti128_si256(srcs[2], _mm256_castsi256_si128(srcs[3]), 1);
+      srcs[4] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+      src_x += src_stride;
+      srcs[3] =
+          _mm256_inserti128_si256(srcs[3], _mm256_castsi256_si128(srcs[4]), 1);
+      if (num_taps == 8) {
+        srcs[5] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+        src_x += src_stride;
+        srcs[4] = _mm256_inserti128_si256(srcs[4],
+                                          _mm256_castsi256_si128(srcs[5]), 1);
+        srcs[6] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+        src_x += src_stride;
+        srcs[5] = _mm256_inserti128_si256(srcs[5],
+                                          _mm256_castsi256_si128(srcs[6]), 1);
+      }
+    }
+  }
+
+  int y = height;
+  do {
+    srcs[next_row - 1] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+    src_x += src_stride;
+
+    srcs[next_row - 2] = _mm256_inserti128_si256(
+        srcs[next_row - 2], _mm256_castsi256_si128(srcs[next_row - 1]), 1);
+
+    srcs[next_row] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+    src_x += src_stride;
+
+    srcs[next_row - 1] = _mm256_inserti128_si256(
+        srcs[next_row - 1], _mm256_castsi256_si128(srcs[next_row]), 1);
+
+    const __m256i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+    const __m256i sums_hi =
+        SumVerticalTaps<filter_index, /*unpack_high=*/true>(srcs, v_tap);
+    if (is_compound) {
+      const __m256i results =
+          Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x20));
+      const __m256i results_hi =
+          Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x31));
+
+      StoreUnaligned32(dst16, results);
+      StoreUnaligned32(dst16 + dst_stride, results_hi);
+      dst16 += dst_stride << 1;
+    } else {
+      const __m256i results = RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      const __m256i results_hi =
+          RightShiftWithRounding_S16(sums_hi, kFilterBits - 1);
+      const __m256i packed_results = _mm256_packus_epi16(results, results_hi);
+      const __m128i this_dst = _mm256_castsi256_si128(packed_results);
+      const auto next_dst = _mm256_extracti128_si256(packed_results, 1);
+
+      StoreUnaligned16(dst8, this_dst);
+      StoreUnaligned16(dst8 + dst_stride, next_dst);
+      dst8 += dst_stride << 1;
+    }
+
+    srcs[0] = srcs[2];
+    if (num_taps >= 4) {
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      if (num_taps >= 6) {
+        srcs[3] = srcs[5];
+        srcs[4] = srcs[6];
+        if (num_taps == 8) {
+          srcs[5] = srcs[7];
+          srcs[6] = srcs[8];
+        }
+      }
+    }
+    y -= 2;
+  } while (y != 0);
+}
+
+template <int filter_index, bool is_compound = false>
+void FilterVertical8xH(const uint8_t* src, const ptrdiff_t src_stride,
+                       void* const dst, const ptrdiff_t dst_stride,
+                       const int /*width*/, const int height,
+                       const __m256i* const v_tap) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  const int next_row = num_taps;
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  const uint8_t* src_x = src;
+  __m256i srcs[8 + 1];
+  // The upper 128 bits hold the filter data for the next row.
+  srcs[0] = _mm256_castsi128_si256(LoadLo8(src_x));
+  src_x += src_stride;
+  if (num_taps >= 4) {
+    srcs[1] = _mm256_castsi128_si256(LoadLo8(src_x));
+    src_x += src_stride;
+    srcs[0] =
+        _mm256_inserti128_si256(srcs[0], _mm256_castsi256_si128(srcs[1]), 1);
+    srcs[2] = _mm256_castsi128_si256(LoadLo8(src_x));
+    src_x += src_stride;
+    srcs[1] =
+        _mm256_inserti128_si256(srcs[1], _mm256_castsi256_si128(srcs[2]), 1);
+    if (num_taps >= 6) {
+      srcs[3] = _mm256_castsi128_si256(LoadLo8(src_x));
+      src_x += src_stride;
+      srcs[2] =
+          _mm256_inserti128_si256(srcs[2], _mm256_castsi256_si128(srcs[3]), 1);
+      srcs[4] = _mm256_castsi128_si256(LoadLo8(src_x));
+      src_x += src_stride;
+      srcs[3] =
+          _mm256_inserti128_si256(srcs[3], _mm256_castsi256_si128(srcs[4]), 1);
+      if (num_taps == 8) {
+        srcs[5] = _mm256_castsi128_si256(LoadLo8(src_x));
+        src_x += src_stride;
+        srcs[4] = _mm256_inserti128_si256(srcs[4],
+                                          _mm256_castsi256_si128(srcs[5]), 1);
+        srcs[6] = _mm256_castsi128_si256(LoadLo8(src_x));
+        src_x += src_stride;
+        srcs[5] = _mm256_inserti128_si256(srcs[5],
+                                          _mm256_castsi256_si128(srcs[6]), 1);
+      }
+    }
+  }
+
+  int y = height;
+  do {
+    srcs[next_row - 1] = _mm256_castsi128_si256(LoadLo8(src_x));
+    src_x += src_stride;
+
+    srcs[next_row - 2] = _mm256_inserti128_si256(
+        srcs[next_row - 2], _mm256_castsi256_si128(srcs[next_row - 1]), 1);
+
+    srcs[next_row] = _mm256_castsi128_si256(LoadLo8(src_x));
+    src_x += src_stride;
+
+    srcs[next_row - 1] = _mm256_inserti128_si256(
+        srcs[next_row - 1], _mm256_castsi256_si128(srcs[next_row]), 1);
+
+    const __m256i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+    if (is_compound) {
+      const __m256i results = Compound1DShift(sums);
+      const __m128i this_dst = _mm256_castsi256_si128(results);
+      const auto next_dst = _mm256_extracti128_si256(results, 1);
+
+      StoreUnaligned16(dst16, this_dst);
+      StoreUnaligned16(dst16 + dst_stride, next_dst);
+      dst16 += dst_stride << 1;
+    } else {
+      const __m256i results = RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      const __m256i packed_results = _mm256_packus_epi16(results, results);
+      const __m128i this_dst = _mm256_castsi256_si128(packed_results);
+      const auto next_dst = _mm256_extracti128_si256(packed_results, 1);
+
+      StoreLo8(dst8, this_dst);
+      StoreLo8(dst8 + dst_stride, next_dst);
+      dst8 += dst_stride << 1;
+    }
+
+    srcs[0] = srcs[2];
+    if (num_taps >= 4) {
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      if (num_taps >= 6) {
+        srcs[3] = srcs[5];
+        srcs[4] = srcs[6];
+        if (num_taps == 8) {
+          srcs[5] = srcs[7];
+          srcs[6] = srcs[8];
+        }
+      }
+    }
+    y -= 2;
+  } while (y != 0);
+}
+
+template <int filter_index, bool is_compound = false>
+void FilterVertical8xH(const uint8_t* src, const ptrdiff_t src_stride,
+                       void* const dst, const ptrdiff_t dst_stride,
+                       const int /*width*/, const int height,
+                       const __m128i* const v_tap) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  const int next_row = num_taps - 1;
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  const uint8_t* src_x = src;
+  __m128i srcs[8];
+  srcs[0] = LoadLo8(src_x);
+  src_x += src_stride;
+  if (num_taps >= 4) {
+    srcs[1] = LoadLo8(src_x);
+    src_x += src_stride;
+    srcs[2] = LoadLo8(src_x);
+    src_x += src_stride;
+    if (num_taps >= 6) {
+      srcs[3] = LoadLo8(src_x);
+      src_x += src_stride;
+      srcs[4] = LoadLo8(src_x);
+      src_x += src_stride;
+      if (num_taps == 8) {
+        srcs[5] = LoadLo8(src_x);
+        src_x += src_stride;
+        srcs[6] = LoadLo8(src_x);
+        src_x += src_stride;
+      }
+    }
+  }
+
+  int y = height;
+  do {
+    srcs[next_row] = LoadLo8(src_x);
+    src_x += src_stride;
+
+    const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+    if (is_compound) {
+      const __m128i results = Compound1DShift(sums);
+      StoreUnaligned16(dst16, results);
+      dst16 += dst_stride;
+    } else {
+      const __m128i results = RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      StoreLo8(dst8, _mm_packus_epi16(results, results));
+      dst8 += dst_stride;
+    }
+
+    srcs[0] = srcs[1];
+    if (num_taps >= 4) {
+      srcs[1] = srcs[2];
+      srcs[2] = srcs[3];
+      if (num_taps >= 6) {
+        srcs[3] = srcs[4];
+        srcs[4] = srcs[5];
+        if (num_taps == 8) {
+          srcs[5] = srcs[6];
+          srcs[6] = srcs[7];
+        }
+      }
+    }
+  } while (--y != 0);
+}
+
+void ConvolveVertical_AVX2(const void* const reference,
+                           const ptrdiff_t reference_stride,
+                           const int /*horizontal_filter_index*/,
+                           const int vertical_filter_index,
+                           const int /*horizontal_filter_id*/,
+                           const int vertical_filter_id, const int width,
+                           const int height, void* prediction,
+                           const ptrdiff_t pred_stride) {
+  const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(filter_index);
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride;
+  auto* dest = static_cast<uint8_t*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride;
+  assert(vertical_filter_id != 0);
+
+  const __m128i v_filter =
+      LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
+
+  // Use 256 bits for width > 4.
+  if (width > 4) {
+    __m256i taps_256[4];
+    if (filter_index < 2) {  // 6 tap.
+      SetupTaps<6>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<0>(src, src_stride, dest, dest_stride, width, height,
+                             taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<0>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      } else {
+        FilterVertical32xH<0>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      }
+    } else if (filter_index == 2) {  // 8 tap.
+      SetupTaps<8>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<2>(src, src_stride, dest, dest_stride, width, height,
+                             taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<2>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      } else {
+        FilterVertical32xH<2>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      }
+    } else if (filter_index == 3) {  // 2 tap.
+      SetupTaps<2>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<3>(src, src_stride, dest, dest_stride, width, height,
+                             taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<3>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      } else {
+        FilterVertical32xH<3>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      }
+    } else if (filter_index == 4) {  // 4 tap.
+      SetupTaps<4>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<4>(src, src_stride, dest, dest_stride, width, height,
+                             taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<4>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      } else {
+        FilterVertical32xH<4>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      }
+    } else {
+      SetupTaps<4>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<5>(src, src_stride, dest, dest_stride, width, height,
+                             taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<5>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      } else {
+        FilterVertical32xH<5>(src, src_stride, dest, dest_stride, width, height,
+                              taps_256);
+      }
+    }
+  } else {  // width <= 8
+    // Use 128 bit code.
+    __m128i taps[4];
+
+    if (filter_index < 2) {  // 6 tap.
+      SetupTaps<6>(&v_filter, taps);
+      if (width == 2) {
+        FilterVertical2xH<6, 0>(src, src_stride, dest, dest_stride, height,
+                                taps);
+      } else {
+        FilterVertical4xH<6, 0>(src, src_stride, dest, dest_stride, height,
+                                taps);
+      }
+    } else if (filter_index == 2) {  // 8 tap.
+      SetupTaps<8>(&v_filter, taps);
+      if (width == 2) {
+        FilterVertical2xH<8, 2>(src, src_stride, dest, dest_stride, height,
+                                taps);
+      } else {
+        FilterVertical4xH<8, 2>(src, src_stride, dest, dest_stride, height,
+                                taps);
+      }
+    } else if (filter_index == 3) {  // 2 tap.
+      SetupTaps<2>(&v_filter, taps);
+      if (width == 2) {
+        FilterVertical2xH<2, 3>(src, src_stride, dest, dest_stride, height,
+                                taps);
+      } else {
+        FilterVertical4xH<2, 3>(src, src_stride, dest, dest_stride, height,
+                                taps);
+      }
+    } else if (filter_index == 4) {  // 4 tap.
+      SetupTaps<4>(&v_filter, taps);
+      if (width == 2) {
+        FilterVertical2xH<4, 4>(src, src_stride, dest, dest_stride, height,
+                                taps);
+      } else {
+        FilterVertical4xH<4, 4>(src, src_stride, dest, dest_stride, height,
+                                taps);
+      }
+    } else {
+      SetupTaps<4>(&v_filter, taps);
+      if (width == 2) {
+        FilterVertical2xH<4, 5>(src, src_stride, dest, dest_stride, height,
+                                taps);
+      } else {
+        FilterVertical4xH<4, 5>(src, src_stride, dest, dest_stride, height,
+                                taps);
+      }
+    }
+  }
+}
+
+void ConvolveCompoundVertical_AVX2(
+    const void* const reference, const ptrdiff_t reference_stride,
+    const int /*horizontal_filter_index*/, const int vertical_filter_index,
+    const int /*horizontal_filter_id*/, const int vertical_filter_id,
+    const int width, const int height, void* prediction,
+    const ptrdiff_t /*pred_stride*/) {
+  const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(filter_index);
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride;
+  auto* dest = static_cast<uint8_t*>(prediction);
+  const ptrdiff_t dest_stride = width;
+  assert(vertical_filter_id != 0);
+
+  const __m128i v_filter =
+      LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
+
+  // Use 256 bits for width > 4.
+  if (width > 4) {
+    __m256i taps_256[4];
+    if (filter_index < 2) {  // 6 tap.
+      SetupTaps<6>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<0, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<0, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else {
+        FilterVertical32xH<0, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      }
+    } else if (filter_index == 2) {  // 8 tap.
+      SetupTaps<8>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<2, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<2, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else {
+        FilterVertical32xH<2, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      }
+    } else if (filter_index == 3) {  // 2 tap.
+      SetupTaps<2>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<3, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<3, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else {
+        FilterVertical32xH<3, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      }
+    } else if (filter_index == 4) {  // 4 tap.
+      SetupTaps<4>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<4, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<4, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else {
+        FilterVertical32xH<4, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      }
+    } else {
+      SetupTaps<4>(&v_filter, taps_256);
+      if (width == 8) {
+        FilterVertical8xH<5, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else if (width == 16) {
+        FilterVertical16xH<5, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      } else {
+        FilterVertical32xH<5, /*is_compound=*/true>(
+            src, src_stride, dest, dest_stride, width, height, taps_256);
+      }
+    }
+  } else {  // width <= 4
+    // Use 128 bit code.
+    __m128i taps[4];
+
+    if (filter_index < 2) {  // 6 tap.
+      SetupTaps<6>(&v_filter, taps);
+      FilterVertical4xH<6, 0, /*is_compound=*/true>(src, src_stride, dest,
+                                                    dest_stride, height, taps);
+    } else if (filter_index == 2) {  // 8 tap.
+      SetupTaps<8>(&v_filter, taps);
+      FilterVertical4xH<8, 2, /*is_compound=*/true>(src, src_stride, dest,
+                                                    dest_stride, height, taps);
+    } else if (filter_index == 3) {  // 2 tap.
+      SetupTaps<2>(&v_filter, taps);
+      FilterVertical4xH<2, 3, /*is_compound=*/true>(src, src_stride, dest,
+                                                    dest_stride, height, taps);
+    } else if (filter_index == 4) {  // 4 tap.
+      SetupTaps<4>(&v_filter, taps);
+      FilterVertical4xH<4, 4, /*is_compound=*/true>(src, src_stride, dest,
+                                                    dest_stride, height, taps);
+    } else {
+      SetupTaps<4>(&v_filter, taps);
+      FilterVertical4xH<4, 5, /*is_compound=*/true>(src, src_stride, dest,
+                                                    dest_stride, height, taps);
+    }
+  }
+}
+
+void ConvolveHorizontal_AVX2(const void* const reference,
+                             const ptrdiff_t reference_stride,
+                             const int horizontal_filter_index,
+                             const int /*vertical_filter_index*/,
+                             const int horizontal_filter_id,
+                             const int /*vertical_filter_id*/, const int width,
+                             const int height, void* prediction,
+                             const ptrdiff_t pred_stride) {
+  const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  // Set |src| to the outermost tap.
+  const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+  auto* dest = static_cast<uint8_t*>(prediction);
+
+  if (width > 2) {
+    DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
+                     horizontal_filter_id, filter_index);
+  } else {
+    // Use non avx2 version for smaller widths.
+    DoHorizontalPass2xH(src, reference_stride, dest, pred_stride, width, height,
+                        horizontal_filter_id, filter_index);
+  }
+}
+
+void ConvolveCompoundHorizontal_AVX2(
+    const void* const reference, const ptrdiff_t reference_stride,
+    const int horizontal_filter_index, const int /*vertical_filter_index*/,
+    const int horizontal_filter_id, const int /*vertical_filter_id*/,
+    const int width, const int height, void* prediction,
+    const ptrdiff_t pred_stride) {
+  const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+  // Set |src| to the outermost tap.
+  const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+  auto* dest = static_cast<uint8_t*>(prediction);
+  // All compound functions output to the predictor buffer with |pred_stride|
+  // equal to |width|.
+  assert(pred_stride == width);
+  // Compound functions start at 4x4.
+  assert(width >= 4 && height >= 4);
+
+#ifdef NDEBUG
+  // Quiet compiler error.
+  (void)pred_stride;
+#endif
+
+  DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>(
+      src, reference_stride, dest, width, width, height, horizontal_filter_id,
+      filter_index);
+}
+
+void ConvolveCompound2D_AVX2(const void* const reference,
+                             const ptrdiff_t reference_stride,
+                             const int horizontal_filter_index,
+                             const int vertical_filter_index,
+                             const int horizontal_filter_id,
+                             const int vertical_filter_id, const int width,
+                             const int height, void* prediction,
+                             const ptrdiff_t pred_stride) {
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+
+  // The output of the horizontal filter is guaranteed to fit in 16 bits.
+  alignas(32) uint16_t
+      intermediate_result[kMaxSuperBlockSizeInPixels *
+                          (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+  const int intermediate_height = height + vertical_taps - 1;
+
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
+  DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
+      src, src_stride, intermediate_result, width, width, intermediate_height,
+      horizontal_filter_id, horiz_filter_index);
+
+  // Vertical filter.
+  auto* dest = static_cast<uint8_t*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride;
+  assert(vertical_filter_id != 0);
+
+  const __m128i v_filter =
+      LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]);
+
+  // Use 256 bits for width > 8.
+  if (width > 8) {
+    __m256i taps_256[4];
+    const __m128i v_filter_ext = _mm_cvtepi8_epi16(v_filter);
+
+    if (vertical_taps == 8) {
+      SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+      Filter2DVertical16xH<8, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps_256);
+    } else if (vertical_taps == 6) {
+      SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+      Filter2DVertical16xH<6, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps_256);
+    } else if (vertical_taps == 4) {
+      SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+      Filter2DVertical16xH<4, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps_256);
+    } else {  // |vertical_taps| == 2
+      SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+      Filter2DVertical16xH<2, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps_256);
+    }
+  } else {  // width <= 8
+    __m128i taps[4];
+    // Use 128 bit code.
+    if (vertical_taps == 8) {
+      SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
+      if (width == 4) {
+        Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest,
+                                                     dest_stride, height, taps);
+      } else {
+        Filter2DVertical<8, /*is_compound=*/true>(
+            intermediate_result, dest, dest_stride, width, height, taps);
+      }
+    } else if (vertical_taps == 6) {
+      SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps);
+      if (width == 4) {
+        Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest,
+                                                     dest_stride, height, taps);
+      } else {
+        Filter2DVertical<6, /*is_compound=*/true>(
+            intermediate_result, dest, dest_stride, width, height, taps);
+      }
+    } else if (vertical_taps == 4) {
+      SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps);
+      if (width == 4) {
+        Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest,
+                                                     dest_stride, height, taps);
+      } else {
+        Filter2DVertical<4, /*is_compound=*/true>(
+            intermediate_result, dest, dest_stride, width, height, taps);
+      }
+    } else {  // |vertical_taps| == 2
+      SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps);
+      if (width == 4) {
+        Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest,
+                                                     dest_stride, height, taps);
+      } else {
+        Filter2DVertical<2, /*is_compound=*/true>(
+            intermediate_result, dest, dest_stride, width, height, taps);
+      }
+    }
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->convolve[0][0][0][1] = ConvolveHorizontal_AVX2;
+  dsp->convolve[0][0][1][0] = ConvolveVertical_AVX2;
+  dsp->convolve[0][0][1][1] = Convolve2D_AVX2;
+
+  dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_AVX2;
+  dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_AVX2;
+  dsp->convolve[0][1][1][1] = ConvolveCompound2D_AVX2;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void ConvolveInit_AVX2() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_AVX2
+namespace libgav1 {
+namespace dsp {
+
+void ConvolveInit_AVX2() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_AVX2

diff --git a/libgav1/src/dsp/x86/convolve_avx2.h b/libgav1/src/dsp/x86/convolve_avx2.h
new file mode 100644
index 0000000..e509bc9
--- /dev/null
+++ b/libgav1/src/dsp/x86/convolve_avx2.h

@@ -0,0 +1,59 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_CONVOLVE_AVX2_H_
+#define LIBGAV1_SRC_DSP_X86_CONVOLVE_AVX2_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::convolve, see the defines below for specifics. This
+// function is not thread-safe.
+void ConvolveInit_AVX2();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If avx2 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the avx2 implementation should be used.
+#if LIBGAV1_TARGETING_AVX2
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveHorizontal
+#define LIBGAV1_Dsp8bpp_ConvolveHorizontal LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveVertical
+#define LIBGAV1_Dsp8bpp_ConvolveVertical LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Convolve2D
+#define LIBGAV1_Dsp8bpp_Convolve2D LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundVertical
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundVertical LIBGAV1_CPU_AVX2
+#endif
+
+#endif  // LIBGAV1_TARGETING_AVX2
+
+#endif  // LIBGAV1_SRC_DSP_X86_CONVOLVE_AVX2_H_

diff --git a/libgav1/src/dsp/x86/convolve_sse4.cc b/libgav1/src/dsp/x86/convolve_sse4.cc
index ff9a373..9b72fe4 100644
--- a/libgav1/src/dsp/x86/convolve_sse4.cc
+++ b/libgav1/src/dsp/x86/convolve_sse4.cc

@@ -16,7 +16,7 @@
 #include "src/utils/constants.h"
 #include "src/utils/cpu.h"
 
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
 #include <smmintrin.h>
 
 #include <algorithm>
@@ -34,73 +34,7 @@
 namespace low_bitdepth {
 namespace {
 
-// TODO(slavarnway): Move to common neon/sse4 file.
-int GetNumTapsInFilter(const int filter_index) {
-  if (filter_index < 2) {
-    // Despite the names these only use 6 taps.
-    // kInterpolationFilterEightTap
-    // kInterpolationFilterEightTapSmooth
-    return 6;
-  }
-
-  if (filter_index == 2) {
-    // kInterpolationFilterEightTapSharp
-    return 8;
-  }
-
-  if (filter_index == 3) {
-    // kInterpolationFilterBilinear
-    return 2;
-  }
-
-  assert(filter_index > 3);
-  // For small sizes (width/height <= 4) the large filters are replaced with 4
-  // tap options.
-  // If the original filters were |kInterpolationFilterEightTap| or
-  // |kInterpolationFilterEightTapSharp| then it becomes
-  // |kInterpolationFilterSwitchable|.
-  // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4
-  // tap filter.
-  return 4;
-}
-
-constexpr int kIntermediateStride = kMaxSuperBlockSizeInPixels;
-constexpr int kHorizontalOffset = 3;
-constexpr int kFilterIndexShift = 6;
-
-// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
-// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
-// sum from outranging int16_t.
-template <int filter_index>
-__m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) {
-  __m128i sum;
-  if (filter_index < 2) {
-    // 6 taps.
-    const __m128i v_madd_21 = _mm_maddubs_epi16(src[0], taps[0]);  // k2k1
-    const __m128i v_madd_43 = _mm_maddubs_epi16(src[1], taps[1]);  // k4k3
-    const __m128i v_madd_65 = _mm_maddubs_epi16(src[2], taps[2]);  // k6k5
-    sum = _mm_add_epi16(v_madd_21, v_madd_43);
-    sum = _mm_add_epi16(sum, v_madd_65);
-  } else if (filter_index == 2) {
-    // 8 taps.
-    const __m128i v_madd_10 = _mm_maddubs_epi16(src[0], taps[0]);  // k1k0
-    const __m128i v_madd_32 = _mm_maddubs_epi16(src[1], taps[1]);  // k3k2
-    const __m128i v_madd_54 = _mm_maddubs_epi16(src[2], taps[2]);  // k5k4
-    const __m128i v_madd_76 = _mm_maddubs_epi16(src[3], taps[3]);  // k7k6
-    const __m128i v_sum_3210 = _mm_add_epi16(v_madd_10, v_madd_32);
-    const __m128i v_sum_7654 = _mm_add_epi16(v_madd_54, v_madd_76);
-    sum = _mm_add_epi16(v_sum_7654, v_sum_3210);
-  } else if (filter_index == 3) {
-    // 2 taps.
-    sum = _mm_maddubs_epi16(src[0], taps[0]);  // k4k3
-  } else {
-    // 4 taps.
-    const __m128i v_madd_32 = _mm_maddubs_epi16(src[0], taps[0]);  // k3k2
-    const __m128i v_madd_54 = _mm_maddubs_epi16(src[1], taps[1]);  // k5k4
-    sum = _mm_add_epi16(v_madd_32, v_madd_54);
-  }
-  return sum;
-}
+#include "src/dsp/x86/convolve_sse4.inc"
 
 template <int filter_index>
 __m128i SumHorizontalTaps(const uint8_t* const src,
@@ -157,68 +91,7 @@
   return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
 }
 
-template <int filter_index>
-__m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
-                             const __m128i* const v_tap) {
-  const __m128i input0 = LoadLo8(&src[2]);
-  const __m128i input1 = LoadLo8(&src[2 + src_stride]);
-
-  if (filter_index == 3) {
-    // 03 04 04 05 05 06 06 07 ....
-    const __m128i input0_dup =
-        _mm_srli_si128(_mm_unpacklo_epi8(input0, input0), 3);
-    // 13 14 14 15 15 16 16 17 ....
-    const __m128i input1_dup =
-        _mm_srli_si128(_mm_unpacklo_epi8(input1, input1), 3);
-    const __m128i v_src_43 = _mm_unpacklo_epi64(input0_dup, input1_dup);
-    const __m128i v_sum_43 = _mm_maddubs_epi16(v_src_43, v_tap[0]);  // k4k3
-    return v_sum_43;
-  }
-
-  // 02 03 03 04 04 05 05 06 06 07 ....
-  const __m128i input0_dup =
-      _mm_srli_si128(_mm_unpacklo_epi8(input0, input0), 1);
-  // 12 13 13 14 14 15 15 16 16 17 ....
-  const __m128i input1_dup =
-      _mm_srli_si128(_mm_unpacklo_epi8(input1, input1), 1);
-  // 04 05 05 06 06 07 07 08 ...
-  const __m128i input0_dup_54 = _mm_srli_si128(input0_dup, 4);
-  // 14 15 15 16 16 17 17 18 ...
-  const __m128i input1_dup_54 = _mm_srli_si128(input1_dup, 4);
-  const __m128i v_src_32 = _mm_unpacklo_epi64(input0_dup, input1_dup);
-  const __m128i v_src_54 = _mm_unpacklo_epi64(input0_dup_54, input1_dup_54);
-  const __m128i v_madd_32 = _mm_maddubs_epi16(v_src_32, v_tap[0]);  // k3k2
-  const __m128i v_madd_54 = _mm_maddubs_epi16(v_src_54, v_tap[1]);  // k5k4
-  const __m128i v_sum_5432 = _mm_add_epi16(v_madd_54, v_madd_32);
-  return v_sum_5432;
-}
-
-template <int filter_index>
-__m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
-                                const __m128i* const v_tap) {
-  __m128i sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
-
-  // Normally the Horizontal pass does the downshift in two passes:
-  // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
-  // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
-  // requires adding the rounding offset from the skipped shift.
-  constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
-
-  sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit));
-  sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
-  return _mm_packus_epi16(sum, sum);
-}
-
-template <int filter_index>
-__m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride,
-                                const __m128i* const v_tap) {
-  const __m128i sum =
-      SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
-
-  return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
-}
-
-template <int num_taps, int step, int filter_index, bool is_2d = false,
+template <int num_taps, int filter_index, bool is_2d = false,
           bool is_compound = false>
 void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
                       void* const dest, const ptrdiff_t pred_stride,
@@ -229,7 +102,7 @@
 
   // 4 tap filters are never used when width > 4.
   if (num_taps != 4 && width > 4) {
-    int y = 0;
+    int y = height;
     do {
       int x = 0;
       do {
@@ -246,12 +119,12 @@
               SimpleHorizontalTaps<filter_index>(&src[x], v_tap);
           StoreLo8(&dest8[x], result);
         }
-        x += step;
+        x += 8;
       } while (x < width);
       src += src_stride;
       dest8 += pred_stride;
       dest16 += pred_stride;
-    } while (++y < height);
+    } while (--y != 0);
     return;
   }
 
@@ -261,7 +134,7 @@
   assert(num_taps <= 4);
   if (num_taps <= 4) {
     if (width == 4) {
-      int y = 0;
+      int y = height;
       do {
         if (is_2d || is_compound) {
           const __m128i v_sum = HorizontalTaps8To16<filter_index>(src, v_tap);
@@ -273,12 +146,13 @@
         src += src_stride;
         dest8 += pred_stride;
         dest16 += pred_stride;
-      } while (++y < height);
+      } while (--y != 0);
       return;
     }
 
     if (!is_compound) {
-      int y = 0;
+      int y = height;
+      if (is_2d) y -= 1;
       do {
         if (is_2d) {
           const __m128i sum =
@@ -297,8 +171,8 @@
         }
 
         src += src_stride << 1;
-        y += 2;
-      } while (y < height - 1);
+        y -= 2;
+      } while (y != 0);
 
       // The 2d filters have an odd |height| because the horizontal pass
       // generates context for the vertical pass.
@@ -330,309 +204,11 @@
   }
 }
 
-template <int num_taps, bool is_2d_vertical = false>
-LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter,
-                                     __m128i* v_tap) {
-  if (num_taps == 8) {
-    v_tap[0] = _mm_shufflelo_epi16(*filter, 0x0);   // k1k0
-    v_tap[1] = _mm_shufflelo_epi16(*filter, 0x55);  // k3k2
-    v_tap[2] = _mm_shufflelo_epi16(*filter, 0xaa);  // k5k4
-    v_tap[3] = _mm_shufflelo_epi16(*filter, 0xff);  // k7k6
-    if (is_2d_vertical) {
-      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
-      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
-      v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
-      v_tap[3] = _mm_cvtepi8_epi16(v_tap[3]);
-    } else {
-      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
-      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
-      v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
-      v_tap[3] = _mm_unpacklo_epi64(v_tap[3], v_tap[3]);
-    }
-  } else if (num_taps == 6) {
-    const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
-    v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x0);   // k2k1
-    v_tap[1] = _mm_shufflelo_epi16(adjusted_filter, 0x55);  // k4k3
-    v_tap[2] = _mm_shufflelo_epi16(adjusted_filter, 0xaa);  // k6k5
-    if (is_2d_vertical) {
-      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
-      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
-      v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
-    } else {
-      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
-      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
-      v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
-    }
-  } else if (num_taps == 4) {
-    v_tap[0] = _mm_shufflelo_epi16(*filter, 0x55);  // k3k2
-    v_tap[1] = _mm_shufflelo_epi16(*filter, 0xaa);  // k5k4
-    if (is_2d_vertical) {
-      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
-      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
-    } else {
-      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
-      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
-    }
-  } else {  // num_taps == 2
-    const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
-    v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x55);  // k4k3
-    if (is_2d_vertical) {
-      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
-    } else {
-      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
-    }
-  }
-}
-
-template <int num_taps, bool is_compound>
-__m128i SimpleSum2DVerticalTaps(const __m128i* const src,
-                                const __m128i* const taps) {
-  __m128i sum_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[0], src[1]), taps[0]);
-  __m128i sum_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[0], src[1]), taps[0]);
-  if (num_taps >= 4) {
-    __m128i madd_lo =
-        _mm_madd_epi16(_mm_unpacklo_epi16(src[2], src[3]), taps[1]);
-    __m128i madd_hi =
-        _mm_madd_epi16(_mm_unpackhi_epi16(src[2], src[3]), taps[1]);
-    sum_lo = _mm_add_epi32(sum_lo, madd_lo);
-    sum_hi = _mm_add_epi32(sum_hi, madd_hi);
-    if (num_taps >= 6) {
-      madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[4], src[5]), taps[2]);
-      madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[4], src[5]), taps[2]);
-      sum_lo = _mm_add_epi32(sum_lo, madd_lo);
-      sum_hi = _mm_add_epi32(sum_hi, madd_hi);
-      if (num_taps == 8) {
-        madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[6], src[7]), taps[3]);
-        madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[6], src[7]), taps[3]);
-        sum_lo = _mm_add_epi32(sum_lo, madd_lo);
-        sum_hi = _mm_add_epi32(sum_hi, madd_hi);
-      }
-    }
-  }
-
-  if (is_compound) {
-    return _mm_packs_epi32(
-        RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
-        RightShiftWithRounding_S32(sum_hi,
-                                   kInterRoundBitsCompoundVertical - 1));
-  }
-
-  return _mm_packs_epi32(
-      RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
-      RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
-}
-
-template <int num_taps, bool is_compound = false>
-void Filter2DVertical(const uint16_t* src, void* const dst,
-                      const ptrdiff_t dst_stride, const int width,
-                      const int height, const __m128i* const taps) {
-  assert(width >= 8);
-  constexpr int next_row = num_taps - 1;
-  // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
-  const ptrdiff_t src_stride = width;
-
-  auto* dst8 = static_cast<uint8_t*>(dst);
-  auto* dst16 = static_cast<uint16_t*>(dst);
-
-  int x = 0;
-  do {
-    __m128i srcs[8];
-    const uint16_t* src_x = src + x;
-    srcs[0] = LoadAligned16(src_x);
-    src_x += src_stride;
-    if (num_taps >= 4) {
-      srcs[1] = LoadAligned16(src_x);
-      src_x += src_stride;
-      srcs[2] = LoadAligned16(src_x);
-      src_x += src_stride;
-      if (num_taps >= 6) {
-        srcs[3] = LoadAligned16(src_x);
-        src_x += src_stride;
-        srcs[4] = LoadAligned16(src_x);
-        src_x += src_stride;
-        if (num_taps == 8) {
-          srcs[5] = LoadAligned16(src_x);
-          src_x += src_stride;
-          srcs[6] = LoadAligned16(src_x);
-          src_x += src_stride;
-        }
-      }
-    }
-
-    int y = 0;
-    do {
-      srcs[next_row] = LoadAligned16(src_x);
-      src_x += src_stride;
-
-      const __m128i sum =
-          SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
-      if (is_compound) {
-        StoreUnaligned16(dst16 + x + y * dst_stride, sum);
-      } else {
-        StoreLo8(dst8 + x + y * dst_stride, _mm_packus_epi16(sum, sum));
-      }
-
-      srcs[0] = srcs[1];
-      if (num_taps >= 4) {
-        srcs[1] = srcs[2];
-        srcs[2] = srcs[3];
-        if (num_taps >= 6) {
-          srcs[3] = srcs[4];
-          srcs[4] = srcs[5];
-          if (num_taps == 8) {
-            srcs[5] = srcs[6];
-            srcs[6] = srcs[7];
-          }
-        }
-      }
-    } while (++y < height);
-    x += 8;
-  } while (x < width);
-}
-
-// Take advantage of |src_stride| == |width| to process two rows at a time.
-template <int num_taps, bool is_compound = false>
-void Filter2DVertical4xH(const uint16_t* src, void* const dst,
-                         const ptrdiff_t dst_stride, const int height,
-                         const __m128i* const taps) {
-  auto* dst8 = static_cast<uint8_t*>(dst);
-  auto* dst16 = static_cast<uint16_t*>(dst);
-
-  __m128i srcs[9];
-  srcs[0] = LoadAligned16(src);
-  src += 8;
-  if (num_taps >= 4) {
-    srcs[2] = LoadAligned16(src);
-    src += 8;
-    srcs[1] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[0], 8), srcs[2]);
-    if (num_taps >= 6) {
-      srcs[4] = LoadAligned16(src);
-      src += 8;
-      srcs[3] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[2], 8), srcs[4]);
-      if (num_taps == 8) {
-        srcs[6] = LoadAligned16(src);
-        src += 8;
-        srcs[5] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[4], 8), srcs[6]);
-      }
-    }
-  }
-
-  int y = 0;
-  do {
-    srcs[num_taps] = LoadAligned16(src);
-    src += 8;
-    srcs[num_taps - 1] = _mm_unpacklo_epi64(
-        _mm_srli_si128(srcs[num_taps - 2], 8), srcs[num_taps]);
-
-    const __m128i sum =
-        SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
-    if (is_compound) {
-      StoreUnaligned16(dst16, sum);
-      dst16 += 4 << 1;
-    } else {
-      const __m128i results = _mm_packus_epi16(sum, sum);
-      Store4(dst8, results);
-      dst8 += dst_stride;
-      Store4(dst8, _mm_srli_si128(results, 4));
-      dst8 += dst_stride;
-    }
-
-    srcs[0] = srcs[2];
-    if (num_taps >= 4) {
-      srcs[1] = srcs[3];
-      srcs[2] = srcs[4];
-      if (num_taps >= 6) {
-        srcs[3] = srcs[5];
-        srcs[4] = srcs[6];
-        if (num_taps == 8) {
-          srcs[5] = srcs[7];
-          srcs[6] = srcs[8];
-        }
-      }
-    }
-    y += 2;
-  } while (y < height);
-}
-
-// Take advantage of |src_stride| == |width| to process four rows at a time.
-template <int num_taps>
-void Filter2DVertical2xH(const uint16_t* src, void* const dst,
-                         const ptrdiff_t dst_stride, const int height,
-                         const __m128i* const taps) {
-  constexpr int next_row = (num_taps < 6) ? 4 : 8;
-
-  auto* dst8 = static_cast<uint8_t*>(dst);
-
-  __m128i srcs[9];
-  srcs[0] = LoadAligned16(src);
-  src += 8;
-  if (num_taps >= 6) {
-    srcs[4] = LoadAligned16(src);
-    src += 8;
-    srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
-    if (num_taps == 8) {
-      srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
-      srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
-    }
-  }
-
-  int y = 0;
-  do {
-    srcs[next_row] = LoadAligned16(src);
-    src += 8;
-    if (num_taps == 2) {
-      srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
-    } else if (num_taps == 4) {
-      srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
-      srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
-      srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
-    } else if (num_taps == 6) {
-      srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
-      srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
-      srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
-    } else if (num_taps == 8) {
-      srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
-      srcs[6] = _mm_alignr_epi8(srcs[8], srcs[4], 8);
-      srcs[7] = _mm_alignr_epi8(srcs[8], srcs[4], 12);
-    }
-
-    const __m128i sum =
-        SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps);
-    const __m128i results = _mm_packus_epi16(sum, sum);
-
-    Store2(dst8, results);
-    dst8 += dst_stride;
-    Store2(dst8, _mm_srli_si128(results, 2));
-    // When |height| <= 4 the taps are restricted to 2 and 4 tap variants.
-    // Therefore we don't need to check this condition when |height| > 4.
-    if (num_taps <= 4 && height == 2) return;
-    dst8 += dst_stride;
-    Store2(dst8, _mm_srli_si128(results, 4));
-    dst8 += dst_stride;
-    Store2(dst8, _mm_srli_si128(results, 6));
-    dst8 += dst_stride;
-
-    srcs[0] = srcs[4];
-    if (num_taps == 6) {
-      srcs[1] = srcs[5];
-      srcs[4] = srcs[8];
-    } else if (num_taps == 8) {
-      srcs[1] = srcs[5];
-      srcs[2] = srcs[6];
-      srcs[3] = srcs[7];
-      srcs[4] = srcs[8];
-    }
-
-    y += 4;
-  } while (y < height);
-}
-
 template <bool is_2d = false, bool is_compound = false>
 LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
     const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
     const ptrdiff_t dst_stride, const int width, const int height,
-    const int subpixel, const int filter_index) {
-  const int filter_id = (subpixel >> 6) & kSubPixelMask;
+    const int filter_id, const int filter_index) {
   assert(filter_id != 0);
   __m128i v_tap[4];
   const __m128i v_horizontal_filter =
@@ -640,37 +216,39 @@
 
   if (filter_index == 2) {  // 8 tap.
     SetupTaps<8>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<8, 8, 2, is_2d, is_compound>(
-        src, src_stride, dst, dst_stride, width, height, v_tap);
+    FilterHorizontal<8, 2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
   } else if (filter_index == 1) {  // 6 tap.
     SetupTaps<6>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<6, 8, 1, is_2d, is_compound>(
-        src, src_stride, dst, dst_stride, width, height, v_tap);
+    FilterHorizontal<6, 1, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
   } else if (filter_index == 0) {  // 6 tap.
     SetupTaps<6>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<6, 8, 0, is_2d, is_compound>(
-        src, src_stride, dst, dst_stride, width, height, v_tap);
+    FilterHorizontal<6, 0, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
   } else if (filter_index == 4) {  // 4 tap.
     SetupTaps<4>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<4, 8, 4, is_2d, is_compound>(
-        src, src_stride, dst, dst_stride, width, height, v_tap);
+    FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
   } else if (filter_index == 5) {  // 4 tap.
     SetupTaps<4>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<4, 8, 5, is_2d, is_compound>(
-        src, src_stride, dst, dst_stride, width, height, v_tap);
+    FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
   } else {  // 2 tap.
     SetupTaps<2>(&v_horizontal_filter, v_tap);
-    FilterHorizontal<2, 8, 3, is_2d, is_compound>(
-        src, src_stride, dst, dst_stride, width, height, v_tap);
+    FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+                                               width, height, v_tap);
   }
 }
 
 void Convolve2D_SSE4_1(const void* const reference,
                        const ptrdiff_t reference_stride,
                        const int horizontal_filter_index,
-                       const int vertical_filter_index, const int subpixel_x,
-                       const int subpixel_y, const int width, const int height,
-                       void* prediction, const ptrdiff_t pred_stride) {
+                       const int vertical_filter_index,
+                       const int horizontal_filter_id,
+                       const int vertical_filter_id, const int width,
+                       const int height, void* prediction,
+                       const ptrdiff_t pred_stride) {
   const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
   const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
   const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
@@ -686,18 +264,17 @@
                     (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
 
   DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result, width,
-                                   width, intermediate_height, subpixel_x,
-                                   horiz_filter_index);
+                                   width, intermediate_height,
+                                   horizontal_filter_id, horiz_filter_index);
 
   // Vertical filter.
   auto* dest = static_cast<uint8_t*>(prediction);
   const ptrdiff_t dest_stride = pred_stride;
-  const int filter_id = ((subpixel_y & 1023) >> 6) & kSubPixelMask;
-  assert(filter_id != 0);
+  assert(vertical_filter_id != 0);
 
   __m128i taps[4];
   const __m128i v_filter =
-      LoadLo8(kHalfSubPixelFilters[vert_filter_index][filter_id]);
+      LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]);
 
   if (vertical_taps == 8) {
     SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
@@ -750,39 +327,6 @@
   }
 }
 
-// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
-// Vertical calculations.
-__m128i Compound1DShift(const __m128i sum) {
-  return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
-}
-
-template <int filter_index>
-__m128i SumVerticalTaps(const __m128i* const srcs, const __m128i* const v_tap) {
-  __m128i v_src[4];
-
-  if (filter_index < 2) {
-    // 6 taps.
-    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
-    v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
-    v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
-  } else if (filter_index == 2) {
-    // 8 taps.
-    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
-    v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
-    v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
-    v_src[3] = _mm_unpacklo_epi8(srcs[6], srcs[7]);
-  } else if (filter_index == 3) {
-    // 2 taps.
-    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
-  } else if (filter_index > 3) {
-    // 4 taps.
-    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
-    v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
-  }
-  const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap);
-  return sum;
-}
-
 template <int filter_index, bool is_compound = false>
 void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
                     void* const dst, const ptrdiff_t dst_stride,
@@ -819,7 +363,9 @@
       }
     }
 
-    int y = 0;
+    auto* dst8_x = dst8 + x;
+    auto* dst16_x = dst16 + x;
+    int y = height;
     do {
       srcs[next_row] = LoadLo8(src_x);
       src_x += src_stride;
@@ -827,11 +373,13 @@
       const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
       if (is_compound) {
         const __m128i results = Compound1DShift(sums);
-        StoreUnaligned16(dst16 + x + y * dst_stride, results);
+        StoreUnaligned16(dst16_x, results);
+        dst16_x += dst_stride;
       } else {
         const __m128i results =
             RightShiftWithRounding_S16(sums, kFilterBits - 1);
-        StoreLo8(dst8 + x + y * dst_stride, _mm_packus_epi16(results, results));
+        StoreLo8(dst8_x, _mm_packus_epi16(results, results));
+        dst8_x += dst_stride;
       }
 
       srcs[0] = srcs[1];
@@ -847,513 +395,19 @@
           }
         }
       }
-    } while (++y < height);
+    } while (--y != 0);
     x += 8;
   } while (x < width);
 }
 
-template <int filter_index, bool is_compound = false>
-void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
-                       void* const dst, const ptrdiff_t dst_stride,
-                       const int height, const __m128i* const v_tap) {
-  const int num_taps = GetNumTapsInFilter(filter_index);
-  auto* dst8 = static_cast<uint8_t*>(dst);
-  auto* dst16 = static_cast<uint16_t*>(dst);
-
-  __m128i srcs[9];
-
-  if (num_taps == 2) {
-    srcs[2] = _mm_setzero_si128();
-    // 00 01 02 03
-    srcs[0] = Load4(src);
-    src += src_stride;
-
-    int y = 0;
-    do {
-      // 10 11 12 13
-      const __m128i a = Load4(src);
-      // 00 01 02 03 10 11 12 13
-      srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
-      src += src_stride;
-      // 20 21 22 23
-      srcs[2] = Load4(src);
-      src += src_stride;
-      // 10 11 12 13 20 21 22 23
-      srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
-
-      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
-      if (is_compound) {
-        const __m128i results = Compound1DShift(sums);
-        StoreUnaligned16(dst16, results);
-        dst16 += 4 << 1;
-      } else {
-        const __m128i results_16 =
-            RightShiftWithRounding_S16(sums, kFilterBits - 1);
-        const __m128i results = _mm_packus_epi16(results_16, results_16);
-        Store4(dst8, results);
-        dst8 += dst_stride;
-        Store4(dst8, _mm_srli_si128(results, 4));
-        dst8 += dst_stride;
-      }
-
-      srcs[0] = srcs[2];
-      y += 2;
-    } while (y < height);
-  } else if (num_taps == 4) {
-    srcs[4] = _mm_setzero_si128();
-    // 00 01 02 03
-    srcs[0] = Load4(src);
-    src += src_stride;
-    // 10 11 12 13
-    const __m128i a = Load4(src);
-    // 00 01 02 03 10 11 12 13
-    srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
-    src += src_stride;
-    // 20 21 22 23
-    srcs[2] = Load4(src);
-    src += src_stride;
-    // 10 11 12 13 20 21 22 23
-    srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
-
-    int y = 0;
-    do {
-      // 30 31 32 33
-      const __m128i b = Load4(src);
-      // 20 21 22 23 30 31 32 33
-      srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
-      src += src_stride;
-      // 40 41 42 43
-      srcs[4] = Load4(src);
-      src += src_stride;
-      // 30 31 32 33 40 41 42 43
-      srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
-
-      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
-      if (is_compound) {
-        const __m128i results = Compound1DShift(sums);
-        StoreUnaligned16(dst16, results);
-        dst16 += 4 << 1;
-      } else {
-        const __m128i results_16 =
-            RightShiftWithRounding_S16(sums, kFilterBits - 1);
-        const __m128i results = _mm_packus_epi16(results_16, results_16);
-        Store4(dst8, results);
-        dst8 += dst_stride;
-        Store4(dst8, _mm_srli_si128(results, 4));
-        dst8 += dst_stride;
-      }
-
-      srcs[0] = srcs[2];
-      srcs[1] = srcs[3];
-      srcs[2] = srcs[4];
-      y += 2;
-    } while (y < height);
-  } else if (num_taps == 6) {
-    srcs[6] = _mm_setzero_si128();
-    // 00 01 02 03
-    srcs[0] = Load4(src);
-    src += src_stride;
-    // 10 11 12 13
-    const __m128i a = Load4(src);
-    // 00 01 02 03 10 11 12 13
-    srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
-    src += src_stride;
-    // 20 21 22 23
-    srcs[2] = Load4(src);
-    src += src_stride;
-    // 10 11 12 13 20 21 22 23
-    srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
-    // 30 31 32 33
-    const __m128i b = Load4(src);
-    // 20 21 22 23 30 31 32 33
-    srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
-    src += src_stride;
-    // 40 41 42 43
-    srcs[4] = Load4(src);
-    src += src_stride;
-    // 30 31 32 33 40 41 42 43
-    srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
-
-    int y = 0;
-    do {
-      // 50 51 52 53
-      const __m128i c = Load4(src);
-      // 40 41 42 43 50 51 52 53
-      srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
-      src += src_stride;
-      // 60 61 62 63
-      srcs[6] = Load4(src);
-      src += src_stride;
-      // 50 51 52 53 60 61 62 63
-      srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
-
-      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
-      if (is_compound) {
-        const __m128i results = Compound1DShift(sums);
-        StoreUnaligned16(dst16, results);
-        dst16 += 4 << 1;
-      } else {
-        const __m128i results_16 =
-            RightShiftWithRounding_S16(sums, kFilterBits - 1);
-        const __m128i results = _mm_packus_epi16(results_16, results_16);
-        Store4(dst8, results);
-        dst8 += dst_stride;
-        Store4(dst8, _mm_srli_si128(results, 4));
-        dst8 += dst_stride;
-      }
-
-      srcs[0] = srcs[2];
-      srcs[1] = srcs[3];
-      srcs[2] = srcs[4];
-      srcs[3] = srcs[5];
-      srcs[4] = srcs[6];
-      y += 2;
-    } while (y < height);
-  } else if (num_taps == 8) {
-    srcs[8] = _mm_setzero_si128();
-    // 00 01 02 03
-    srcs[0] = Load4(src);
-    src += src_stride;
-    // 10 11 12 13
-    const __m128i a = Load4(src);
-    // 00 01 02 03 10 11 12 13
-    srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
-    src += src_stride;
-    // 20 21 22 23
-    srcs[2] = Load4(src);
-    src += src_stride;
-    // 10 11 12 13 20 21 22 23
-    srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
-    // 30 31 32 33
-    const __m128i b = Load4(src);
-    // 20 21 22 23 30 31 32 33
-    srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
-    src += src_stride;
-    // 40 41 42 43
-    srcs[4] = Load4(src);
-    src += src_stride;
-    // 30 31 32 33 40 41 42 43
-    srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
-    // 50 51 52 53
-    const __m128i c = Load4(src);
-    // 40 41 42 43 50 51 52 53
-    srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
-    src += src_stride;
-    // 60 61 62 63
-    srcs[6] = Load4(src);
-    src += src_stride;
-    // 50 51 52 53 60 61 62 63
-    srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
-
-    int y = 0;
-    do {
-      // 70 71 72 73
-      const __m128i d = Load4(src);
-      // 60 61 62 63 70 71 72 73
-      srcs[6] = _mm_unpacklo_epi32(srcs[6], d);
-      src += src_stride;
-      // 80 81 82 83
-      srcs[8] = Load4(src);
-      src += src_stride;
-      // 70 71 72 73 80 81 82 83
-      srcs[7] = _mm_unpacklo_epi32(d, srcs[8]);
-
-      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
-      if (is_compound) {
-        const __m128i results = Compound1DShift(sums);
-        StoreUnaligned16(dst16, results);
-        dst16 += 4 << 1;
-      } else {
-        const __m128i results_16 =
-            RightShiftWithRounding_S16(sums, kFilterBits - 1);
-        const __m128i results = _mm_packus_epi16(results_16, results_16);
-        Store4(dst8, results);
-        dst8 += dst_stride;
-        Store4(dst8, _mm_srli_si128(results, 4));
-        dst8 += dst_stride;
-      }
-
-      srcs[0] = srcs[2];
-      srcs[1] = srcs[3];
-      srcs[2] = srcs[4];
-      srcs[3] = srcs[5];
-      srcs[4] = srcs[6];
-      srcs[5] = srcs[7];
-      srcs[6] = srcs[8];
-      y += 2;
-    } while (y < height);
-  }
-}
-
-template <int filter_index, bool negative_outside_taps = false>
-void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
-                       void* const dst, const ptrdiff_t dst_stride,
-                       const int height, const __m128i* const v_tap) {
-  const int num_taps = GetNumTapsInFilter(filter_index);
-  auto* dst8 = static_cast<uint8_t*>(dst);
-
-  __m128i srcs[9];
-
-  if (num_taps == 2) {
-    srcs[2] = _mm_setzero_si128();
-    // 00 01
-    srcs[0] = Load2(src);
-    src += src_stride;
-
-    int y = 0;
-    do {
-      // 00 01 10 11
-      srcs[0] = Load2<1>(src, srcs[0]);
-      src += src_stride;
-      // 00 01 10 11 20 21
-      srcs[0] = Load2<2>(src, srcs[0]);
-      src += src_stride;
-      // 00 01 10 11 20 21 30 31
-      srcs[0] = Load2<3>(src, srcs[0]);
-      src += src_stride;
-      // 40 41
-      srcs[2] = Load2<0>(src, srcs[2]);
-      src += src_stride;
-      // 00 01 10 11 20 21 30 31 40 41
-      const __m128i srcs_0_2 = _mm_unpacklo_epi64(srcs[0], srcs[2]);
-      // 10 11 20 21 30 31 40 41
-      srcs[1] = _mm_srli_si128(srcs_0_2, 2);
-      // This uses srcs[0]..srcs[1].
-      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
-      const __m128i results_16 =
-          RightShiftWithRounding_S16(sums, kFilterBits - 1);
-      const __m128i results = _mm_packus_epi16(results_16, results_16);
-
-      Store2(dst8, results);
-      dst8 += dst_stride;
-      Store2(dst8, _mm_srli_si128(results, 2));
-      if (height == 2) return;
-      dst8 += dst_stride;
-      Store2(dst8, _mm_srli_si128(results, 4));
-      dst8 += dst_stride;
-      Store2(dst8, _mm_srli_si128(results, 6));
-      dst8 += dst_stride;
-
-      srcs[0] = srcs[2];
-      y += 4;
-    } while (y < height);
-  } else if (num_taps == 4) {
-    srcs[4] = _mm_setzero_si128();
-
-    // 00 01
-    srcs[0] = Load2(src);
-    src += src_stride;
-    // 00 01 10 11
-    srcs[0] = Load2<1>(src, srcs[0]);
-    src += src_stride;
-    // 00 01 10 11 20 21
-    srcs[0] = Load2<2>(src, srcs[0]);
-    src += src_stride;
-
-    int y = 0;
-    do {
-      // 00 01 10 11 20 21 30 31
-      srcs[0] = Load2<3>(src, srcs[0]);
-      src += src_stride;
-      // 40 41
-      srcs[4] = Load2<0>(src, srcs[4]);
-      src += src_stride;
-      // 40 41 50 51
-      srcs[4] = Load2<1>(src, srcs[4]);
-      src += src_stride;
-      // 40 41 50 51 60 61
-      srcs[4] = Load2<2>(src, srcs[4]);
-      src += src_stride;
-      // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
-      const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
-      // 10 11 20 21 30 31 40 41
-      srcs[1] = _mm_srli_si128(srcs_0_4, 2);
-      // 20 21 30 31 40 41 50 51
-      srcs[2] = _mm_srli_si128(srcs_0_4, 4);
-      // 30 31 40 41 50 51 60 61
-      srcs[3] = _mm_srli_si128(srcs_0_4, 6);
-
-      // This uses srcs[0]..srcs[3].
-      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
-      const __m128i results_16 =
-          RightShiftWithRounding_S16(sums, kFilterBits - 1);
-      const __m128i results = _mm_packus_epi16(results_16, results_16);
-
-      Store2(dst8, results);
-      dst8 += dst_stride;
-      Store2(dst8, _mm_srli_si128(results, 2));
-      if (height == 2) return;
-      dst8 += dst_stride;
-      Store2(dst8, _mm_srli_si128(results, 4));
-      dst8 += dst_stride;
-      Store2(dst8, _mm_srli_si128(results, 6));
-      dst8 += dst_stride;
-
-      srcs[0] = srcs[4];
-      y += 4;
-    } while (y < height);
-  } else if (num_taps == 6) {
-    // During the vertical pass the number of taps is restricted when
-    // |height| <= 4.
-    assert(height > 4);
-    srcs[8] = _mm_setzero_si128();
-
-    // 00 01
-    srcs[0] = Load2(src);
-    src += src_stride;
-    // 00 01 10 11
-    srcs[0] = Load2<1>(src, srcs[0]);
-    src += src_stride;
-    // 00 01 10 11 20 21
-    srcs[0] = Load2<2>(src, srcs[0]);
-    src += src_stride;
-    // 00 01 10 11 20 21 30 31
-    srcs[0] = Load2<3>(src, srcs[0]);
-    src += src_stride;
-    // 40 41
-    srcs[4] = Load2(src);
-    src += src_stride;
-    // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
-    const __m128i srcs_0_4x = _mm_unpacklo_epi64(srcs[0], srcs[4]);
-    // 10 11 20 21 30 31 40 41
-    srcs[1] = _mm_srli_si128(srcs_0_4x, 2);
-
-    int y = 0;
-    do {
-      // 40 41 50 51
-      srcs[4] = Load2<1>(src, srcs[4]);
-      src += src_stride;
-      // 40 41 50 51 60 61
-      srcs[4] = Load2<2>(src, srcs[4]);
-      src += src_stride;
-      // 40 41 50 51 60 61 70 71
-      srcs[4] = Load2<3>(src, srcs[4]);
-      src += src_stride;
-      // 80 81
-      srcs[8] = Load2<0>(src, srcs[8]);
-      src += src_stride;
-      // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
-      const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
-      // 20 21 30 31 40 41 50 51
-      srcs[2] = _mm_srli_si128(srcs_0_4, 4);
-      // 30 31 40 41 50 51 60 61
-      srcs[3] = _mm_srli_si128(srcs_0_4, 6);
-      const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
-      // 50 51 60 61 70 71 80 81
-      srcs[5] = _mm_srli_si128(srcs_4_8, 2);
-
-      // This uses srcs[0]..srcs[5].
-      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
-      const __m128i results_16 =
-          RightShiftWithRounding_S16(sums, kFilterBits - 1);
-      const __m128i results = _mm_packus_epi16(results_16, results_16);
-
-      Store2(dst8, results);
-      dst8 += dst_stride;
-      Store2(dst8, _mm_srli_si128(results, 2));
-      dst8 += dst_stride;
-      Store2(dst8, _mm_srli_si128(results, 4));
-      dst8 += dst_stride;
-      Store2(dst8, _mm_srli_si128(results, 6));
-      dst8 += dst_stride;
-
-      srcs[0] = srcs[4];
-      srcs[1] = srcs[5];
-      srcs[4] = srcs[8];
-      y += 4;
-    } while (y < height);
-  } else if (num_taps == 8) {
-    // During the vertical pass the number of taps is restricted when
-    // |height| <= 4.
-    assert(height > 4);
-    srcs[8] = _mm_setzero_si128();
-    // 00 01
-    srcs[0] = Load2(src);
-    src += src_stride;
-    // 00 01 10 11
-    srcs[0] = Load2<1>(src, srcs[0]);
-    src += src_stride;
-    // 00 01 10 11 20 21
-    srcs[0] = Load2<2>(src, srcs[0]);
-    src += src_stride;
-    // 00 01 10 11 20 21 30 31
-    srcs[0] = Load2<3>(src, srcs[0]);
-    src += src_stride;
-    // 40 41
-    srcs[4] = Load2(src);
-    src += src_stride;
-    // 40 41 50 51
-    srcs[4] = Load2<1>(src, srcs[4]);
-    src += src_stride;
-    // 40 41 50 51 60 61
-    srcs[4] = Load2<2>(src, srcs[4]);
-    src += src_stride;
-
-    // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
-    const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
-    // 10 11 20 21 30 31 40 41
-    srcs[1] = _mm_srli_si128(srcs_0_4, 2);
-    // 20 21 30 31 40 41 50 51
-    srcs[2] = _mm_srli_si128(srcs_0_4, 4);
-    // 30 31 40 41 50 51 60 61
-    srcs[3] = _mm_srli_si128(srcs_0_4, 6);
-
-    int y = 0;
-    do {
-      // 40 41 50 51 60 61 70 71
-      srcs[4] = Load2<3>(src, srcs[4]);
-      src += src_stride;
-      // 80 81
-      srcs[8] = Load2<0>(src, srcs[8]);
-      src += src_stride;
-      // 80 81 90 91
-      srcs[8] = Load2<1>(src, srcs[8]);
-      src += src_stride;
-      // 80 81 90 91 a0 a1
-      srcs[8] = Load2<2>(src, srcs[8]);
-      src += src_stride;
-
-      // 40 41 50 51 60 61 70 71 80 81 90 91 a0 a1
-      const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
-      // 50 51 60 61 70 71 80 81
-      srcs[5] = _mm_srli_si128(srcs_4_8, 2);
-      // 60 61 70 71 80 81 90 91
-      srcs[6] = _mm_srli_si128(srcs_4_8, 4);
-      // 70 71 80 81 90 91 a0 a1
-      srcs[7] = _mm_srli_si128(srcs_4_8, 6);
-
-      // This uses srcs[0]..srcs[7].
-      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
-      const __m128i results_16 =
-          RightShiftWithRounding_S16(sums, kFilterBits - 1);
-      const __m128i results = _mm_packus_epi16(results_16, results_16);
-
-      Store2(dst8, results);
-      dst8 += dst_stride;
-      Store2(dst8, _mm_srli_si128(results, 2));
-      dst8 += dst_stride;
-      Store2(dst8, _mm_srli_si128(results, 4));
-      dst8 += dst_stride;
-      Store2(dst8, _mm_srli_si128(results, 6));
-      dst8 += dst_stride;
-
-      srcs[0] = srcs[4];
-      srcs[1] = srcs[5];
-      srcs[2] = srcs[6];
-      srcs[3] = srcs[7];
-      srcs[4] = srcs[8];
-      y += 4;
-    } while (y < height);
-  }
-}
-
 void ConvolveVertical_SSE4_1(const void* const reference,
                              const ptrdiff_t reference_stride,
                              const int /*horizontal_filter_index*/,
                              const int vertical_filter_index,
-                             const int /*subpixel_x*/, const int subpixel_y,
-                             const int width, const int height,
-                             void* prediction, const ptrdiff_t pred_stride) {
+                             const int /*horizontal_filter_id*/,
+                             const int vertical_filter_id, const int width,
+                             const int height, void* prediction,
+                             const ptrdiff_t pred_stride) {
   const int filter_index = GetFilterIndex(vertical_filter_index, height);
   const int vertical_taps = GetNumTapsInFilter(filter_index);
   const ptrdiff_t src_stride = reference_stride;
@@ -1361,19 +415,18 @@
                     (vertical_taps / 2 - 1) * src_stride;
   auto* dest = static_cast<uint8_t*>(prediction);
   const ptrdiff_t dest_stride = pred_stride;
-  const int filter_id = (subpixel_y >> 6) & kSubPixelMask;
-  assert(filter_id != 0);
+  assert(vertical_filter_id != 0);
 
   __m128i taps[4];
   const __m128i v_filter =
-      LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
+      LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
 
   if (filter_index < 2) {  // 6 tap.
     SetupTaps<6>(&v_filter, taps);
     if (width == 2) {
-      FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical2xH<6, 0>(src, src_stride, dest, dest_stride, height, taps);
     } else if (width == 4) {
-      FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical4xH<6, 0>(src, src_stride, dest, dest_stride, height, taps);
     } else {
       FilterVertical<0>(src, src_stride, dest, dest_stride, width, height,
                         taps);
@@ -1381,9 +434,9 @@
   } else if (filter_index == 2) {  // 8 tap.
     SetupTaps<8>(&v_filter, taps);
     if (width == 2) {
-      FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical2xH<8, 2>(src, src_stride, dest, dest_stride, height, taps);
     } else if (width == 4) {
-      FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical4xH<8, 2>(src, src_stride, dest, dest_stride, height, taps);
     } else {
       FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
                         taps);
@@ -1391,9 +444,9 @@
   } else if (filter_index == 3) {  // 2 tap.
     SetupTaps<2>(&v_filter, taps);
     if (width == 2) {
-      FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical2xH<2, 3>(src, src_stride, dest, dest_stride, height, taps);
     } else if (width == 4) {
-      FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical4xH<2, 3>(src, src_stride, dest, dest_stride, height, taps);
     } else {
       FilterVertical<3>(src, src_stride, dest, dest_stride, width, height,
                         taps);
@@ -1401,9 +454,9 @@
   } else if (filter_index == 4) {  // 4 tap.
     SetupTaps<4>(&v_filter, taps);
     if (width == 2) {
-      FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical2xH<4, 4>(src, src_stride, dest, dest_stride, height, taps);
     } else if (width == 4) {
-      FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical4xH<4, 4>(src, src_stride, dest, dest_stride, height, taps);
     } else {
       FilterVertical<4>(src, src_stride, dest, dest_stride, width, height,
                         taps);
@@ -1414,9 +467,9 @@
     SetupTaps<4>(&v_filter, taps);
 
     if (width == 2) {
-      FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical2xH<4, 5>(src, src_stride, dest, dest_stride, height, taps);
     } else if (width == 4) {
-      FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height, taps);
+      FilterVertical4xH<4, 5>(src, src_stride, dest, dest_stride, height, taps);
     } else {
       FilterVertical<5>(src, src_stride, dest, dest_stride, width, height,
                         taps);
@@ -1424,11 +477,14 @@
   }
 }
 
-void ConvolveCompoundCopy_SSE4(
-    const void* const reference, const ptrdiff_t reference_stride,
-    const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
-    const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
-    const int height, void* prediction, const ptrdiff_t pred_stride) {
+void ConvolveCompoundCopy_SSE4(const void* const reference,
+                               const ptrdiff_t reference_stride,
+                               const int /*horizontal_filter_index*/,
+                               const int /*vertical_filter_index*/,
+                               const int /*horizontal_filter_id*/,
+                               const int /*vertical_filter_id*/,
+                               const int width, const int height,
+                               void* prediction, const ptrdiff_t pred_stride) {
   const auto* src = static_cast<const uint8_t*>(reference);
   const ptrdiff_t src_stride = reference_stride;
   auto* dest = static_cast<uint16_t*>(prediction);
@@ -1485,26 +541,26 @@
 void ConvolveCompoundVertical_SSE4_1(
     const void* const reference, const ptrdiff_t reference_stride,
     const int /*horizontal_filter_index*/, const int vertical_filter_index,
-    const int /*subpixel_x*/, const int subpixel_y, const int width,
-    const int height, void* prediction, const ptrdiff_t /*pred_stride*/) {
+    const int /*horizontal_filter_id*/, const int vertical_filter_id,
+    const int width, const int height, void* prediction,
+    const ptrdiff_t /*pred_stride*/) {
   const int filter_index = GetFilterIndex(vertical_filter_index, height);
   const int vertical_taps = GetNumTapsInFilter(filter_index);
   const ptrdiff_t src_stride = reference_stride;
   const auto* src = static_cast<const uint8_t*>(reference) -
                     (vertical_taps / 2 - 1) * src_stride;
   auto* dest = static_cast<uint16_t*>(prediction);
-  const int filter_id = (subpixel_y >> 6) & kSubPixelMask;
-  assert(filter_id != 0);
+  assert(vertical_filter_id != 0);
 
   __m128i taps[4];
   const __m128i v_filter =
-      LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
+      LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
 
   if (filter_index < 2) {  // 6 tap.
     SetupTaps<6>(&v_filter, taps);
     if (width == 4) {
-      FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4,
-                                                 height, taps);
+      FilterVertical4xH<6, 0, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                    height, taps);
     } else {
       FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width,
                                               width, height, taps);
@@ -1513,8 +569,8 @@
     SetupTaps<8>(&v_filter, taps);
 
     if (width == 4) {
-      FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
-                                                 height, taps);
+      FilterVertical4xH<8, 2, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                    height, taps);
     } else {
       FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
                                               width, height, taps);
@@ -1523,8 +579,8 @@
     SetupTaps<2>(&v_filter, taps);
 
     if (width == 4) {
-      FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4,
-                                                 height, taps);
+      FilterVertical4xH<2, 3, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                    height, taps);
     } else {
       FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width,
                                               width, height, taps);
@@ -1533,8 +589,8 @@
     SetupTaps<4>(&v_filter, taps);
 
     if (width == 4) {
-      FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4,
-                                                 height, taps);
+      FilterVertical4xH<4, 4, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                    height, taps);
     } else {
       FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width,
                                               width, height, taps);
@@ -1543,8 +599,8 @@
     SetupTaps<4>(&v_filter, taps);
 
     if (width == 4) {
-      FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4,
-                                                 height, taps);
+      FilterVertical4xH<4, 5, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                    height, taps);
     } else {
       FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width,
                                               width, height, taps);
@@ -1556,7 +612,8 @@
                                const ptrdiff_t reference_stride,
                                const int horizontal_filter_index,
                                const int /*vertical_filter_index*/,
-                               const int subpixel_x, const int /*subpixel_y*/,
+                               const int horizontal_filter_id,
+                               const int /*vertical_filter_id*/,
                                const int width, const int height,
                                void* prediction, const ptrdiff_t pred_stride) {
   const int filter_index = GetFilterIndex(horizontal_filter_index, width);
@@ -1565,28 +622,32 @@
   auto* dest = static_cast<uint8_t*>(prediction);
 
   DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
-                   subpixel_x, filter_index);
+                   horizontal_filter_id, filter_index);
 }
 
 void ConvolveCompoundHorizontal_SSE4_1(
     const void* const reference, const ptrdiff_t reference_stride,
     const int horizontal_filter_index, const int /*vertical_filter_index*/,
-    const int subpixel_x, const int /*subpixel_y*/, const int width,
-    const int height, void* prediction, const ptrdiff_t /*pred_stride*/) {
+    const int horizontal_filter_id, const int /*vertical_filter_id*/,
+    const int width, const int height, void* prediction,
+    const ptrdiff_t /*pred_stride*/) {
   const int filter_index = GetFilterIndex(horizontal_filter_index, width);
   const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
   auto* dest = static_cast<uint16_t*>(prediction);
 
   DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>(
-      src, reference_stride, dest, width, width, height, subpixel_x,
+      src, reference_stride, dest, width, width, height, horizontal_filter_id,
       filter_index);
 }
 
-void ConvolveCompound2D_SSE4_1(
-    const void* const reference, const ptrdiff_t reference_stride,
-    const int horizontal_filter_index, const int vertical_filter_index,
-    const int subpixel_x, const int subpixel_y, const int width,
-    const int height, void* prediction, const ptrdiff_t /*pred_stride*/) {
+void ConvolveCompound2D_SSE4_1(const void* const reference,
+                               const ptrdiff_t reference_stride,
+                               const int horizontal_filter_index,
+                               const int vertical_filter_index,
+                               const int horizontal_filter_id,
+                               const int vertical_filter_id, const int width,
+                               const int height, void* prediction,
+                               const ptrdiff_t /*pred_stride*/) {
   // The output of the horizontal filter, i.e. the intermediate_result, is
   // guaranteed to fit in int16_t.
   alignas(16) uint16_t
@@ -1609,17 +670,16 @@
 
   DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
       src, src_stride, intermediate_result, width, width, intermediate_height,
-      subpixel_x, horiz_filter_index);
+      horizontal_filter_id, horiz_filter_index);
 
   // Vertical filter.
   auto* dest = static_cast<uint16_t*>(prediction);
-  const int filter_id = ((subpixel_y & 1023) >> 6) & kSubPixelMask;
-  assert(filter_id != 0);
+  assert(vertical_filter_id != 0);
 
   const ptrdiff_t dest_stride = width;
   __m128i taps[4];
   const __m128i v_filter =
-      LoadLo8(kHalfSubPixelFilters[vert_filter_index][filter_id]);
+      LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]);
 
   if (vertical_taps == 8) {
     SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
@@ -1777,7 +837,11 @@
 template <int num_taps, int grade_x>
 inline void PrepareSourceVectors(const uint8_t* src, const __m128i src_indices,
                                  __m128i* const source /*[num_taps >> 1]*/) {
-  const __m128i src_vals = LoadUnaligned16(src);
+  // |used_bytes| is only computed in msan builds. Mask away unused bytes for
+  // msan because it incorrectly models the outcome of the shuffles in some
+  // cases. This has not been reproduced out of context.
+  const int used_bytes = _mm_extract_epi8(src_indices, 15) + 1 + num_taps - 2;
+  const __m128i src_vals = LoadUnaligned16Msan(src, 16 - used_bytes);
   source[0] = _mm_shuffle_epi8(src_vals, src_indices);
   if (grade_x == 1) {
     if (num_taps > 2) {
@@ -1793,7 +857,7 @@
     assert(grade_x > 1);
     assert(num_taps != 4);
     // grade_x > 1 also means width >= 8 && num_taps != 4
-    const __m128i src_vals_ext = LoadLo8(src + 16);
+    const __m128i src_vals_ext = LoadLo8Msan(src + 16, 24 - used_bytes);
     if (num_taps > 2) {
       source[1] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 2),
                                    src_indices);
@@ -2008,14 +1072,10 @@
 // |width_class| is 2, 4, or 8, according to the Store function that should be
 // used.
 template <int num_taps, int width_class, bool is_compound>
-#if LIBGAV1_MSAN
-__attribute__((no_sanitize_memory)) void ConvolveVerticalScale(
-#else
-inline void ConvolveVerticalScale(
-#endif
-    const int16_t* src, const int width, const int subpixel_y,
-    const int filter_index, const int step_y, const int height, void* dest,
-    const ptrdiff_t dest_stride) {
+inline void ConvolveVerticalScale(const int16_t* src, const int width,
+                                  const int subpixel_y, const int filter_index,
+                                  const int step_y, const int height,
+                                  void* dest, const ptrdiff_t dest_stride) {
   constexpr ptrdiff_t src_stride = kIntermediateStride;
   constexpr int kernel_offset = (8 - num_taps) / 2;
   const int16_t* src_y = src;
@@ -2282,6 +1342,540 @@
   }
 }
 
+inline void HalfAddHorizontal(const uint8_t* src, uint8_t* dst) {
+  const __m128i left = LoadUnaligned16(src);
+  const __m128i right = LoadUnaligned16(src + 1);
+  StoreUnaligned16(dst, _mm_avg_epu8(left, right));
+}
+
+template <int width>
+inline void IntraBlockCopyHorizontal(const uint8_t* src,
+                                     const ptrdiff_t src_stride,
+                                     const int height, uint8_t* dst,
+                                     const ptrdiff_t dst_stride) {
+  const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
+  const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
+
+  int y = height;
+  do {
+    HalfAddHorizontal(src, dst);
+    if (width >= 32) {
+      src += 16;
+      dst += 16;
+      HalfAddHorizontal(src, dst);
+      if (width >= 64) {
+        src += 16;
+        dst += 16;
+        HalfAddHorizontal(src, dst);
+        src += 16;
+        dst += 16;
+        HalfAddHorizontal(src, dst);
+        if (width == 128) {
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal(src, dst);
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal(src, dst);
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal(src, dst);
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal(src, dst);
+        }
+      }
+    }
+    src += src_remainder_stride;
+    dst += dst_remainder_stride;
+  } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopyHorizontal_SSE4_1(
+    const void* const reference, const ptrdiff_t reference_stride,
+    const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
+    const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
+    const int height, void* const prediction, const ptrdiff_t pred_stride) {
+  const auto* src = static_cast<const uint8_t*>(reference);
+  auto* dest = static_cast<uint8_t*>(prediction);
+
+  if (width == 128) {
+    IntraBlockCopyHorizontal<128>(src, reference_stride, height, dest,
+                                  pred_stride);
+  } else if (width == 64) {
+    IntraBlockCopyHorizontal<64>(src, reference_stride, height, dest,
+                                 pred_stride);
+  } else if (width == 32) {
+    IntraBlockCopyHorizontal<32>(src, reference_stride, height, dest,
+                                 pred_stride);
+  } else if (width == 16) {
+    IntraBlockCopyHorizontal<16>(src, reference_stride, height, dest,
+                                 pred_stride);
+  } else if (width == 8) {
+    int y = height;
+    do {
+      const __m128i left = LoadLo8(src);
+      const __m128i right = LoadLo8(src + 1);
+      StoreLo8(dest, _mm_avg_epu8(left, right));
+
+      src += reference_stride;
+      dest += pred_stride;
+    } while (--y != 0);
+  } else if (width == 4) {
+    int y = height;
+    do {
+      __m128i left = Load4(src);
+      __m128i right = Load4(src + 1);
+      src += reference_stride;
+      left = _mm_unpacklo_epi32(left, Load4(src));
+      right = _mm_unpacklo_epi32(right, Load4(src + 1));
+      src += reference_stride;
+
+      const __m128i result = _mm_avg_epu8(left, right);
+
+      Store4(dest, result);
+      dest += pred_stride;
+      Store4(dest, _mm_srli_si128(result, 4));
+      dest += pred_stride;
+      y -= 2;
+    } while (y != 0);
+  } else {
+    assert(width == 2);
+    __m128i left = _mm_setzero_si128();
+    __m128i right = _mm_setzero_si128();
+    int y = height;
+    do {
+      left = Load2<0>(src, left);
+      right = Load2<0>(src + 1, right);
+      src += reference_stride;
+      left = Load2<1>(src, left);
+      right = Load2<1>(src + 1, right);
+      src += reference_stride;
+
+      const __m128i result = _mm_avg_epu8(left, right);
+
+      Store2(dest, result);
+      dest += pred_stride;
+      Store2(dest, _mm_srli_si128(result, 2));
+      dest += pred_stride;
+      y -= 2;
+    } while (y != 0);
+  }
+}
+
+template <int width>
+inline void IntraBlockCopyVertical(const uint8_t* src,
+                                   const ptrdiff_t src_stride, const int height,
+                                   uint8_t* dst, const ptrdiff_t dst_stride) {
+  const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
+  const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
+  __m128i row[8], below[8];
+
+  row[0] = LoadUnaligned16(src);
+  if (width >= 32) {
+    src += 16;
+    row[1] = LoadUnaligned16(src);
+    if (width >= 64) {
+      src += 16;
+      row[2] = LoadUnaligned16(src);
+      src += 16;
+      row[3] = LoadUnaligned16(src);
+      if (width == 128) {
+        src += 16;
+        row[4] = LoadUnaligned16(src);
+        src += 16;
+        row[5] = LoadUnaligned16(src);
+        src += 16;
+        row[6] = LoadUnaligned16(src);
+        src += 16;
+        row[7] = LoadUnaligned16(src);
+      }
+    }
+  }
+  src += src_remainder_stride;
+
+  int y = height;
+  do {
+    below[0] = LoadUnaligned16(src);
+    if (width >= 32) {
+      src += 16;
+      below[1] = LoadUnaligned16(src);
+      if (width >= 64) {
+        src += 16;
+        below[2] = LoadUnaligned16(src);
+        src += 16;
+        below[3] = LoadUnaligned16(src);
+        if (width == 128) {
+          src += 16;
+          below[4] = LoadUnaligned16(src);
+          src += 16;
+          below[5] = LoadUnaligned16(src);
+          src += 16;
+          below[6] = LoadUnaligned16(src);
+          src += 16;
+          below[7] = LoadUnaligned16(src);
+        }
+      }
+    }
+    src += src_remainder_stride;
+
+    StoreUnaligned16(dst, _mm_avg_epu8(row[0], below[0]));
+    row[0] = below[0];
+    if (width >= 32) {
+      dst += 16;
+      StoreUnaligned16(dst, _mm_avg_epu8(row[1], below[1]));
+      row[1] = below[1];
+      if (width >= 64) {
+        dst += 16;
+        StoreUnaligned16(dst, _mm_avg_epu8(row[2], below[2]));
+        row[2] = below[2];
+        dst += 16;
+        StoreUnaligned16(dst, _mm_avg_epu8(row[3], below[3]));
+        row[3] = below[3];
+        if (width >= 128) {
+          dst += 16;
+          StoreUnaligned16(dst, _mm_avg_epu8(row[4], below[4]));
+          row[4] = below[4];
+          dst += 16;
+          StoreUnaligned16(dst, _mm_avg_epu8(row[5], below[5]));
+          row[5] = below[5];
+          dst += 16;
+          StoreUnaligned16(dst, _mm_avg_epu8(row[6], below[6]));
+          row[6] = below[6];
+          dst += 16;
+          StoreUnaligned16(dst, _mm_avg_epu8(row[7], below[7]));
+          row[7] = below[7];
+        }
+      }
+    }
+    dst += dst_remainder_stride;
+  } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopyVertical_SSE4_1(
+    const void* const reference, const ptrdiff_t reference_stride,
+    const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
+    const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
+    const int width, const int height, void* const prediction,
+    const ptrdiff_t pred_stride) {
+  const auto* src = static_cast<const uint8_t*>(reference);
+  auto* dest = static_cast<uint8_t*>(prediction);
+
+  if (width == 128) {
+    IntraBlockCopyVertical<128>(src, reference_stride, height, dest,
+                                pred_stride);
+  } else if (width == 64) {
+    IntraBlockCopyVertical<64>(src, reference_stride, height, dest,
+                               pred_stride);
+  } else if (width == 32) {
+    IntraBlockCopyVertical<32>(src, reference_stride, height, dest,
+                               pred_stride);
+  } else if (width == 16) {
+    IntraBlockCopyVertical<16>(src, reference_stride, height, dest,
+                               pred_stride);
+  } else if (width == 8) {
+    __m128i row, below;
+    row = LoadLo8(src);
+    src += reference_stride;
+
+    int y = height;
+    do {
+      below = LoadLo8(src);
+      src += reference_stride;
+
+      StoreLo8(dest, _mm_avg_epu8(row, below));
+      dest += pred_stride;
+
+      row = below;
+    } while (--y != 0);
+  } else if (width == 4) {
+    __m128i row = Load4(src);
+    src += reference_stride;
+
+    int y = height;
+    do {
+      __m128i below = Load4(src);
+      src += reference_stride;
+
+      Store4(dest, _mm_avg_epu8(row, below));
+      dest += pred_stride;
+
+      row = below;
+    } while (--y != 0);
+  } else {
+    assert(width == 2);
+    __m128i row = Load2(src);
+    __m128i below = _mm_setzero_si128();
+    src += reference_stride;
+
+    int y = height;
+    do {
+      below = Load2<0>(src, below);
+      src += reference_stride;
+
+      Store2(dest, _mm_avg_epu8(row, below));
+      dest += pred_stride;
+
+      row = below;
+    } while (--y != 0);
+  }
+}
+
+// Load then add two uint8_t vectors. Return the uint16_t vector result.
+inline __m128i LoadU8AndAddLong(const uint8_t* src, const uint8_t* src1) {
+  const __m128i a = _mm_cvtepu8_epi16(LoadLo8(src));
+  const __m128i b = _mm_cvtepu8_epi16(LoadLo8(src1));
+  return _mm_add_epi16(a, b);
+}
+
+inline __m128i AddU16RightShift2AndPack(__m128i v0, __m128i v1) {
+  const __m128i a = _mm_add_epi16(v0, v1);
+  const __m128i b = _mm_srli_epi16(a, 1);
+  // Use avg here to shift right by 1 with round.
+  const __m128i c = _mm_avg_epu16(b, _mm_setzero_si128());
+  return _mm_packus_epi16(c, c);
+}
+
+template <int width>
+inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride,
+                             const int height, uint8_t* dst,
+                             const ptrdiff_t dst_stride) {
+  const ptrdiff_t src_remainder_stride = src_stride - (width - 8);
+  const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8);
+  __m128i row[16];
+  row[0] = LoadU8AndAddLong(src, src + 1);
+  if (width >= 16) {
+    src += 8;
+    row[1] = LoadU8AndAddLong(src, src + 1);
+    if (width >= 32) {
+      src += 8;
+      row[2] = LoadU8AndAddLong(src, src + 1);
+      src += 8;
+      row[3] = LoadU8AndAddLong(src, src + 1);
+      if (width >= 64) {
+        src += 8;
+        row[4] = LoadU8AndAddLong(src, src + 1);
+        src += 8;
+        row[5] = LoadU8AndAddLong(src, src + 1);
+        src += 8;
+        row[6] = LoadU8AndAddLong(src, src + 1);
+        src += 8;
+        row[7] = LoadU8AndAddLong(src, src + 1);
+        if (width == 128) {
+          src += 8;
+          row[8] = LoadU8AndAddLong(src, src + 1);
+          src += 8;
+          row[9] = LoadU8AndAddLong(src, src + 1);
+          src += 8;
+          row[10] = LoadU8AndAddLong(src, src + 1);
+          src += 8;
+          row[11] = LoadU8AndAddLong(src, src + 1);
+          src += 8;
+          row[12] = LoadU8AndAddLong(src, src + 1);
+          src += 8;
+          row[13] = LoadU8AndAddLong(src, src + 1);
+          src += 8;
+          row[14] = LoadU8AndAddLong(src, src + 1);
+          src += 8;
+          row[15] = LoadU8AndAddLong(src, src + 1);
+        }
+      }
+    }
+  }
+  src += src_remainder_stride;
+
+  int y = height;
+  do {
+    const __m128i below_0 = LoadU8AndAddLong(src, src + 1);
+    StoreLo8(dst, AddU16RightShift2AndPack(row[0], below_0));
+    row[0] = below_0;
+    if (width >= 16) {
+      src += 8;
+      dst += 8;
+
+      const __m128i below_1 = LoadU8AndAddLong(src, src + 1);
+      StoreLo8(dst, AddU16RightShift2AndPack(row[1], below_1));
+      row[1] = below_1;
+      if (width >= 32) {
+        src += 8;
+        dst += 8;
+
+        const __m128i below_2 = LoadU8AndAddLong(src, src + 1);
+        StoreLo8(dst, AddU16RightShift2AndPack(row[2], below_2));
+        row[2] = below_2;
+        src += 8;
+        dst += 8;
+
+        const __m128i below_3 = LoadU8AndAddLong(src, src + 1);
+        StoreLo8(dst, AddU16RightShift2AndPack(row[3], below_3));
+        row[3] = below_3;
+        if (width >= 64) {
+          src += 8;
+          dst += 8;
+
+          const __m128i below_4 = LoadU8AndAddLong(src, src + 1);
+          StoreLo8(dst, AddU16RightShift2AndPack(row[4], below_4));
+          row[4] = below_4;
+          src += 8;
+          dst += 8;
+
+          const __m128i below_5 = LoadU8AndAddLong(src, src + 1);
+          StoreLo8(dst, AddU16RightShift2AndPack(row[5], below_5));
+          row[5] = below_5;
+          src += 8;
+          dst += 8;
+
+          const __m128i below_6 = LoadU8AndAddLong(src, src + 1);
+          StoreLo8(dst, AddU16RightShift2AndPack(row[6], below_6));
+          row[6] = below_6;
+          src += 8;
+          dst += 8;
+
+          const __m128i below_7 = LoadU8AndAddLong(src, src + 1);
+          StoreLo8(dst, AddU16RightShift2AndPack(row[7], below_7));
+          row[7] = below_7;
+          if (width == 128) {
+            src += 8;
+            dst += 8;
+
+            const __m128i below_8 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[8], below_8));
+            row[8] = below_8;
+            src += 8;
+            dst += 8;
+
+            const __m128i below_9 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[9], below_9));
+            row[9] = below_9;
+            src += 8;
+            dst += 8;
+
+            const __m128i below_10 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[10], below_10));
+            row[10] = below_10;
+            src += 8;
+            dst += 8;
+
+            const __m128i below_11 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[11], below_11));
+            row[11] = below_11;
+            src += 8;
+            dst += 8;
+
+            const __m128i below_12 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[12], below_12));
+            row[12] = below_12;
+            src += 8;
+            dst += 8;
+
+            const __m128i below_13 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[13], below_13));
+            row[13] = below_13;
+            src += 8;
+            dst += 8;
+
+            const __m128i below_14 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[14], below_14));
+            row[14] = below_14;
+            src += 8;
+            dst += 8;
+
+            const __m128i below_15 = LoadU8AndAddLong(src, src + 1);
+            StoreLo8(dst, AddU16RightShift2AndPack(row[15], below_15));
+            row[15] = below_15;
+          }
+        }
+      }
+    }
+    src += src_remainder_stride;
+    dst += dst_remainder_stride;
+  } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopy2D_SSE4_1(
+    const void* const reference, const ptrdiff_t reference_stride,
+    const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
+    const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
+    const int width, const int height, void* const prediction,
+    const ptrdiff_t pred_stride) {
+  const auto* src = static_cast<const uint8_t*>(reference);
+  auto* dest = static_cast<uint8_t*>(prediction);
+  // Note: allow vertical access to height + 1. Because this function is only
+  // for u/v plane of intra block copy, such access is guaranteed to be within
+  // the prediction block.
+
+  if (width == 128) {
+    IntraBlockCopy2D<128>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 64) {
+    IntraBlockCopy2D<64>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 32) {
+    IntraBlockCopy2D<32>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 16) {
+    IntraBlockCopy2D<16>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 8) {
+    IntraBlockCopy2D<8>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 4) {
+    __m128i left = _mm_cvtepu8_epi16(Load4(src));
+    __m128i right = _mm_cvtepu8_epi16(Load4(src + 1));
+    src += reference_stride;
+
+    __m128i row = _mm_add_epi16(left, right);
+
+    int y = height;
+    do {
+      left = Load4(src);
+      right = Load4(src + 1);
+      src += reference_stride;
+      left = _mm_unpacklo_epi32(left, Load4(src));
+      right = _mm_unpacklo_epi32(right, Load4(src + 1));
+      src += reference_stride;
+
+      const __m128i below =
+          _mm_add_epi16(_mm_cvtepu8_epi16(left), _mm_cvtepu8_epi16(right));
+      const __m128i result =
+          AddU16RightShift2AndPack(_mm_unpacklo_epi64(row, below), below);
+
+      Store4(dest, result);
+      dest += pred_stride;
+      Store4(dest, _mm_srli_si128(result, 4));
+      dest += pred_stride;
+
+      row = _mm_srli_si128(below, 8);
+      y -= 2;
+    } while (y != 0);
+  } else {
+    __m128i left = Load2(src);
+    __m128i right = Load2(src + 1);
+    src += reference_stride;
+
+    __m128i row =
+        _mm_add_epi16(_mm_cvtepu8_epi16(left), _mm_cvtepu8_epi16(right));
+
+    int y = height;
+    do {
+      left = Load2<0>(src, left);
+      right = Load2<0>(src + 1, right);
+      src += reference_stride;
+      left = Load2<2>(src, left);
+      right = Load2<2>(src + 1, right);
+      src += reference_stride;
+
+      const __m128i below =
+          _mm_add_epi16(_mm_cvtepu8_epi16(left), _mm_cvtepu8_epi16(right));
+      const __m128i result =
+          AddU16RightShift2AndPack(_mm_unpacklo_epi64(row, below), below);
+
+      Store2(dest, result);
+      dest += pred_stride;
+      Store2(dest, _mm_srli_si128(result, 4));
+      dest += pred_stride;
+
+      row = _mm_srli_si128(below, 8);
+      y -= 2;
+    } while (y != 0);
+  }
+}
+
 void Init8bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
   assert(dsp != nullptr);
@@ -2294,6 +1888,10 @@
   dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_SSE4_1;
   dsp->convolve[0][1][1][1] = ConvolveCompound2D_SSE4_1;
 
+  dsp->convolve[1][0][0][1] = ConvolveIntraBlockCopyHorizontal_SSE4_1;
+  dsp->convolve[1][0][1][0] = ConvolveIntraBlockCopyVertical_SSE4_1;
+  dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_SSE4_1;
+
   dsp->convolve_scale[0] = ConvolveScale2D_SSE4_1<false>;
   dsp->convolve_scale[1] = ConvolveScale2D_SSE4_1<true>;
 }
@@ -2306,7 +1904,7 @@
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_SSE4_1
+#else   // !LIBGAV1_TARGETING_SSE4_1
 namespace libgav1 {
 namespace dsp {
 
@@ -2314,4 +1912,4 @@
 
 }  // namespace dsp
 }  // namespace libgav1
-#endif  // LIBGAV1_ENABLE_SSE4_1
+#endif  // LIBGAV1_TARGETING_SSE4_1

diff --git a/libgav1/src/dsp/x86/convolve_sse4.h b/libgav1/src/dsp/x86/convolve_sse4.h
index e449a87..d6c3155 100644
--- a/libgav1/src/dsp/x86/convolve_sse4.h
+++ b/libgav1/src/dsp/x86/convolve_sse4.h

@@ -32,7 +32,7 @@
 
 // If sse4 is enabled and the baseline isn't set due to a higher level of
 // optimization being enabled, signal the sse4 implementation should be used.
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
 
 #ifndef LIBGAV1_Dsp8bpp_ConvolveHorizontal
 #define LIBGAV1_Dsp8bpp_ConvolveHorizontal LIBGAV1_CPU_SSE4_1
@@ -70,6 +70,6 @@
 #define LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D LIBGAV1_CPU_SSE4_1
 #endif
 
-#endif  // LIBGAV1_ENABLE_SSE4_1
+#endif  // LIBGAV1_TARGETING_SSE4_1
 
 #endif  // LIBGAV1_SRC_DSP_X86_CONVOLVE_SSE4_H_

diff --git a/libgav1/src/dsp/x86/convolve_sse4.inc b/libgav1/src/dsp/x86/convolve_sse4.inc
new file mode 100644
index 0000000..550d6a4
--- /dev/null
+++ b/libgav1/src/dsp/x86/convolve_sse4.inc

@@ -0,0 +1,934 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Common 128 bit functions used for sse4/avx2 convolve implementations.
+// This will be included inside an anonymous namespace on files where these are
+// necessary.
+
+#include "src/dsp/convolve.inc"
+
+// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
+// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
+// sum from outranging int16_t.
+template <int filter_index>
+__m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) {
+  __m128i sum;
+  if (filter_index < 2) {
+    // 6 taps.
+    const __m128i v_madd_21 = _mm_maddubs_epi16(src[0], taps[0]);  // k2k1
+    const __m128i v_madd_43 = _mm_maddubs_epi16(src[1], taps[1]);  // k4k3
+    const __m128i v_madd_65 = _mm_maddubs_epi16(src[2], taps[2]);  // k6k5
+    sum = _mm_add_epi16(v_madd_21, v_madd_43);
+    sum = _mm_add_epi16(sum, v_madd_65);
+  } else if (filter_index == 2) {
+    // 8 taps.
+    const __m128i v_madd_10 = _mm_maddubs_epi16(src[0], taps[0]);  // k1k0
+    const __m128i v_madd_32 = _mm_maddubs_epi16(src[1], taps[1]);  // k3k2
+    const __m128i v_madd_54 = _mm_maddubs_epi16(src[2], taps[2]);  // k5k4
+    const __m128i v_madd_76 = _mm_maddubs_epi16(src[3], taps[3]);  // k7k6
+    const __m128i v_sum_3210 = _mm_add_epi16(v_madd_10, v_madd_32);
+    const __m128i v_sum_7654 = _mm_add_epi16(v_madd_54, v_madd_76);
+    sum = _mm_add_epi16(v_sum_7654, v_sum_3210);
+  } else if (filter_index == 3) {
+    // 2 taps.
+    sum = _mm_maddubs_epi16(src[0], taps[0]);  // k4k3
+  } else {
+    // 4 taps.
+    const __m128i v_madd_32 = _mm_maddubs_epi16(src[0], taps[0]);  // k3k2
+    const __m128i v_madd_54 = _mm_maddubs_epi16(src[1], taps[1]);  // k5k4
+    sum = _mm_add_epi16(v_madd_32, v_madd_54);
+  }
+  return sum;
+}
+
+template <int filter_index>
+__m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
+                             const __m128i* const v_tap) {
+  // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
+  const __m128i v_src = LoadHi8(LoadLo8(&src[0]), &src[src_stride]);
+
+  if (filter_index == 3) {
+    // 03 04 04 05 05 06 06 07 13 14 14 15 15 16 16 17
+    const __m128i v_src_43 = _mm_shuffle_epi8(
+        v_src, _mm_set_epi32(0x0f0e0e0d, 0x0d0c0c0b, 0x07060605, 0x05040403));
+    const __m128i v_sum_43 = _mm_maddubs_epi16(v_src_43, v_tap[0]);  // k4k3
+    return v_sum_43;
+  }
+
+  // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16
+  const __m128i v_src_32 = _mm_shuffle_epi8(
+      v_src, _mm_set_epi32(0x0e0d0d0c, 0x0c0b0b0a, 0x06050504, 0x04030302));
+  // 04 05 05 06 06 07 07 xx 14 15 15 16 16 17 17 xx
+  const __m128i v_src_54 = _mm_shuffle_epi8(
+      v_src, _mm_set_epi32(static_cast<int>(0x800f0f0e), 0x0e0d0d0c,
+                           static_cast<int>(0x80070706), 0x06050504));
+  const __m128i v_madd_32 = _mm_maddubs_epi16(v_src_32, v_tap[0]);  // k3k2
+  const __m128i v_madd_54 = _mm_maddubs_epi16(v_src_54, v_tap[1]);  // k5k4
+  const __m128i v_sum_5432 = _mm_add_epi16(v_madd_54, v_madd_32);
+  return v_sum_5432;
+}
+
+template <int filter_index>
+__m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
+                                const __m128i* const v_tap) {
+  __m128i sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+  // Normally the Horizontal pass does the downshift in two passes:
+  // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+  // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
+  // requires adding the rounding offset from the skipped shift.
+  constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
+
+  sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit));
+  sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
+  return _mm_packus_epi16(sum, sum);
+}
+
+template <int filter_index>
+__m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride,
+                                const __m128i* const v_tap) {
+  const __m128i sum =
+      SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+  return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int num_taps, bool is_2d_vertical = false>
+LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter,
+                                     __m128i* v_tap) {
+  if (num_taps == 8) {
+    v_tap[0] = _mm_shufflelo_epi16(*filter, 0x0);   // k1k0
+    v_tap[1] = _mm_shufflelo_epi16(*filter, 0x55);  // k3k2
+    v_tap[2] = _mm_shufflelo_epi16(*filter, 0xaa);  // k5k4
+    v_tap[3] = _mm_shufflelo_epi16(*filter, 0xff);  // k7k6
+    if (is_2d_vertical) {
+      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+      v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
+      v_tap[3] = _mm_cvtepi8_epi16(v_tap[3]);
+    } else {
+      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+      v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
+      v_tap[3] = _mm_unpacklo_epi64(v_tap[3], v_tap[3]);
+    }
+  } else if (num_taps == 6) {
+    const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
+    v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x0);   // k2k1
+    v_tap[1] = _mm_shufflelo_epi16(adjusted_filter, 0x55);  // k4k3
+    v_tap[2] = _mm_shufflelo_epi16(adjusted_filter, 0xaa);  // k6k5
+    if (is_2d_vertical) {
+      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+      v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
+    } else {
+      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+      v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
+    }
+  } else if (num_taps == 4) {
+    v_tap[0] = _mm_shufflelo_epi16(*filter, 0x55);  // k3k2
+    v_tap[1] = _mm_shufflelo_epi16(*filter, 0xaa);  // k5k4
+    if (is_2d_vertical) {
+      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+      v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+    } else {
+      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+      v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+    }
+  } else {  // num_taps == 2
+    const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
+    v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x55);  // k4k3
+    if (is_2d_vertical) {
+      v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+    } else {
+      v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+    }
+  }
+}
+
+template <int num_taps, bool is_compound>
+__m128i SimpleSum2DVerticalTaps(const __m128i* const src,
+                                const __m128i* const taps) {
+  __m128i sum_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[0], src[1]), taps[0]);
+  __m128i sum_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[0], src[1]), taps[0]);
+  if (num_taps >= 4) {
+    __m128i madd_lo =
+        _mm_madd_epi16(_mm_unpacklo_epi16(src[2], src[3]), taps[1]);
+    __m128i madd_hi =
+        _mm_madd_epi16(_mm_unpackhi_epi16(src[2], src[3]), taps[1]);
+    sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+    sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+    if (num_taps >= 6) {
+      madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[4], src[5]), taps[2]);
+      madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[4], src[5]), taps[2]);
+      sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+      sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+      if (num_taps == 8) {
+        madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[6], src[7]), taps[3]);
+        madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[6], src[7]), taps[3]);
+        sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+        sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+      }
+    }
+  }
+
+  if (is_compound) {
+    return _mm_packs_epi32(
+        RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+        RightShiftWithRounding_S32(sum_hi,
+                                   kInterRoundBitsCompoundVertical - 1));
+  }
+
+  return _mm_packs_epi32(
+      RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+      RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical(const uint16_t* src, void* const dst,
+                      const ptrdiff_t dst_stride, const int width,
+                      const int height, const __m128i* const taps) {
+  assert(width >= 8);
+  constexpr int next_row = num_taps - 1;
+  // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
+  const ptrdiff_t src_stride = width;
+
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  int x = 0;
+  do {
+    __m128i srcs[8];
+    const uint16_t* src_x = src + x;
+    srcs[0] = LoadAligned16(src_x);
+    src_x += src_stride;
+    if (num_taps >= 4) {
+      srcs[1] = LoadAligned16(src_x);
+      src_x += src_stride;
+      srcs[2] = LoadAligned16(src_x);
+      src_x += src_stride;
+      if (num_taps >= 6) {
+        srcs[3] = LoadAligned16(src_x);
+        src_x += src_stride;
+        srcs[4] = LoadAligned16(src_x);
+        src_x += src_stride;
+        if (num_taps == 8) {
+          srcs[5] = LoadAligned16(src_x);
+          src_x += src_stride;
+          srcs[6] = LoadAligned16(src_x);
+          src_x += src_stride;
+        }
+      }
+    }
+
+    auto* dst8_x = dst8 + x;
+    auto* dst16_x = dst16 + x;
+    int y = height;
+    do {
+      srcs[next_row] = LoadAligned16(src_x);
+      src_x += src_stride;
+
+      const __m128i sum =
+          SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+      if (is_compound) {
+        StoreUnaligned16(dst16_x, sum);
+        dst16_x += dst_stride;
+      } else {
+        StoreLo8(dst8_x, _mm_packus_epi16(sum, sum));
+        dst8_x += dst_stride;
+      }
+
+      srcs[0] = srcs[1];
+      if (num_taps >= 4) {
+        srcs[1] = srcs[2];
+        srcs[2] = srcs[3];
+        if (num_taps >= 6) {
+          srcs[3] = srcs[4];
+          srcs[4] = srcs[5];
+          if (num_taps == 8) {
+            srcs[5] = srcs[6];
+            srcs[6] = srcs[7];
+          }
+        }
+      }
+    } while (--y != 0);
+    x += 8;
+  } while (x < width);
+}
+
+// Take advantage of |src_stride| == |width| to process two rows at a time.
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical4xH(const uint16_t* src, void* const dst,
+                         const ptrdiff_t dst_stride, const int height,
+                         const __m128i* const taps) {
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  __m128i srcs[9];
+  srcs[0] = LoadAligned16(src);
+  src += 8;
+  if (num_taps >= 4) {
+    srcs[2] = LoadAligned16(src);
+    src += 8;
+    srcs[1] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[0], 8), srcs[2]);
+    if (num_taps >= 6) {
+      srcs[4] = LoadAligned16(src);
+      src += 8;
+      srcs[3] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[2], 8), srcs[4]);
+      if (num_taps == 8) {
+        srcs[6] = LoadAligned16(src);
+        src += 8;
+        srcs[5] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[4], 8), srcs[6]);
+      }
+    }
+  }
+
+  int y = height;
+  do {
+    srcs[num_taps] = LoadAligned16(src);
+    src += 8;
+    srcs[num_taps - 1] = _mm_unpacklo_epi64(
+        _mm_srli_si128(srcs[num_taps - 2], 8), srcs[num_taps]);
+
+    const __m128i sum =
+        SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+    if (is_compound) {
+      StoreUnaligned16(dst16, sum);
+      dst16 += 4 << 1;
+    } else {
+      const __m128i results = _mm_packus_epi16(sum, sum);
+      Store4(dst8, results);
+      dst8 += dst_stride;
+      Store4(dst8, _mm_srli_si128(results, 4));
+      dst8 += dst_stride;
+    }
+
+    srcs[0] = srcs[2];
+    if (num_taps >= 4) {
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      if (num_taps >= 6) {
+        srcs[3] = srcs[5];
+        srcs[4] = srcs[6];
+        if (num_taps == 8) {
+          srcs[5] = srcs[7];
+          srcs[6] = srcs[8];
+        }
+      }
+    }
+    y -= 2;
+  } while (y != 0);
+}
+
+// Take advantage of |src_stride| == |width| to process four rows at a time.
+template <int num_taps>
+void Filter2DVertical2xH(const uint16_t* src, void* const dst,
+                         const ptrdiff_t dst_stride, const int height,
+                         const __m128i* const taps) {
+  constexpr int next_row = (num_taps < 6) ? 4 : 8;
+
+  auto* dst8 = static_cast<uint8_t*>(dst);
+
+  __m128i srcs[9];
+  srcs[0] = LoadAligned16(src);
+  src += 8;
+  if (num_taps >= 6) {
+    srcs[4] = LoadAligned16(src);
+    src += 8;
+    srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+    if (num_taps == 8) {
+      srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+      srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+    }
+  }
+
+  int y = height;
+  do {
+    srcs[next_row] = LoadAligned16(src);
+    src += 8;
+    if (num_taps == 2) {
+      srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+    } else if (num_taps == 4) {
+      srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+      srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+      srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+    } else if (num_taps == 6) {
+      srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+      srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+      srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
+    } else if (num_taps == 8) {
+      srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
+      srcs[6] = _mm_alignr_epi8(srcs[8], srcs[4], 8);
+      srcs[7] = _mm_alignr_epi8(srcs[8], srcs[4], 12);
+    }
+
+    const __m128i sum =
+        SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps);
+    const __m128i results = _mm_packus_epi16(sum, sum);
+
+    Store2(dst8, results);
+    dst8 += dst_stride;
+    Store2(dst8, _mm_srli_si128(results, 2));
+    // When |height| <= 4 the taps are restricted to 2 and 4 tap variants.
+    // Therefore we don't need to check this condition when |height| > 4.
+    if (num_taps <= 4 && height == 2) return;
+    dst8 += dst_stride;
+    Store2(dst8, _mm_srli_si128(results, 4));
+    dst8 += dst_stride;
+    Store2(dst8, _mm_srli_si128(results, 6));
+    dst8 += dst_stride;
+
+    srcs[0] = srcs[4];
+    if (num_taps == 6) {
+      srcs[1] = srcs[5];
+      srcs[4] = srcs[8];
+    } else if (num_taps == 8) {
+      srcs[1] = srcs[5];
+      srcs[2] = srcs[6];
+      srcs[3] = srcs[7];
+      srcs[4] = srcs[8];
+    }
+
+    y -= 4;
+  } while (y != 0);
+}
+
+// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
+// Vertical calculations.
+__m128i Compound1DShift(const __m128i sum) {
+  return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int filter_index>
+__m128i SumVerticalTaps(const __m128i* const srcs, const __m128i* const v_tap) {
+  __m128i v_src[4];
+
+  if (filter_index < 2) {
+    // 6 taps.
+    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+    v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+    v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
+  } else if (filter_index == 2) {
+    // 8 taps.
+    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+    v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+    v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
+    v_src[3] = _mm_unpacklo_epi8(srcs[6], srcs[7]);
+  } else if (filter_index == 3) {
+    // 2 taps.
+    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+  } else if (filter_index > 3) {
+    // 4 taps.
+    v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+    v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+  }
+  const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap);
+  return sum;
+}
+
+// TODO(slavarnway): Use num_taps instead of filter_index for templates. See the
+// 2D version.
+template <int num_taps, int filter_index, bool is_compound = false>
+void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
+                       void* const dst, const ptrdiff_t dst_stride,
+                       const int height, const __m128i* const v_tap) {
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  __m128i srcs[9];
+
+  if (num_taps == 2) {
+    srcs[2] = _mm_setzero_si128();
+    // 00 01 02 03
+    srcs[0] = Load4(src);
+    src += src_stride;
+
+    int y = height;
+    do {
+      // 10 11 12 13
+      const __m128i a = Load4(src);
+      // 00 01 02 03 10 11 12 13
+      srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+      src += src_stride;
+      // 20 21 22 23
+      srcs[2] = Load4(src);
+      src += src_stride;
+      // 10 11 12 13 20 21 22 23
+      srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      if (is_compound) {
+        const __m128i results = Compound1DShift(sums);
+        StoreUnaligned16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const __m128i results_16 =
+            RightShiftWithRounding_S16(sums, kFilterBits - 1);
+        const __m128i results = _mm_packus_epi16(results_16, results_16);
+        Store4(dst8, results);
+        dst8 += dst_stride;
+        Store4(dst8, _mm_srli_si128(results, 4));
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      y -= 2;
+    } while (y != 0);
+  } else if (num_taps == 4) {
+    srcs[4] = _mm_setzero_si128();
+    // 00 01 02 03
+    srcs[0] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13
+    const __m128i a = Load4(src);
+    // 00 01 02 03 10 11 12 13
+    srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+    src += src_stride;
+    // 20 21 22 23
+    srcs[2] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13 20 21 22 23
+    srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+
+    int y = height;
+    do {
+      // 30 31 32 33
+      const __m128i b = Load4(src);
+      // 20 21 22 23 30 31 32 33
+      srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+      src += src_stride;
+      // 40 41 42 43
+      srcs[4] = Load4(src);
+      src += src_stride;
+      // 30 31 32 33 40 41 42 43
+      srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      if (is_compound) {
+        const __m128i results = Compound1DShift(sums);
+        StoreUnaligned16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const __m128i results_16 =
+            RightShiftWithRounding_S16(sums, kFilterBits - 1);
+        const __m128i results = _mm_packus_epi16(results_16, results_16);
+        Store4(dst8, results);
+        dst8 += dst_stride;
+        Store4(dst8, _mm_srli_si128(results, 4));
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      y -= 2;
+    } while (y != 0);
+  } else if (num_taps == 6) {
+    srcs[6] = _mm_setzero_si128();
+    // 00 01 02 03
+    srcs[0] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13
+    const __m128i a = Load4(src);
+    // 00 01 02 03 10 11 12 13
+    srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+    src += src_stride;
+    // 20 21 22 23
+    srcs[2] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13 20 21 22 23
+    srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+    // 30 31 32 33
+    const __m128i b = Load4(src);
+    // 20 21 22 23 30 31 32 33
+    srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+    src += src_stride;
+    // 40 41 42 43
+    srcs[4] = Load4(src);
+    src += src_stride;
+    // 30 31 32 33 40 41 42 43
+    srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+
+    int y = height;
+    do {
+      // 50 51 52 53
+      const __m128i c = Load4(src);
+      // 40 41 42 43 50 51 52 53
+      srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
+      src += src_stride;
+      // 60 61 62 63
+      srcs[6] = Load4(src);
+      src += src_stride;
+      // 50 51 52 53 60 61 62 63
+      srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
+
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      if (is_compound) {
+        const __m128i results = Compound1DShift(sums);
+        StoreUnaligned16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const __m128i results_16 =
+            RightShiftWithRounding_S16(sums, kFilterBits - 1);
+        const __m128i results = _mm_packus_epi16(results_16, results_16);
+        Store4(dst8, results);
+        dst8 += dst_stride;
+        Store4(dst8, _mm_srli_si128(results, 4));
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      srcs[3] = srcs[5];
+      srcs[4] = srcs[6];
+      y -= 2;
+    } while (y != 0);
+  } else if (num_taps == 8) {
+    srcs[8] = _mm_setzero_si128();
+    // 00 01 02 03
+    srcs[0] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13
+    const __m128i a = Load4(src);
+    // 00 01 02 03 10 11 12 13
+    srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+    src += src_stride;
+    // 20 21 22 23
+    srcs[2] = Load4(src);
+    src += src_stride;
+    // 10 11 12 13 20 21 22 23
+    srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+    // 30 31 32 33
+    const __m128i b = Load4(src);
+    // 20 21 22 23 30 31 32 33
+    srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+    src += src_stride;
+    // 40 41 42 43
+    srcs[4] = Load4(src);
+    src += src_stride;
+    // 30 31 32 33 40 41 42 43
+    srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+    // 50 51 52 53
+    const __m128i c = Load4(src);
+    // 40 41 42 43 50 51 52 53
+    srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
+    src += src_stride;
+    // 60 61 62 63
+    srcs[6] = Load4(src);
+    src += src_stride;
+    // 50 51 52 53 60 61 62 63
+    srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
+
+    int y = height;
+    do {
+      // 70 71 72 73
+      const __m128i d = Load4(src);
+      // 60 61 62 63 70 71 72 73
+      srcs[6] = _mm_unpacklo_epi32(srcs[6], d);
+      src += src_stride;
+      // 80 81 82 83
+      srcs[8] = Load4(src);
+      src += src_stride;
+      // 70 71 72 73 80 81 82 83
+      srcs[7] = _mm_unpacklo_epi32(d, srcs[8]);
+
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      if (is_compound) {
+        const __m128i results = Compound1DShift(sums);
+        StoreUnaligned16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const __m128i results_16 =
+            RightShiftWithRounding_S16(sums, kFilterBits - 1);
+        const __m128i results = _mm_packus_epi16(results_16, results_16);
+        Store4(dst8, results);
+        dst8 += dst_stride;
+        Store4(dst8, _mm_srli_si128(results, 4));
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      srcs[3] = srcs[5];
+      srcs[4] = srcs[6];
+      srcs[5] = srcs[7];
+      srcs[6] = srcs[8];
+      y -= 2;
+    } while (y != 0);
+  }
+}
+
+template <int num_taps, int filter_index, bool negative_outside_taps = false>
+void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
+                       void* const dst, const ptrdiff_t dst_stride,
+                       const int height, const __m128i* const v_tap) {
+  auto* dst8 = static_cast<uint8_t*>(dst);
+
+  __m128i srcs[9];
+
+  if (num_taps == 2) {
+    srcs[2] = _mm_setzero_si128();
+    // 00 01
+    srcs[0] = Load2(src);
+    src += src_stride;
+
+    int y = height;
+    do {
+      // 00 01 10 11
+      srcs[0] = Load2<1>(src, srcs[0]);
+      src += src_stride;
+      // 00 01 10 11 20 21
+      srcs[0] = Load2<2>(src, srcs[0]);
+      src += src_stride;
+      // 00 01 10 11 20 21 30 31
+      srcs[0] = Load2<3>(src, srcs[0]);
+      src += src_stride;
+      // 40 41
+      srcs[2] = Load2<0>(src, srcs[2]);
+      src += src_stride;
+      // 00 01 10 11 20 21 30 31 40 41
+      const __m128i srcs_0_2 = _mm_unpacklo_epi64(srcs[0], srcs[2]);
+      // 10 11 20 21 30 31 40 41
+      srcs[1] = _mm_srli_si128(srcs_0_2, 2);
+      // This uses srcs[0]..srcs[1].
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i results_16 =
+          RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+      Store2(dst8, results);
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 2));
+      if (height == 2) return;
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 4));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 6));
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[2];
+      y -= 4;
+    } while (y != 0);
+  } else if (num_taps == 4) {
+    srcs[4] = _mm_setzero_si128();
+
+    // 00 01
+    srcs[0] = Load2(src);
+    src += src_stride;
+    // 00 01 10 11
+    srcs[0] = Load2<1>(src, srcs[0]);
+    src += src_stride;
+    // 00 01 10 11 20 21
+    srcs[0] = Load2<2>(src, srcs[0]);
+    src += src_stride;
+
+    int y = height;
+    do {
+      // 00 01 10 11 20 21 30 31
+      srcs[0] = Load2<3>(src, srcs[0]);
+      src += src_stride;
+      // 40 41
+      srcs[4] = Load2<0>(src, srcs[4]);
+      src += src_stride;
+      // 40 41 50 51
+      srcs[4] = Load2<1>(src, srcs[4]);
+      src += src_stride;
+      // 40 41 50 51 60 61
+      srcs[4] = Load2<2>(src, srcs[4]);
+      src += src_stride;
+      // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+      const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+      // 10 11 20 21 30 31 40 41
+      srcs[1] = _mm_srli_si128(srcs_0_4, 2);
+      // 20 21 30 31 40 41 50 51
+      srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+      // 30 31 40 41 50 51 60 61
+      srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+
+      // This uses srcs[0]..srcs[3].
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i results_16 =
+          RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+      Store2(dst8, results);
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 2));
+      if (height == 2) return;
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 4));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 6));
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[4];
+      y -= 4;
+    } while (y != 0);
+  } else if (num_taps == 6) {
+    // During the vertical pass the number of taps is restricted when
+    // |height| <= 4.
+    assert(height > 4);
+    srcs[8] = _mm_setzero_si128();
+
+    // 00 01
+    srcs[0] = Load2(src);
+    src += src_stride;
+    // 00 01 10 11
+    srcs[0] = Load2<1>(src, srcs[0]);
+    src += src_stride;
+    // 00 01 10 11 20 21
+    srcs[0] = Load2<2>(src, srcs[0]);
+    src += src_stride;
+    // 00 01 10 11 20 21 30 31
+    srcs[0] = Load2<3>(src, srcs[0]);
+    src += src_stride;
+    // 40 41
+    srcs[4] = Load2(src);
+    src += src_stride;
+    // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+    const __m128i srcs_0_4x = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+    // 10 11 20 21 30 31 40 41
+    srcs[1] = _mm_srli_si128(srcs_0_4x, 2);
+
+    int y = height;
+    do {
+      // 40 41 50 51
+      srcs[4] = Load2<1>(src, srcs[4]);
+      src += src_stride;
+      // 40 41 50 51 60 61
+      srcs[4] = Load2<2>(src, srcs[4]);
+      src += src_stride;
+      // 40 41 50 51 60 61 70 71
+      srcs[4] = Load2<3>(src, srcs[4]);
+      src += src_stride;
+      // 80 81
+      srcs[8] = Load2<0>(src, srcs[8]);
+      src += src_stride;
+      // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+      const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+      // 20 21 30 31 40 41 50 51
+      srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+      // 30 31 40 41 50 51 60 61
+      srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+      const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
+      // 50 51 60 61 70 71 80 81
+      srcs[5] = _mm_srli_si128(srcs_4_8, 2);
+
+      // This uses srcs[0]..srcs[5].
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i results_16 =
+          RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+      Store2(dst8, results);
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 2));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 4));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 6));
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[4];
+      srcs[1] = srcs[5];
+      srcs[4] = srcs[8];
+      y -= 4;
+    } while (y != 0);
+  } else if (num_taps == 8) {
+    // During the vertical pass the number of taps is restricted when
+    // |height| <= 4.
+    assert(height > 4);
+    srcs[8] = _mm_setzero_si128();
+    // 00 01
+    srcs[0] = Load2(src);
+    src += src_stride;
+    // 00 01 10 11
+    srcs[0] = Load2<1>(src, srcs[0]);
+    src += src_stride;
+    // 00 01 10 11 20 21
+    srcs[0] = Load2<2>(src, srcs[0]);
+    src += src_stride;
+    // 00 01 10 11 20 21 30 31
+    srcs[0] = Load2<3>(src, srcs[0]);
+    src += src_stride;
+    // 40 41
+    srcs[4] = Load2(src);
+    src += src_stride;
+    // 40 41 50 51
+    srcs[4] = Load2<1>(src, srcs[4]);
+    src += src_stride;
+    // 40 41 50 51 60 61
+    srcs[4] = Load2<2>(src, srcs[4]);
+    src += src_stride;
+
+    // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+    const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+    // 10 11 20 21 30 31 40 41
+    srcs[1] = _mm_srli_si128(srcs_0_4, 2);
+    // 20 21 30 31 40 41 50 51
+    srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+    // 30 31 40 41 50 51 60 61
+    srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+
+    int y = height;
+    do {
+      // 40 41 50 51 60 61 70 71
+      srcs[4] = Load2<3>(src, srcs[4]);
+      src += src_stride;
+      // 80 81
+      srcs[8] = Load2<0>(src, srcs[8]);
+      src += src_stride;
+      // 80 81 90 91
+      srcs[8] = Load2<1>(src, srcs[8]);
+      src += src_stride;
+      // 80 81 90 91 a0 a1
+      srcs[8] = Load2<2>(src, srcs[8]);
+      src += src_stride;
+
+      // 40 41 50 51 60 61 70 71 80 81 90 91 a0 a1
+      const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
+      // 50 51 60 61 70 71 80 81
+      srcs[5] = _mm_srli_si128(srcs_4_8, 2);
+      // 60 61 70 71 80 81 90 91
+      srcs[6] = _mm_srli_si128(srcs_4_8, 4);
+      // 70 71 80 81 90 91 a0 a1
+      srcs[7] = _mm_srli_si128(srcs_4_8, 6);
+
+      // This uses srcs[0]..srcs[7].
+      const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+      const __m128i results_16 =
+          RightShiftWithRounding_S16(sums, kFilterBits - 1);
+      const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+      Store2(dst8, results);
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 2));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 4));
+      dst8 += dst_stride;
+      Store2(dst8, _mm_srli_si128(results, 6));
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[4];
+      srcs[1] = srcs[5];
+      srcs[2] = srcs[6];
+      srcs[3] = srcs[7];
+      srcs[4] = srcs[8];
+      y -= 4;
+    } while (y != 0);
+  }
+}

diff --git a/libgav1/src/dsp/x86/distance_weighted_blend_sse4.cc b/libgav1/src/dsp/x86/distance_weighted_blend_sse4.cc
index 77517ee..3c29b19 100644
--- a/libgav1/src/dsp/x86/distance_weighted_blend_sse4.cc
+++ b/libgav1/src/dsp/x86/distance_weighted_blend_sse4.cc

@@ -15,7 +15,7 @@
 #include "src/dsp/distance_weighted_blend.h"
 #include "src/utils/cpu.h"
 
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
 
 #include <xmmintrin.h>
 
@@ -30,6 +30,7 @@
 
 namespace libgav1 {
 namespace dsp {
+namespace low_bitdepth {
 namespace {
 
 constexpr int kInterPostRoundBit = 4;
@@ -212,13 +213,231 @@
 }
 
 }  // namespace
+}  // namespace low_bitdepth
 
-void DistanceWeightedBlendInit_SSE4_1() { Init8bpp(); }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+constexpr int kMax10bppSample = (1 << 10) - 1;
+constexpr int kInterPostRoundBit = 4;
+
+inline __m128i ComputeWeightedAverage8(const __m128i& pred0,
+                                       const __m128i& pred1,
+                                       const __m128i& weight0,
+                                       const __m128i& weight1) {
+  // This offset is a combination of round_factor and round_offset
+  // which are to be added and subtracted respectively.
+  // Here kInterPostRoundBit + 4 is considering bitdepth=10.
+  constexpr int offset =
+      (1 << ((kInterPostRoundBit + 4) - 1)) - (kCompoundOffset << 4);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i bias = _mm_set1_epi32(offset);
+  const __m128i clip_high = _mm_set1_epi16(kMax10bppSample);
+
+  __m128i prediction0 = _mm_cvtepu16_epi32(pred0);
+  __m128i mult0 = _mm_mullo_epi32(prediction0, weight0);
+  __m128i prediction1 = _mm_cvtepu16_epi32(pred1);
+  __m128i mult1 = _mm_mullo_epi32(prediction1, weight1);
+  __m128i sum = _mm_add_epi32(mult0, mult1);
+  sum = _mm_add_epi32(sum, bias);
+  const __m128i result0 = _mm_srai_epi32(sum, kInterPostRoundBit + 4);
+
+  prediction0 = _mm_unpackhi_epi16(pred0, zero);
+  mult0 = _mm_mullo_epi32(prediction0, weight0);
+  prediction1 = _mm_unpackhi_epi16(pred1, zero);
+  mult1 = _mm_mullo_epi32(prediction1, weight1);
+  sum = _mm_add_epi32(mult0, mult1);
+  sum = _mm_add_epi32(sum, bias);
+  const __m128i result1 = _mm_srai_epi32(sum, kInterPostRoundBit + 4);
+  const __m128i pack = _mm_packus_epi32(result0, result1);
+
+  return _mm_min_epi16(pack, clip_high);
+}
+
+template <int height>
+inline void DistanceWeightedBlend4xH_SSE4_1(
+    const uint16_t* pred_0, const uint16_t* pred_1, const uint8_t weight_0,
+    const uint8_t weight_1, void* const dest, const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const __m128i weight0 = _mm_set1_epi32(weight_0);
+  const __m128i weight1 = _mm_set1_epi32(weight_1);
+
+  int y = height;
+  do {
+    const __m128i src_00 = LoadLo8(pred_0);
+    const __m128i src_10 = LoadLo8(pred_1);
+    pred_0 += 4;
+    pred_1 += 4;
+    __m128i src_0 = LoadHi8(src_00, pred_0);
+    __m128i src_1 = LoadHi8(src_10, pred_1);
+    pred_0 += 4;
+    pred_1 += 4;
+    const __m128i res0 =
+        ComputeWeightedAverage8(src_0, src_1, weight0, weight1);
+
+    const __m128i src_01 = LoadLo8(pred_0);
+    const __m128i src_11 = LoadLo8(pred_1);
+    pred_0 += 4;
+    pred_1 += 4;
+    src_0 = LoadHi8(src_01, pred_0);
+    src_1 = LoadHi8(src_11, pred_1);
+    pred_0 += 4;
+    pred_1 += 4;
+    const __m128i res1 =
+        ComputeWeightedAverage8(src_0, src_1, weight0, weight1);
+
+    StoreLo8(dst, res0);
+    dst += dest_stride;
+    StoreHi8(dst, res0);
+    dst += dest_stride;
+    StoreLo8(dst, res1);
+    dst += dest_stride;
+    StoreHi8(dst, res1);
+    dst += dest_stride;
+    y -= 4;
+  } while (y != 0);
+}
+
+template <int height>
+inline void DistanceWeightedBlend8xH_SSE4_1(
+    const uint16_t* pred_0, const uint16_t* pred_1, const uint8_t weight_0,
+    const uint8_t weight_1, void* const dest, const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const __m128i weight0 = _mm_set1_epi32(weight_0);
+  const __m128i weight1 = _mm_set1_epi32(weight_1);
+
+  int y = height;
+  do {
+    const __m128i src_00 = LoadAligned16(pred_0);
+    const __m128i src_10 = LoadAligned16(pred_1);
+    pred_0 += 8;
+    pred_1 += 8;
+    const __m128i res0 =
+        ComputeWeightedAverage8(src_00, src_10, weight0, weight1);
+
+    const __m128i src_01 = LoadAligned16(pred_0);
+    const __m128i src_11 = LoadAligned16(pred_1);
+    pred_0 += 8;
+    pred_1 += 8;
+    const __m128i res1 =
+        ComputeWeightedAverage8(src_01, src_11, weight0, weight1);
+
+    StoreUnaligned16(dst, res0);
+    dst += dest_stride;
+    StoreUnaligned16(dst, res1);
+    dst += dest_stride;
+    y -= 2;
+  } while (y != 0);
+}
+
+inline void DistanceWeightedBlendLarge_SSE4_1(
+    const uint16_t* pred_0, const uint16_t* pred_1, const uint8_t weight_0,
+    const uint8_t weight_1, const int width, const int height, void* const dest,
+    const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const __m128i weight0 = _mm_set1_epi32(weight_0);
+  const __m128i weight1 = _mm_set1_epi32(weight_1);
+
+  int y = height;
+  do {
+    int x = 0;
+    do {
+      const __m128i src_0_lo = LoadAligned16(pred_0 + x);
+      const __m128i src_1_lo = LoadAligned16(pred_1 + x);
+      const __m128i res_lo =
+          ComputeWeightedAverage8(src_0_lo, src_1_lo, weight0, weight1);
+
+      const __m128i src_0_hi = LoadAligned16(pred_0 + x + 8);
+      const __m128i src_1_hi = LoadAligned16(pred_1 + x + 8);
+      const __m128i res_hi =
+          ComputeWeightedAverage8(src_0_hi, src_1_hi, weight0, weight1);
+
+      StoreUnaligned16(dst + x, res_lo);
+      x += 8;
+      StoreUnaligned16(dst + x, res_hi);
+      x += 8;
+    } while (x < width);
+    dst += dest_stride;
+    pred_0 += width;
+    pred_1 += width;
+  } while (--y != 0);
+}
+
+void DistanceWeightedBlend_SSE4_1(const void* prediction_0,
+                                  const void* prediction_1,
+                                  const uint8_t weight_0,
+                                  const uint8_t weight_1, const int width,
+                                  const int height, void* const dest,
+                                  const ptrdiff_t dest_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  const ptrdiff_t dst_stride = dest_stride / sizeof(*pred_0);
+  if (width == 4) {
+    if (height == 4) {
+      DistanceWeightedBlend4xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1,
+                                         dest, dst_stride);
+    } else if (height == 8) {
+      DistanceWeightedBlend4xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1,
+                                         dest, dst_stride);
+    } else {
+      assert(height == 16);
+      DistanceWeightedBlend4xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1,
+                                          dest, dst_stride);
+    }
+    return;
+  }
+
+  if (width == 8) {
+    switch (height) {
+      case 4:
+        DistanceWeightedBlend8xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1,
+                                           dest, dst_stride);
+        return;
+      case 8:
+        DistanceWeightedBlend8xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1,
+                                           dest, dst_stride);
+        return;
+      case 16:
+        DistanceWeightedBlend8xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1,
+                                            dest, dst_stride);
+        return;
+      default:
+        assert(height == 32);
+        DistanceWeightedBlend8xH_SSE4_1<32>(pred_0, pred_1, weight_0, weight_1,
+                                            dest, dst_stride);
+
+        return;
+    }
+  }
+
+  DistanceWeightedBlendLarge_SSE4_1(pred_0, pred_1, weight_0, weight_1, width,
+                                    height, dest, dst_stride);
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_10BPP_SSE4_1(DistanceWeightedBlend)
+  dsp->distance_weighted_blend = DistanceWeightedBlend_SSE4_1;
+#endif
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void DistanceWeightedBlendInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
 
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_SSE4_1
+#else   // !LIBGAV1_TARGETING_SSE4_1
 
 namespace libgav1 {
 namespace dsp {
@@ -227,4 +446,4 @@
 
 }  // namespace dsp
 }  // namespace libgav1
-#endif  // LIBGAV1_ENABLE_SSE4_1
+#endif  // LIBGAV1_TARGETING_SSE4_1

diff --git a/libgav1/src/dsp/x86/distance_weighted_blend_sse4.h b/libgav1/src/dsp/x86/distance_weighted_blend_sse4.h
index 2831ded..dbb9f88 100644
--- a/libgav1/src/dsp/x86/distance_weighted_blend_sse4.h
+++ b/libgav1/src/dsp/x86/distance_weighted_blend_sse4.h

@@ -31,11 +31,15 @@
 
 // If sse4 is enabled and the baseline isn't set due to a higher level of
 // optimization being enabled, signal the sse4 implementation should be used.
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
 #ifndef LIBGAV1_Dsp8bpp_DistanceWeightedBlend
 #define LIBGAV1_Dsp8bpp_DistanceWeightedBlend LIBGAV1_CPU_SSE4_1
 #endif
 
-#endif  // LIBGAV1_ENABLE_SSE4_1
+#ifndef LIBGAV1_Dsp10bpp_DistanceWeightedBlend
+#define LIBGAV1_Dsp10bpp_DistanceWeightedBlend LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
 
 #endif  // LIBGAV1_SRC_DSP_X86_DISTANCE_WEIGHTED_BLEND_SSE4_H_

diff --git a/libgav1/src/dsp/x86/film_grain_sse4.cc b/libgav1/src/dsp/x86/film_grain_sse4.cc
new file mode 100644
index 0000000..745c1ca
--- /dev/null
+++ b/libgav1/src/dsp/x86/film_grain_sse4.cc

@@ -0,0 +1,514 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/film_grain.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/film_grain_common.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace film_grain {
+namespace {
+
+// Load 8 values from source, widening to int16_t intermediate value size.
+// The function is overloaded for each type and bitdepth for simplicity.
+inline __m128i LoadSource(const int8_t* src) {
+  return _mm_cvtepi8_epi16(LoadLo8(src));
+}
+
+// Load 8 values from source, widening to int16_t intermediate value size.
+inline __m128i LoadSource(const uint8_t* src) {
+  return _mm_cvtepu8_epi16(LoadLo8(src));
+}
+
+inline __m128i LoadSourceMsan(const uint8_t* src, const int valid_range) {
+  return _mm_cvtepu8_epi16(LoadLo8Msan(src, 8 - valid_range));
+}
+
+// Store 8 values to dest, narrowing to uint8_t from int16_t intermediate value.
+inline void StoreUnsigned(uint8_t* dest, const __m128i data) {
+  StoreLo8(dest, _mm_packus_epi16(data, data));
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+// Load 8 values from source.
+inline __m128i LoadSource(const int16_t* src) { return LoadUnaligned16(src); }
+
+// Load 8 values from source.
+inline __m128i LoadSource(const uint16_t* src) { return LoadUnaligned16(src); }
+
+// Store 8 values to dest.
+inline void StoreUnsigned(uint16_t* dest, const __m128i data) {
+  StoreUnaligned16(dest, data);
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+// For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
+inline __m128i GetAverageLuma(const uint8_t* const luma, int subsampling_x) {
+  if (subsampling_x != 0) {
+    const __m128i src = LoadUnaligned16(luma);
+
+    return RightShiftWithRounding_U16(
+        _mm_hadd_epi16(_mm_cvtepu8_epi16(src),
+                       _mm_unpackhi_epi8(src, _mm_setzero_si128())),
+        1);
+  }
+  return _mm_cvtepu8_epi16(LoadLo8(luma));
+}
+
+inline __m128i GetAverageLumaMsan(const uint8_t* const luma, int subsampling_x,
+                                  int valid_range) {
+  if (subsampling_x != 0) {
+    const __m128i src = LoadUnaligned16Msan(luma, 16 - valid_range);
+
+    return RightShiftWithRounding_U16(
+        _mm_hadd_epi16(_mm_cvtepu8_epi16(src),
+                       _mm_unpackhi_epi8(src, _mm_setzero_si128())),
+        1);
+  }
+  return _mm_cvtepu8_epi16(LoadLo8Msan(luma, 8 - valid_range));
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+// For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
+inline __m128i GetAverageLuma(const uint16_t* const luma, int subsampling_x) {
+  if (subsampling_x != 0) {
+    return RightShiftWithRounding_U16(
+        _mm_hadd_epi16(LoadUnaligned16(luma), LoadUnaligned16(luma + 8)), 1);
+  }
+  return LoadUnaligned16(luma);
+}
+
+inline __m128i GetAverageLumaMsan(const uint16_t* const luma, int subsampling_x,
+                                  int valid_range) {
+  if (subsampling_x != 0) {
+    return RightShiftWithRounding_U16(
+        _mm_hadd_epi16(
+            LoadUnaligned16Msan(luma, 16 - valid_range * sizeof(*luma)),
+            LoadUnaligned16Msan(luma + 8, 32 - valid_range * sizeof(*luma))),
+        1);
+  }
+  return LoadUnaligned16Msan(luma, 16 - valid_range * sizeof(*luma));
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+inline __m128i Clip3(const __m128i value, const __m128i low,
+                     const __m128i high) {
+  const __m128i clipped_to_ceiling = _mm_min_epi16(high, value);
+  return _mm_max_epi16(low, clipped_to_ceiling);
+}
+
+template <int bitdepth, typename Pixel>
+inline __m128i GetScalingFactors(
+    const uint8_t scaling_lut[kScalingLookupTableSize], const Pixel* source) {
+  alignas(16) int16_t start_vals[8];
+  if (bitdepth == 8) {
+    // TODO(petersonab): Speed this up by creating a uint16_t scaling_lut.
+    // Currently this code results in a series of movzbl.
+    for (int i = 0; i < 8; ++i) {
+      start_vals[i] = scaling_lut[source[i]];
+    }
+    return LoadAligned16(start_vals);
+  }
+  alignas(16) int16_t end_vals[8];
+  // TODO(petersonab): Precompute this into a larger table for direct lookups.
+  for (int i = 0; i < 8; ++i) {
+    const int index = source[i] >> 2;
+    start_vals[i] = scaling_lut[index];
+    end_vals[i] = scaling_lut[index + 1];
+  }
+  const __m128i start = LoadAligned16(start_vals);
+  const __m128i end = LoadAligned16(end_vals);
+  __m128i remainder = LoadSource(source);
+  remainder = _mm_srli_epi16(_mm_slli_epi16(remainder, 14), 1);
+  const __m128i delta = _mm_mulhrs_epi16(_mm_sub_epi16(end, start), remainder);
+  return _mm_add_epi16(start, delta);
+}
+
+// |scaling_shift| is in range [8,11].
+template <int bitdepth>
+inline __m128i ScaleNoise(const __m128i noise, const __m128i scaling,
+                          const __m128i scaling_shift) {
+  const __m128i shifted_scale_factors = _mm_sll_epi16(scaling, scaling_shift);
+  return _mm_mulhrs_epi16(noise, shifted_scale_factors);
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageLuma_SSE4_1(
+    const void* noise_image_ptr, int min_value, int max_luma, int scaling_shift,
+    int width, int height, int start_height,
+    const uint8_t scaling_lut_y[kScalingLookupTableSize],
+    const void* source_plane_y, ptrdiff_t source_stride_y, void* dest_plane_y,
+    ptrdiff_t dest_stride_y) {
+  const auto* noise_image =
+      static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+  const auto* in_y_row = static_cast<const Pixel*>(source_plane_y);
+  source_stride_y /= sizeof(Pixel);
+  auto* out_y_row = static_cast<Pixel*>(dest_plane_y);
+  dest_stride_y /= sizeof(Pixel);
+  const __m128i floor = _mm_set1_epi16(min_value);
+  const __m128i ceiling = _mm_set1_epi16(max_luma);
+  const int safe_width = width & ~7;
+  const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift);
+  int y = 0;
+  do {
+    int x = 0;
+    for (; x < safe_width; x += 8) {
+      // TODO(b/133525232): Make 16-pixel version of loop body.
+      const __m128i orig = LoadSource(&in_y_row[x]);
+      const __m128i scaling =
+          GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
+      __m128i noise = LoadSource(&(noise_image[kPlaneY][y + start_height][x]));
+
+      noise = ScaleNoise<bitdepth>(noise, scaling, derived_scaling_shift);
+      const __m128i combined = _mm_add_epi16(orig, noise);
+      StoreUnsigned(&out_y_row[x], Clip3(combined, floor, ceiling));
+    }
+
+    if (x < width) {
+      Pixel luma_buffer[8];
+      // Prevent arbitrary indices from entering GetScalingFactors.
+      memset(luma_buffer, 0, sizeof(luma_buffer));
+      const int valid_range = width - x;
+      memcpy(luma_buffer, &in_y_row[x], valid_range * sizeof(in_y_row[0]));
+      luma_buffer[valid_range] = in_y_row[width - 1];
+      const __m128i orig = LoadSource(&in_y_row[x]);
+      const __m128i scaling =
+          GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, luma_buffer);
+      __m128i noise = LoadSource(&(noise_image[kPlaneY][y + start_height][x]));
+
+      noise = ScaleNoise<bitdepth>(noise, scaling, derived_scaling_shift);
+      const __m128i combined = _mm_add_epi16(orig, noise);
+      StoreUnsigned(&out_y_row[x], Clip3(combined, floor, ceiling));
+    }
+    in_y_row += source_stride_y;
+    out_y_row += dest_stride_y;
+  } while (++y < height);
+  out_y_row = static_cast<Pixel*>(dest_plane_y);
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+inline __m128i BlendChromaValsWithCfl(
+    const Pixel* average_luma_buffer,
+    const uint8_t scaling_lut[kScalingLookupTableSize],
+    const Pixel* chroma_cursor, const GrainType* noise_image_cursor,
+    const __m128i scaling_shift) {
+  const __m128i scaling =
+      GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer);
+  const __m128i orig = LoadSource(chroma_cursor);
+  __m128i noise = LoadSource(noise_image_cursor);
+  noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift);
+  return _mm_add_epi16(orig, noise);
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1(
+    const Array2D<GrainType>& noise_image, int min_value, int max_chroma,
+    int width, int height, int start_height, int subsampling_x,
+    int subsampling_y, int scaling_shift,
+    const uint8_t scaling_lut[kScalingLookupTableSize], const Pixel* in_y_row,
+    ptrdiff_t source_stride_y, const Pixel* in_chroma_row,
+    ptrdiff_t source_stride_chroma, Pixel* out_chroma_row,
+    ptrdiff_t dest_stride) {
+  const __m128i floor = _mm_set1_epi16(min_value);
+  const __m128i ceiling = _mm_set1_epi16(max_chroma);
+  alignas(16) Pixel luma_buffer[16];
+
+  const int chroma_height = (height + subsampling_y) >> subsampling_y;
+  const int chroma_width = (width + subsampling_x) >> subsampling_x;
+  // |chroma_width| is rounded up. If |width| is odd, then the final pixel will
+  // need to be guarded from overread, even if |chroma_width| is divisible by 8.
+  const int safe_chroma_width = (chroma_width - (width & 1)) & ~7;
+
+  // Writing to this buffer avoids the cost of doing 8 lane lookups in a row
+  // in GetScalingFactors.
+  Pixel average_luma_buffer[8];
+  assert(start_height % 2 == 0);
+  start_height >>= subsampling_y;
+  const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift);
+  int y = 0;
+  do {
+    int x = 0;
+    for (; x < safe_chroma_width; x += 8) {
+      const int luma_x = x << subsampling_x;
+      // TODO(petersonab): Consider specializing by subsampling_x. In the 444
+      // case &in_y_row[x] can be passed to GetScalingFactors directly.
+      const __m128i average_luma =
+          GetAverageLuma(&in_y_row[luma_x], subsampling_x);
+      StoreUnsigned(average_luma_buffer, average_luma);
+
+      const __m128i blended =
+          BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
+              average_luma_buffer, scaling_lut, &in_chroma_row[x],
+              &(noise_image[y + start_height][x]), derived_scaling_shift);
+      StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
+    }
+
+    // This section only runs if width % (8 << sub_x) != 0. It should never run
+    // on 720p and above.
+    if (x < chroma_width) {
+      // Prevent huge indices from entering GetScalingFactors due to
+      // uninitialized values. This is not a problem in 8bpp because the table
+      // is made larger than 255 values.
+      if (bitdepth > 8) {
+        memset(luma_buffer, 0, sizeof(luma_buffer));
+      }
+      const int luma_x = x << subsampling_x;
+      const int valid_range = width - luma_x;
+      assert(valid_range < 16);
+      memcpy(luma_buffer, &in_y_row[luma_x], valid_range * sizeof(in_y_row[0]));
+      luma_buffer[valid_range] = in_y_row[width - 1];
+      const __m128i average_luma =
+          GetAverageLumaMsan(luma_buffer, subsampling_x, valid_range + 1);
+      StoreUnsigned(average_luma_buffer, average_luma);
+
+      const __m128i blended =
+          BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
+              average_luma_buffer, scaling_lut, &in_chroma_row[x],
+              &(noise_image[y + start_height][x]), derived_scaling_shift);
+      StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
+    }
+
+    in_y_row += source_stride_y << subsampling_y;
+    in_chroma_row += source_stride_chroma;
+    out_chroma_row += dest_stride;
+  } while (++y < chroma_height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == true.
+// This further implies that scaling_lut_u == scaling_lut_v == scaling_lut_y.
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageChromaWithCfl_SSE4_1(
+    Plane plane, const FilmGrainParams& params, const void* noise_image_ptr,
+    int min_value, int max_chroma, int width, int height, int start_height,
+    int subsampling_x, int subsampling_y,
+    const uint8_t scaling_lut[kScalingLookupTableSize],
+    const void* source_plane_y, ptrdiff_t source_stride_y,
+    const void* source_plane_uv, ptrdiff_t source_stride_uv,
+    void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+  const auto* noise_image =
+      static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+  const auto* in_y = static_cast<const Pixel*>(source_plane_y);
+  source_stride_y /= sizeof(Pixel);
+
+  const auto* in_uv = static_cast<const Pixel*>(source_plane_uv);
+  source_stride_uv /= sizeof(Pixel);
+  auto* out_uv = static_cast<Pixel*>(dest_plane_uv);
+  dest_stride_uv /= sizeof(Pixel);
+  BlendChromaPlaneWithCfl_SSE4_1<bitdepth, GrainType, Pixel>(
+      noise_image[plane], min_value, max_chroma, width, height, start_height,
+      subsampling_x, subsampling_y, params.chroma_scaling, scaling_lut, in_y,
+      source_stride_y, in_uv, source_stride_uv, out_uv, dest_stride_uv);
+}
+
+}  // namespace
+
+namespace low_bitdepth {
+namespace {
+
+// |offset| is 32x4 packed to add with the result of _mm_madd_epi16.
+inline __m128i BlendChromaValsNoCfl8bpp(
+    const uint8_t scaling_lut[kScalingLookupTableSize], const __m128i& orig,
+    const int8_t* noise_image_cursor, const __m128i& average_luma,
+    const __m128i& scaling_shift, const __m128i& offset,
+    const __m128i& weights) {
+  uint8_t merged_buffer[8];
+  const __m128i combined_lo =
+      _mm_madd_epi16(_mm_unpacklo_epi16(average_luma, orig), weights);
+  const __m128i combined_hi =
+      _mm_madd_epi16(_mm_unpackhi_epi16(average_luma, orig), weights);
+  const __m128i merged_base = _mm_packs_epi32(_mm_srai_epi32((combined_lo), 6),
+                                              _mm_srai_epi32((combined_hi), 6));
+
+  const __m128i merged = _mm_add_epi16(merged_base, offset);
+
+  StoreLo8(merged_buffer, _mm_packus_epi16(merged, merged));
+  const __m128i scaling =
+      GetScalingFactors<8, uint8_t>(scaling_lut, merged_buffer);
+  __m128i noise = LoadSource(noise_image_cursor);
+  noise = ScaleNoise<8>(noise, scaling, scaling_shift);
+  return _mm_add_epi16(orig, noise);
+}
+
+LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_SSE4_1(
+    const Array2D<int8_t>& noise_image, int min_value, int max_chroma,
+    int width, int height, int start_height, int subsampling_x,
+    int subsampling_y, int scaling_shift, int chroma_offset,
+    int chroma_multiplier, int luma_multiplier,
+    const uint8_t scaling_lut[kScalingLookupTableSize], const uint8_t* in_y_row,
+    ptrdiff_t source_stride_y, const uint8_t* in_chroma_row,
+    ptrdiff_t source_stride_chroma, uint8_t* out_chroma_row,
+    ptrdiff_t dest_stride) {
+  const __m128i floor = _mm_set1_epi16(min_value);
+  const __m128i ceiling = _mm_set1_epi16(max_chroma);
+
+  const int chroma_height = (height + subsampling_y) >> subsampling_y;
+  const int chroma_width = (width + subsampling_x) >> subsampling_x;
+  // |chroma_width| is rounded up. If |width| is odd, then the final luma pixel
+  // will need to be guarded from overread, even if |chroma_width| is a
+  // multiple of 8.
+  const int safe_chroma_width = (chroma_width - (width & 1)) & ~7;
+  alignas(16) uint8_t luma_buffer[16];
+  const __m128i offset = _mm_set1_epi16(chroma_offset);
+  const __m128i multipliers = _mm_set1_epi32(LeftShift(chroma_multiplier, 16) |
+                                             (luma_multiplier & 0xFFFF));
+  const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift);
+
+  start_height >>= subsampling_y;
+  int y = 0;
+  do {
+    int x = 0;
+    for (; x < safe_chroma_width; x += 8) {
+      const int luma_x = x << subsampling_x;
+      const __m128i average_luma =
+          GetAverageLuma(&in_y_row[luma_x], subsampling_x);
+      const __m128i orig_chroma = LoadSource(&in_chroma_row[x]);
+      const __m128i blended = BlendChromaValsNoCfl8bpp(
+          scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
+          average_luma, derived_scaling_shift, offset, multipliers);
+      StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
+    }
+
+    if (x < chroma_width) {
+      // Begin right edge iteration. Same as the normal iterations, but the
+      // |average_luma| computation requires a duplicated luma value at the
+      // end.
+      const int luma_x = x << subsampling_x;
+      const int valid_range = width - luma_x;
+      assert(valid_range < 16);
+      // There is no need to pre-initialize this buffer, because merged values
+      // used as indices are saturated in the 8bpp case. Uninitialized values
+      // are written outside the frame.
+      memcpy(luma_buffer, &in_y_row[luma_x], valid_range * sizeof(in_y_row[0]));
+      luma_buffer[valid_range] = in_y_row[width - 1];
+      const int valid_range_chroma = chroma_width - x;
+      uint8_t chroma_buffer[8];
+      memcpy(chroma_buffer, &in_chroma_row[x],
+             valid_range_chroma * sizeof(in_chroma_row[0]));
+
+      const __m128i average_luma =
+          GetAverageLumaMsan(luma_buffer, subsampling_x, valid_range + 1);
+      const __m128i orig_chroma =
+          LoadSourceMsan(chroma_buffer, valid_range_chroma);
+      const __m128i blended = BlendChromaValsNoCfl8bpp(
+          scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
+          average_luma, derived_scaling_shift, offset, multipliers);
+      StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
+      // End of right edge iteration.
+    }
+
+    in_y_row += source_stride_y << subsampling_y;
+    in_chroma_row += source_stride_chroma;
+    out_chroma_row += dest_stride;
+  } while (++y < chroma_height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == false.
+void BlendNoiseWithImageChroma8bpp_SSE4_1(
+    Plane plane, const FilmGrainParams& params, const void* noise_image_ptr,
+    int min_value, int max_chroma, int width, int height, int start_height,
+    int subsampling_x, int subsampling_y,
+    const uint8_t scaling_lut[kScalingLookupTableSize],
+    const void* source_plane_y, ptrdiff_t source_stride_y,
+    const void* source_plane_uv, ptrdiff_t source_stride_uv,
+    void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+  assert(plane == kPlaneU || plane == kPlaneV);
+  const auto* noise_image =
+      static_cast<const Array2D<int8_t>*>(noise_image_ptr);
+  const auto* in_y = static_cast<const uint8_t*>(source_plane_y);
+  const auto* in_uv = static_cast<const uint8_t*>(source_plane_uv);
+  auto* out_uv = static_cast<uint8_t*>(dest_plane_uv);
+
+  const int offset = (plane == kPlaneU) ? params.u_offset : params.v_offset;
+  const int luma_multiplier =
+      (plane == kPlaneU) ? params.u_luma_multiplier : params.v_luma_multiplier;
+  const int multiplier =
+      (plane == kPlaneU) ? params.u_multiplier : params.v_multiplier;
+  BlendChromaPlane8bpp_SSE4_1(
+      noise_image[plane], min_value, max_chroma, width, height, start_height,
+      subsampling_x, subsampling_y, params.chroma_scaling, offset, multiplier,
+      luma_multiplier, scaling_lut, in_y, source_stride_y, in_uv,
+      source_stride_uv, out_uv, dest_stride_uv);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+
+  dsp->film_grain.blend_noise_luma =
+      BlendNoiseWithImageLuma_SSE4_1<8, int8_t, uint8_t>;
+  dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma8bpp_SSE4_1;
+  dsp->film_grain.blend_noise_chroma[1] =
+      BlendNoiseWithImageChromaWithCfl_SSE4_1<8, int8_t, uint8_t>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+
+  dsp->film_grain.blend_noise_luma =
+      BlendNoiseWithImageLuma_SSE4_1<10, int16_t, uint16_t>;
+  dsp->film_grain.blend_noise_chroma[1] =
+      BlendNoiseWithImageChromaWithCfl_SSE4_1<10, int16_t, uint16_t>;
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+}  // namespace film_grain
+
+void FilmGrainInit_SSE4_1() {
+  film_grain::low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  film_grain::high_bitdepth::Init10bpp();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_ENABLE_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void FilmGrainInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1

diff --git a/libgav1/src/dsp/x86/film_grain_sse4.h b/libgav1/src/dsp/x86/film_grain_sse4.h
new file mode 100644
index 0000000..1cacbac
--- /dev/null
+++ b/libgav1/src/dsp/x86/film_grain_sse4.h

@@ -0,0 +1,40 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_FILM_GRAIN_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_FILM_GRAIN_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initialize members of Dsp::film_grain. This function is not thread-safe.
+void FilmGrainInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_SSE4_1
+#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_SSE4_1
+#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChroma LIBGAV1_DSP_SSE4_1
+#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChromaWithCfl LIBGAV1_DSP_SSE4_1
+#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseChromaWithCfl LIBGAV1_DSP_SSE4_1
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_FILM_GRAIN_SSE4_H_

diff --git a/libgav1/src/dsp/x86/intra_edge_sse4.cc b/libgav1/src/dsp/x86/intra_edge_sse4.cc
index 3635ee1..d6af907 100644
--- a/libgav1/src/dsp/x86/intra_edge_sse4.cc
+++ b/libgav1/src/dsp/x86/intra_edge_sse4.cc

@@ -15,14 +15,14 @@
 #include "src/dsp/intra_edge.h"
 #include "src/utils/cpu.h"
 
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
 
 #include <xmmintrin.h>
 
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
-#include <cstring>  // memcpy
+#include <cstring>
 
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
@@ -259,7 +259,7 @@
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_SSE4_1
+#else   // !LIBGAV1_TARGETING_SSE4_1
 namespace libgav1 {
 namespace dsp {
 
@@ -267,4 +267,4 @@
 
 }  // namespace dsp
 }  // namespace libgav1
-#endif  // LIBGAV1_ENABLE_SSE4_1
+#endif  // LIBGAV1_TARGETING_SSE4_1

diff --git a/libgav1/src/dsp/x86/intra_edge_sse4.h b/libgav1/src/dsp/x86/intra_edge_sse4.h
index d6c926e..6ed4d40 100644
--- a/libgav1/src/dsp/x86/intra_edge_sse4.h
+++ b/libgav1/src/dsp/x86/intra_edge_sse4.h

@@ -32,7 +32,7 @@
 
 // If sse4 is enabled and the baseline isn't set due to a higher level of
 // optimization being enabled, signal the sse4 implementation should be used.
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
 #ifndef LIBGAV1_Dsp8bpp_IntraEdgeFilter
 #define LIBGAV1_Dsp8bpp_IntraEdgeFilter LIBGAV1_CPU_SSE4_1
 #endif
@@ -41,6 +41,6 @@
 #define LIBGAV1_Dsp8bpp_IntraEdgeUpsampler LIBGAV1_CPU_SSE4_1
 #endif
 
-#endif  // LIBGAV1_ENABLE_SSE4_1
+#endif  // LIBGAV1_TARGETING_SSE4_1
 
 #endif  // LIBGAV1_SRC_DSP_X86_INTRA_EDGE_SSE4_H_

diff --git a/libgav1/src/dsp/x86/intrapred_cfl_sse4.cc b/libgav1/src/dsp/x86/intrapred_cfl_sse4.cc
index ddf3a95..f2dcfdb 100644
--- a/libgav1/src/dsp/x86/intrapred_cfl_sse4.cc
+++ b/libgav1/src/dsp/x86/intrapred_cfl_sse4.cc

@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_cfl.h"
 #include "src/utils/cpu.h"
 
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
 
 #include <smmintrin.h>
 
@@ -29,9 +29,48 @@
 #include "src/dsp/x86/common_sse4.h"
 #include "src/utils/common.h"
 #include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
 
 namespace libgav1 {
 namespace dsp {
+namespace {
+
+// This duplicates the last two 16-bit values in |row|.
+inline __m128i LastRowSamples(const __m128i row) {
+  return _mm_shuffle_epi32(row, 0xFF);
+}
+
+// This duplicates the last 16-bit value in |row|.
+inline __m128i LastRowResult(const __m128i row) {
+  const __m128i dup_row = _mm_shufflehi_epi16(row, 0xFF);
+  return _mm_shuffle_epi32(dup_row, 0xFF);
+}
+
+// Takes in two sums of input row pairs, and completes the computation for two
+// output rows.
+inline __m128i StoreLumaResults4_420(const __m128i vertical_sum0,
+                                     const __m128i vertical_sum1,
+                                     int16_t* luma_ptr) {
+  __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1);
+  result = _mm_slli_epi16(result, 1);
+  StoreLo8(luma_ptr, result);
+  StoreHi8(luma_ptr + kCflLumaBufferStride, result);
+  return result;
+}
+
+// Takes two halves of a vertically added pair of rows and completes the
+// computation for one output row.
+inline __m128i StoreLumaResults8_420(const __m128i vertical_sum0,
+                                     const __m128i vertical_sum1,
+                                     int16_t* luma_ptr) {
+  __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1);
+  result = _mm_slli_epi16(result, 1);
+  StoreUnaligned16(luma_ptr, result);
+  return result;
+}
+
+}  // namespace
+
 namespace low_bitdepth {
 namespace {
 
@@ -40,8 +79,8 @@
 
 inline __m128i CflPredictUnclipped(const __m128i* input, __m128i alpha_q12,
                                    __m128i alpha_sign, __m128i dc_q0) {
-  __m128i ac_q3 = LoadUnaligned16(input);
-  __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
+  const __m128i ac_q3 = LoadUnaligned16(input);
+  const __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
   __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
   scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign);
   return _mm_add_epi16(scaled_luma_q0, dc_q0);
@@ -88,8 +127,7 @@
 template <int block_height_log2, bool is_inside>
 void CflSubsampler444_4xH_SSE4_1(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
-    const int /*max_luma_width*/, const int max_luma_height,
-    const void* const source, ptrdiff_t stride) {
+    const int max_luma_height, const void* const source, ptrdiff_t stride) {
   static_assert(block_height_log2 <= 4, "");
   const int block_height = 1 << block_height_log2;
   const int visible_height = max_luma_height;
@@ -119,12 +157,15 @@
   } while (y < visible_height);
 
   if (!is_inside) {
-    int y = visible_height;
+    // Replicate the 2 high lanes.
+    samples = _mm_shuffle_epi32(samples, 0xee);
     do {
+      StoreLo8(luma_ptr, samples);
+      luma_ptr += kCflLumaBufferStride;
       StoreHi8(luma_ptr, samples);
       luma_ptr += kCflLumaBufferStride;
       sum = _mm_add_epi16(sum, samples);
-      ++y;
+      y += 2;
     } while (y < block_height);
   }
 
@@ -152,15 +193,15 @@
   static_assert(block_height_log2 <= 4, "");
   assert(max_luma_width >= 4);
   assert(max_luma_height >= 4);
-  const int block_height = 1 << block_height_log2;
-  const int block_width = 4;
+  static_cast<void>(max_luma_width);
+  constexpr int block_height = 1 << block_height_log2;
 
-  if (block_height <= max_luma_height && block_width <= max_luma_width) {
-    CflSubsampler444_4xH_SSE4_1<block_height_log2, true>(
-        luma, max_luma_width, max_luma_height, source, stride);
+  if (block_height <= max_luma_height) {
+    CflSubsampler444_4xH_SSE4_1<block_height_log2, true>(luma, max_luma_height,
+                                                         source, stride);
   } else {
-    CflSubsampler444_4xH_SSE4_1<block_height_log2, false>(
-        luma, max_luma_width, max_luma_height, source, stride);
+    CflSubsampler444_4xH_SSE4_1<block_height_log2, false>(luma, max_luma_height,
+                                                          source, stride);
   }
 }
 
@@ -302,19 +343,9 @@
   __m128i inner_sum_lo, inner_sum_hi;
   int y = 0;
   do {
-#if LIBGAV1_MSAN  // We can load uninitialized values here. Even though they are
-                  // then masked off by blendv, MSAN isn't smart enough to
-                  // understand that. So we switch to a C implementation here.
-    uint16_t c_arr[16];
-    for (int x = 0; x < 16; x++) {
-      const int x_index = std::min(x, visible_width_16 - 1);
-      c_arr[x] = src[x_index] << 3;
-    }
-    samples0 = LoadUnaligned16(c_arr);
-    samples1 = LoadUnaligned16(c_arr + 8);
-    static_cast<void>(blend_mask_16);
-#else
-    __m128i samples01 = LoadUnaligned16(src);
+    // We can load uninitialized values here. Even though they are then masked
+    // off by blendv, MSAN doesn't model that behavior.
+    __m128i samples01 = LoadUnaligned16Msan(src, invisible_width_16);
 
     if (!inside) {
       const __m128i border16 =
@@ -323,26 +354,15 @@
     }
     samples0 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples01), 3);
     samples1 = _mm_slli_epi16(_mm_unpackhi_epi8(samples01, zero), 3);
-#endif  // LIBGAV1_MSAN
 
     StoreUnaligned16(luma_ptr, samples0);
     StoreUnaligned16(luma_ptr + 8, samples1);
     __m128i inner_sum = _mm_add_epi16(samples0, samples1);
 
     if (block_width == 32) {
-#if LIBGAV1_MSAN  // We can load uninitialized values here. Even though they are
-                  // then masked off by blendv, MSAN isn't smart enough to
-                  // understand that. So we switch to a C implementation here.
-      uint16_t c_arr[16];
-      for (int x = 16; x < 32; x++) {
-        const int x_index = std::min(x, visible_width_32 - 1);
-        c_arr[x - 16] = src[x_index] << 3;
-      }
-      samples2 = LoadUnaligned16(c_arr);
-      samples3 = LoadUnaligned16(c_arr + 8);
-      static_cast<void>(blend_mask_32);
-#else
-      __m128i samples23 = LoadUnaligned16(src + 16);
+      // We can load uninitialized values here. Even though they are then masked
+      // off by blendv, MSAN doesn't model that behavior.
+      __m128i samples23 = LoadUnaligned16Msan(src + 16, invisible_width_32);
       if (!inside) {
         const __m128i border32 =
             _mm_set1_epi8(static_cast<int8_t>(src[visible_width_32 - 1]));
@@ -350,7 +370,6 @@
       }
       samples2 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples23), 3);
       samples3 = _mm_slli_epi16(_mm_unpackhi_epi8(samples23, zero), 3);
-#endif  // LIBGAV1_MSAN
 
       StoreUnaligned16(luma_ptr + 16, samples2);
       StoreUnaligned16(luma_ptr + 24, samples3);
@@ -418,29 +437,6 @@
   }
 }
 
-// Takes in two sums of input row pairs, and completes the computation for two
-// output rows.
-inline __m128i StoreLumaResults4_420(const __m128i vertical_sum0,
-                                     const __m128i vertical_sum1,
-                                     int16_t* luma_ptr) {
-  __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1);
-  result = _mm_slli_epi16(result, 1);
-  StoreLo8(luma_ptr, result);
-  StoreHi8(luma_ptr + kCflLumaBufferStride, result);
-  return result;
-}
-
-// Takes two halves of a vertically added pair of rows and completes the
-// computation for one output row.
-inline __m128i StoreLumaResults8_420(const __m128i vertical_sum0,
-                                     const __m128i vertical_sum1,
-                                     int16_t* luma_ptr) {
-  __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1);
-  result = _mm_slli_epi16(result, 1);
-  StoreUnaligned16(luma_ptr, result);
-  return result;
-}
-
 template <int block_height_log2>
 void CflSubsampler420_4xH_SSE4_1(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
@@ -511,17 +507,6 @@
   }
 }
 
-// This duplicates the last two 16-bit values in |row|.
-inline __m128i LastRowSamples(const __m128i row) {
-  return _mm_shuffle_epi32(row, 0xFF);
-}
-
-// This duplicates the last 16-bit value in |row|.
-inline __m128i LastRowResult(const __m128i row) {
-  const __m128i dup_row = _mm_shufflehi_epi16(row, 0xFF);
-  return _mm_shuffle_epi32(dup_row, 0xFF);
-}
-
 template <int block_height_log2, int max_luma_width>
 inline void CflSubsampler420Impl_8xH_SSE4_1(
     int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
@@ -655,10 +640,11 @@
   __m128i final_sum = zero;
   const int block_height = 1 << block_height_log2;
   const int luma_height = std::min(block_height, max_luma_height >> 1);
+  static_assert(max_luma_width <= 32, "");
 
   int16_t* luma_ptr = luma[0];
   __m128i final_row_result;
-  // Begin first y section, covering width up to 16.
+  // Begin first y section, covering width up to 32.
   int y = 0;
   do {
     const uint8_t* src_next = src + stride;
@@ -694,29 +680,32 @@
     final_row_result =
         StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8);
     sum = _mm_add_epi16(sum, final_row_result);
+    if (block_width_log2 == 5) {
+      const __m128i wide_fill = LastRowResult(final_row_result);
+      sum = _mm_add_epi16(sum, wide_fill);
+      sum = _mm_add_epi16(sum, wide_fill);
+    }
     final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
     final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
     src += stride << 1;
     luma_ptr += kCflLumaBufferStride;
   } while (++y < luma_height);
 
-  // Because max_luma_width is at most 32, any values beyond x=16 will
-  // necessarily be duplicated.
-  if (block_width_log2 == 5) {
-    const __m128i wide_fill = LastRowResult(final_row_result);
-    // Multiply duplicated value by number of occurrences, height * 4, since
-    // there are 16 in each row and the value appears in the vector 4 times.
-    final_sum = _mm_add_epi32(
-        final_sum,
-        _mm_slli_epi32(_mm_cvtepi16_epi32(wide_fill), block_height_log2 + 2));
-  }
-
   // Begin second y section.
   if (y < block_height) {
     const __m128i final_fill0 =
         LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
     const __m128i final_fill1 =
         LoadUnaligned16(luma_ptr - kCflLumaBufferStride + 8);
+    __m128i wide_fill;
+
+    if (block_width_log2 == 5) {
+      // There are 16 16-bit fill values per row, shifting by 2 accounts for
+      // the widening to 32-bit.
+      wide_fill =
+          _mm_slli_epi32(_mm_cvtepi16_epi32(LastRowResult(final_fill1)), 2);
+    }
+
     const __m128i final_inner_sum = _mm_add_epi16(final_fill0, final_fill1);
     const __m128i final_inner_sum0 = _mm_cvtepu16_epi32(final_inner_sum);
     const __m128i final_inner_sum1 = _mm_unpackhi_epi16(final_inner_sum, zero);
@@ -726,6 +715,9 @@
     do {
       StoreUnaligned16(luma_ptr, final_fill0);
       StoreUnaligned16(luma_ptr + 8, final_fill1);
+      if (block_width_log2 == 5) {
+        final_sum = _mm_add_epi32(final_sum, wide_fill);
+      }
       luma_ptr += kCflLumaBufferStride;
 
       final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
@@ -747,14 +739,10 @@
     const __m128i samples1 = LoadUnaligned16(luma_ptr + 8);
     final_row_result = _mm_sub_epi16(samples1, averages);
     StoreUnaligned16(luma_ptr + 8, final_row_result);
-  }
-  if (block_width_log2 == 5) {
-    int16_t* wide_luma_ptr = luma[0] + 16;
-    const __m128i wide_fill = LastRowResult(final_row_result);
-    for (int i = 0; i < block_height;
-         ++i, wide_luma_ptr += kCflLumaBufferStride) {
-      StoreUnaligned16(wide_luma_ptr, wide_fill);
-      StoreUnaligned16(wide_luma_ptr + 8, wide_fill);
+    if (block_width_log2 == 5) {
+      const __m128i wide_fill = LastRowResult(final_row_result);
+      StoreUnaligned16(luma_ptr + 16, wide_fill);
+      StoreUnaligned16(luma_ptr + 24, wide_fill);
     }
   }
 }
@@ -958,12 +946,887 @@
 }  // namespace
 }  // namespace low_bitdepth
 
-void IntraPredCflInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+//------------------------------------------------------------------------------
+// CflIntraPredictor_10bpp_SSE4_1
+
+inline __m128i CflPredictUnclipped(const __m128i* input, __m128i alpha_q12,
+                                   __m128i alpha_sign, __m128i dc_q0) {
+  const __m128i ac_q3 = LoadUnaligned16(input);
+  const __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
+  __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
+  scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign);
+  return _mm_add_epi16(scaled_luma_q0, dc_q0);
+}
+
+inline __m128i ClipEpi16(__m128i x, __m128i min, __m128i max) {
+  return _mm_max_epi16(_mm_min_epi16(x, max), min);
+}
+
+template <int width, int height>
+void CflIntraPredictor_10bpp_SSE4_1(
+    void* const dest, ptrdiff_t stride,
+    const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int alpha) {
+  constexpr int kCflLumaBufferStrideLog2_16i = 5;
+  constexpr int kCflLumaBufferStrideLog2_128i =
+      kCflLumaBufferStrideLog2_16i - 3;
+  constexpr int kRowIncr = 1 << kCflLumaBufferStrideLog2_128i;
+  auto* dst = static_cast<uint16_t*>(dest);
+  const __m128i alpha_sign = _mm_set1_epi16(alpha);
+  const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
+  auto* row = reinterpret_cast<const __m128i*>(luma);
+  const __m128i* row_end = row + (height << kCflLumaBufferStrideLog2_128i);
+  const __m128i dc_val = _mm_set1_epi16(dst[0]);
+  const __m128i min = _mm_setzero_si128();
+  const __m128i max = _mm_set1_epi16((1 << kBitdepth10) - 1);
+
+  stride >>= 1;
+
+  do {
+    __m128i res = CflPredictUnclipped(row, alpha_q12, alpha_sign, dc_val);
+    res = ClipEpi16(res, min, max);
+    if (width == 4) {
+      StoreLo8(dst, res);
+    } else if (width == 8) {
+      StoreUnaligned16(dst, res);
+    } else if (width == 16) {
+      StoreUnaligned16(dst, res);
+      const __m128i res_1 =
+          CflPredictUnclipped(row + 1, alpha_q12, alpha_sign, dc_val);
+      StoreUnaligned16(dst + 8, ClipEpi16(res_1, min, max));
+    } else {
+      StoreUnaligned16(dst, res);
+      const __m128i res_1 =
+          CflPredictUnclipped(row + 1, alpha_q12, alpha_sign, dc_val);
+      StoreUnaligned16(dst + 8, ClipEpi16(res_1, min, max));
+      const __m128i res_2 =
+          CflPredictUnclipped(row + 2, alpha_q12, alpha_sign, dc_val);
+      StoreUnaligned16(dst + 16, ClipEpi16(res_2, min, max));
+      const __m128i res_3 =
+          CflPredictUnclipped(row + 3, alpha_q12, alpha_sign, dc_val);
+      StoreUnaligned16(dst + 24, ClipEpi16(res_3, min, max));
+    }
+
+    dst += stride;
+  } while ((row += kRowIncr) < row_end);
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_4xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* const source, ptrdiff_t stride) {
+  static_assert(block_height_log2 <= 4, "");
+  const int block_height = 1 << block_height_log2;
+  const int visible_height = max_luma_height;
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  __m128i zero = _mm_setzero_si128();
+  __m128i sum = zero;
+  __m128i samples;
+  int y = visible_height;
+
+  do {
+    samples = LoadHi8(LoadLo8(src), src + src_stride);
+    src += src_stride << 1;
+    sum = _mm_add_epi16(sum, samples);
+    y -= 2;
+  } while (y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    samples = _mm_unpackhi_epi64(samples, samples);
+    do {
+      sum = _mm_add_epi16(sum, samples);
+      y += 2;
+    } while (y < block_height);
+  }
+
+  sum = _mm_add_epi32(_mm_unpackhi_epi16(sum, zero), _mm_cvtepu16_epi32(sum));
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+  // Here the left shift by 3 (to increase precision) is nullified in right
+  // shift ((log2 of width 4) + 1).
+  __m128i averages = RightShiftWithRounding_U32(sum, block_height_log2 - 1);
+  averages = _mm_shufflelo_epi16(averages, 0);
+  src = static_cast<const uint16_t*>(source);
+  luma_ptr = luma[0];
+  y = visible_height;
+  do {
+    samples = LoadLo8(src);
+    samples = _mm_slli_epi16(samples, 3);
+    StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
+    src += src_stride;
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    // Replicate last line
+    do {
+      StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
+      luma_ptr += kCflLumaBufferStride;
+    } while (++y < block_height);
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_4xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  static_cast<void>(max_luma_width);
+  static_cast<void>(max_luma_height);
+  static_assert(block_height_log2 <= 4, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+  const int block_height = 1 << block_height_log2;
+
+  if (block_height <= max_luma_height) {
+    CflSubsampler444_4xH_SSE4_1<block_height_log2, true>(luma, max_luma_height,
+                                                         source, stride);
+  } else {
+    CflSubsampler444_4xH_SSE4_1<block_height_log2, false>(luma, max_luma_height,
+                                                          source, stride);
+  }
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_8xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* const source, ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const int visible_height = max_luma_height;
+  const __m128i dup16 = _mm_set1_epi32(0x01000100);
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  const __m128i zero = _mm_setzero_si128();
+  __m128i sum = zero;
+  __m128i samples;
+  int y = visible_height;
+
+  do {
+    samples = LoadUnaligned16(src);
+    src += src_stride;
+    sum = _mm_add_epi16(sum, samples);
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    do {
+      sum = _mm_add_epi16(sum, samples);
+    } while (++y < block_height);
+  }
+
+  sum = _mm_add_epi32(_mm_unpackhi_epi16(sum, zero), _mm_cvtepu16_epi32(sum));
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+  // Here the left shift by 3 (to increase precision) is nullified in right
+  // shift (log2 of width 8).
+  __m128i averages = RightShiftWithRounding_U32(sum, block_height_log2);
+  averages = _mm_shuffle_epi8(averages, dup16);
+
+  src = static_cast<const uint16_t*>(source);
+  luma_ptr = luma[0];
+  y = visible_height;
+  do {
+    samples = LoadUnaligned16(src);
+    samples = _mm_slli_epi16(samples, 3);
+    StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
+    src += src_stride;
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    // Replicate last line
+    do {
+      StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
+      luma_ptr += kCflLumaBufferStride;
+    } while (++y < block_height);
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_8xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  static_cast<void>(max_luma_width);
+  static_cast<void>(max_luma_height);
+  static_assert(block_height_log2 <= 5, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+  const int block_height = 1 << block_height_log2;
+  const int block_width = 8;
+
+  const int horz_inside = block_width <= max_luma_width;
+  const int vert_inside = block_height <= max_luma_height;
+  if (horz_inside && vert_inside) {
+    CflSubsampler444_8xH_SSE4_1<block_height_log2, true>(luma, max_luma_height,
+                                                         source, stride);
+  } else {
+    CflSubsampler444_8xH_SSE4_1<block_height_log2, false>(luma, max_luma_height,
+                                                          source, stride);
+  }
+}
+
+template <int block_width_log2, int block_height_log2, bool is_inside>
+void CflSubsampler444_WxH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const int visible_height = max_luma_height;
+  const int block_width = 1 << block_width_log2;
+  const __m128i dup16 = _mm_set1_epi32(0x01000100);
+  const __m128i zero = _mm_setzero_si128();
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  __m128i sum = zero;
+  __m128i inner_sum_lo, inner_sum_hi;
+  __m128i samples[4];
+  int y = visible_height;
+
+  do {
+    samples[0] = LoadUnaligned16(src);
+    samples[1] = (max_luma_width >= 16) ? LoadUnaligned16(src + 8)
+                                        : LastRowResult(samples[0]);
+    __m128i inner_sum = _mm_add_epi16(samples[0], samples[1]);
+    if (block_width == 32) {
+      samples[2] = (max_luma_width >= 24) ? LoadUnaligned16(src + 16)
+                                          : LastRowResult(samples[1]);
+      samples[3] = (max_luma_width == 32) ? LoadUnaligned16(src + 24)
+                                          : LastRowResult(samples[2]);
+
+      inner_sum = _mm_add_epi16(samples[2], inner_sum);
+      inner_sum = _mm_add_epi16(samples[3], inner_sum);
+    }
+    inner_sum_lo = _mm_cvtepu16_epi32(inner_sum);
+    inner_sum_hi = _mm_unpackhi_epi16(inner_sum, zero);
+    sum = _mm_add_epi32(sum, inner_sum_lo);
+    sum = _mm_add_epi32(sum, inner_sum_hi);
+    src += src_stride;
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    __m128i inner_sum = _mm_add_epi16(samples[0], samples[1]);
+    if (block_width == 32) {
+      inner_sum = _mm_add_epi16(samples[2], inner_sum);
+      inner_sum = _mm_add_epi16(samples[3], inner_sum);
+    }
+    inner_sum_lo = _mm_cvtepu16_epi32(inner_sum);
+    inner_sum_hi = _mm_unpackhi_epi16(inner_sum, zero);
+    do {
+      sum = _mm_add_epi32(sum, inner_sum_lo);
+      sum = _mm_add_epi32(sum, inner_sum_hi);
+    } while (++y < block_height);
+  }
+
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+  sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+  // Here the left shift by 3 (to increase precision) is subtracted in right
+  // shift factor (block_width_log2 + block_height_log2 - 3).
+  __m128i averages =
+      RightShiftWithRounding_U32(sum, block_width_log2 + block_height_log2 - 3);
+  averages = _mm_shuffle_epi8(averages, dup16);
+
+  src = static_cast<const uint16_t*>(source);
+  __m128i samples_ext = zero;
+  luma_ptr = luma[0];
+  y = visible_height;
+  do {
+    int idx = 0;
+    for (int x = 0; x < block_width; x += 8) {
+      if (max_luma_width > x) {
+        samples[idx] = LoadUnaligned16(&src[x]);
+        samples[idx] = _mm_slli_epi16(samples[idx], 3);
+        samples_ext = samples[idx];
+      } else {
+        samples[idx] = LastRowResult(samples_ext);
+      }
+      StoreUnaligned16(&luma_ptr[x], _mm_sub_epi16(samples[idx++], averages));
+    }
+    src += src_stride;
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+
+  if (!is_inside) {
+    y = visible_height;
+    // Replicate last line
+    do {
+      int idx = 0;
+      for (int x = 0; x < block_width; x += 8) {
+        StoreUnaligned16(&luma_ptr[x], _mm_sub_epi16(samples[idx++], averages));
+      }
+      luma_ptr += kCflLumaBufferStride;
+    } while (++y < block_height);
+  }
+}
+
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler444_WxH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  static_assert(block_width_log2 == 4 || block_width_log2 == 5,
+                "This function will only work for block_width 16 and 32.");
+  static_assert(block_height_log2 <= 5, "");
+  assert(max_luma_width >= 4);
+  assert(max_luma_height >= 4);
+
+  const int block_height = 1 << block_height_log2;
+  const int vert_inside = block_height <= max_luma_height;
+  if (vert_inside) {
+    CflSubsampler444_WxH_SSE4_1<block_width_log2, block_height_log2, true>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  } else {
+    CflSubsampler444_WxH_SSE4_1<block_width_log2, block_height_log2, false>(
+        luma, max_luma_width, max_luma_height, source, stride);
+  }
+}
+
+template <int block_height_log2>
+void CflSubsampler420_4xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int /*max_luma_width*/, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  int16_t* luma_ptr = luma[0];
+  const __m128i zero = _mm_setzero_si128();
+  __m128i final_sum = zero;
+  const int luma_height = std::min(block_height, max_luma_height >> 1);
+  int y = luma_height;
+
+  do {
+    const __m128i samples_row0 = LoadUnaligned16(src);
+    src += src_stride;
+    const __m128i samples_row1 = LoadUnaligned16(src);
+    src += src_stride;
+    const __m128i luma_sum01 = _mm_add_epi16(samples_row0, samples_row1);
+
+    const __m128i samples_row2 = LoadUnaligned16(src);
+    src += src_stride;
+    const __m128i samples_row3 = LoadUnaligned16(src);
+    src += src_stride;
+    const __m128i luma_sum23 = _mm_add_epi16(samples_row2, samples_row3);
+    __m128i sum = StoreLumaResults4_420(luma_sum01, luma_sum23, luma_ptr);
+    luma_ptr += kCflLumaBufferStride << 1;
+
+    const __m128i samples_row4 = LoadUnaligned16(src);
+    src += src_stride;
+    const __m128i samples_row5 = LoadUnaligned16(src);
+    src += src_stride;
+    const __m128i luma_sum45 = _mm_add_epi16(samples_row4, samples_row5);
+
+    const __m128i samples_row6 = LoadUnaligned16(src);
+    src += src_stride;
+    const __m128i samples_row7 = LoadUnaligned16(src);
+    src += src_stride;
+    const __m128i luma_sum67 = _mm_add_epi16(samples_row6, samples_row7);
+    sum = _mm_add_epi16(
+        sum, StoreLumaResults4_420(luma_sum45, luma_sum67, luma_ptr));
+    luma_ptr += kCflLumaBufferStride << 1;
+
+    final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+    final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+    y -= 4;
+  } while (y != 0);
+
+  const __m128i final_fill = LoadLo8(luma_ptr - kCflLumaBufferStride);
+  const __m128i final_fill_to_sum = _mm_cvtepu16_epi32(final_fill);
+  for (y = luma_height; y < block_height; ++y) {
+    StoreLo8(luma_ptr, final_fill);
+    luma_ptr += kCflLumaBufferStride;
+    final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+  }
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+  __m128i averages = RightShiftWithRounding_U32(
+      final_sum, block_height_log2 + 2 /*log2 of width 4*/);
+
+  averages = _mm_shufflelo_epi16(averages, 0);
+  luma_ptr = luma[0];
+  y = block_height;
+  do {
+    const __m128i samples = LoadLo8(luma_ptr);
+    StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+}
+
+template <int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_8xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* const source, ptrdiff_t stride) {
+  const int block_height = 1 << block_height_log2;
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i final_sum = zero;
+  int16_t* luma_ptr = luma[0];
+  const int luma_height = std::min(block_height, max_luma_height >> 1);
+  int y = luma_height;
+
+  do {
+    const __m128i samples_row00 = LoadUnaligned16(src);
+    const __m128i samples_row01 = (max_luma_width == 16)
+                                      ? LoadUnaligned16(src + 8)
+                                      : LastRowSamples(samples_row00);
+    src += src_stride;
+    const __m128i samples_row10 = LoadUnaligned16(src);
+    const __m128i samples_row11 = (max_luma_width == 16)
+                                      ? LoadUnaligned16(src + 8)
+                                      : LastRowSamples(samples_row10);
+    src += src_stride;
+    const __m128i luma_sum00 = _mm_add_epi16(samples_row00, samples_row10);
+    const __m128i luma_sum01 = _mm_add_epi16(samples_row01, samples_row11);
+    __m128i sum = StoreLumaResults8_420(luma_sum00, luma_sum01, luma_ptr);
+    luma_ptr += kCflLumaBufferStride;
+
+    const __m128i samples_row20 = LoadUnaligned16(src);
+    const __m128i samples_row21 = (max_luma_width == 16)
+                                      ? LoadUnaligned16(src + 8)
+                                      : LastRowSamples(samples_row20);
+    src += src_stride;
+    const __m128i samples_row30 = LoadUnaligned16(src);
+    const __m128i samples_row31 = (max_luma_width == 16)
+                                      ? LoadUnaligned16(src + 8)
+                                      : LastRowSamples(samples_row30);
+    src += src_stride;
+    const __m128i luma_sum10 = _mm_add_epi16(samples_row20, samples_row30);
+    const __m128i luma_sum11 = _mm_add_epi16(samples_row21, samples_row31);
+    sum = _mm_add_epi16(
+        sum, StoreLumaResults8_420(luma_sum10, luma_sum11, luma_ptr));
+    luma_ptr += kCflLumaBufferStride;
+
+    const __m128i samples_row40 = LoadUnaligned16(src);
+    const __m128i samples_row41 = (max_luma_width == 16)
+                                      ? LoadUnaligned16(src + 8)
+                                      : LastRowSamples(samples_row40);
+    src += src_stride;
+    const __m128i samples_row50 = LoadUnaligned16(src);
+    const __m128i samples_row51 = (max_luma_width == 16)
+                                      ? LoadUnaligned16(src + 8)
+                                      : LastRowSamples(samples_row50);
+    src += src_stride;
+    const __m128i luma_sum20 = _mm_add_epi16(samples_row40, samples_row50);
+    const __m128i luma_sum21 = _mm_add_epi16(samples_row41, samples_row51);
+    sum = _mm_add_epi16(
+        sum, StoreLumaResults8_420(luma_sum20, luma_sum21, luma_ptr));
+    luma_ptr += kCflLumaBufferStride;
+
+    const __m128i samples_row60 = LoadUnaligned16(src);
+    const __m128i samples_row61 = (max_luma_width == 16)
+                                      ? LoadUnaligned16(src + 8)
+                                      : LastRowSamples(samples_row60);
+    src += src_stride;
+    const __m128i samples_row70 = LoadUnaligned16(src);
+    const __m128i samples_row71 = (max_luma_width == 16)
+                                      ? LoadUnaligned16(src + 8)
+                                      : LastRowSamples(samples_row70);
+    src += src_stride;
+    const __m128i luma_sum30 = _mm_add_epi16(samples_row60, samples_row70);
+    const __m128i luma_sum31 = _mm_add_epi16(samples_row61, samples_row71);
+    sum = _mm_add_epi16(
+        sum, StoreLumaResults8_420(luma_sum30, luma_sum31, luma_ptr));
+    luma_ptr += kCflLumaBufferStride;
+
+    final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+    final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+    y -= 4;
+  } while (y != 0);
+
+  // Duplicate the final row downward to the end after max_luma_height.
+  const __m128i final_fill = LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
+  const __m128i final_fill_to_sum0 = _mm_cvtepi16_epi32(final_fill);
+  const __m128i final_fill_to_sum1 =
+      _mm_cvtepi16_epi32(_mm_srli_si128(final_fill, 8));
+  const __m128i final_fill_to_sum =
+      _mm_add_epi32(final_fill_to_sum0, final_fill_to_sum1);
+  for (y = luma_height; y < block_height; ++y) {
+    StoreUnaligned16(luma_ptr, final_fill);
+    luma_ptr += kCflLumaBufferStride;
+    final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+  }
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+  __m128i averages = RightShiftWithRounding_S32(
+      final_sum, block_height_log2 + 3 /*log2 of width 8*/);
+
+  averages = _mm_shufflelo_epi16(averages, 0);
+  averages = _mm_shuffle_epi32(averages, 0);
+  luma_ptr = luma[0];
+  y = block_height;
+  do {
+    const __m128i samples = LoadUnaligned16(luma_ptr);
+    StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+}
+
+template <int block_height_log2>
+void CflSubsampler420_8xH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  if (max_luma_width == 8) {
+    CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 8>(luma, max_luma_height,
+                                                          source, stride);
+  } else {
+    CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 16>(
+        luma, max_luma_height, source, stride);
+  }
+}
+
+template <int block_width_log2, int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_WxH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_height, const void* const source, ptrdiff_t stride) {
+  const auto* src = static_cast<const uint16_t*>(source);
+  const ptrdiff_t src_stride = stride / sizeof(src[0]);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i final_sum = zero;
+  const int block_height = 1 << block_height_log2;
+  const int luma_height = std::min(block_height, max_luma_height >> 1);
+  int16_t* luma_ptr = luma[0];
+  __m128i final_row_result;
+  // Begin first y section, covering width up to 32.
+  int y = luma_height;
+
+  do {
+    const uint16_t* src_next = src + src_stride;
+    const __m128i samples_row00 = LoadUnaligned16(src);
+    const __m128i samples_row01 = (max_luma_width >= 16)
+                                      ? LoadUnaligned16(src + 8)
+                                      : LastRowSamples(samples_row00);
+    const __m128i samples_row02 = (max_luma_width >= 24)
+                                      ? LoadUnaligned16(src + 16)
+                                      : LastRowSamples(samples_row01);
+    const __m128i samples_row03 = (max_luma_width == 32)
+                                      ? LoadUnaligned16(src + 24)
+                                      : LastRowSamples(samples_row02);
+    const __m128i samples_row10 = LoadUnaligned16(src_next);
+    const __m128i samples_row11 = (max_luma_width >= 16)
+                                      ? LoadUnaligned16(src_next + 8)
+                                      : LastRowSamples(samples_row10);
+    const __m128i samples_row12 = (max_luma_width >= 24)
+                                      ? LoadUnaligned16(src_next + 16)
+                                      : LastRowSamples(samples_row11);
+    const __m128i samples_row13 = (max_luma_width == 32)
+                                      ? LoadUnaligned16(src_next + 24)
+                                      : LastRowSamples(samples_row12);
+    const __m128i luma_sum0 = _mm_add_epi16(samples_row00, samples_row10);
+    const __m128i luma_sum1 = _mm_add_epi16(samples_row01, samples_row11);
+    const __m128i luma_sum2 = _mm_add_epi16(samples_row02, samples_row12);
+    const __m128i luma_sum3 = _mm_add_epi16(samples_row03, samples_row13);
+    __m128i sum = StoreLumaResults8_420(luma_sum0, luma_sum1, luma_ptr);
+    final_row_result =
+        StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8);
+    sum = _mm_add_epi16(sum, final_row_result);
+    final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+    final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+
+    // Because max_luma_width is at most 32, any values beyond x=16 will
+    // necessarily be duplicated.
+    if (block_width_log2 == 5) {
+      const __m128i wide_fill = LastRowResult(final_row_result);
+      // There are 16 16-bit fill values per row, shifting by 2 accounts for
+      // the widening to 32-bit.
+      final_sum = _mm_add_epi32(
+          final_sum, _mm_slli_epi32(_mm_cvtepi16_epi32(wide_fill), 2));
+    }
+    src += src_stride << 1;
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+
+  // Begin second y section.
+  y = luma_height;
+  if (y < block_height) {
+    const __m128i final_fill0 =
+        LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
+    const __m128i final_fill1 =
+        LoadUnaligned16(luma_ptr - kCflLumaBufferStride + 8);
+    __m128i wide_fill;
+    if (block_width_log2 == 5) {
+      // There are 16 16-bit fill values per row, shifting by 2 accounts for
+      // the widening to 32-bit.
+      wide_fill =
+          _mm_slli_epi32(_mm_cvtepi16_epi32(LastRowResult(final_fill1)), 2);
+    }
+    const __m128i final_inner_sum = _mm_add_epi16(final_fill0, final_fill1);
+    const __m128i final_inner_sum0 = _mm_cvtepu16_epi32(final_inner_sum);
+    const __m128i final_inner_sum1 = _mm_unpackhi_epi16(final_inner_sum, zero);
+    const __m128i final_fill_to_sum =
+        _mm_add_epi32(final_inner_sum0, final_inner_sum1);
+
+    do {
+      StoreUnaligned16(luma_ptr, final_fill0);
+      StoreUnaligned16(luma_ptr + 8, final_fill1);
+      if (block_width_log2 == 5) {
+        final_sum = _mm_add_epi32(final_sum, wide_fill);
+      }
+      luma_ptr += kCflLumaBufferStride;
+      final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+    } while (++y < block_height);
+  }  // End second y section.
+
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+  final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+  __m128i averages = RightShiftWithRounding_S32(
+      final_sum, block_width_log2 + block_height_log2);
+  averages = _mm_shufflelo_epi16(averages, 0);
+  averages = _mm_shuffle_epi32(averages, 0);
+
+  luma_ptr = luma[0];
+  y = block_height;
+  do {
+    const __m128i samples0 = LoadUnaligned16(luma_ptr);
+    StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples0, averages));
+    const __m128i samples1 = LoadUnaligned16(luma_ptr + 8);
+    final_row_result = _mm_sub_epi16(samples1, averages);
+    StoreUnaligned16(luma_ptr + 8, final_row_result);
+
+    if (block_width_log2 == 5) {
+      const __m128i wide_fill = LastRowResult(final_row_result);
+      StoreUnaligned16(luma_ptr + 16, wide_fill);
+      StoreUnaligned16(luma_ptr + 24, wide_fill);
+    }
+    luma_ptr += kCflLumaBufferStride;
+  } while (--y != 0);
+}
+
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler420_WxH_SSE4_1(
+    int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+    const int max_luma_width, const int max_luma_height,
+    const void* const source, ptrdiff_t stride) {
+  switch (max_luma_width) {
+    case 8:
+      CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 8>(
+          luma, max_luma_height, source, stride);
+      return;
+    case 16:
+      CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 16>(
+          luma, max_luma_height, source, stride);
+      return;
+    case 24:
+      CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 24>(
+          luma, max_luma_height, source, stride);
+      return;
+    default:
+      assert(max_luma_width == 32);
+      CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 32>(
+          luma, max_luma_height, source, stride);
+      return;
+  }
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize4x4] =
+      CflIntraPredictor_10bpp_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize4x8] =
+      CflIntraPredictor_10bpp_SSE4_1<4, 8>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize4x16] =
+      CflIntraPredictor_10bpp_SSE4_1<4, 16>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize8x4] =
+      CflIntraPredictor_10bpp_SSE4_1<8, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize8x8] =
+      CflIntraPredictor_10bpp_SSE4_1<8, 8>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize8x16] =
+      CflIntraPredictor_10bpp_SSE4_1<8, 16>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize8x32] =
+      CflIntraPredictor_10bpp_SSE4_1<8, 32>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize16x4] =
+      CflIntraPredictor_10bpp_SSE4_1<16, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize16x8] =
+      CflIntraPredictor_10bpp_SSE4_1<16, 8>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize16x16] =
+      CflIntraPredictor_10bpp_SSE4_1<16, 16>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize16x32] =
+      CflIntraPredictor_10bpp_SSE4_1<16, 32>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize32x8] =
+      CflIntraPredictor_10bpp_SSE4_1<32, 8>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize32x16] =
+      CflIntraPredictor_10bpp_SSE4_1<32, 16>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_CflIntraPredictor)
+  dsp->cfl_intra_predictors[kTransformSize32x32] =
+      CflIntraPredictor_10bpp_SSE4_1<32, 32>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+      CflSubsampler420_4xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+      CflSubsampler420_4xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+      CflSubsampler420_4xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+      CflSubsampler420_8xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+      CflSubsampler420_8xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+      CflSubsampler420_8xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+      CflSubsampler420_8xH_SSE4_1<5>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<4, 2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<4, 3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<4, 5>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<5, 3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<5, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_CflSubsampler420)
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+      CflSubsampler420_WxH_SSE4_1<5, 5>;
+#endif
+
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+      CflSubsampler444_4xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+      CflSubsampler444_4xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+      CflSubsampler444_4xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+      CflSubsampler444_8xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+      CflSubsampler444_8xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+      CflSubsampler444_8xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+      CflSubsampler444_8xH_SSE4_1<5>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+      CflSubsampler444_WxH_SSE4_1<4, 2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+      CflSubsampler444_WxH_SSE4_1<4, 3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+      CflSubsampler444_WxH_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+      CflSubsampler444_WxH_SSE4_1<4, 5>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+      CflSubsampler444_WxH_SSE4_1<5, 3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+      CflSubsampler444_WxH_SSE4_1<5, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_CflSubsampler444)
+  dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+      CflSubsampler444_WxH_SSE4_1<5, 5>;
+#endif
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredCflInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}
 
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_SSE4_1
+#else  // !LIBGAV1_TARGETING_SSE4_1
 
 namespace libgav1 {
 namespace dsp {
@@ -973,4 +1836,4 @@
 }  // namespace dsp
 }  // namespace libgav1
 
-#endif  // LIBGAV1_ENABLE_SSE4_1
+#endif  // LIBGAV1_TARGETING_SSE4_1

diff --git a/libgav1/src/dsp/x86/intrapred_cfl_sse4.h b/libgav1/src/dsp/x86/intrapred_cfl_sse4.h
new file mode 100644
index 0000000..5d1a425
--- /dev/null
+++ b/libgav1/src/dsp/x86/intrapred_cfl_sse4.h

@@ -0,0 +1,376 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_CFL_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRAPRED_CFL_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cfl_intra_predictors and Dsp::cfl_subsamplers, see the
+// defines below for specifics. These functions are not thread-safe.
+void IntraPredCflInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+//------------------------------------------------------------------------------
+// 10bpp
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_INTRAPRED_CFL_SSE4_H_

diff --git a/libgav1/src/dsp/x86/intrapred_directional_sse4.cc b/libgav1/src/dsp/x86/intrapred_directional_sse4.cc
new file mode 100644
index 0000000..e642aee
--- /dev/null
+++ b/libgav1/src/dsp/x86/intrapred_directional_sse4.cc

@@ -0,0 +1,1478 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_directional.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+//------------------------------------------------------------------------------
+// 7.11.2.4. Directional intra prediction process
+
+// Special case: An |xstep| of 64 corresponds to an angle delta of 45, meaning
+// upsampling is ruled out. In addition, the bits masked by 0x3F for
+// |shift_val| are 0 for all multiples of 64, so the formula
+// val = top[top_base_x]*shift + top[top_base_x+1]*(32-shift), reduces to
+// val = top[top_base_x+1] << 5, meaning only the second set of pixels is
+// involved in the output. Hence |top| is offset by 1.
+inline void DirectionalZone1_Step64(uint8_t* dst, ptrdiff_t stride,
+                                    const uint8_t* const top, const int width,
+                                    const int height) {
+  ptrdiff_t offset = 1;
+  if (height == 4) {
+    memcpy(dst, top + offset, width);
+    dst += stride;
+    memcpy(dst, top + offset + 1, width);
+    dst += stride;
+    memcpy(dst, top + offset + 2, width);
+    dst += stride;
+    memcpy(dst, top + offset + 3, width);
+    return;
+  }
+  int y = 0;
+  do {
+    memcpy(dst, top + offset, width);
+    dst += stride;
+    memcpy(dst, top + offset + 1, width);
+    dst += stride;
+    memcpy(dst, top + offset + 2, width);
+    dst += stride;
+    memcpy(dst, top + offset + 3, width);
+    dst += stride;
+    memcpy(dst, top + offset + 4, width);
+    dst += stride;
+    memcpy(dst, top + offset + 5, width);
+    dst += stride;
+    memcpy(dst, top + offset + 6, width);
+    dst += stride;
+    memcpy(dst, top + offset + 7, width);
+    dst += stride;
+
+    offset += 8;
+    y += 8;
+  } while (y < height);
+}
+
+inline void DirectionalZone1_4xH(uint8_t* dst, ptrdiff_t stride,
+                                 const uint8_t* const top, const int height,
+                                 const int xstep, const bool upsampled) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int scale_bits = 6 - upsample_shift;
+  const __m128i max_shift = _mm_set1_epi8(32);
+  // Downscaling for a weighted average whose weights sum to 32 (max_shift).
+  const int rounding_bits = 5;
+  const int max_base_x = (height + 3 /* width - 1 */) << upsample_shift;
+  const __m128i final_top_val = _mm_set1_epi16(top[max_base_x]);
+  const __m128i sampler = upsampled ? _mm_set_epi64x(0, 0x0706050403020100)
+                                    : _mm_set_epi64x(0, 0x0403030202010100);
+  // Each 16-bit value here corresponds to a position that may exceed
+  // |max_base_x|. When added to the top_base_x, it is used to mask values
+  // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
+  // not supported for packed integers.
+  const __m128i offsets =
+      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+  // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x|
+  // is always greater than |height|, so clipping to 1 is enough to make the
+  // logic work.
+  const int xstep_units = std::max(xstep >> scale_bits, 1);
+  const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+  // Rows up to this y-value can be computed without checking for bounds.
+  int y = 0;
+  int top_x = xstep;
+
+  for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+    const int top_base_x = top_x >> scale_bits;
+
+    // Permit negative values of |top_x|.
+    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi8(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+    __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+    top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+    const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+
+    // Load 8 values because we will select the sampled values based on
+    // |upsampled|.
+    const __m128i values = LoadLo8(top + top_base_x);
+    const __m128i sampled_values = _mm_shuffle_epi8(values, sampler);
+    const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+    __m128i prod = _mm_maddubs_epi16(sampled_values, shifts);
+    prod = RightShiftWithRounding_U16(prod, rounding_bits);
+    // Replace pixels from invalid range with top-right corner.
+    prod = _mm_blendv_epi8(prod, final_top_val, past_max);
+    Store4(dst, _mm_packus_epi16(prod, prod));
+  }
+
+  // Fill in corner-only rows.
+  for (; y < height; ++y) {
+    memset(dst, top[max_base_x], /* width */ 4);
+    dst += stride;
+  }
+}
+
+// 7.11.2.4 (7) angle < 90
+inline void DirectionalZone1_Large(uint8_t* dest, ptrdiff_t stride,
+                                   const uint8_t* const top_row,
+                                   const int width, const int height,
+                                   const int xstep, const bool upsampled) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const __m128i sampler =
+      upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+                : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+  const int scale_bits = 6 - upsample_shift;
+  const int max_base_x = ((width + height) - 1) << upsample_shift;
+
+  const __m128i max_shift = _mm_set1_epi8(32);
+  // Downscaling for a weighted average whose weights sum to 32 (max_shift).
+  const int rounding_bits = 5;
+  const int base_step = 1 << upsample_shift;
+  const int base_step8 = base_step << 3;
+
+  // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x|
+  // is always greater than |height|, so clipping to 1 is enough to make the
+  // logic work.
+  const int xstep_units = std::max(xstep >> scale_bits, 1);
+  const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+  // Rows up to this y-value can be computed without checking for bounds.
+  const int max_no_corner_y = std::min(
+      LeftShift((max_base_x - (base_step * width)), scale_bits) / xstep,
+      height);
+  // No need to check for exceeding |max_base_x| in the first loop.
+  int y = 0;
+  int top_x = xstep;
+  for (; y < max_no_corner_y; ++y, dest += stride, top_x += xstep) {
+    int top_base_x = top_x >> scale_bits;
+    // Permit negative values of |top_x|.
+    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi8(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+    int x = 0;
+    do {
+      const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+      __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+      vals = _mm_maddubs_epi16(vals, shifts);
+      vals = RightShiftWithRounding_U16(vals, rounding_bits);
+      StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+      top_base_x += base_step8;
+      x += 8;
+    } while (x < width);
+  }
+
+  // Each 16-bit value here corresponds to a position that may exceed
+  // |max_base_x|. When added to the top_base_x, it is used to mask values
+  // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
+  // not supported for packed integers.
+  const __m128i offsets =
+      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+  const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+  const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
+  const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
+  for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) {
+    int top_base_x = top_x >> scale_bits;
+
+    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi8(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+    __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+    top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+    int x = 0;
+    const int min_corner_only_x =
+        std::min(width, ((max_base_x - top_base_x) >> upsample_shift) + 7) & ~7;
+    for (; x < min_corner_only_x;
+         x += 8, top_base_x += base_step8,
+         top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
+      const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+      // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents
+      // reading out of bounds. If all indices are past max and we don't need to
+      // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will
+      // reset for the next |y|.
+      top_base_x &= ~_mm_cvtsi128_si32(past_max);
+      const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+      __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+      vals = _mm_maddubs_epi16(vals, shifts);
+      vals = RightShiftWithRounding_U16(vals, rounding_bits);
+      vals = _mm_blendv_epi8(vals, final_top_val, past_max);
+      StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+    }
+    // Corner-only section of the row.
+    memset(dest + x, top_row[max_base_x], width - x);
+  }
+  // Fill in corner-only rows.
+  for (; y < height; ++y) {
+    memset(dest, top_row[max_base_x], width);
+    dest += stride;
+  }
+}
+
+// 7.11.2.4 (7) angle < 90
+inline void DirectionalZone1_SSE4_1(uint8_t* dest, ptrdiff_t stride,
+                                    const uint8_t* const top_row,
+                                    const int width, const int height,
+                                    const int xstep, const bool upsampled) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  if (xstep == 64) {
+    DirectionalZone1_Step64(dest, stride, top_row, width, height);
+    return;
+  }
+  if (width == 4) {
+    DirectionalZone1_4xH(dest, stride, top_row, height, xstep, upsampled);
+    return;
+  }
+  if (width >= 32) {
+    DirectionalZone1_Large(dest, stride, top_row, width, height, xstep,
+                           upsampled);
+    return;
+  }
+  const __m128i sampler =
+      upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+                : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+  const int scale_bits = 6 - upsample_shift;
+  const int max_base_x = ((width + height) - 1) << upsample_shift;
+
+  const __m128i max_shift = _mm_set1_epi8(32);
+  // Downscaling for a weighted average whose weights sum to 32 (max_shift).
+  const int rounding_bits = 5;
+  const int base_step = 1 << upsample_shift;
+  const int base_step8 = base_step << 3;
+
+  // No need to check for exceeding |max_base_x| in the loops.
+  if (((xstep * height) >> scale_bits) + base_step * width < max_base_x) {
+    int top_x = xstep;
+    int y = 0;
+    do {
+      int top_base_x = top_x >> scale_bits;
+      // Permit negative values of |top_x|.
+      const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+      const __m128i shift = _mm_set1_epi8(shift_val);
+      const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+      const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+      int x = 0;
+      do {
+        const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+        __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+        vals = _mm_maddubs_epi16(vals, shifts);
+        vals = RightShiftWithRounding_U16(vals, rounding_bits);
+        StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+        top_base_x += base_step8;
+        x += 8;
+      } while (x < width);
+      dest += stride;
+      top_x += xstep;
+    } while (++y < height);
+    return;
+  }
+
+  // Each 16-bit value here corresponds to a position that may exceed
+  // |max_base_x|. When added to the top_base_x, it is used to mask values
+  // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
+  // not supported for packed integers.
+  const __m128i offsets =
+      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+  const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+  const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
+  const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
+  int top_x = xstep;
+  int y = 0;
+  do {
+    int top_base_x = top_x >> scale_bits;
+
+    if (top_base_x >= max_base_x) {
+      for (int i = y; i < height; ++i) {
+        memset(dest, top_row[max_base_x], width);
+        dest += stride;
+      }
+      return;
+    }
+
+    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi8(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+    __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+    top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+    int x = 0;
+    for (; x < width - 8;
+         x += 8, top_base_x += base_step8,
+         top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
+      const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+      // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents
+      // reading out of bounds. If all indices are past max and we don't need to
+      // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will
+      // reset for the next |y|.
+      top_base_x &= ~_mm_cvtsi128_si32(past_max);
+      const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+      __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+      vals = _mm_maddubs_epi16(vals, shifts);
+      vals = RightShiftWithRounding_U16(vals, rounding_bits);
+      vals = _mm_blendv_epi8(vals, final_top_val, past_max);
+      StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+    }
+    const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+    __m128i vals;
+    if (upsampled) {
+      vals = LoadUnaligned16(top_row + top_base_x);
+    } else {
+      const __m128i top_vals = LoadLo8(top_row + top_base_x);
+      vals = _mm_shuffle_epi8(top_vals, sampler);
+      vals = _mm_insert_epi8(vals, top_row[top_base_x + 8], 15);
+    }
+    vals = _mm_maddubs_epi16(vals, shifts);
+    vals = RightShiftWithRounding_U16(vals, rounding_bits);
+    vals = _mm_blendv_epi8(vals, final_top_val, past_max);
+    StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+    dest += stride;
+    top_x += xstep;
+  } while (++y < height);
+}
+
+void DirectionalIntraPredictorZone1_SSE4_1(void* const dest, ptrdiff_t stride,
+                                           const void* const top_row,
+                                           const int width, const int height,
+                                           const int xstep,
+                                           const bool upsampled_top) {
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  auto* dst = static_cast<uint8_t*>(dest);
+  DirectionalZone1_SSE4_1(dst, stride, top_ptr, width, height, xstep,
+                          upsampled_top);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_4x4(uint8_t* dest, ptrdiff_t stride,
+                                 const uint8_t* const left_column,
+                                 const int base_left_y, const int ystep) {
+  // For use in the non-upsampled case.
+  const __m128i sampler = _mm_set_epi64x(0, 0x0403030202010100);
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int scale_bits = 6 - upsample_shift;
+  const __m128i max_shift = _mm_set1_epi8(32);
+  // Downscaling for a weighted average whose weights sum to 32 (max_shift).
+  const int rounding_bits = 5;
+
+  __m128i result_block[4];
+  for (int x = 0, left_y = base_left_y; x < 4; x++, left_y += ystep) {
+    const int left_base_y = left_y >> scale_bits;
+    const int shift_val = ((left_y << upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi8(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+    __m128i vals;
+    if (upsampled) {
+      vals = LoadLo8(left_column + left_base_y);
+    } else {
+      const __m128i top_vals = LoadLo8(left_column + left_base_y);
+      vals = _mm_shuffle_epi8(top_vals, sampler);
+    }
+    vals = _mm_maddubs_epi16(vals, shifts);
+    vals = RightShiftWithRounding_U16(vals, rounding_bits);
+    result_block[x] = _mm_packus_epi16(vals, vals);
+  }
+  const __m128i result = Transpose4x4_U8(result_block);
+  // This is result_row0.
+  Store4(dest, result);
+  dest += stride;
+  const int result_row1 = _mm_extract_epi32(result, 1);
+  memcpy(dest, &result_row1, sizeof(result_row1));
+  dest += stride;
+  const int result_row2 = _mm_extract_epi32(result, 2);
+  memcpy(dest, &result_row2, sizeof(result_row2));
+  dest += stride;
+  const int result_row3 = _mm_extract_epi32(result, 3);
+  memcpy(dest, &result_row3, sizeof(result_row3));
+}
+
+template <bool upsampled, int height>
+inline void DirectionalZone3_8xH(uint8_t* dest, ptrdiff_t stride,
+                                 const uint8_t* const left_column,
+                                 const int base_left_y, const int ystep) {
+  // For use in the non-upsampled case.
+  const __m128i sampler =
+      _mm_set_epi64x(0x0807070606050504, 0x0403030202010100);
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int scale_bits = 6 - upsample_shift;
+  const __m128i max_shift = _mm_set1_epi8(32);
+  // Downscaling for a weighted average whose weights sum to 32 (max_shift).
+  const int rounding_bits = 5;
+
+  __m128i result_block[8];
+  for (int x = 0, left_y = base_left_y; x < 8; x++, left_y += ystep) {
+    const int left_base_y = left_y >> scale_bits;
+    const int shift_val = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi8(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+    __m128i vals;
+    if (upsampled) {
+      vals = LoadUnaligned16(left_column + left_base_y);
+    } else {
+      const __m128i top_vals = LoadUnaligned16(left_column + left_base_y);
+      vals = _mm_shuffle_epi8(top_vals, sampler);
+    }
+    vals = _mm_maddubs_epi16(vals, shifts);
+    result_block[x] = RightShiftWithRounding_U16(vals, rounding_bits);
+  }
+  Transpose8x8_U16(result_block, result_block);
+  for (int y = 0; y < height; ++y) {
+    StoreLo8(dest, _mm_packus_epi16(result_block[y], result_block[y]));
+    dest += stride;
+  }
+}
+
+// 7.11.2.4 (9) angle > 180
+void DirectionalIntraPredictorZone3_SSE4_1(void* dest, ptrdiff_t stride,
+                                           const void* const left_column,
+                                           const int width, const int height,
+                                           const int ystep,
+                                           const bool upsampled) {
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  const int upsample_shift = static_cast<int>(upsampled);
+  if (width == 4 || height == 4) {
+    const ptrdiff_t stride4 = stride << 2;
+    if (upsampled) {
+      int left_y = ystep;
+      int x = 0;
+      do {
+        uint8_t* dst_x = dst + x;
+        int y = 0;
+        do {
+          DirectionalZone3_4x4<true>(
+              dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
+          dst_x += stride4;
+          y += 4;
+        } while (y < height);
+        left_y += ystep << 2;
+        x += 4;
+      } while (x < width);
+    } else {
+      int left_y = ystep;
+      int x = 0;
+      do {
+        uint8_t* dst_x = dst + x;
+        int y = 0;
+        do {
+          DirectionalZone3_4x4<false>(dst_x, stride, left_ptr + y, left_y,
+                                      ystep);
+          dst_x += stride4;
+          y += 4;
+        } while (y < height);
+        left_y += ystep << 2;
+        x += 4;
+      } while (x < width);
+    }
+    return;
+  }
+
+  const ptrdiff_t stride8 = stride << 3;
+  if (upsampled) {
+    int left_y = ystep;
+    int x = 0;
+    do {
+      uint8_t* dst_x = dst + x;
+      int y = 0;
+      do {
+        DirectionalZone3_8xH<true, 8>(
+            dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
+        dst_x += stride8;
+        y += 8;
+      } while (y < height);
+      left_y += ystep << 3;
+      x += 8;
+    } while (x < width);
+  } else {
+    int left_y = ystep;
+    int x = 0;
+    do {
+      uint8_t* dst_x = dst + x;
+      int y = 0;
+      do {
+        DirectionalZone3_8xH<false, 8>(
+            dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
+        dst_x += stride8;
+        y += 8;
+      } while (y < height);
+      left_y += ystep << 3;
+      x += 8;
+    } while (x < width);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Directional Zone 2 Functions
+// 7.11.2.4 (8)
+
+// DirectionalBlend* selectively overwrites the values written by
+// DirectionalZone2FromLeftCol*. |zone_bounds| has one 16-bit index for each
+// row.
+template <int y_selector>
+inline void DirectionalBlend4_SSE4_1(uint8_t* dest,
+                                     const __m128i& dest_index_vect,
+                                     const __m128i& vals,
+                                     const __m128i& zone_bounds) {
+  const __m128i max_dest_x_vect = _mm_shufflelo_epi16(zone_bounds, y_selector);
+  const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect);
+  const __m128i original_vals = _mm_cvtepu8_epi16(Load4(dest));
+  const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left);
+  Store4(dest, _mm_packus_epi16(blended_vals, blended_vals));
+}
+
+inline void DirectionalBlend8_SSE4_1(uint8_t* dest,
+                                     const __m128i& dest_index_vect,
+                                     const __m128i& vals,
+                                     const __m128i& zone_bounds,
+                                     const __m128i& bounds_selector) {
+  const __m128i max_dest_x_vect =
+      _mm_shuffle_epi8(zone_bounds, bounds_selector);
+  const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect);
+  const __m128i original_vals = _mm_cvtepu8_epi16(LoadLo8(dest));
+  const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left);
+  StoreLo8(dest, _mm_packus_epi16(blended_vals, blended_vals));
+}
+
+constexpr int kDirectionalWeightBits = 5;
+// |source| is packed with 4 or 8 pairs of 8-bit values from left or top.
+// |shifts| is named to match the specification, with 4 or 8 pairs of (32 -
+// shift) and shift. Shift is guaranteed to be between 0 and 32.
+inline __m128i DirectionalZone2FromSource_SSE4_1(const uint8_t* const source,
+                                                 const __m128i& shifts,
+                                                 const __m128i& sampler) {
+  const __m128i src_vals = LoadUnaligned16(source);
+  __m128i vals = _mm_shuffle_epi8(src_vals, sampler);
+  vals = _mm_maddubs_epi16(vals, shifts);
+  return RightShiftWithRounding_U16(vals, kDirectionalWeightBits);
+}
+
+// Because the source values "move backwards" as the row index increases, the
+// indices derived from ystep are generally negative. This is accommodated by
+// making sure the relative indices are within [-15, 0] when the function is
+// called, and sliding them into the inclusive range [0, 15], relative to a
+// lower base address.
+constexpr int kPositiveIndexOffset = 15;
+
+template <bool upsampled>
+inline void DirectionalZone2FromLeftCol_4x4_SSE4_1(
+    uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column_base,
+    __m128i left_y) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int scale_bits = 6 - upsample_shift;
+  const __m128i max_shifts = _mm_set1_epi8(32);
+  const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+  const __m128i index_increment = _mm_cvtsi32_si128(0x01010101);
+  const __m128i positive_offset = _mm_set1_epi8(kPositiveIndexOffset);
+  // Left_column and sampler are both offset by 15 so the indices are always
+  // positive.
+  const uint8_t* left_column = left_column_base - kPositiveIndexOffset;
+  for (int y = 0; y < 4; dst += stride, ++y) {
+    __m128i offset_y = _mm_srai_epi16(left_y, scale_bits);
+    offset_y = _mm_packs_epi16(offset_y, offset_y);
+
+    const __m128i adjacent = _mm_add_epi8(offset_y, index_increment);
+    __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent);
+    // Slide valid |offset_y| indices from range [-15, 0] to [0, 15] so they
+    // can work as shuffle indices. Some values may be out of bounds, but their
+    // pred results will be masked over by top prediction.
+    sampler = _mm_add_epi8(sampler, positive_offset);
+
+    __m128i shifts = _mm_srli_epi16(
+        _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1);
+    shifts = _mm_packus_epi16(shifts, shifts);
+    const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts);
+    shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+    const __m128i vals = DirectionalZone2FromSource_SSE4_1(
+        left_column + (y << upsample_shift), shifts, sampler);
+    Store4(dst, _mm_packus_epi16(vals, vals));
+  }
+}
+
+// The height at which a load of 16 bytes will not contain enough source pixels
+// from |left_column| to supply an accurate row when computing 8 pixels at a
+// time. The values are found by inspection. By coincidence, all angles that
+// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up
+// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15.
+constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = {
+    1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40};
+
+template <bool upsampled>
+inline void DirectionalZone2FromLeftCol_8x8_SSE4_1(
+    uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column,
+    __m128i left_y) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int scale_bits = 6 - upsample_shift;
+  const __m128i max_shifts = _mm_set1_epi8(32);
+  const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+  const __m128i index_increment = _mm_set1_epi8(1);
+  const __m128i denegation = _mm_set1_epi8(kPositiveIndexOffset);
+  for (int y = 0; y < 8; dst += stride, ++y) {
+    __m128i offset_y = _mm_srai_epi16(left_y, scale_bits);
+    offset_y = _mm_packs_epi16(offset_y, offset_y);
+    const __m128i adjacent = _mm_add_epi8(offset_y, index_increment);
+
+    // Offset the relative index because ystep is negative in Zone 2 and shuffle
+    // indices must be nonnegative.
+    __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent);
+    sampler = _mm_add_epi8(sampler, denegation);
+
+    __m128i shifts = _mm_srli_epi16(
+        _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1);
+    shifts = _mm_packus_epi16(shifts, shifts);
+    const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts);
+    shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+
+    // The specification adds (y << 6) to left_y, which is subject to
+    // upsampling, but this puts sampler indices out of the 0-15 range. It is
+    // equivalent to offset the source address by (y << upsample_shift) instead.
+    const __m128i vals = DirectionalZone2FromSource_SSE4_1(
+        left_column - kPositiveIndexOffset + (y << upsample_shift), shifts,
+        sampler);
+    StoreLo8(dst, _mm_packus_epi16(vals, vals));
+  }
+}
+
+// |zone_bounds| is an epi16 of the relative x index at which base >= -(1 <<
+// upsampled_top), for each row. When there are 4 values, they can be duplicated
+// with a non-register shuffle mask.
+// |shifts| is one pair of weights that applies throughout a given row.
+template <bool upsampled_top>
+inline void DirectionalZone1Blend_4x4(
+    uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride,
+    __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts,
+    const __m128i& dest_index_x, int top_x, const int xstep) {
+  const int upsample_shift = static_cast<int>(upsampled_top);
+  const int scale_bits_x = 6 - upsample_shift;
+  top_x -= xstep;
+
+  int top_base_x = (top_x >> scale_bits_x);
+  const __m128i vals0 = DirectionalZone2FromSource_SSE4_1(
+      top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x00), sampler);
+  DirectionalBlend4_SSE4_1<0x00>(dest, dest_index_x, vals0, zone_bounds);
+  top_x -= xstep;
+  dest += stride;
+
+  top_base_x = (top_x >> scale_bits_x);
+  const __m128i vals1 = DirectionalZone2FromSource_SSE4_1(
+      top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x55), sampler);
+  DirectionalBlend4_SSE4_1<0x55>(dest, dest_index_x, vals1, zone_bounds);
+  top_x -= xstep;
+  dest += stride;
+
+  top_base_x = (top_x >> scale_bits_x);
+  const __m128i vals2 = DirectionalZone2FromSource_SSE4_1(
+      top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xAA), sampler);
+  DirectionalBlend4_SSE4_1<0xAA>(dest, dest_index_x, vals2, zone_bounds);
+  top_x -= xstep;
+  dest += stride;
+
+  top_base_x = (top_x >> scale_bits_x);
+  const __m128i vals3 = DirectionalZone2FromSource_SSE4_1(
+      top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xFF), sampler);
+  DirectionalBlend4_SSE4_1<0xFF>(dest, dest_index_x, vals3, zone_bounds);
+}
+
+template <bool upsampled_top, int height>
+inline void DirectionalZone1Blend_8xH(
+    uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride,
+    __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts,
+    const __m128i& dest_index_x, int top_x, const int xstep) {
+  const int upsample_shift = static_cast<int>(upsampled_top);
+  const int scale_bits_x = 6 - upsample_shift;
+
+  __m128i y_selector = _mm_set1_epi32(0x01000100);
+  const __m128i index_increment = _mm_set1_epi32(0x02020202);
+  for (int y = 0; y < height; ++y,
+           y_selector = _mm_add_epi8(y_selector, index_increment),
+           dest += stride) {
+    top_x -= xstep;
+    const int top_base_x = top_x >> scale_bits_x;
+    const __m128i vals = DirectionalZone2FromSource_SSE4_1(
+        top_row + top_base_x, _mm_shuffle_epi8(shifts, y_selector), sampler);
+    DirectionalBlend8_SSE4_1(dest, dest_index_x, vals, zone_bounds, y_selector);
+  }
+}
+
+// 7.11.2.4 (8) 90 < angle > 180
+// The strategy for this function is to know how many blocks can be processed
+// with just pixels from |top_ptr|, then handle mixed blocks, then handle only
+// blocks that take from |left_ptr|. Additionally, a fast index-shuffle
+// approach is used for pred values from |left_column| in sections that permit
+// it.
+template <bool upsampled_left, bool upsampled_top>
+inline void DirectionalZone2_SSE4_1(void* dest, ptrdiff_t stride,
+                                    const uint8_t* const top_row,
+                                    const uint8_t* const left_column,
+                                    const int width, const int height,
+                                    const int xstep, const int ystep) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const int upsample_left_shift = static_cast<int>(upsampled_left);
+  const int upsample_top_shift = static_cast<int>(upsampled_top);
+  const __m128i max_shift = _mm_set1_epi8(32);
+  const ptrdiff_t stride8 = stride << 3;
+  const __m128i dest_index_x =
+      _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
+  const __m128i sampler_top =
+      upsampled_top
+          ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+          : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+  const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+  // All columns from |min_top_only_x| to the right will only need |top_row| to
+  // compute. This assumes minimum |xstep| is 3.
+  const int min_top_only_x = std::min((height * xstep) >> 6, width);
+
+  // For steep angles, the source pixels from left_column may not fit in a
+  // 16-byte load for shuffling.
+  // TODO(petersonab): Find a more precise formula for this subject to x.
+  const int max_shuffle_height =
+      std::min(height, kDirectionalZone2ShuffleInvalidHeight[ystep >> 6]);
+
+  const int xstep8 = xstep << 3;
+  const __m128i xstep8_vect = _mm_set1_epi16(xstep8);
+  // Accumulate xstep across 8 rows.
+  const __m128i xstep_dup = _mm_set1_epi16(-xstep);
+  const __m128i increments = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+  const __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments);
+  // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+  const __m128i scaled_one = _mm_set1_epi16(-64);
+  __m128i xstep_bounds_base =
+      (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift)
+                    : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift);
+
+  const int left_base_increment = ystep >> 6;
+  const int ystep_remainder = ystep & 0x3F;
+  const int ystep8 = ystep << 3;
+  const int left_base_increment8 = ystep8 >> 6;
+  const int ystep_remainder8 = ystep8 & 0x3F;
+  const __m128i increment_left8 = _mm_set1_epi16(-ystep_remainder8);
+
+  // If the 64 scaling is regarded as a decimal point, the first value of the
+  // left_y vector omits the portion which is covered under the left_column
+  // offset. Following values need the full ystep as a relative offset.
+  const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
+  const __m128i ystep_dup = _mm_set1_epi16(-ystep);
+  __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
+  left_y = _mm_add_epi16(ystep_init, left_y);
+
+  const __m128i increment_top8 = _mm_set1_epi16(8 << 6);
+  int x = 0;
+
+  // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
+  // The first stage, before the first y-loop, covers blocks that are only
+  // computed from the top row. The second stage, comprising two y-loops, covers
+  // blocks that have a mixture of values computed from top or left. The final
+  // stage covers blocks that are only computed from the left.
+  for (int left_offset = -left_base_increment; x < min_top_only_x;
+       x += 8,
+           xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8),
+           // Watch left_y because it can still get big.
+       left_y = _mm_add_epi16(left_y, increment_left8),
+           left_offset -= left_base_increment8) {
+    uint8_t* dst_x = dst + x;
+
+    // Round down to the nearest multiple of 8.
+    const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
+    DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
+                         max_top_only_y, -xstep, upsampled_top);
+    DirectionalZone1_4xH(dst_x + 4, stride,
+                         top_row + ((x + 4) << upsample_top_shift),
+                         max_top_only_y, -xstep, upsampled_top);
+
+    int y = max_top_only_y;
+    dst_x += stride * y;
+    const int xstep_y = xstep * y;
+    const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
+    // All rows from |min_left_only_y| down for this set of columns, only need
+    // |left_column| to compute.
+    const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height);
+    // At high angles such that min_left_only_y < 8, ystep is low and xstep is
+    // high. This means that max_shuffle_height is unbounded and xstep_bounds
+    // will overflow in 16 bits. This is prevented by stopping the first
+    // blending loop at min_left_only_y for such cases, which means we skip over
+    // the second blending loop as well.
+    const int left_shuffle_stop_y =
+        std::min(max_shuffle_height, min_left_only_y);
+    __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
+    __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
+    int top_x = -xstep_y;
+
+    for (; y < left_shuffle_stop_y;
+         y += 8, dst_x += stride8,
+         xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
+         xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
+         top_x -= xstep8) {
+      DirectionalZone2FromLeftCol_8x8_SSE4_1<upsampled_left>(
+          dst_x, stride,
+          left_column + ((left_offset + y) << upsample_left_shift), left_y);
+
+      __m128i shifts = _mm_srli_epi16(
+          _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
+                        shift_mask),
+          1);
+      shifts = _mm_packus_epi16(shifts, shifts);
+      __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
+      shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+      __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
+      DirectionalZone1Blend_8xH<upsampled_top, 8>(
+          dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
+          xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
+    }
+    // Pick up from the last y-value, using the 10% slower but secure method for
+    // left prediction.
+    const auto base_left_y = static_cast<int16_t>(_mm_extract_epi16(left_y, 0));
+    for (; y < min_left_only_y;
+         y += 8, dst_x += stride8,
+         xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
+         xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
+         top_x -= xstep8) {
+      const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
+
+      DirectionalZone3_8xH<upsampled_left, 8>(
+          dst_x, stride,
+          left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+          -ystep);
+
+      __m128i shifts = _mm_srli_epi16(
+          _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
+                        shift_mask),
+          1);
+      shifts = _mm_packus_epi16(shifts, shifts);
+      __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
+      shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+      DirectionalZone1Blend_8xH<upsampled_top, 8>(
+          dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
+          xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
+    }
+    // Loop over y for left_only rows.
+    for (; y < height; y += 8, dst_x += stride8) {
+      DirectionalZone3_8xH<upsampled_left, 8>(
+          dst_x, stride,
+          left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+          -ystep);
+    }
+  }
+  for (; x < width; x += 4) {
+    DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
+                         height, -xstep, upsampled_top);
+  }
+}
+
+template <bool upsampled_left, bool upsampled_top>
+inline void DirectionalZone2_4_SSE4_1(void* dest, ptrdiff_t stride,
+                                      const uint8_t* const top_row,
+                                      const uint8_t* const left_column,
+                                      const int width, const int height,
+                                      const int xstep, const int ystep) {
+  auto* dst = static_cast<uint8_t*>(dest);
+  const int upsample_left_shift = static_cast<int>(upsampled_left);
+  const int upsample_top_shift = static_cast<int>(upsampled_top);
+  const __m128i max_shift = _mm_set1_epi8(32);
+  const ptrdiff_t stride4 = stride << 2;
+  const __m128i dest_index_x = _mm_set_epi32(0, 0, 0x00030002, 0x00010000);
+  const __m128i sampler_top =
+      upsampled_top
+          ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+          : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+  // All columns from |min_top_only_x| to the right will only need |top_row| to
+  // compute.
+  assert(xstep >= 3);
+  const int min_top_only_x = std::min((height * xstep) >> 6, width);
+
+  const int xstep4 = xstep << 2;
+  const __m128i xstep4_vect = _mm_set1_epi16(xstep4);
+  const __m128i xstep_dup = _mm_set1_epi16(-xstep);
+  const __m128i increments = _mm_set_epi32(0, 0, 0x00040003, 0x00020001);
+  __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments);
+  const __m128i scaled_one = _mm_set1_epi16(-64);
+  // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+  __m128i xstep_bounds_base =
+      (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift)
+                    : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift);
+
+  const int left_base_increment = ystep >> 6;
+  const int ystep_remainder = ystep & 0x3F;
+  const int ystep4 = ystep << 2;
+  const int left_base_increment4 = ystep4 >> 6;
+  // This is guaranteed to be less than 64, but accumulation may bring it past
+  // 64 for higher x values.
+  const int ystep_remainder4 = ystep4 & 0x3F;
+  const __m128i increment_left4 = _mm_set1_epi16(-ystep_remainder4);
+  const __m128i increment_top4 = _mm_set1_epi16(4 << 6);
+
+  // If the 64 scaling is regarded as a decimal point, the first value of the
+  // left_y vector omits the portion which will go into the left_column offset.
+  // Following values need the full ystep as a relative offset.
+  const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
+  const __m128i ystep_dup = _mm_set1_epi16(-ystep);
+  __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
+  left_y = _mm_add_epi16(ystep_init, left_y);
+  const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+
+  int x = 0;
+  // Loop over x for columns with a mixture of sources.
+  for (int left_offset = -left_base_increment; x < min_top_only_x; x += 4,
+           xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top4),
+           left_y = _mm_add_epi16(left_y, increment_left4),
+           left_offset -= left_base_increment4) {
+    uint8_t* dst_x = dst + x;
+
+    // Round down to the nearest multiple of 8.
+    const int max_top_only_y = std::min((x << 6) / xstep, height) & 0xFFFFFFF4;
+    DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
+                         max_top_only_y, -xstep, upsampled_top);
+    int y = max_top_only_y;
+    dst_x += stride * y;
+    const int xstep_y = xstep * y;
+    const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
+    // All rows from |min_left_only_y| down for this set of columns, only need
+    // |left_column| to compute. Rounded up to the nearest multiple of 4.
+    const int min_left_only_y = std::min(((x + 4) << 6) / xstep, height);
+
+    __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
+    __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
+    int top_x = -xstep_y;
+
+    // Loop over y for mixed rows.
+    for (; y < min_left_only_y;
+         y += 4, dst_x += stride4,
+         xstep_bounds = _mm_add_epi16(xstep_bounds, xstep4_vect),
+         xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep4_vect),
+         top_x -= xstep4) {
+      DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>(
+          dst_x, stride,
+          left_column + ((left_offset + y) * (1 << upsample_left_shift)),
+          left_y);
+
+      __m128i shifts = _mm_srli_epi16(
+          _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
+                        shift_mask),
+          1);
+      shifts = _mm_packus_epi16(shifts, shifts);
+      const __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
+      shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+      const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
+      DirectionalZone1Blend_4x4<upsampled_top>(
+          dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
+          xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
+    }
+    // Loop over y for left-only rows, if any.
+    for (; y < height; y += 4, dst_x += stride4) {
+      DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>(
+          dst_x, stride,
+          left_column + ((left_offset + y) << upsample_left_shift), left_y);
+    }
+  }
+  // Loop over top-only columns, if any.
+  for (; x < width; x += 4) {
+    DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
+                         height, -xstep, upsampled_top);
+  }
+}
+
+void DirectionalIntraPredictorZone2_SSE4_1(void* const dest, ptrdiff_t stride,
+                                           const void* const top_row,
+                                           const void* const left_column,
+                                           const int width, const int height,
+                                           const int xstep, const int ystep,
+                                           const bool upsampled_top,
+                                           const bool upsampled_left) {
+  // Increasing the negative buffer for this function allows more rows to be
+  // processed at a time without branching in an inner loop to check the base.
+  uint8_t top_buffer[288];
+  uint8_t left_buffer[288];
+  memcpy(top_buffer + 128, static_cast<const uint8_t*>(top_row) - 16, 160);
+  memcpy(left_buffer + 128, static_cast<const uint8_t*>(left_column) - 16, 160);
+  const uint8_t* top_ptr = top_buffer + 144;
+  const uint8_t* left_ptr = left_buffer + 144;
+  if (width == 4 || height == 4) {
+    if (upsampled_left) {
+      if (upsampled_top) {
+        DirectionalZone2_4_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr,
+                                              width, height, xstep, ystep);
+      } else {
+        DirectionalZone2_4_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr,
+                                               width, height, xstep, ystep);
+      }
+    } else {
+      if (upsampled_top) {
+        DirectionalZone2_4_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr,
+                                               width, height, xstep, ystep);
+      } else {
+        DirectionalZone2_4_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr,
+                                                width, height, xstep, ystep);
+      }
+    }
+    return;
+  }
+  if (upsampled_left) {
+    if (upsampled_top) {
+      DirectionalZone2_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr,
+                                          width, height, xstep, ystep);
+    } else {
+      DirectionalZone2_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr,
+                                           width, height, xstep, ystep);
+    }
+  } else {
+    if (upsampled_top) {
+      DirectionalZone2_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr,
+                                           width, height, xstep, ystep);
+    } else {
+      DirectionalZone2_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr,
+                                            width, height, xstep, ystep);
+    }
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone1)
+  dsp->directional_intra_predictor_zone1 =
+      DirectionalIntraPredictorZone1_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone2)
+  dsp->directional_intra_predictor_zone2 =
+      DirectionalIntraPredictorZone2_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone3)
+  dsp->directional_intra_predictor_zone3 =
+      DirectionalIntraPredictorZone3_SSE4_1;
+#endif
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+//------------------------------------------------------------------------------
+// 7.11.2.4. Directional intra prediction process
+
+// Special case: An |xstep| of 64 corresponds to an angle delta of 45, meaning
+// upsampling is ruled out. In addition, the bits masked by 0x3F for
+// |shift_val| are 0 for all multiples of 64, so the formula
+// val = top[top_base_x]*shift + top[top_base_x+1]*(32-shift), reduces to
+// val = top[top_base_x+1] << 5, meaning only the second set of pixels is
+// involved in the output. Hence |top| is offset by 1.
+inline void DirectionalZone1_Step64(uint16_t* dst, ptrdiff_t stride,
+                                    const uint16_t* const top, const int width,
+                                    const int height) {
+  ptrdiff_t offset = 1;
+  if (height == 4) {
+    memcpy(dst, top + offset, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 1, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 2, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 3, width * sizeof(dst[0]));
+    return;
+  }
+  int y = height;
+  do {
+    memcpy(dst, top + offset, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 1, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 2, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 3, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 4, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 5, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 6, width * sizeof(dst[0]));
+    dst += stride;
+    memcpy(dst, top + offset + 7, width * sizeof(dst[0]));
+    dst += stride;
+
+    offset += 8;
+    y -= 8;
+  } while (y != 0);
+}
+
+// Produce a weighted average whose weights sum to 32.
+inline __m128i CombineTopVals4(const __m128i& top_vals, const __m128i& sampler,
+                               const __m128i& shifts,
+                               const __m128i& top_indices,
+                               const __m128i& final_top_val,
+                               const __m128i& border_index) {
+  const __m128i sampled_values = _mm_shuffle_epi8(top_vals, sampler);
+  __m128i prod = _mm_mullo_epi16(sampled_values, shifts);
+  prod = _mm_hadd_epi16(prod, prod);
+  const __m128i result = RightShiftWithRounding_U16(prod, 5 /*log2(32)*/);
+
+  const __m128i past_max = _mm_cmpgt_epi16(top_indices, border_index);
+  // Replace pixels from invalid range with top-right corner.
+  return _mm_blendv_epi8(result, final_top_val, past_max);
+}
+
+// When width is 4, only one load operation is needed per iteration. We also
+// avoid extra loop precomputations that cause too much overhead.
+inline void DirectionalZone1_4xH(uint16_t* dst, ptrdiff_t stride,
+                                 const uint16_t* const top, const int height,
+                                 const int xstep, const bool upsampled,
+                                 const __m128i& sampler) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int index_scale_bits = 6 - upsample_shift;
+  const int max_base_x = (height + 3 /* width - 1 */) << upsample_shift;
+  const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+  const __m128i final_top_val = _mm_set1_epi16(top[max_base_x]);
+
+  // Each 16-bit value here corresponds to a position that may exceed
+  // |max_base_x|. When added to the top_base_x, it is used to mask values
+  // that pass the end of |top|. Starting from 1 to simulate "cmpge" because
+  // only cmpgt is available.
+  const __m128i offsets =
+      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+  // All rows from |min_corner_only_y| down will simply use memcpy.
+  // |max_base_x| is always greater than |height|, so clipping the denominator
+  // to 1 is enough to make the logic work.
+  const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+  const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+  int y = 0;
+  int top_x = xstep;
+  const __m128i max_shift = _mm_set1_epi16(32);
+
+  for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+    const int top_base_x = top_x >> index_scale_bits;
+
+    // Permit negative values of |top_x|.
+    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi16(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
+    __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+    top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+    // Load 8 values because we will select the sampled values based on
+    // |upsampled|.
+    const __m128i values = LoadUnaligned16(top + top_base_x);
+    const __m128i pred =
+        CombineTopVals4(values, sampler, shifts, top_index_vect, final_top_val,
+                        max_base_x_vect);
+    StoreLo8(dst, pred);
+  }
+
+  // Fill in corner-only rows.
+  for (; y < height; ++y) {
+    Memset(dst, top[max_base_x], /* width */ 4);
+    dst += stride;
+  }
+}
+
+// General purpose combine function.
+// |check_border| means the final source value has to be duplicated into the
+// result. This simplifies the loop structures that use precomputed boundaries
+// to identify sections where it is safe to compute without checking for the
+// right border.
+template <bool check_border>
+inline __m128i CombineTopVals(
+    const __m128i& top_vals_0, const __m128i& top_vals_1,
+    const __m128i& sampler, const __m128i& shifts,
+    const __m128i& top_indices = _mm_setzero_si128(),
+    const __m128i& final_top_val = _mm_setzero_si128(),
+    const __m128i& border_index = _mm_setzero_si128()) {
+  constexpr int scale_int_bits = 5;
+  const __m128i sampled_values_0 = _mm_shuffle_epi8(top_vals_0, sampler);
+  const __m128i sampled_values_1 = _mm_shuffle_epi8(top_vals_1, sampler);
+  const __m128i prod_0 = _mm_mullo_epi16(sampled_values_0, shifts);
+  const __m128i prod_1 = _mm_mullo_epi16(sampled_values_1, shifts);
+  const __m128i combined = _mm_hadd_epi16(prod_0, prod_1);
+  const __m128i result = RightShiftWithRounding_U16(combined, scale_int_bits);
+  if (check_border) {
+    const __m128i past_max = _mm_cmpgt_epi16(top_indices, border_index);
+    // Replace pixels from invalid range with top-right corner.
+    return _mm_blendv_epi8(result, final_top_val, past_max);
+  }
+  return result;
+}
+
+// 7.11.2.4 (7) angle < 90
+inline void DirectionalZone1_Large(uint16_t* dest, ptrdiff_t stride,
+                                   const uint16_t* const top_row,
+                                   const int width, const int height,
+                                   const int xstep, const bool upsampled,
+                                   const __m128i& sampler) {
+  const int upsample_shift = static_cast<int>(upsampled);
+  const int index_scale_bits = 6 - upsample_shift;
+  const int max_base_x = ((width + height) - 1) << upsample_shift;
+
+  const __m128i max_shift = _mm_set1_epi16(32);
+  const int base_step = 1 << upsample_shift;
+  const int base_step8 = base_step << 3;
+
+  // All rows from |min_corner_only_y| down will simply use memcpy.
+  // |max_base_x| is always greater than |height|, so clipping to 1 is enough
+  // to make the logic work.
+  const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+  const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+  // Rows up to this y-value can be computed without checking for bounds.
+  const int max_no_corner_y = std::min(
+      LeftShift((max_base_x - (base_step * width)), index_scale_bits) / xstep,
+      height);
+  // No need to check for exceeding |max_base_x| in the first loop.
+  int y = 0;
+  int top_x = xstep;
+  for (; y < max_no_corner_y; ++y, dest += stride, top_x += xstep) {
+    int top_base_x = top_x >> index_scale_bits;
+    // Permit negative values of |top_x|.
+    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi16(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
+    int x = 0;
+    do {
+      const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x);
+      const __m128i top_vals_1 =
+          LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift));
+
+      const __m128i pred =
+          CombineTopVals<false>(top_vals_0, top_vals_1, sampler, shifts);
+
+      StoreUnaligned16(dest + x, pred);
+      top_base_x += base_step8;
+      x += 8;
+    } while (x < width);
+  }
+
+  // Each 16-bit value here corresponds to a position that may exceed
+  // |max_base_x|. When added to |top_base_x|, it is used to mask values
+  // that pass the end of the |top| buffer. Starting from 1 to simulate "cmpge"
+  // which is not supported for packed integers.
+  const __m128i offsets =
+      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+  const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+  const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
+  const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
+  for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) {
+    int top_base_x = top_x >> index_scale_bits;
+
+    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi16(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
+    __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+    top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+    int x = 0;
+    const int min_corner_only_x =
+        std::min(width, ((max_base_x - top_base_x) >> upsample_shift) + 7) & ~7;
+    for (; x < min_corner_only_x;
+         x += 8, top_base_x += base_step8,
+         top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
+      const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x);
+      const __m128i top_vals_1 =
+          LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift));
+      const __m128i pred =
+          CombineTopVals<true>(top_vals_0, top_vals_1, sampler, shifts,
+                               top_index_vect, final_top_val, max_base_x_vect);
+      StoreUnaligned16(dest + x, pred);
+    }
+    // Corner-only section of the row.
+    Memset(dest + x, top_row[max_base_x], width - x);
+  }
+  // Fill in corner-only rows.
+  for (; y < height; ++y) {
+    Memset(dest, top_row[max_base_x], width);
+    dest += stride;
+  }
+}
+
+// 7.11.2.4 (7) angle < 90
+inline void DirectionalIntraPredictorZone1_SSE4_1(
+    void* dest_ptr, ptrdiff_t stride, const void* const top_ptr,
+    const int width, const int height, const int xstep, const bool upsampled) {
+  const auto* const top_row = static_cast<const uint16_t*>(top_ptr);
+  auto* dest = static_cast<uint16_t*>(dest_ptr);
+  stride /= sizeof(uint16_t);
+  const int upsample_shift = static_cast<int>(upsampled);
+  if (xstep == 64) {
+    DirectionalZone1_Step64(dest, stride, top_row, width, height);
+    return;
+  }
+  // Each base pixel paired with its following pixel, for hadd purposes.
+  const __m128i adjacency_shuffler = _mm_set_epi16(
+      0x0908, 0x0706, 0x0706, 0x0504, 0x0504, 0x0302, 0x0302, 0x0100);
+  // This is equivalent to not shuffling at all.
+  const __m128i identity_shuffler = _mm_set_epi16(
+      0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
+  // This represents a trade-off between code size and speed. When upsampled
+  // is true, no shuffle is necessary. But to avoid in-loop branching, we
+  // would need 2 copies of the main function body.
+  const __m128i sampler = upsampled ? identity_shuffler : adjacency_shuffler;
+  if (width == 4) {
+    DirectionalZone1_4xH(dest, stride, top_row, height, xstep, upsampled,
+                         sampler);
+    return;
+  }
+  if (width >= 32) {
+    DirectionalZone1_Large(dest, stride, top_row, width, height, xstep,
+                           upsampled, sampler);
+    return;
+  }
+  const int index_scale_bits = 6 - upsample_shift;
+  const int max_base_x = ((width + height) - 1) << upsample_shift;
+
+  const __m128i max_shift = _mm_set1_epi16(32);
+  const int base_step = 1 << upsample_shift;
+  const int base_step8 = base_step << 3;
+
+  // No need to check for exceeding |max_base_x| in the loops.
+  if (((xstep * height) >> index_scale_bits) + base_step * width < max_base_x) {
+    int top_x = xstep;
+    int y = height;
+    do {
+      int top_base_x = top_x >> index_scale_bits;
+      // Permit negative values of |top_x|.
+      const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+      const __m128i shift = _mm_set1_epi16(shift_val);
+      const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
+      const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
+      int x = 0;
+      do {
+        const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x);
+        const __m128i top_vals_1 =
+            LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift));
+        const __m128i pred =
+            CombineTopVals<false>(top_vals_0, top_vals_1, sampler, shifts);
+        StoreUnaligned16(dest + x, pred);
+        top_base_x += base_step8;
+        x += 8;
+      } while (x < width);
+      dest += stride;
+      top_x += xstep;
+    } while (--y != 0);
+    return;
+  }
+
+  // General case. Blocks with width less than 32 do not benefit from x-wise
+  // loop splitting, but do benefit from using memset on appropriate rows.
+
+  // Each 16-bit value here corresponds to a position that may exceed
+  // |max_base_x|. When added to the top_base_x, it is used to mask values
+  // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
+  // not supported for packed integers.
+  const __m128i offsets =
+      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+  const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+  const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
+  const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
+
+  // All rows from |min_corner_only_y| down will simply use memcpy.
+  // |max_base_x| is always greater than |height|, so clipping the denominator
+  // to 1 is enough to make the logic work.
+  const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+  const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+  int top_x = xstep;
+  int y = 0;
+  for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) {
+    int top_base_x = top_x >> index_scale_bits;
+
+    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+    const __m128i shift = _mm_set1_epi16(shift_val);
+    const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
+    const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
+    __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+    top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+    for (int x = 0; x < width; x += 8, top_base_x += base_step8,
+             top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
+      const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x);
+      const __m128i top_vals_1 =
+          LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift));
+      const __m128i pred =
+          CombineTopVals<true>(top_vals_0, top_vals_1, sampler, shifts,
+                               top_index_vect, final_top_val, max_base_x_vect);
+      StoreUnaligned16(dest + x, pred);
+    }
+  }
+
+  // Fill in corner-only rows.
+  for (; y < height; ++y) {
+    Memset(dest, top_row[max_base_x], width);
+    dest += stride;
+  }
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+#if DSP_ENABLED_10BPP_SSE4_1(DirectionalIntraPredictorZone1)
+  dsp->directional_intra_predictor_zone1 =
+      DirectionalIntraPredictorZone1_SSE4_1;
+#endif
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredDirectionalInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredDirectionalInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1

diff --git a/libgav1/src/dsp/x86/intrapred_directional_sse4.h b/libgav1/src/dsp/x86/intrapred_directional_sse4.h
new file mode 100644
index 0000000..b352450
--- /dev/null
+++ b/libgav1/src/dsp/x86/intrapred_directional_sse4.h

@@ -0,0 +1,54 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_DIRECTIONAL_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRAPRED_DIRECTIONAL_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::directional_intra_predictor_zone*, see the defines below for
+// specifics. These functions are not thread-safe.
+void IntraPredDirectionalInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1
+#define LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_INTRAPRED_DIRECTIONAL_SSE4_H_

diff --git a/libgav1/src/dsp/x86/intrapred_filter_sse4.cc b/libgav1/src/dsp/x86/intrapred_filter_sse4.cc
new file mode 100644
index 0000000..022af8d
--- /dev/null
+++ b/libgav1/src/dsp/x86/intrapred_filter_sse4.cc

@@ -0,0 +1,432 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_filter.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+//------------------------------------------------------------------------------
+// FilterIntraPredictor_SSE4_1
+// Section 7.11.2.3. Recursive intra prediction process
+// This filter applies recursively to 4x2 sub-blocks within the transform block,
+// meaning that the predicted pixels in each sub-block are used as inputs to
+// sub-blocks below and to the right, if present.
+//
+// Each output value in the sub-block is predicted by a different filter applied
+// to the same array of top-left, top, and left values. If fn refers to the
+// output of the nth filter, given this block:
+// TL T0 T1 T2 T3
+// L0 f0 f1 f2 f3
+// L1 f4 f5 f6 f7
+// The filter input order is p0, p1, p2, p3, p4, p5, p6:
+// p0 p1 p2 p3 p4
+// p5 f0 f1 f2 f3
+// p6 f4 f5 f6 f7
+// Filters usually apply to 8 values for convenience, so in this case we fix
+// the 8th filter tap to 0 and disregard the value of the 8th input.
+
+// This shuffle mask selects 32-bit blocks in the order 0, 1, 0, 1, which
+// duplicates the first 8 bytes of a 128-bit vector into the second 8 bytes.
+constexpr int kDuplicateFirstHalf = 0x44;
+
+// Apply all filter taps to the given 7 packed 16-bit values, keeping the 8th
+// at zero to preserve the sum.
+// |pixels| contains p0-p7 in order as shown above.
+// |taps_0_1| contains the filter kernels used to predict f0 and f1, and so on.
+inline void Filter4x2_SSE4_1(uint8_t* dst, const ptrdiff_t stride,
+                             const __m128i& pixels, const __m128i& taps_0_1,
+                             const __m128i& taps_2_3, const __m128i& taps_4_5,
+                             const __m128i& taps_6_7) {
+  const __m128i mul_0_01 = _mm_maddubs_epi16(pixels, taps_0_1);
+  const __m128i mul_0_23 = _mm_maddubs_epi16(pixels, taps_2_3);
+  // |output_half| contains 8 partial sums for f0-f7.
+  __m128i output_half = _mm_hadd_epi16(mul_0_01, mul_0_23);
+  __m128i output = _mm_hadd_epi16(output_half, output_half);
+  const __m128i output_row0 =
+      _mm_packus_epi16(RightShiftWithRounding_S16(output, 4),
+                       /* unused half */ output);
+  Store4(dst, output_row0);
+  const __m128i mul_1_01 = _mm_maddubs_epi16(pixels, taps_4_5);
+  const __m128i mul_1_23 = _mm_maddubs_epi16(pixels, taps_6_7);
+  output_half = _mm_hadd_epi16(mul_1_01, mul_1_23);
+  output = _mm_hadd_epi16(output_half, output_half);
+  const __m128i output_row1 =
+      _mm_packus_epi16(RightShiftWithRounding_S16(output, 4),
+                       /* arbitrary pack arg */ output);
+  Store4(dst + stride, output_row1);
+}
+
+// 4xH transform sizes are given special treatment because LoadLo8 goes out
+// of bounds and every block involves the left column. The top-left pixel, p0,
+// is stored in the top buffer for the first 4x2, but comes from the left buffer
+// for successive blocks. This implementation takes advantage of the fact
+// that the p5 and p6 for each sub-block come solely from the |left_ptr| buffer,
+// using shifts to arrange things to fit reusable shuffle vectors.
+inline void Filter4xH(uint8_t* dest, ptrdiff_t stride,
+                      const uint8_t* const top_ptr,
+                      const uint8_t* const left_ptr, FilterIntraPredictor pred,
+                      const int height) {
+  // Two filter kernels per vector.
+  const __m128i taps_0_1 = LoadAligned16(kFilterIntraTaps[pred][0]);
+  const __m128i taps_2_3 = LoadAligned16(kFilterIntraTaps[pred][2]);
+  const __m128i taps_4_5 = LoadAligned16(kFilterIntraTaps[pred][4]);
+  const __m128i taps_6_7 = LoadAligned16(kFilterIntraTaps[pred][6]);
+  __m128i top = Load4(top_ptr - 1);
+  __m128i pixels = _mm_insert_epi8(top, top_ptr[3], 4);
+  __m128i left = (height == 4 ? Load4(left_ptr) : LoadLo8(left_ptr));
+  left = _mm_slli_si128(left, 5);
+
+  // Relative pixels: top[-1], top[0], top[1], top[2], top[3], left[0], left[1],
+  // left[2], left[3], left[4], left[5], left[6], left[7]
+  // Let rn represent a pixel usable as pn for the 4x2 after this one. We get:
+  //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+  // p0 p1 p2 p3 p4 p5 p6 r5 r6 ...
+  //                   r0
+  pixels = _mm_or_si128(left, pixels);
+
+  // Two sets of the same input pixels to apply two filters at once.
+  pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+  Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                   taps_6_7);
+  dest += stride;  // Move to y = 1.
+  pixels = Load4(dest);
+
+  // Relative pixels: top[0], top[1], top[2], top[3], empty, left[-2], left[-1],
+  // left[0], left[1], ...
+  //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+  // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
+  //                         r0
+  pixels = _mm_or_si128(left, pixels);
+
+  // This mask rearranges bytes in the order: 6, 0, 1, 2, 3, 7, 8, 15. The last
+  // byte is an unused value, which shall be multiplied by 0 when we apply the
+  // filter.
+  constexpr int64_t kInsertTopLeftFirstMask = 0x0F08070302010006;
+
+  // Insert left[-1] in front as TL and put left[0] and left[1] at the end.
+  const __m128i pixel_order1 = _mm_set1_epi64x(kInsertTopLeftFirstMask);
+  pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+  dest += stride;  // Move to y = 2.
+  Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                   taps_6_7);
+  dest += stride;  // Move to y = 3.
+
+  // Compute the middle 8 rows before using common code for the final 4 rows, in
+  // order to fit the assumption that |left| has the next TL at position 8.
+  if (height == 16) {
+    // This shift allows us to use pixel_order2 twice after shifting by 2 later.
+    left = _mm_slli_si128(left, 1);
+    pixels = Load4(dest);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, left[-4],
+    // left[-3], left[-2], left[-1], left[0], left[1], left[2], left[3]
+    //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+    // p1 p2 p3 p4 xx xx xx xx xx p0 p5 p6 r5 r6 ...
+    //                                  r0
+    pixels = _mm_or_si128(left, pixels);
+
+    // This mask rearranges bytes in the order: 9, 0, 1, 2, 3, 7, 8, 15. The
+    // last byte is an unused value, as above. The top-left was shifted to
+    // position nine to keep two empty spaces after the top pixels.
+    constexpr int64_t kInsertTopLeftSecondMask = 0x0F0B0A0302010009;
+
+    // Insert (relative) left[-1] in front as TL and put left[0] and left[1] at
+    // the end.
+    const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftSecondMask);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+    dest += stride;  // Move to y = 4.
+
+    // First 4x2 in the if body.
+    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+
+    // Clear all but final pixel in the first 8 of left column.
+    __m128i keep_top_left = _mm_srli_si128(left, 13);
+    dest += stride;  // Move to y = 5.
+    pixels = Load4(dest);
+    left = _mm_srli_si128(left, 2);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], left[-6],
+    // left[-5], left[-4], left[-3], left[-2], left[-1], left[0], left[1]
+    //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+    // p1 p2 p3 p4 xx xx xx xx xx p0 p5 p6 r5 r6 ...
+    //                                  r0
+    pixels = _mm_or_si128(left, pixels);
+    left = LoadLo8(left_ptr + 8);
+
+    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+    dest += stride;  // Move to y = 6.
+
+    // Second 4x2 in the if body.
+    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+
+    // Position TL value so we can use pixel_order1.
+    keep_top_left = _mm_slli_si128(keep_top_left, 6);
+    dest += stride;  // Move to y = 7.
+    pixels = Load4(dest);
+    left = _mm_slli_si128(left, 7);
+    left = _mm_or_si128(left, keep_top_left);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
+    // left[-1], left[0], left[1], left[2], left[3], ...
+    //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+    // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
+    //                         r0
+    pixels = _mm_or_si128(left, pixels);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+    dest += stride;  // Move to y = 8.
+
+    // Third 4x2 in the if body.
+    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+    dest += stride;  // Move to y = 9.
+
+    // Prepare final inputs.
+    pixels = Load4(dest);
+    left = _mm_srli_si128(left, 2);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
+    // left[-1], left[0], left[1], left[2], left[3], ...
+    //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+    // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
+    //                         r0
+    pixels = _mm_or_si128(left, pixels);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+    dest += stride;  // Move to y = 10.
+
+    // Fourth 4x2 in the if body.
+    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+    dest += stride;  // Move to y = 11.
+  }
+
+  // In both the 8 and 16 case at this point, we can assume that |left| has the
+  // next TL at position 8.
+  if (height > 4) {
+    // Erase prior left pixels by shifting TL to position 0.
+    left = _mm_srli_si128(left, 8);
+    left = _mm_slli_si128(left, 6);
+    pixels = Load4(dest);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
+    // left[-1], left[0], left[1], left[2], left[3], ...
+    //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+    // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
+    //                         r0
+    pixels = _mm_or_si128(left, pixels);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+    dest += stride;  // Move to y = 12 or 4.
+
+    // First of final two 4x2 blocks.
+    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+    dest += stride;  // Move to y = 13 or 5.
+    pixels = Load4(dest);
+    left = _mm_srli_si128(left, 2);
+
+    // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
+    // left[-1], left[0], left[1], left[2], left[3], ...
+    //  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
+    // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
+    //                         r0
+    pixels = _mm_or_si128(left, pixels);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+    dest += stride;  // Move to y = 14 or 6.
+
+    // Last of final two 4x2 blocks.
+    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+  }
+}
+
+void FilterIntraPredictor_SSE4_1(void* const dest, ptrdiff_t stride,
+                                 const void* const top_row,
+                                 const void* const left_column,
+                                 FilterIntraPredictor pred, const int width,
+                                 const int height) {
+  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+  auto* dst = static_cast<uint8_t*>(dest);
+  if (width == 4) {
+    Filter4xH(dst, stride, top_ptr, left_ptr, pred, height);
+    return;
+  }
+
+  // There is one set of 7 taps for each of the 4x2 output pixels.
+  const __m128i taps_0_1 = LoadAligned16(kFilterIntraTaps[pred][0]);
+  const __m128i taps_2_3 = LoadAligned16(kFilterIntraTaps[pred][2]);
+  const __m128i taps_4_5 = LoadAligned16(kFilterIntraTaps[pred][4]);
+  const __m128i taps_6_7 = LoadAligned16(kFilterIntraTaps[pred][6]);
+
+  // This mask rearranges bytes in the order: 0, 1, 2, 3, 4, 8, 9, 15. The 15 at
+  // the end is an unused value, which shall be multiplied by 0 when we apply
+  // the filter.
+  constexpr int64_t kCondenseLeftMask = 0x0F09080403020100;
+
+  // Takes the "left section" and puts it right after p0-p4.
+  const __m128i pixel_order1 = _mm_set1_epi64x(kCondenseLeftMask);
+
+  // This mask rearranges bytes in the order: 8, 0, 1, 2, 3, 9, 10, 15. The last
+  // byte is unused as above.
+  constexpr int64_t kInsertTopLeftMask = 0x0F0A090302010008;
+
+  // Shuffles the "top left" from the left section, to the front. Used when
+  // grabbing data from left_column and not top_row.
+  const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftMask);
+
+  // This first pass takes care of the cases where the top left pixel comes from
+  // top_row.
+  __m128i pixels = LoadLo8(top_ptr - 1);
+  __m128i left = _mm_slli_si128(Load4(left_column), 8);
+  pixels = _mm_or_si128(pixels, left);
+
+  // Two sets of the same pixels to multiply with two sets of taps.
+  pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+  Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5, taps_6_7);
+  left = _mm_srli_si128(left, 1);
+
+  // Load
+  pixels = Load4(dst + stride);
+
+  // Because of the above shift, this OR 'invades' the final of the first 8
+  // bytes of |pixels|. This is acceptable because the 8th filter tap is always
+  // a padded 0.
+  pixels = _mm_or_si128(pixels, left);
+  pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+  const ptrdiff_t stride2 = stride << 1;
+  const ptrdiff_t stride4 = stride << 2;
+  Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                   taps_6_7);
+  dst += 4;
+  for (int x = 3; x < width - 4; x += 4) {
+    pixels = Load4(top_ptr + x);
+    pixels = _mm_insert_epi8(pixels, top_ptr[x + 4], 4);
+    pixels = _mm_insert_epi8(pixels, dst[-1], 5);
+    pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6);
+
+    // Duplicate bottom half into upper half.
+    pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+    Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+    pixels = Load4(dst + stride - 1);
+    pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4);
+    pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5);
+    pixels = _mm_insert_epi8(pixels, dst[stride + stride2 - 1], 6);
+
+    // Duplicate bottom half into upper half.
+    pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+    Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
+                     taps_4_5, taps_6_7);
+    dst += 4;
+  }
+
+  // Now we handle heights that reference previous blocks rather than top_row.
+  for (int y = 4; y < height; y += 4) {
+    // Leftmost 4x4 block for this height.
+    dst -= width;
+    dst += stride4;
+
+    // Top Left is not available by offset in these leftmost blocks.
+    pixels = Load4(dst - stride);
+    left = _mm_slli_si128(Load4(left_ptr + y - 1), 8);
+    left = _mm_insert_epi8(left, left_ptr[y + 3], 12);
+    pixels = _mm_or_si128(pixels, left);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+    Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                     taps_6_7);
+
+    // The bytes shifted into positions 6 and 7 will be ignored by the shuffle.
+    left = _mm_srli_si128(left, 2);
+    pixels = Load4(dst + stride);
+    pixels = _mm_or_si128(pixels, left);
+    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+    Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
+                     taps_4_5, taps_6_7);
+
+    dst += 4;
+
+    // Remaining 4x4 blocks for this height.
+    for (int x = 4; x < width; x += 4) {
+      pixels = Load4(dst - stride - 1);
+      pixels = _mm_insert_epi8(pixels, dst[-stride + 3], 4);
+      pixels = _mm_insert_epi8(pixels, dst[-1], 5);
+      pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6);
+
+      // Duplicate bottom half into upper half.
+      pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+      Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+                       taps_6_7);
+      pixels = Load4(dst + stride - 1);
+      pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4);
+      pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5);
+      pixels = _mm_insert_epi8(pixels, dst[stride2 + stride - 1], 6);
+
+      // Duplicate bottom half into upper half.
+      pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+      Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
+                       taps_4_5, taps_6_7);
+      dst += 4;
+    }
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+// These guards check if this version of the function was not superseded by
+// a higher optimization level, such as AVX. The corresponding #define also
+// prevents the C version from being added to the table.
+#if DSP_ENABLED_8BPP_SSE4_1(FilterIntraPredictor)
+  dsp->filter_intra_predictor = FilterIntraPredictor_SSE4_1;
+#endif
+}
+
+}  // namespace
+
+void IntraPredFilterInit_SSE4_1() { Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredFilterInit_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1

diff --git a/libgav1/src/dsp/x86/intrapred_filter_sse4.h b/libgav1/src/dsp/x86/intrapred_filter_sse4.h
new file mode 100644
index 0000000..ce28f93
--- /dev/null
+++ b/libgav1/src/dsp/x86/intrapred_filter_sse4.h

@@ -0,0 +1,41 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_FILTER_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRAPRED_FILTER_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::filter_intra_predictor, see the defines below for specifics.
+// These functions are not thread-safe.
+void IntraPredFilterInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_FilterIntraPredictor
+#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_INTRAPRED_FILTER_SSE4_H_

diff --git a/libgav1/src/dsp/x86/intrapred_smooth_sse4.cc b/libgav1/src/dsp/x86/intrapred_smooth_sse4.cc
index a761813..de9f551 100644
--- a/libgav1/src/dsp/x86/intrapred_smooth_sse4.cc
+++ b/libgav1/src/dsp/x86/intrapred_smooth_sse4.cc

@@ -12,22 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_smooth.h"
 #include "src/utils/cpu.h"
 
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
 
 #include <xmmintrin.h>
 
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
-#include <cstring>  // memcpy
 
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
 #include "src/dsp/x86/common_sse4.h"
 #include "src/utils/common.h"
+#include "src/utils/constants.h"
 
 namespace libgav1 {
 namespace dsp {
@@ -67,29 +67,6 @@
   Store4(dest, _mm_shuffle_epi8(pred, cvtepi32_epi8));
 }
 
-template <int y_mask>
-inline __m128i SmoothVerticalSum4(const __m128i& top, const __m128i& weights,
-                                  const __m128i& scaled_bottom_left) {
-  const __m128i weights_y = _mm_shuffle_epi32(weights, y_mask);
-  const __m128i weighted_top_y = _mm_mullo_epi16(top, weights_y);
-  const __m128i scaled_bottom_left_y =
-      _mm_shuffle_epi32(scaled_bottom_left, y_mask);
-  return _mm_add_epi32(scaled_bottom_left_y, weighted_top_y);
-}
-
-template <int y_mask>
-inline void WriteSmoothVerticalSum4(uint8_t* dest, const __m128i& top,
-                                    const __m128i& weights,
-                                    const __m128i& scaled_bottom_left,
-                                    const __m128i& round) {
-  __m128i pred_sum =
-      SmoothVerticalSum4<y_mask>(top, weights, scaled_bottom_left);
-  // Equivalent to RightShiftWithRounding(pred[x][y], 8).
-  pred_sum = _mm_srli_epi32(_mm_add_epi32(pred_sum, round), 8);
-  const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
-  Store4(dest, _mm_shuffle_epi8(pred_sum, cvtepi32_epi8));
-}
-
 // For SMOOTH_H, |pixels| is the repeated left value for the row. For SMOOTH_V,
 // |pixels| is a segment of the top row or the whole top row, and |weights| is
 // repeated.
@@ -2649,7 +2626,7 @@
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_SSE4_1
+#else  // !LIBGAV1_TARGETING_SSE4_1
 
 namespace libgav1 {
 namespace dsp {
@@ -2659,4 +2636,4 @@
 }  // namespace dsp
 }  // namespace libgav1
 
-#endif  // LIBGAV1_ENABLE_SSE4_1
+#endif  // LIBGAV1_TARGETING_SSE4_1

diff --git a/libgav1/src/dsp/x86/intrapred_smooth_sse4.h b/libgav1/src/dsp/x86/intrapred_smooth_sse4.h
new file mode 100644
index 0000000..9353371
--- /dev/null
+++ b/libgav1/src/dsp/x86/intrapred_smooth_sse4.h

@@ -0,0 +1,318 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_SMOOTH_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRAPRED_SMOOTH_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors[][kIntraPredictorSmooth.*].
+// This function is not thread-safe.
+void IntraPredSmoothInit_SSE4_1();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \
+  LIBGAV1_CPU_SSE4_1
+#endif
+#endif  // LIBGAV1_TARGETING_SSE4_1
+
+#endif  // LIBGAV1_SRC_DSP_X86_INTRAPRED_SMOOTH_SSE4_H_

diff --git a/libgav1/src/dsp/x86/intrapred_sse4.cc b/libgav1/src/dsp/x86/intrapred_sse4.cc
index 11ba9aa..063929d 100644
--- a/libgav1/src/dsp/x86/intrapred_sse4.cc
+++ b/libgav1/src/dsp/x86/intrapred_sse4.cc

@@ -15,7 +15,7 @@
 #include "src/dsp/intrapred.h"
 #include "src/utils/cpu.h"
 
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
 
 #include <xmmintrin.h>
 
@@ -23,13 +23,14 @@
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
-#include <cstring>  // memcpy
+#include <cstring>
 
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
 #include "src/dsp/x86/common_sse4.h"
 #include "src/dsp/x86/transpose_sse4.h"
 #include "src/utils/common.h"
+#include "src/utils/constants.h"
 
 namespace libgav1 {
 namespace dsp {
@@ -51,10 +52,6 @@
   return _mm_mulhi_epi16(interm, _mm_cvtsi32_si128(multiplier));
 }
 
-// This shuffle mask selects 32-bit blocks in the order 0, 1, 0, 1, which
-// duplicates the first 8 bytes of a 128-bit vector into the second 8 bytes.
-constexpr int kDuplicateFirstHalf = 0x44;
-
 //------------------------------------------------------------------------------
 // DcPredFuncs_SSE4_1
 
@@ -1408,1337 +1405,6 @@
   WritePaeth16x16(dst + 48, stride, top_left, top_3, left_3);
 }
 
-//------------------------------------------------------------------------------
-// 7.11.2.4. Directional intra prediction process
-
-// Special case: An |xstep| of 64 corresponds to an angle delta of 45, meaning
-// upsampling is ruled out. In addition, the bits masked by 0x3F for
-// |shift_val| are 0 for all multiples of 64, so the formula
-// val = top[top_base_x]*shift + top[top_base_x+1]*(32-shift), reduces to
-// val = top[top_base_x+1] << 5, meaning only the second set of pixels is
-// involved in the output. Hence |top| is offset by 1.
-inline void DirectionalZone1_Step64(uint8_t* dst, ptrdiff_t stride,
-                                    const uint8_t* const top, const int width,
-                                    const int height) {
-  ptrdiff_t offset = 1;
-  if (height == 4) {
-    memcpy(dst, top + offset, width);
-    dst += stride;
-    memcpy(dst, top + offset + 1, width);
-    dst += stride;
-    memcpy(dst, top + offset + 2, width);
-    dst += stride;
-    memcpy(dst, top + offset + 3, width);
-    return;
-  }
-  int y = 0;
-  do {
-    memcpy(dst, top + offset, width);
-    dst += stride;
-    memcpy(dst, top + offset + 1, width);
-    dst += stride;
-    memcpy(dst, top + offset + 2, width);
-    dst += stride;
-    memcpy(dst, top + offset + 3, width);
-    dst += stride;
-    memcpy(dst, top + offset + 4, width);
-    dst += stride;
-    memcpy(dst, top + offset + 5, width);
-    dst += stride;
-    memcpy(dst, top + offset + 6, width);
-    dst += stride;
-    memcpy(dst, top + offset + 7, width);
-    dst += stride;
-
-    offset += 8;
-    y += 8;
-  } while (y < height);
-}
-
-inline void DirectionalZone1_4xH(uint8_t* dst, ptrdiff_t stride,
-                                 const uint8_t* const top, const int height,
-                                 const int xstep, const bool upsampled) {
-  const int upsample_shift = static_cast<int>(upsampled);
-  const int scale_bits = 6 - upsample_shift;
-  const int rounding_bits = 5;
-  const int max_base_x = (height + 3 /* width - 1 */) << upsample_shift;
-  const __m128i final_top_val = _mm_set1_epi16(top[max_base_x]);
-  const __m128i sampler = upsampled ? _mm_set_epi64x(0, 0x0706050403020100)
-                                    : _mm_set_epi64x(0, 0x0403030202010100);
-  // Each 16-bit value here corresponds to a position that may exceed
-  // |max_base_x|. When added to the top_base_x, it is used to mask values
-  // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
-  // not supported for packed integers.
-  const __m128i offsets =
-      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
-
-  // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x|
-  // is always greater than |height|, so clipping to 1 is enough to make the
-  // logic work.
-  const int xstep_units = std::max(xstep >> scale_bits, 1);
-  const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
-
-  // Rows up to this y-value can be computed without checking for bounds.
-  int y = 0;
-  int top_x = xstep;
-
-  for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
-    const int top_base_x = top_x >> scale_bits;
-
-    // Permit negative values of |top_x|.
-    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
-    const __m128i shift = _mm_set1_epi8(shift_val);
-    const __m128i max_shift = _mm_set1_epi8(32);
-    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
-    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
-    __m128i top_index_vect = _mm_set1_epi16(top_base_x);
-    top_index_vect = _mm_add_epi16(top_index_vect, offsets);
-    const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
-
-    // Load 8 values because we will select the sampled values based on
-    // |upsampled|.
-    const __m128i values = LoadLo8(top + top_base_x);
-    const __m128i sampled_values = _mm_shuffle_epi8(values, sampler);
-    const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
-    __m128i prod = _mm_maddubs_epi16(sampled_values, shifts);
-    prod = RightShiftWithRounding_U16(prod, rounding_bits);
-    // Replace pixels from invalid range with top-right corner.
-    prod = _mm_blendv_epi8(prod, final_top_val, past_max);
-    Store4(dst, _mm_packus_epi16(prod, prod));
-  }
-
-  // Fill in corner-only rows.
-  for (; y < height; ++y) {
-    memset(dst, top[max_base_x], /* width */ 4);
-    dst += stride;
-  }
-}
-
-// 7.11.2.4 (7) angle < 90
-inline void DirectionalZone1_Large(uint8_t* dest, ptrdiff_t stride,
-                                   const uint8_t* const top_row,
-                                   const int width, const int height,
-                                   const int xstep, const bool upsampled) {
-  const int upsample_shift = static_cast<int>(upsampled);
-  const __m128i sampler =
-      upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
-                : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
-  const int scale_bits = 6 - upsample_shift;
-  const int max_base_x = ((width + height) - 1) << upsample_shift;
-
-  const __m128i max_shift = _mm_set1_epi8(32);
-  const int rounding_bits = 5;
-  const int base_step = 1 << upsample_shift;
-  const int base_step8 = base_step << 3;
-
-  // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x|
-  // is always greater than |height|, so clipping to 1 is enough to make the
-  // logic work.
-  const int xstep_units = std::max(xstep >> scale_bits, 1);
-  const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
-
-  // Rows up to this y-value can be computed without checking for bounds.
-  const int max_no_corner_y = std::min(
-      LeftShift((max_base_x - (base_step * width)), scale_bits) / xstep,
-      height);
-  // No need to check for exceeding |max_base_x| in the first loop.
-  int y = 0;
-  int top_x = xstep;
-  for (; y < max_no_corner_y; ++y, dest += stride, top_x += xstep) {
-    int top_base_x = top_x >> scale_bits;
-    // Permit negative values of |top_x|.
-    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
-    const __m128i shift = _mm_set1_epi8(shift_val);
-    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
-    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
-    int x = 0;
-    do {
-      const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
-      __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
-      vals = _mm_maddubs_epi16(vals, shifts);
-      vals = RightShiftWithRounding_U16(vals, rounding_bits);
-      StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
-      top_base_x += base_step8;
-      x += 8;
-    } while (x < width);
-  }
-
-  // Each 16-bit value here corresponds to a position that may exceed
-  // |max_base_x|. When added to the top_base_x, it is used to mask values
-  // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
-  // not supported for packed integers.
-  const __m128i offsets =
-      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
-
-  const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
-  const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
-  const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
-  for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) {
-    int top_base_x = top_x >> scale_bits;
-
-    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
-    const __m128i shift = _mm_set1_epi8(shift_val);
-    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
-    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
-    __m128i top_index_vect = _mm_set1_epi16(top_base_x);
-    top_index_vect = _mm_add_epi16(top_index_vect, offsets);
-
-    int x = 0;
-    const int min_corner_only_x =
-        std::min(width, ((max_base_x - top_base_x) >> upsample_shift) + 7) & ~7;
-    for (; x < min_corner_only_x;
-         x += 8, top_base_x += base_step8,
-         top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
-      const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
-      // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents
-      // reading out of bounds. If all indices are past max and we don't need to
-      // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will
-      // reset for the next |y|.
-      top_base_x &= ~_mm_cvtsi128_si32(past_max);
-      const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
-      __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
-      vals = _mm_maddubs_epi16(vals, shifts);
-      vals = RightShiftWithRounding_U16(vals, rounding_bits);
-      vals = _mm_blendv_epi8(vals, final_top_val, past_max);
-      StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
-    }
-    // Corner-only section of the row.
-    memset(dest + x, top_row[max_base_x], width - x);
-  }
-  // Fill in corner-only rows.
-  for (; y < height; ++y) {
-    memset(dest, top_row[max_base_x], width);
-    dest += stride;
-  }
-}
-
-// 7.11.2.4 (7) angle < 90
-inline void DirectionalZone1_SSE4_1(uint8_t* dest, ptrdiff_t stride,
-                                    const uint8_t* const top_row,
-                                    const int width, const int height,
-                                    const int xstep, const bool upsampled) {
-  const int upsample_shift = static_cast<int>(upsampled);
-  if (xstep == 64) {
-    DirectionalZone1_Step64(dest, stride, top_row, width, height);
-    return;
-  }
-  if (width == 4) {
-    DirectionalZone1_4xH(dest, stride, top_row, height, xstep, upsampled);
-    return;
-  }
-  if (width >= 32) {
-    DirectionalZone1_Large(dest, stride, top_row, width, height, xstep,
-                           upsampled);
-    return;
-  }
-  const __m128i sampler =
-      upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
-                : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
-  const int scale_bits = 6 - upsample_shift;
-  const int max_base_x = ((width + height) - 1) << upsample_shift;
-
-  const __m128i max_shift = _mm_set1_epi8(32);
-  const int rounding_bits = 5;
-  const int base_step = 1 << upsample_shift;
-  const int base_step8 = base_step << 3;
-
-  // No need to check for exceeding |max_base_x| in the loops.
-  if (((xstep * height) >> scale_bits) + base_step * width < max_base_x) {
-    int top_x = xstep;
-    int y = 0;
-    do {
-      int top_base_x = top_x >> scale_bits;
-      // Permit negative values of |top_x|.
-      const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
-      const __m128i shift = _mm_set1_epi8(shift_val);
-      const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
-      const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
-      int x = 0;
-      do {
-        const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
-        __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
-        vals = _mm_maddubs_epi16(vals, shifts);
-        vals = RightShiftWithRounding_U16(vals, rounding_bits);
-        StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
-        top_base_x += base_step8;
-        x += 8;
-      } while (x < width);
-      dest += stride;
-      top_x += xstep;
-    } while (++y < height);
-    return;
-  }
-
-  // Each 16-bit value here corresponds to a position that may exceed
-  // |max_base_x|. When added to the top_base_x, it is used to mask values
-  // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
-  // not supported for packed integers.
-  const __m128i offsets =
-      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
-
-  const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
-  const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
-  const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
-  int top_x = xstep;
-  int y = 0;
-  do {
-    int top_base_x = top_x >> scale_bits;
-
-    if (top_base_x >= max_base_x) {
-      for (int i = y; i < height; ++i) {
-        memset(dest, top_row[max_base_x], width);
-        dest += stride;
-      }
-      return;
-    }
-
-    const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
-    const __m128i shift = _mm_set1_epi8(shift_val);
-    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
-    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
-    __m128i top_index_vect = _mm_set1_epi16(top_base_x);
-    top_index_vect = _mm_add_epi16(top_index_vect, offsets);
-
-    int x = 0;
-    for (; x < width - 8;
-         x += 8, top_base_x += base_step8,
-         top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
-      const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
-      // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents
-      // reading out of bounds. If all indices are past max and we don't need to
-      // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will
-      // reset for the next |y|.
-      top_base_x &= ~_mm_cvtsi128_si32(past_max);
-      const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
-      __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
-      vals = _mm_maddubs_epi16(vals, shifts);
-      vals = RightShiftWithRounding_U16(vals, rounding_bits);
-      vals = _mm_blendv_epi8(vals, final_top_val, past_max);
-      StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
-    }
-    const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
-    __m128i vals;
-    if (upsampled) {
-      vals = LoadUnaligned16(top_row + top_base_x);
-    } else {
-      const __m128i top_vals = LoadLo8(top_row + top_base_x);
-      vals = _mm_shuffle_epi8(top_vals, sampler);
-      vals = _mm_insert_epi8(vals, top_row[top_base_x + 8], 15);
-    }
-    vals = _mm_maddubs_epi16(vals, shifts);
-    vals = RightShiftWithRounding_U16(vals, rounding_bits);
-    vals = _mm_blendv_epi8(vals, final_top_val, past_max);
-    StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
-    dest += stride;
-    top_x += xstep;
-  } while (++y < height);
-}
-
-void DirectionalIntraPredictorZone1_SSE4_1(void* const dest, ptrdiff_t stride,
-                                           const void* const top_row,
-                                           const int width, const int height,
-                                           const int xstep,
-                                           const bool upsampled_top) {
-  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
-  auto* dst = static_cast<uint8_t*>(dest);
-  DirectionalZone1_SSE4_1(dst, stride, top_ptr, width, height, xstep,
-                          upsampled_top);
-}
-
-template <bool upsampled>
-inline void DirectionalZone3_4x4(uint8_t* dest, ptrdiff_t stride,
-                                 const uint8_t* const left_column,
-                                 const int base_left_y, const int ystep) {
-  // For use in the non-upsampled case.
-  const __m128i sampler = _mm_set_epi64x(0, 0x0403030202010100);
-  const int upsample_shift = static_cast<int>(upsampled);
-  const int scale_bits = 6 - upsample_shift;
-  const __m128i max_shift = _mm_set1_epi8(32);
-  const int rounding_bits = 5;
-
-  __m128i result_block[4];
-  for (int x = 0, left_y = base_left_y; x < 4; x++, left_y += ystep) {
-    const int left_base_y = left_y >> scale_bits;
-    const int shift_val = ((left_y << upsample_shift) & 0x3F) >> 1;
-    const __m128i shift = _mm_set1_epi8(shift_val);
-    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
-    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
-    __m128i vals;
-    if (upsampled) {
-      vals = LoadLo8(left_column + left_base_y);
-    } else {
-      const __m128i top_vals = LoadLo8(left_column + left_base_y);
-      vals = _mm_shuffle_epi8(top_vals, sampler);
-    }
-    vals = _mm_maddubs_epi16(vals, shifts);
-    vals = RightShiftWithRounding_U16(vals, rounding_bits);
-    result_block[x] = _mm_packus_epi16(vals, vals);
-  }
-  const __m128i result = Transpose4x4_U8(result_block);
-  // This is result_row0.
-  Store4(dest, result);
-  dest += stride;
-  const int result_row1 = _mm_extract_epi32(result, 1);
-  memcpy(dest, &result_row1, sizeof(result_row1));
-  dest += stride;
-  const int result_row2 = _mm_extract_epi32(result, 2);
-  memcpy(dest, &result_row2, sizeof(result_row2));
-  dest += stride;
-  const int result_row3 = _mm_extract_epi32(result, 3);
-  memcpy(dest, &result_row3, sizeof(result_row3));
-}
-
-template <bool upsampled, int height>
-inline void DirectionalZone3_8xH(uint8_t* dest, ptrdiff_t stride,
-                                 const uint8_t* const left_column,
-                                 const int base_left_y, const int ystep) {
-  // For use in the non-upsampled case.
-  const __m128i sampler =
-      _mm_set_epi64x(0x0807070606050504, 0x0403030202010100);
-  const int upsample_shift = static_cast<int>(upsampled);
-  const int scale_bits = 6 - upsample_shift;
-  const __m128i max_shift = _mm_set1_epi8(32);
-  const int rounding_bits = 5;
-
-  __m128i result_block[8];
-  for (int x = 0, left_y = base_left_y; x < 8; x++, left_y += ystep) {
-    const int left_base_y = left_y >> scale_bits;
-    const int shift_val = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
-    const __m128i shift = _mm_set1_epi8(shift_val);
-    const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
-    const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
-    __m128i vals;
-    if (upsampled) {
-      vals = LoadUnaligned16(left_column + left_base_y);
-    } else {
-      const __m128i top_vals = LoadUnaligned16(left_column + left_base_y);
-      vals = _mm_shuffle_epi8(top_vals, sampler);
-    }
-    vals = _mm_maddubs_epi16(vals, shifts);
-    result_block[x] = RightShiftWithRounding_U16(vals, rounding_bits);
-  }
-  Transpose8x8_U16(result_block, result_block);
-  for (int y = 0; y < height; ++y) {
-    StoreLo8(dest, _mm_packus_epi16(result_block[y], result_block[y]));
-    dest += stride;
-  }
-}
-
-// 7.11.2.4 (9) angle > 180
-void DirectionalIntraPredictorZone3_SSE4_1(void* dest, ptrdiff_t stride,
-                                           const void* const left_column,
-                                           const int width, const int height,
-                                           const int ystep,
-                                           const bool upsampled) {
-  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
-  auto* dst = static_cast<uint8_t*>(dest);
-  const int upsample_shift = static_cast<int>(upsampled);
-  if (width == 4 || height == 4) {
-    const ptrdiff_t stride4 = stride << 2;
-    if (upsampled) {
-      int left_y = ystep;
-      int x = 0;
-      do {
-        uint8_t* dst_x = dst + x;
-        int y = 0;
-        do {
-          DirectionalZone3_4x4<true>(
-              dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
-          dst_x += stride4;
-          y += 4;
-        } while (y < height);
-        left_y += ystep << 2;
-        x += 4;
-      } while (x < width);
-    } else {
-      int left_y = ystep;
-      int x = 0;
-      do {
-        uint8_t* dst_x = dst + x;
-        int y = 0;
-        do {
-          DirectionalZone3_4x4<false>(dst_x, stride, left_ptr + y, left_y,
-                                      ystep);
-          dst_x += stride4;
-          y += 4;
-        } while (y < height);
-        left_y += ystep << 2;
-        x += 4;
-      } while (x < width);
-    }
-    return;
-  }
-
-  const ptrdiff_t stride8 = stride << 3;
-  if (upsampled) {
-    int left_y = ystep;
-    int x = 0;
-    do {
-      uint8_t* dst_x = dst + x;
-      int y = 0;
-      do {
-        DirectionalZone3_8xH<true, 8>(
-            dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
-        dst_x += stride8;
-        y += 8;
-      } while (y < height);
-      left_y += ystep << 3;
-      x += 8;
-    } while (x < width);
-  } else {
-    int left_y = ystep;
-    int x = 0;
-    do {
-      uint8_t* dst_x = dst + x;
-      int y = 0;
-      do {
-        DirectionalZone3_8xH<false, 8>(
-            dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
-        dst_x += stride8;
-        y += 8;
-      } while (y < height);
-      left_y += ystep << 3;
-      x += 8;
-    } while (x < width);
-  }
-}
-
-//------------------------------------------------------------------------------
-// Directional Zone 2 Functions
-// 7.11.2.4 (8)
-
-// DirectionalBlend* selectively overwrites the values written by
-// DirectionalZone2FromLeftCol*. |zone_bounds| has one 16-bit index for each
-// row.
-template <int y_selector>
-inline void DirectionalBlend4_SSE4_1(uint8_t* dest,
-                                     const __m128i& dest_index_vect,
-                                     const __m128i& vals,
-                                     const __m128i& zone_bounds) {
-  const __m128i max_dest_x_vect = _mm_shufflelo_epi16(zone_bounds, y_selector);
-  const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect);
-  const __m128i original_vals = _mm_cvtepu8_epi16(Load4(dest));
-  const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left);
-  Store4(dest, _mm_packus_epi16(blended_vals, blended_vals));
-}
-
-inline void DirectionalBlend8_SSE4_1(uint8_t* dest,
-                                     const __m128i& dest_index_vect,
-                                     const __m128i& vals,
-                                     const __m128i& zone_bounds,
-                                     const __m128i& bounds_selector) {
-  const __m128i max_dest_x_vect =
-      _mm_shuffle_epi8(zone_bounds, bounds_selector);
-  const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect);
-  const __m128i original_vals = _mm_cvtepu8_epi16(LoadLo8(dest));
-  const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left);
-  StoreLo8(dest, _mm_packus_epi16(blended_vals, blended_vals));
-}
-
-constexpr int kDirectionalWeightBits = 5;
-// |source| is packed with 4 or 8 pairs of 8-bit values from left or top.
-// |shifts| is named to match the specification, with 4 or 8 pairs of (32 -
-// shift) and shift. Shift is guaranteed to be between 0 and 32.
-inline __m128i DirectionalZone2FromSource_SSE4_1(const uint8_t* const source,
-                                                 const __m128i& shifts,
-                                                 const __m128i& sampler) {
-  const __m128i src_vals = LoadUnaligned16(source);
-  __m128i vals = _mm_shuffle_epi8(src_vals, sampler);
-  vals = _mm_maddubs_epi16(vals, shifts);
-  return RightShiftWithRounding_U16(vals, kDirectionalWeightBits);
-}
-
-// Because the source values "move backwards" as the row index increases, the
-// indices derived from ystep are generally negative. This is accommodated by
-// making sure the relative indices are within [-15, 0] when the function is
-// called, and sliding them into the inclusive range [0, 15], relative to a
-// lower base address.
-constexpr int kPositiveIndexOffset = 15;
-
-template <bool upsampled>
-inline void DirectionalZone2FromLeftCol_4x4_SSE4_1(
-    uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column_base,
-    __m128i left_y) {
-  const int upsample_shift = static_cast<int>(upsampled);
-  const int scale_bits = 6 - upsample_shift;
-  const __m128i max_shifts = _mm_set1_epi8(32);
-  const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
-  const __m128i index_increment = _mm_cvtsi32_si128(0x01010101);
-  const __m128i positive_offset = _mm_set1_epi8(kPositiveIndexOffset);
-  // Left_column and sampler are both offset by 15 so the indices are always
-  // positive.
-  const uint8_t* left_column = left_column_base - kPositiveIndexOffset;
-  for (int y = 0; y < 4; dst += stride, ++y) {
-    __m128i offset_y = _mm_srai_epi16(left_y, scale_bits);
-    offset_y = _mm_packs_epi16(offset_y, offset_y);
-
-    const __m128i adjacent = _mm_add_epi8(offset_y, index_increment);
-    __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent);
-    // Slide valid |offset_y| indices from range [-15, 0] to [0, 15] so they
-    // can work as shuffle indices. Some values may be out of bounds, but their
-    // pred results will be masked over by top prediction.
-    sampler = _mm_add_epi8(sampler, positive_offset);
-
-    __m128i shifts = _mm_srli_epi16(
-        _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1);
-    shifts = _mm_packus_epi16(shifts, shifts);
-    const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts);
-    shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
-    const __m128i vals = DirectionalZone2FromSource_SSE4_1(
-        left_column + (y << upsample_shift), shifts, sampler);
-    Store4(dst, _mm_packus_epi16(vals, vals));
-  }
-}
-
-// The height at which a load of 16 bytes will not contain enough source pixels
-// from |left_column| to supply an accurate row when computing 8 pixels at a
-// time. The values are found by inspection. By coincidence, all angles that
-// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up
-// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15.
-constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = {
-    1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40};
-
-template <bool upsampled>
-inline void DirectionalZone2FromLeftCol_8x8_SSE4_1(
-    uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column,
-    __m128i left_y) {
-  const int upsample_shift = static_cast<int>(upsampled);
-  const int scale_bits = 6 - upsample_shift;
-  const __m128i max_shifts = _mm_set1_epi8(32);
-  const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
-  const __m128i index_increment = _mm_set1_epi8(1);
-  const __m128i denegation = _mm_set1_epi8(kPositiveIndexOffset);
-  for (int y = 0; y < 8; dst += stride, ++y) {
-    __m128i offset_y = _mm_srai_epi16(left_y, scale_bits);
-    offset_y = _mm_packs_epi16(offset_y, offset_y);
-    const __m128i adjacent = _mm_add_epi8(offset_y, index_increment);
-
-    // Offset the relative index because ystep is negative in Zone 2 and shuffle
-    // indices must be nonnegative.
-    __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent);
-    sampler = _mm_add_epi8(sampler, denegation);
-
-    __m128i shifts = _mm_srli_epi16(
-        _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1);
-    shifts = _mm_packus_epi16(shifts, shifts);
-    const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts);
-    shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
-
-    // The specification adds (y << 6) to left_y, which is subject to
-    // upsampling, but this puts sampler indices out of the 0-15 range. It is
-    // equivalent to offset the source address by (y << upsample_shift) instead.
-    const __m128i vals = DirectionalZone2FromSource_SSE4_1(
-        left_column - kPositiveIndexOffset + (y << upsample_shift), shifts,
-        sampler);
-    StoreLo8(dst, _mm_packus_epi16(vals, vals));
-  }
-}
-
-// |zone_bounds| is an epi16 of the relative x index at which base >= -(1 <<
-// upsampled_top), for each row. When there are 4 values, they can be duplicated
-// with a non-register shuffle mask.
-// |shifts| is one pair of weights that applies throughout a given row.
-template <bool upsampled_top>
-inline void DirectionalZone1Blend_4x4(
-    uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride,
-    __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts,
-    const __m128i& dest_index_x, int top_x, const int xstep) {
-  const int upsample_shift = static_cast<int>(upsampled_top);
-  const int scale_bits_x = 6 - upsample_shift;
-  top_x -= xstep;
-
-  int top_base_x = (top_x >> scale_bits_x);
-  const __m128i vals0 = DirectionalZone2FromSource_SSE4_1(
-      top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x00), sampler);
-  DirectionalBlend4_SSE4_1<0x00>(dest, dest_index_x, vals0, zone_bounds);
-  top_x -= xstep;
-  dest += stride;
-
-  top_base_x = (top_x >> scale_bits_x);
-  const __m128i vals1 = DirectionalZone2FromSource_SSE4_1(
-      top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x55), sampler);
-  DirectionalBlend4_SSE4_1<0x55>(dest, dest_index_x, vals1, zone_bounds);
-  top_x -= xstep;
-  dest += stride;
-
-  top_base_x = (top_x >> scale_bits_x);
-  const __m128i vals2 = DirectionalZone2FromSource_SSE4_1(
-      top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xAA), sampler);
-  DirectionalBlend4_SSE4_1<0xAA>(dest, dest_index_x, vals2, zone_bounds);
-  top_x -= xstep;
-  dest += stride;
-
-  top_base_x = (top_x >> scale_bits_x);
-  const __m128i vals3 = DirectionalZone2FromSource_SSE4_1(
-      top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xFF), sampler);
-  DirectionalBlend4_SSE4_1<0xFF>(dest, dest_index_x, vals3, zone_bounds);
-}
-
-template <bool upsampled_top, int height>
-inline void DirectionalZone1Blend_8xH(
-    uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride,
-    __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts,
-    const __m128i& dest_index_x, int top_x, const int xstep) {
-  const int upsample_shift = static_cast<int>(upsampled_top);
-  const int scale_bits_x = 6 - upsample_shift;
-
-  __m128i y_selector = _mm_set1_epi32(0x01000100);
-  const __m128i index_increment = _mm_set1_epi32(0x02020202);
-  for (int y = 0; y < height; ++y,
-           y_selector = _mm_add_epi8(y_selector, index_increment),
-           dest += stride) {
-    top_x -= xstep;
-    const int top_base_x = top_x >> scale_bits_x;
-    const __m128i vals = DirectionalZone2FromSource_SSE4_1(
-        top_row + top_base_x, _mm_shuffle_epi8(shifts, y_selector), sampler);
-    DirectionalBlend8_SSE4_1(dest, dest_index_x, vals, zone_bounds, y_selector);
-  }
-}
-
-// 7.11.2.4 (8) 90 < angle > 180
-// The strategy for this function is to know how many blocks can be processed
-// with just pixels from |top_ptr|, then handle mixed blocks, then handle only
-// blocks that take from |left_ptr|. Additionally, a fast index-shuffle
-// approach is used for pred values from |left_column| in sections that permit
-// it.
-template <bool upsampled_left, bool upsampled_top>
-inline void DirectionalZone2_SSE4_1(void* dest, ptrdiff_t stride,
-                                    const uint8_t* const top_row,
-                                    const uint8_t* const left_column,
-                                    const int width, const int height,
-                                    const int xstep, const int ystep) {
-  auto* dst = static_cast<uint8_t*>(dest);
-  const int upsample_left_shift = static_cast<int>(upsampled_left);
-  const int upsample_top_shift = static_cast<int>(upsampled_top);
-  const __m128i max_shift = _mm_set1_epi8(32);
-  const ptrdiff_t stride8 = stride << 3;
-  const __m128i dest_index_x =
-      _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
-  const __m128i sampler_top =
-      upsampled_top
-          ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
-          : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
-  const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
-  // All columns from |min_top_only_x| to the right will only need |top_row| to
-  // compute. This assumes minimum |xstep| is 3.
-  const int min_top_only_x = std::min((height * xstep) >> 6, width);
-
-  // For steep angles, the source pixels from left_column may not fit in a
-  // 16-byte load for shuffling.
-  // TODO(petersonab): Find a more precise formula for this subject to x.
-  const int max_shuffle_height =
-      std::min(height, kDirectionalZone2ShuffleInvalidHeight[ystep >> 6]);
-
-  const int xstep8 = xstep << 3;
-  const __m128i xstep8_vect = _mm_set1_epi16(xstep8);
-  // Accumulate xstep across 8 rows.
-  const __m128i xstep_dup = _mm_set1_epi16(-xstep);
-  const __m128i increments = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
-  const __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments);
-  // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
-  const __m128i scaled_one = _mm_set1_epi16(-64);
-  __m128i xstep_bounds_base =
-      (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift)
-                    : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift);
-
-  const int left_base_increment = ystep >> 6;
-  const int ystep_remainder = ystep & 0x3F;
-  const int ystep8 = ystep << 3;
-  const int left_base_increment8 = ystep8 >> 6;
-  const int ystep_remainder8 = ystep8 & 0x3F;
-  const __m128i increment_left8 = _mm_set1_epi16(-ystep_remainder8);
-
-  // If the 64 scaling is regarded as a decimal point, the first value of the
-  // left_y vector omits the portion which is covered under the left_column
-  // offset. Following values need the full ystep as a relative offset.
-  const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
-  const __m128i ystep_dup = _mm_set1_epi16(-ystep);
-  __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
-  left_y = _mm_add_epi16(ystep_init, left_y);
-
-  const __m128i increment_top8 = _mm_set1_epi16(8 << 6);
-  int x = 0;
-
-  // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
-  // The first stage, before the first y-loop, covers blocks that are only
-  // computed from the top row. The second stage, comprising two y-loops, covers
-  // blocks that have a mixture of values computed from top or left. The final
-  // stage covers blocks that are only computed from the left.
-  for (int left_offset = -left_base_increment; x < min_top_only_x;
-       x += 8,
-           xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8),
-           // Watch left_y because it can still get big.
-       left_y = _mm_add_epi16(left_y, increment_left8),
-           left_offset -= left_base_increment8) {
-    uint8_t* dst_x = dst + x;
-
-    // Round down to the nearest multiple of 8.
-    const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
-    DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
-                         max_top_only_y, -xstep, upsampled_top);
-    DirectionalZone1_4xH(dst_x + 4, stride,
-                         top_row + ((x + 4) << upsample_top_shift),
-                         max_top_only_y, -xstep, upsampled_top);
-
-    int y = max_top_only_y;
-    dst_x += stride * y;
-    const int xstep_y = xstep * y;
-    const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
-    // All rows from |min_left_only_y| down for this set of columns, only need
-    // |left_column| to compute.
-    const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height);
-    // At high angles such that min_left_only_y < 8, ystep is low and xstep is
-    // high. This means that max_shuffle_height is unbounded and xstep_bounds
-    // will overflow in 16 bits. This is prevented by stopping the first
-    // blending loop at min_left_only_y for such cases, which means we skip over
-    // the second blending loop as well.
-    const int left_shuffle_stop_y =
-        std::min(max_shuffle_height, min_left_only_y);
-    __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
-    __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
-    int top_x = -xstep_y;
-
-    for (; y < left_shuffle_stop_y;
-         y += 8, dst_x += stride8,
-         xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
-         xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
-         top_x -= xstep8) {
-      DirectionalZone2FromLeftCol_8x8_SSE4_1<upsampled_left>(
-          dst_x, stride,
-          left_column + ((left_offset + y) << upsample_left_shift), left_y);
-
-      __m128i shifts = _mm_srli_epi16(
-          _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
-                        shift_mask),
-          1);
-      shifts = _mm_packus_epi16(shifts, shifts);
-      __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
-      shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
-      __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
-      DirectionalZone1Blend_8xH<upsampled_top, 8>(
-          dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
-          xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
-    }
-    // Pick up from the last y-value, using the 10% slower but secure method for
-    // left prediction.
-    const auto base_left_y = static_cast<int16_t>(_mm_extract_epi16(left_y, 0));
-    for (; y < min_left_only_y;
-         y += 8, dst_x += stride8,
-         xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
-         xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
-         top_x -= xstep8) {
-      const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
-
-      DirectionalZone3_8xH<upsampled_left, 8>(
-          dst_x, stride,
-          left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
-          -ystep);
-
-      __m128i shifts = _mm_srli_epi16(
-          _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
-                        shift_mask),
-          1);
-      shifts = _mm_packus_epi16(shifts, shifts);
-      __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
-      shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
-      DirectionalZone1Blend_8xH<upsampled_top, 8>(
-          dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
-          xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
-    }
-    // Loop over y for left_only rows.
-    for (; y < height; y += 8, dst_x += stride8) {
-      DirectionalZone3_8xH<upsampled_left, 8>(
-          dst_x, stride,
-          left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
-          -ystep);
-    }
-  }
-  for (; x < width; x += 4) {
-    DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
-                         height, -xstep, upsampled_top);
-  }
-}
-
-template <bool upsampled_left, bool upsampled_top>
-inline void DirectionalZone2_4_SSE4_1(void* dest, ptrdiff_t stride,
-                                      const uint8_t* const top_row,
-                                      const uint8_t* const left_column,
-                                      const int width, const int height,
-                                      const int xstep, const int ystep) {
-  auto* dst = static_cast<uint8_t*>(dest);
-  const int upsample_left_shift = static_cast<int>(upsampled_left);
-  const int upsample_top_shift = static_cast<int>(upsampled_top);
-  const __m128i max_shift = _mm_set1_epi8(32);
-  const ptrdiff_t stride4 = stride << 2;
-  const __m128i dest_index_x = _mm_set_epi32(0, 0, 0x00030002, 0x00010000);
-  const __m128i sampler_top =
-      upsampled_top
-          ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
-          : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
-  // All columns from |min_top_only_x| to the right will only need |top_row| to
-  // compute.
-  assert(xstep >= 3);
-  const int min_top_only_x = std::min((height * xstep) >> 6, width);
-
-  const int xstep4 = xstep << 2;
-  const __m128i xstep4_vect = _mm_set1_epi16(xstep4);
-  const __m128i xstep_dup = _mm_set1_epi16(-xstep);
-  const __m128i increments = _mm_set_epi32(0, 0, 0x00040003, 0x00020001);
-  __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments);
-  const __m128i scaled_one = _mm_set1_epi16(-64);
-  // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
-  __m128i xstep_bounds_base =
-      (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift)
-                    : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift);
-
-  const int left_base_increment = ystep >> 6;
-  const int ystep_remainder = ystep & 0x3F;
-  const int ystep4 = ystep << 2;
-  const int left_base_increment4 = ystep4 >> 6;
-  // This is guaranteed to be less than 64, but accumulation may bring it past
-  // 64 for higher x values.
-  const int ystep_remainder4 = ystep4 & 0x3F;
-  const __m128i increment_left4 = _mm_set1_epi16(-ystep_remainder4);
-  const __m128i increment_top4 = _mm_set1_epi16(4 << 6);
-
-  // If the 64 scaling is regarded as a decimal point, the first value of the
-  // left_y vector omits the portion which will go into the left_column offset.
-  // Following values need the full ystep as a relative offset.
-  const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
-  const __m128i ystep_dup = _mm_set1_epi16(-ystep);
-  __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
-  left_y = _mm_add_epi16(ystep_init, left_y);
-  const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
-
-  int x = 0;
-  // Loop over x for columns with a mixture of sources.
-  for (int left_offset = -left_base_increment; x < min_top_only_x; x += 4,
-           xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top4),
-           left_y = _mm_add_epi16(left_y, increment_left4),
-           left_offset -= left_base_increment4) {
-    uint8_t* dst_x = dst + x;
-
-    // Round down to the nearest multiple of 8.
-    const int max_top_only_y = std::min((x << 6) / xstep, height) & 0xFFFFFFF4;
-    DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
-                         max_top_only_y, -xstep, upsampled_top);
-    int y = max_top_only_y;
-    dst_x += stride * y;
-    const int xstep_y = xstep * y;
-    const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
-    // All rows from |min_left_only_y| down for this set of columns, only need
-    // |left_column| to compute. Rounded up to the nearest multiple of 4.
-    const int min_left_only_y = std::min(((x + 4) << 6) / xstep, height);
-
-    __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
-    __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
-    int top_x = -xstep_y;
-
-    // Loop over y for mixed rows.
-    for (; y < min_left_only_y;
-         y += 4, dst_x += stride4,
-         xstep_bounds = _mm_add_epi16(xstep_bounds, xstep4_vect),
-         xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep4_vect),
-         top_x -= xstep4) {
-      DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>(
-          dst_x, stride,
-          left_column + ((left_offset + y) * (1 << upsample_left_shift)),
-          left_y);
-
-      __m128i shifts = _mm_srli_epi16(
-          _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
-                        shift_mask),
-          1);
-      shifts = _mm_packus_epi16(shifts, shifts);
-      const __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
-      shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
-      const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
-      DirectionalZone1Blend_4x4<upsampled_top>(
-          dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
-          xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
-    }
-    // Loop over y for left-only rows, if any.
-    for (; y < height; y += 4, dst_x += stride4) {
-      DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>(
-          dst_x, stride,
-          left_column + ((left_offset + y) << upsample_left_shift), left_y);
-    }
-  }
-  // Loop over top-only columns, if any.
-  for (; x < width; x += 4) {
-    DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
-                         height, -xstep, upsampled_top);
-  }
-}
-
-void DirectionalIntraPredictorZone2_SSE4_1(void* const dest, ptrdiff_t stride,
-                                           const void* const top_row,
-                                           const void* const left_column,
-                                           const int width, const int height,
-                                           const int xstep, const int ystep,
-                                           const bool upsampled_top,
-                                           const bool upsampled_left) {
-  // Increasing the negative buffer for this function allows more rows to be
-  // processed at a time without branching in an inner loop to check the base.
-  uint8_t top_buffer[288];
-  uint8_t left_buffer[288];
-  memcpy(top_buffer + 128, static_cast<const uint8_t*>(top_row) - 16, 160);
-  memcpy(left_buffer + 128, static_cast<const uint8_t*>(left_column) - 16, 160);
-  const uint8_t* top_ptr = top_buffer + 144;
-  const uint8_t* left_ptr = left_buffer + 144;
-  if (width == 4 || height == 4) {
-    if (upsampled_left) {
-      if (upsampled_top) {
-        DirectionalZone2_4_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr,
-                                              width, height, xstep, ystep);
-      } else {
-        DirectionalZone2_4_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr,
-                                               width, height, xstep, ystep);
-      }
-    } else {
-      if (upsampled_top) {
-        DirectionalZone2_4_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr,
-                                               width, height, xstep, ystep);
-      } else {
-        DirectionalZone2_4_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr,
-                                                width, height, xstep, ystep);
-      }
-    }
-    return;
-  }
-  if (upsampled_left) {
-    if (upsampled_top) {
-      DirectionalZone2_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr,
-                                          width, height, xstep, ystep);
-    } else {
-      DirectionalZone2_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr,
-                                           width, height, xstep, ystep);
-    }
-  } else {
-    if (upsampled_top) {
-      DirectionalZone2_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr,
-                                           width, height, xstep, ystep);
-    } else {
-      DirectionalZone2_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr,
-                                            width, height, xstep, ystep);
-    }
-  }
-}
-
-//------------------------------------------------------------------------------
-// FilterIntraPredictor_SSE4_1
-
-// Apply all filter taps to the given 7 packed 16-bit values, keeping the 8th
-// at zero to preserve the sum.
-inline void Filter4x2_SSE4_1(uint8_t* dst, const ptrdiff_t stride,
-                             const __m128i& pixels, const __m128i& taps_0_1,
-                             const __m128i& taps_2_3, const __m128i& taps_4_5,
-                             const __m128i& taps_6_7) {
-  const __m128i mul_0_01 = _mm_maddubs_epi16(pixels, taps_0_1);
-  const __m128i mul_0_23 = _mm_maddubs_epi16(pixels, taps_2_3);
-  // |output_half| contains 8 partial sums.
-  __m128i output_half = _mm_hadd_epi16(mul_0_01, mul_0_23);
-  __m128i output = _mm_hadd_epi16(output_half, output_half);
-  const __m128i output_row0 =
-      _mm_packus_epi16(RightShiftWithRounding_S16(output, 4),
-                       /* arbitrary pack arg */ output);
-  Store4(dst, output_row0);
-  const __m128i mul_1_01 = _mm_maddubs_epi16(pixels, taps_4_5);
-  const __m128i mul_1_23 = _mm_maddubs_epi16(pixels, taps_6_7);
-  output_half = _mm_hadd_epi16(mul_1_01, mul_1_23);
-  output = _mm_hadd_epi16(output_half, output_half);
-  const __m128i output_row1 =
-      _mm_packus_epi16(RightShiftWithRounding_S16(output, 4),
-                       /* arbitrary pack arg */ output);
-  Store4(dst + stride, output_row1);
-}
-
-// 4xH transform sizes are given special treatment because LoadLo8 goes out
-// of bounds and every block involves the left column. This implementation
-// loads TL from the top row for the first block, so it is not
-inline void Filter4xH(uint8_t* dest, ptrdiff_t stride,
-                      const uint8_t* const top_ptr,
-                      const uint8_t* const left_ptr, FilterIntraPredictor pred,
-                      const int height) {
-  const __m128i taps_0_1 = LoadUnaligned16(kFilterIntraTaps[pred][0]);
-  const __m128i taps_2_3 = LoadUnaligned16(kFilterIntraTaps[pred][2]);
-  const __m128i taps_4_5 = LoadUnaligned16(kFilterIntraTaps[pred][4]);
-  const __m128i taps_6_7 = LoadUnaligned16(kFilterIntraTaps[pred][6]);
-  __m128i top = Load4(top_ptr - 1);
-  __m128i pixels = _mm_insert_epi8(top, top_ptr[3], 4);
-  __m128i left = (height == 4 ? Load4(left_ptr) : LoadLo8(left_ptr));
-  left = _mm_slli_si128(left, 5);
-
-  // Relative pixels: top[-1], top[0], top[1], top[2], top[3], left[0], left[1],
-  // left[2], left[3], left[4], left[5], left[6], left[7]
-  pixels = _mm_or_si128(left, pixels);
-
-  // Duplicate first 8 bytes.
-  pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
-  Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
-                   taps_6_7);
-  dest += stride;  // Move to y = 1.
-  pixels = Load4(dest);
-
-  // Relative pixels: top[0], top[1], top[2], top[3], empty, left[-2], left[-1],
-  // left[0], left[1], ...
-  pixels = _mm_or_si128(left, pixels);
-
-  // This mask rearranges bytes in the order: 6, 0, 1, 2, 3, 7, 8, 15. The last
-  // byte is an unused value, which shall be multiplied by 0 when we apply the
-  // filter.
-  constexpr int64_t kInsertTopLeftFirstMask = 0x0F08070302010006;
-
-  // Insert left[-1] in front as TL and put left[0] and left[1] at the end.
-  const __m128i pixel_order1 = _mm_set1_epi64x(kInsertTopLeftFirstMask);
-  pixels = _mm_shuffle_epi8(pixels, pixel_order1);
-  dest += stride;  // Move to y = 2.
-  Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
-                   taps_6_7);
-  dest += stride;  // Move to y = 3.
-
-  // Compute the middle 8 rows before using common code for the final 4 rows.
-  // Because the common code below this block assumes that
-  if (height == 16) {
-    // This shift allows us to use pixel_order2 twice after shifting by 2 later.
-    left = _mm_slli_si128(left, 1);
-    pixels = Load4(dest);
-
-    // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, left[-4],
-    // left[-3], left[-2], left[-1], left[0], left[1], left[2], left[3]
-    pixels = _mm_or_si128(left, pixels);
-
-    // This mask rearranges bytes in the order: 9, 0, 1, 2, 3, 7, 8, 15. The
-    // last byte is an unused value, as above. The top-left was shifted to
-    // position nine to keep two empty spaces after the top pixels.
-    constexpr int64_t kInsertTopLeftSecondMask = 0x0F0B0A0302010009;
-
-    // Insert (relative) left[-1] in front as TL and put left[0] and left[1] at
-    // the end.
-    const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftSecondMask);
-    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
-    dest += stride;  // Move to y = 4.
-
-    // First 4x2 in the if body.
-    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
-                     taps_6_7);
-
-    // Clear all but final pixel in the first 8 of left column.
-    __m128i keep_top_left = _mm_srli_si128(left, 13);
-    dest += stride;  // Move to y = 5.
-    pixels = Load4(dest);
-    left = _mm_srli_si128(left, 2);
-
-    // Relative pixels: top[0], top[1], top[2], top[3], left[-6],
-    // left[-5], left[-4], left[-3], left[-2], left[-1], left[0], left[1]
-    pixels = _mm_or_si128(left, pixels);
-    left = LoadLo8(left_ptr + 8);
-
-    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
-    dest += stride;  // Move to y = 6.
-
-    // Second 4x2 in the if body.
-    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
-                     taps_6_7);
-
-    // Position TL value so we can use pixel_order1.
-    keep_top_left = _mm_slli_si128(keep_top_left, 6);
-    dest += stride;  // Move to y = 7.
-    pixels = Load4(dest);
-    left = _mm_slli_si128(left, 7);
-    left = _mm_or_si128(left, keep_top_left);
-
-    // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
-    // left[-1], left[0], left[1], left[2], left[3], ...
-    pixels = _mm_or_si128(left, pixels);
-    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
-    dest += stride;  // Move to y = 8.
-
-    // Third 4x2 in the if body.
-    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
-                     taps_6_7);
-    dest += stride;  // Move to y = 9.
-
-    // Prepare final inputs.
-    pixels = Load4(dest);
-    left = _mm_srli_si128(left, 2);
-
-    // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
-    // left[-1], left[0], left[1], left[2], left[3], ...
-    pixels = _mm_or_si128(left, pixels);
-    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
-    dest += stride;  // Move to y = 10.
-
-    // Fourth 4x2 in the if body.
-    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
-                     taps_6_7);
-    dest += stride;  // Move to y = 11.
-  }
-
-  // In both the 8 and 16 case, we assume that the left vector has the next TL
-  // at position 8.
-  if (height > 4) {
-    // Erase prior left pixels by shifting TL to position 0.
-    left = _mm_srli_si128(left, 8);
-    left = _mm_slli_si128(left, 6);
-    pixels = Load4(dest);
-
-    // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
-    // left[-1], left[0], left[1], left[2], left[3], ...
-    pixels = _mm_or_si128(left, pixels);
-    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
-    dest += stride;  // Move to y = 12 or 4.
-
-    // First of final two 4x2 blocks.
-    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
-                     taps_6_7);
-    dest += stride;  // Move to y = 13 or 5.
-    pixels = Load4(dest);
-    left = _mm_srli_si128(left, 2);
-
-    // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
-    // left[-1], left[0], left[1], left[2], left[3], ...
-    pixels = _mm_or_si128(left, pixels);
-    pixels = _mm_shuffle_epi8(pixels, pixel_order1);
-    dest += stride;  // Move to y = 14 or 6.
-
-    // Last of final two 4x2 blocks.
-    Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
-                     taps_6_7);
-  }
-}
-
-void FilterIntraPredictor_SSE4_1(void* const dest, ptrdiff_t stride,
-                                 const void* const top_row,
-                                 const void* const left_column,
-                                 FilterIntraPredictor pred, const int width,
-                                 const int height) {
-  const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
-  const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
-  auto* dst = static_cast<uint8_t*>(dest);
-  if (width == 4) {
-    Filter4xH(dst, stride, top_ptr, left_ptr, pred, height);
-    return;
-  }
-
-  // There is one set of 7 taps for each of the 4x2 output pixels.
-  const __m128i taps_0_1 = LoadUnaligned16(kFilterIntraTaps[pred][0]);
-  const __m128i taps_2_3 = LoadUnaligned16(kFilterIntraTaps[pred][2]);
-  const __m128i taps_4_5 = LoadUnaligned16(kFilterIntraTaps[pred][4]);
-  const __m128i taps_6_7 = LoadUnaligned16(kFilterIntraTaps[pred][6]);
-
-  // This mask rearranges bytes in the order: 0, 1, 2, 3, 4, 8, 9, 15. The 15 at
-  // the end is an unused value, which shall be multiplied by 0 when we apply
-  // the filter.
-  constexpr int64_t kCondenseLeftMask = 0x0F09080403020100;
-
-  // Takes the "left section" and puts it right after p0-p4.
-  const __m128i pixel_order1 = _mm_set1_epi64x(kCondenseLeftMask);
-
-  // This mask rearranges bytes in the order: 8, 0, 1, 2, 3, 9, 10, 15. The last
-  // byte is unused as above.
-  constexpr int64_t kInsertTopLeftMask = 0x0F0A090302010008;
-
-  // Shuffles the "top left" from the left section, to the front. Used when
-  // grabbing data from left_column and not top_row.
-  const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftMask);
-
-  // This first pass takes care of the cases where the top left pixel comes from
-  // top_row.
-  __m128i pixels = LoadLo8(top_ptr - 1);
-  __m128i left = _mm_slli_si128(Load4(left_column), 8);
-  pixels = _mm_or_si128(pixels, left);
-
-  // Two sets of the same pixels to multiply with two sets of taps.
-  pixels = _mm_shuffle_epi8(pixels, pixel_order1);
-  Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5, taps_6_7);
-  left = _mm_srli_si128(left, 1);
-
-  // Load
-  pixels = Load4(dst + stride);
-
-  // Because of the above shift, this OR 'invades' the final of the first 8
-  // bytes of |pixels|. This is acceptable because the 8th filter tap is always
-  // a padded 0.
-  pixels = _mm_or_si128(pixels, left);
-  pixels = _mm_shuffle_epi8(pixels, pixel_order2);
-  const ptrdiff_t stride2 = stride << 1;
-  const ptrdiff_t stride4 = stride << 2;
-  Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
-                   taps_6_7);
-  dst += 4;
-  for (int x = 3; x < width - 4; x += 4) {
-    pixels = Load4(top_ptr + x);
-    pixels = _mm_insert_epi8(pixels, top_ptr[x + 4], 4);
-    pixels = _mm_insert_epi8(pixels, dst[-1], 5);
-    pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6);
-
-    // Duplicate bottom half into upper half.
-    pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
-    Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
-                     taps_6_7);
-    pixels = Load4(dst + stride - 1);
-    pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4);
-    pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5);
-    pixels = _mm_insert_epi8(pixels, dst[stride + stride2 - 1], 6);
-
-    // Duplicate bottom half into upper half.
-    pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
-    Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
-                     taps_4_5, taps_6_7);
-    dst += 4;
-  }
-
-  // Now we handle heights that reference previous blocks rather than top_row.
-  for (int y = 4; y < height; y += 4) {
-    // Leftmost 4x4 block for this height.
-    dst -= width;
-    dst += stride4;
-
-    // Top Left is not available by offset in these leftmost blocks.
-    pixels = Load4(dst - stride);
-    left = _mm_slli_si128(Load4(left_ptr + y - 1), 8);
-    left = _mm_insert_epi8(left, left_ptr[y + 3], 12);
-    pixels = _mm_or_si128(pixels, left);
-    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
-    Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
-                     taps_6_7);
-
-    // The bytes shifted into positions 6 and 7 will be ignored by the shuffle.
-    left = _mm_srli_si128(left, 2);
-    pixels = Load4(dst + stride);
-    pixels = _mm_or_si128(pixels, left);
-    pixels = _mm_shuffle_epi8(pixels, pixel_order2);
-    Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
-                     taps_4_5, taps_6_7);
-
-    dst += 4;
-
-    // Remaining 4x4 blocks for this height.
-    for (int x = 4; x < width; x += 4) {
-      pixels = Load4(dst - stride - 1);
-      pixels = _mm_insert_epi8(pixels, dst[-stride + 3], 4);
-      pixels = _mm_insert_epi8(pixels, dst[-1], 5);
-      pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6);
-
-      // Duplicate bottom half into upper half.
-      pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
-      Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
-                       taps_6_7);
-      pixels = Load4(dst + stride - 1);
-      pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4);
-      pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5);
-      pixels = _mm_insert_epi8(pixels, dst[stride2 + stride - 1], 6);
-
-      // Duplicate bottom half into upper half.
-      pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
-      Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
-                       taps_4_5, taps_6_7);
-      dst += 4;
-    }
-  }
-}
-
 void Init8bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
   assert(dsp != nullptr);
@@ -2746,21 +1412,6 @@
 // These guards check if this version of the function was not superseded by
 // a higher optimization level, such as AVX. The corresponding #define also
 // prevents the C version from being added to the table.
-#if DSP_ENABLED_8BPP_SSE4_1(FilterIntraPredictor)
-  dsp->filter_intra_predictor = FilterIntraPredictor_SSE4_1;
-#endif
-#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone1)
-  dsp->directional_intra_predictor_zone1 =
-      DirectionalIntraPredictorZone1_SSE4_1;
-#endif
-#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone2)
-  dsp->directional_intra_predictor_zone2 =
-      DirectionalIntraPredictorZone2_SSE4_1;
-#endif
-#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone3)
-  dsp->directional_intra_predictor_zone3 =
-      DirectionalIntraPredictorZone3_SSE4_1;
-#endif
 #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDcTop)
   dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
       DcDefs::_4x4::DcTop;
@@ -3524,7 +2175,7 @@
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_SSE4_1
+#else   // !LIBGAV1_TARGETING_SSE4_1
 namespace libgav1 {
 namespace dsp {
 
@@ -3532,4 +2183,4 @@
 
 }  // namespace dsp
 }  // namespace libgav1
-#endif  // LIBGAV1_ENABLE_SSE4_1
+#endif  // LIBGAV1_TARGETING_SSE4_1

diff --git a/libgav1/src/dsp/x86/intrapred_sse4.h b/libgav1/src/dsp/x86/intrapred_sse4.h
index eb3825d..1f6f30a 100644
--- a/libgav1/src/dsp/x86/intrapred_sse4.h
+++ b/libgav1/src/dsp/x86/intrapred_sse4.h

@@ -23,36 +23,16 @@
 namespace libgav1 {
 namespace dsp {
 
-// Initializes Dsp::intra_predictors, Dsp::directional_intra_predictor_zone*,
-// Dsp::cfl_intra_predictors, Dsp::cfl_subsamplers and
-// Dsp::filter_intra_predictor, see the defines below for specifics. These
-// functions are not thread-safe.
+// Initializes Dsp::intra_predictors. See the defines below for specifics.
+// These functions are not thread-safe.
 void IntraPredInit_SSE4_1();
-void IntraPredCflInit_SSE4_1();
-void IntraPredSmoothInit_SSE4_1();
 
 }  // namespace dsp
 }  // namespace libgav1
 
 // If sse4 is enabled and the baseline isn't set due to a higher level of
 // optimization being enabled, signal the sse4 implementation should be used.
-#if LIBGAV1_ENABLE_SSE4_1
-#ifndef LIBGAV1_Dsp8bpp_FilterIntraPredictor
-#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1
-#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2
-#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3
-#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_SSE4_1
-#endif
-
+#if LIBGAV1_TARGETING_SSE4_1
 #ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop
 #define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
 #endif
@@ -138,174 +118,6 @@
   LIBGAV1_CPU_SSE4_1
 #endif
 
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
 #ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft
 #define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_SSE4_1
 #endif
@@ -658,287 +470,6 @@
   LIBGAV1_CPU_SSE4_1
 #endif
 
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \
-  LIBGAV1_CPU_SSE4_1
-#endif
-
 //------------------------------------------------------------------------------
 // 10bpp
 
@@ -1055,6 +586,6 @@
   LIBGAV1_CPU_SSE4_1
 #endif
 
-#endif  // LIBGAV1_ENABLE_SSE4_1
+#endif  // LIBGAV1_TARGETING_SSE4_1
 
 #endif  // LIBGAV1_SRC_DSP_X86_INTRAPRED_SSE4_H_

diff --git a/libgav1/src/dsp/x86/inverse_transform_sse4.cc b/libgav1/src/dsp/x86/inverse_transform_sse4.cc
index 30ad436..12c008f 100644
--- a/libgav1/src/dsp/x86/inverse_transform_sse4.cc
+++ b/libgav1/src/dsp/x86/inverse_transform_sse4.cc

@@ -15,7 +15,7 @@
 #include "src/dsp/inverse_transform.h"
 #include "src/utils/cpu.h"
 
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
 
 #include <smmintrin.h>
 
@@ -94,8 +94,7 @@
       static_cast<uint16_t>(cos128) | (static_cast<uint32_t>(sin128) << 16));
   const __m128i ba = _mm_unpacklo_epi16(*a, *b);
   const __m128i ab = _mm_unpacklo_epi16(*b, *a);
-  const __m128i sign =
-      _mm_set_epi32(0x80000001, 0x80000001, 0x80000001, 0x80000001);
+  const __m128i sign = _mm_set1_epi32(static_cast<int>(0x80000001));
   // -sin cos, -sin cos, -sin cos, -sin cos
   const __m128i msin_pcos = _mm_sign_epi16(psin_pcos, sign);
   const __m128i x0 = _mm_madd_epi16(ba, msin_pcos);
@@ -121,8 +120,7 @@
   const int16_t sin128 = Sin128(angle);
   const __m128i psin_pcos = _mm_set1_epi32(
       static_cast<uint16_t>(cos128) | (static_cast<uint32_t>(sin128) << 16));
-  const __m128i sign =
-      _mm_set_epi32(0x80000001, 0x80000001, 0x80000001, 0x80000001);
+  const __m128i sign = _mm_set1_epi32(static_cast<int>(0x80000001));
   // -sin cos, -sin cos, -sin cos, -sin cos
   const __m128i msin_pcos = _mm_sign_epi16(psin_pcos, sign);
   const __m128i ba = _mm_unpacklo_epi16(*a, *b);
@@ -221,20 +219,16 @@
 // Discrete Cosine Transforms (DCT).
 
 template <int width>
-LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, const void* source,
-                                     int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, int adjusted_tx_height,
                                      bool should_round, int row_shift) {
-  if (non_zero_coeff_count > 1) {
-    return false;
-  }
+  if (adjusted_tx_height > 1) return false;
 
   auto* dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
-
-  const __m128i v_src_lo = _mm_shufflelo_epi16(_mm_cvtsi32_si128(src[0]), 0);
+  const __m128i v_src_lo = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
   const __m128i v_src =
       (width == 4) ? v_src_lo : _mm_shuffle_epi32(v_src_lo, 0);
-  const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+  const __m128i v_mask =
+      _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
   const __m128i v_kTransformRowMultiplier =
       _mm_set1_epi16(kTransformRowMultiplier << 3);
   const __m128i v_src_round =
@@ -266,26 +260,22 @@
 }
 
 template <int height>
-LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, const void* source,
-                                           int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, int adjusted_tx_height,
                                            int width) {
-  if (non_zero_coeff_count > 1) {
-    return false;
-  }
+  if (adjusted_tx_height > 1) return false;
 
   auto* dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
   const int16_t cos128 = Cos128(32);
 
   // Calculate dc values for first row.
   if (width == 4) {
-    const __m128i v_src = LoadLo8(src);
+    const __m128i v_src = LoadLo8(dst);
     const __m128i xy = _mm_mulhrs_epi16(v_src, _mm_set1_epi16(cos128 << 3));
     StoreLo8(dst, xy);
   } else {
     int i = 0;
     do {
-      const __m128i v_src = LoadUnaligned16(&src[i]);
+      const __m128i v_src = LoadUnaligned16(&dst[i]);
       const __m128i xy = _mm_mulhrs_epi16(v_src, _mm_set1_epi16(cos128 << 3));
       StoreUnaligned16(&dst[i], xy);
       i += 8;
@@ -294,21 +284,21 @@
 
   // Copy first row to the rest of the block.
   for (int y = 1; y < height; ++y) {
-    memcpy(&dst[y * width], &src[(y - 1) * width], width * sizeof(dst[0]));
+    memcpy(&dst[y * width], dst, width * sizeof(dst[0]));
   }
   return true;
 }
 
-template <ButterflyRotationFunc bufferfly_rotation,
-          bool is_fast_bufferfly = false>
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
 LIBGAV1_ALWAYS_INLINE void Dct4Stages(__m128i* s) {
   // stage 12.
-  if (is_fast_bufferfly) {
+  if (is_fast_butterfly) {
     ButterflyRotation_SecondIsZero(&s[0], &s[1], 32, true);
     ButterflyRotation_SecondIsZero(&s[2], &s[3], 48, false);
   } else {
-    bufferfly_rotation(&s[0], &s[1], 32, true);
-    bufferfly_rotation(&s[2], &s[3], 48, false);
+    butterfly_rotation(&s[0], &s[1], 32, true);
+    butterfly_rotation(&s[2], &s[3], 48, false);
   }
 
   // stage 17.
@@ -317,23 +307,22 @@
 }
 
 // Process 4 dct4 rows or columns, depending on the transpose flag.
-template <ButterflyRotationFunc bufferfly_rotation, bool stage_is_rectangular>
-LIBGAV1_ALWAYS_INLINE void Dct4_SSE4_1(void* dest, const void* source,
-                                       int32_t step, bool transpose) {
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct4_SSE4_1(void* dest, int32_t step,
+                                       bool transpose) {
   auto* const dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
   __m128i s[4], x[4];
 
   if (stage_is_rectangular) {
     if (transpose) {
       __m128i input[8];
-      LoadSrc<8, 8>(src, step, 0, input);
+      LoadSrc<8, 8>(dst, step, 0, input);
       Transpose4x8To8x4_U16(input, x);
     } else {
-      LoadSrc<16, 4>(src, step, 0, x);
+      LoadSrc<16, 4>(dst, step, 0, x);
     }
   } else {
-    LoadSrc<8, 4>(src, step, 0, x);
+    LoadSrc<8, 4>(dst, step, 0, x);
     if (transpose) {
       Transpose4x4_U16(x, x);
     }
@@ -345,7 +334,7 @@
   s[2] = x[1];
   s[3] = x[3];
 
-  Dct4Stages<bufferfly_rotation>(s);
+  Dct4Stages<butterfly_rotation>(s);
 
   if (stage_is_rectangular) {
     if (transpose) {
@@ -363,16 +352,16 @@
   }
 }
 
-template <ButterflyRotationFunc bufferfly_rotation,
-          bool is_fast_bufferfly = false>
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
 LIBGAV1_ALWAYS_INLINE void Dct8Stages(__m128i* s) {
   // stage 8.
-  if (is_fast_bufferfly) {
+  if (is_fast_butterfly) {
     ButterflyRotation_SecondIsZero(&s[4], &s[7], 56, false);
     ButterflyRotation_FirstIsZero(&s[5], &s[6], 24, false);
   } else {
-    bufferfly_rotation(&s[4], &s[7], 56, false);
-    bufferfly_rotation(&s[5], &s[6], 24, false);
+    butterfly_rotation(&s[4], &s[7], 56, false);
+    butterfly_rotation(&s[5], &s[6], 24, false);
   }
 
   // stage 13.
@@ -380,7 +369,7 @@
   HadamardRotation(&s[6], &s[7], true);
 
   // stage 18.
-  bufferfly_rotation(&s[6], &s[5], 32, true);
+  butterfly_rotation(&s[6], &s[5], 32, true);
 
   // stage 22.
   HadamardRotation(&s[0], &s[7], false);
@@ -390,28 +379,27 @@
 }
 
 // Process dct8 rows or columns, depending on the transpose flag.
-template <ButterflyRotationFunc bufferfly_rotation, bool stage_is_rectangular>
-LIBGAV1_ALWAYS_INLINE void Dct8_SSE4_1(void* dest, const void* source,
-                                       int32_t step, bool transpose) {
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct8_SSE4_1(void* dest, int32_t step,
+                                       bool transpose) {
   auto* const dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
   __m128i s[8], x[8];
 
   if (stage_is_rectangular) {
     if (transpose) {
       __m128i input[4];
-      LoadSrc<16, 4>(src, step, 0, input);
+      LoadSrc<16, 4>(dst, step, 0, input);
       Transpose8x4To4x8_U16(input, x);
     } else {
-      LoadSrc<8, 8>(src, step, 0, x);
+      LoadSrc<8, 8>(dst, step, 0, x);
     }
   } else {
     if (transpose) {
       __m128i input[8];
-      LoadSrc<16, 8>(src, step, 0, input);
+      LoadSrc<16, 8>(dst, step, 0, input);
       Transpose8x8_U16(input, x);
     } else {
-      LoadSrc<16, 8>(src, step, 0, x);
+      LoadSrc<16, 8>(dst, step, 0, x);
     }
   }
 
@@ -426,8 +414,8 @@
   s[6] = x[3];
   s[7] = x[7];
 
-  Dct4Stages<bufferfly_rotation>(s);
-  Dct8Stages<bufferfly_rotation>(s);
+  Dct4Stages<butterfly_rotation>(s);
+  Dct8Stages<butterfly_rotation>(s);
 
   if (stage_is_rectangular) {
     if (transpose) {
@@ -448,20 +436,20 @@
   }
 }
 
-template <ButterflyRotationFunc bufferfly_rotation,
-          bool is_fast_bufferfly = false>
+template <ButterflyRotationFunc butterfly_rotation,
+          bool is_fast_butterfly = false>
 LIBGAV1_ALWAYS_INLINE void Dct16Stages(__m128i* s) {
   // stage 5.
-  if (is_fast_bufferfly) {
+  if (is_fast_butterfly) {
     ButterflyRotation_SecondIsZero(&s[8], &s[15], 60, false);
     ButterflyRotation_FirstIsZero(&s[9], &s[14], 28, false);
     ButterflyRotation_SecondIsZero(&s[10], &s[13], 44, false);
     ButterflyRotation_FirstIsZero(&s[11], &s[12], 12, false);
   } else {
-    bufferfly_rotation(&s[8], &s[15], 60, false);
-    bufferfly_rotation(&s[9], &s[14], 28, false);
-    bufferfly_rotation(&s[10], &s[13], 44, false);
-    bufferfly_rotation(&s[11], &s[12], 12, false);
+    butterfly_rotation(&s[8], &s[15], 60, false);
+    butterfly_rotation(&s[9], &s[14], 28, false);
+    butterfly_rotation(&s[10], &s[13], 44, false);
+    butterfly_rotation(&s[11], &s[12], 12, false);
   }
 
   // stage 9.
@@ -471,8 +459,8 @@
   HadamardRotation(&s[14], &s[15], true);
 
   // stage 14.
-  bufferfly_rotation(&s[14], &s[9], 48, true);
-  bufferfly_rotation(&s[13], &s[10], 112, true);
+  butterfly_rotation(&s[14], &s[9], 48, true);
+  butterfly_rotation(&s[13], &s[10], 112, true);
 
   // stage 19.
   HadamardRotation(&s[8], &s[11], false);
@@ -481,8 +469,8 @@
   HadamardRotation(&s[13], &s[14], true);
 
   // stage 23.
-  bufferfly_rotation(&s[13], &s[10], 32, true);
-  bufferfly_rotation(&s[12], &s[11], 32, true);
+  butterfly_rotation(&s[13], &s[10], 32, true);
+  butterfly_rotation(&s[12], &s[11], 32, true);
 
   // stage 26.
   HadamardRotation(&s[0], &s[15], false);
@@ -496,32 +484,31 @@
 }
 
 // Process dct16 rows or columns, depending on the transpose flag.
-template <ButterflyRotationFunc bufferfly_rotation, bool stage_is_rectangular>
-LIBGAV1_ALWAYS_INLINE void Dct16_SSE4_1(void* dest, const void* source,
-                                        int32_t step, bool transpose) {
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct16_SSE4_1(void* dest, int32_t step,
+                                        bool transpose) {
   auto* const dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
   __m128i s[16], x[16];
 
   if (stage_is_rectangular) {
     if (transpose) {
       __m128i input[4];
-      LoadSrc<16, 4>(src, step, 0, input);
+      LoadSrc<16, 4>(dst, step, 0, input);
       Transpose8x4To4x8_U16(input, x);
-      LoadSrc<16, 4>(src, step, 8, input);
+      LoadSrc<16, 4>(dst, step, 8, input);
       Transpose8x4To4x8_U16(input, &x[8]);
     } else {
-      LoadSrc<8, 16>(src, step, 0, x);
+      LoadSrc<8, 16>(dst, step, 0, x);
     }
   } else {
     if (transpose) {
       for (int idx = 0; idx < 16; idx += 8) {
         __m128i input[8];
-        LoadSrc<16, 8>(src, step, idx, input);
+        LoadSrc<16, 8>(dst, step, idx, input);
         Transpose8x8_U16(input, &x[idx]);
       }
     } else {
-      LoadSrc<16, 16>(src, step, 0, x);
+      LoadSrc<16, 16>(dst, step, 0, x);
     }
   }
 
@@ -544,9 +531,9 @@
   s[14] = x[7];
   s[15] = x[15];
 
-  Dct4Stages<bufferfly_rotation>(s);
-  Dct8Stages<bufferfly_rotation>(s);
-  Dct16Stages<bufferfly_rotation>(s);
+  Dct4Stages<butterfly_rotation>(s);
+  Dct8Stages<butterfly_rotation>(s);
+  Dct16Stages<butterfly_rotation>(s);
 
   if (stage_is_rectangular) {
     if (transpose) {
@@ -571,7 +558,7 @@
   }
 }
 
-template <ButterflyRotationFunc bufferfly_rotation,
+template <ButterflyRotationFunc butterfly_rotation,
           bool is_fast_butterfly = false>
 LIBGAV1_ALWAYS_INLINE void Dct32Stages(__m128i* s) {
   // stage 3
@@ -585,14 +572,14 @@
     ButterflyRotation_SecondIsZero(&s[22], &s[25], 38, false);
     ButterflyRotation_FirstIsZero(&s[23], &s[24], 6, false);
   } else {
-    bufferfly_rotation(&s[16], &s[31], 62, false);
-    bufferfly_rotation(&s[17], &s[30], 30, false);
-    bufferfly_rotation(&s[18], &s[29], 46, false);
-    bufferfly_rotation(&s[19], &s[28], 14, false);
-    bufferfly_rotation(&s[20], &s[27], 54, false);
-    bufferfly_rotation(&s[21], &s[26], 22, false);
-    bufferfly_rotation(&s[22], &s[25], 38, false);
-    bufferfly_rotation(&s[23], &s[24], 6, false);
+    butterfly_rotation(&s[16], &s[31], 62, false);
+    butterfly_rotation(&s[17], &s[30], 30, false);
+    butterfly_rotation(&s[18], &s[29], 46, false);
+    butterfly_rotation(&s[19], &s[28], 14, false);
+    butterfly_rotation(&s[20], &s[27], 54, false);
+    butterfly_rotation(&s[21], &s[26], 22, false);
+    butterfly_rotation(&s[22], &s[25], 38, false);
+    butterfly_rotation(&s[23], &s[24], 6, false);
   }
   // stage 6.
   HadamardRotation(&s[16], &s[17], false);
@@ -605,10 +592,10 @@
   HadamardRotation(&s[30], &s[31], true);
 
   // stage 10.
-  bufferfly_rotation(&s[30], &s[17], 24 + 32, true);
-  bufferfly_rotation(&s[29], &s[18], 24 + 64 + 32, true);
-  bufferfly_rotation(&s[26], &s[21], 24, true);
-  bufferfly_rotation(&s[25], &s[22], 24 + 64, true);
+  butterfly_rotation(&s[30], &s[17], 24 + 32, true);
+  butterfly_rotation(&s[29], &s[18], 24 + 64 + 32, true);
+  butterfly_rotation(&s[26], &s[21], 24, true);
+  butterfly_rotation(&s[25], &s[22], 24 + 64, true);
 
   // stage 15.
   HadamardRotation(&s[16], &s[19], false);
@@ -621,10 +608,10 @@
   HadamardRotation(&s[29], &s[30], true);
 
   // stage 20.
-  bufferfly_rotation(&s[29], &s[18], 48, true);
-  bufferfly_rotation(&s[28], &s[19], 48, true);
-  bufferfly_rotation(&s[27], &s[20], 48 + 64, true);
-  bufferfly_rotation(&s[26], &s[21], 48 + 64, true);
+  butterfly_rotation(&s[29], &s[18], 48, true);
+  butterfly_rotation(&s[28], &s[19], 48, true);
+  butterfly_rotation(&s[27], &s[20], 48 + 64, true);
+  butterfly_rotation(&s[26], &s[21], 48 + 64, true);
 
   // stage 24.
   HadamardRotation(&s[16], &s[23], false);
@@ -637,10 +624,10 @@
   HadamardRotation(&s[27], &s[28], true);
 
   // stage 27.
-  bufferfly_rotation(&s[27], &s[20], 32, true);
-  bufferfly_rotation(&s[26], &s[21], 32, true);
-  bufferfly_rotation(&s[25], &s[22], 32, true);
-  bufferfly_rotation(&s[24], &s[23], 32, true);
+  butterfly_rotation(&s[27], &s[20], 32, true);
+  butterfly_rotation(&s[26], &s[21], 32, true);
+  butterfly_rotation(&s[25], &s[22], 32, true);
+  butterfly_rotation(&s[24], &s[23], 32, true);
 
   // stage 29.
   HadamardRotation(&s[0], &s[31], false);
@@ -662,21 +649,19 @@
 }
 
 // Process dct32 rows or columns, depending on the transpose flag.
-LIBGAV1_ALWAYS_INLINE void Dct32_SSE4_1(void* dest, const void* source,
-                                        const int32_t step,
+LIBGAV1_ALWAYS_INLINE void Dct32_SSE4_1(void* dest, const int32_t step,
                                         const bool transpose) {
   auto* const dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
   __m128i s[32], x[32];
 
   if (transpose) {
     for (int idx = 0; idx < 32; idx += 8) {
       __m128i input[8];
-      LoadSrc<16, 8>(src, step, idx, input);
+      LoadSrc<16, 8>(dst, step, idx, input);
       Transpose8x8_U16(input, &x[idx]);
     }
   } else {
-    LoadSrc<16, 32>(src, step, 0, x);
+    LoadSrc<16, 32>(dst, step, 0, x);
   }
 
   // stage 1
@@ -735,10 +720,8 @@
 
 // Allow the compiler to call this function instead of force inlining. Tests
 // show the performance is slightly faster.
-void Dct64_SSE4_1(void* dest, const void* source, int32_t step,
-                  bool transpose) {
+void Dct64_SSE4_1(void* dest, int32_t step, bool transpose) {
   auto* const dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
   __m128i s[64], x[32];
 
   if (transpose) {
@@ -746,13 +729,13 @@
     // 64.
     for (int idx = 0; idx < 32; idx += 8) {
       __m128i input[8];
-      LoadSrc<16, 8>(src, step, idx, input);
+      LoadSrc<16, 8>(dst, step, idx, input);
       Transpose8x8_U16(input, &x[idx]);
     }
   } else {
     // The last 32 values of every column are always zero if the |tx_height| is
     // 64.
-    LoadSrc<16, 32>(src, step, 0, x);
+    LoadSrc<16, 32>(dst, step, 0, x);
   }
 
   // stage 1
@@ -957,22 +940,21 @@
 // Asymmetric Discrete Sine Transforms (ADST).
 
 template <bool stage_is_rectangular>
-LIBGAV1_ALWAYS_INLINE void Adst4_SSE4_1(void* dest, const void* source,
-                                        int32_t step, bool transpose) {
+LIBGAV1_ALWAYS_INLINE void Adst4_SSE4_1(void* dest, int32_t step,
+                                        bool transpose) {
   auto* const dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
   __m128i s[8], x[4];
 
   if (stage_is_rectangular) {
     if (transpose) {
       __m128i input[8];
-      LoadSrc<8, 8>(src, step, 0, input);
+      LoadSrc<8, 8>(dst, step, 0, input);
       Transpose4x8To8x4_U16(input, x);
     } else {
-      LoadSrc<16, 4>(src, step, 0, x);
+      LoadSrc<16, 4>(dst, step, 0, x);
     }
   } else {
-    LoadSrc<8, 4>(src, step, 0, x);
+    LoadSrc<8, 4>(dst, step, 0, x);
     if (transpose) {
       Transpose4x4_U16(x, x);
     }
@@ -1049,18 +1031,15 @@
 constexpr int16_t kAdst4DcOnlyMultiplier[8] = {1321, 0, 2482, 0,
                                                3344, 0, 2482, 1321};
 
-LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, const void* source,
-                                       int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, int adjusted_tx_height,
                                        bool should_round, int row_shift) {
-  if (non_zero_coeff_count > 1) {
-    return false;
-  }
+  if (adjusted_tx_height > 1) return false;
 
   auto* dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
   const __m128i v_src =
-      _mm_shuffle_epi32(_mm_shufflelo_epi16(_mm_cvtsi32_si128(src[0]), 0), 0);
-  const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+      _mm_shuffle_epi32(_mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0), 0);
+  const __m128i v_mask =
+      _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
   const __m128i v_kTransformRowMultiplier =
       _mm_set1_epi16(kTransformRowMultiplier << 3);
   const __m128i v_src_round =
@@ -1083,19 +1062,14 @@
   return true;
 }
 
-LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, const void* source,
-                                             int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, int adjusted_tx_height,
                                              int width) {
-  if (non_zero_coeff_count > 1) {
-    return false;
-  }
+  if (adjusted_tx_height > 1) return false;
 
   auto* dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
-
   int i = 0;
   do {
-    const __m128i v_src = _mm_cvtepi16_epi32(LoadLo8(&src[i]));
+    const __m128i v_src = _mm_cvtepi16_epi32(LoadLo8(&dst[i]));
     const __m128i kAdst4Multiplier_0 = _mm_set1_epi32(kAdst4Multiplier[0]);
     const __m128i kAdst4Multiplier_1 = _mm_set1_epi32(kAdst4Multiplier[1]);
     const __m128i kAdst4Multiplier_2 = _mm_set1_epi32(kAdst4Multiplier[2]);
@@ -1122,28 +1096,27 @@
   return true;
 }
 
-template <ButterflyRotationFunc bufferfly_rotation, bool stage_is_rectangular>
-LIBGAV1_ALWAYS_INLINE void Adst8_SSE4_1(void* dest, const void* source,
-                                        int32_t step, bool transpose) {
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst8_SSE4_1(void* dest, int32_t step,
+                                        bool transpose) {
   auto* const dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
   __m128i s[8], x[8];
 
   if (stage_is_rectangular) {
     if (transpose) {
       __m128i input[4];
-      LoadSrc<16, 4>(src, step, 0, input);
+      LoadSrc<16, 4>(dst, step, 0, input);
       Transpose8x4To4x8_U16(input, x);
     } else {
-      LoadSrc<8, 8>(src, step, 0, x);
+      LoadSrc<8, 8>(dst, step, 0, x);
     }
   } else {
     if (transpose) {
       __m128i input[8];
-      LoadSrc<16, 8>(src, step, 0, input);
+      LoadSrc<16, 8>(dst, step, 0, input);
       Transpose8x8_U16(input, x);
     } else {
-      LoadSrc<16, 8>(src, step, 0, x);
+      LoadSrc<16, 8>(dst, step, 0, x);
     }
   }
 
@@ -1158,10 +1131,10 @@
   s[7] = x[6];
 
   // stage 2.
-  bufferfly_rotation(&s[0], &s[1], 60 - 0, true);
-  bufferfly_rotation(&s[2], &s[3], 60 - 16, true);
-  bufferfly_rotation(&s[4], &s[5], 60 - 32, true);
-  bufferfly_rotation(&s[6], &s[7], 60 - 48, true);
+  butterfly_rotation(&s[0], &s[1], 60 - 0, true);
+  butterfly_rotation(&s[2], &s[3], 60 - 16, true);
+  butterfly_rotation(&s[4], &s[5], 60 - 32, true);
+  butterfly_rotation(&s[6], &s[7], 60 - 48, true);
 
   // stage 3.
   HadamardRotation(&s[0], &s[4], false);
@@ -1170,8 +1143,8 @@
   HadamardRotation(&s[3], &s[7], false);
 
   // stage 4.
-  bufferfly_rotation(&s[4], &s[5], 48 - 0, true);
-  bufferfly_rotation(&s[7], &s[6], 48 - 32, true);
+  butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+  butterfly_rotation(&s[7], &s[6], 48 - 32, true);
 
   // stage 5.
   HadamardRotation(&s[0], &s[2], false);
@@ -1180,8 +1153,8 @@
   HadamardRotation(&s[5], &s[7], false);
 
   // stage 6.
-  bufferfly_rotation(&s[2], &s[3], 32, true);
-  bufferfly_rotation(&s[6], &s[7], 32, true);
+  butterfly_rotation(&s[2], &s[3], 32, true);
+  butterfly_rotation(&s[6], &s[7], 32, true);
 
   // stage 7.
   const __m128i v_zero = _mm_setzero_si128();
@@ -1213,19 +1186,16 @@
   }
 }
 
-LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, const void* source,
-                                       int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, int adjusted_tx_height,
                                        bool should_round, int row_shift) {
-  if (non_zero_coeff_count > 1) {
-    return false;
-  }
+  if (adjusted_tx_height > 1) return false;
 
   auto* dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
   __m128i s[8];
 
-  const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(src[0]), 0);
-  const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+  const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
+  const __m128i v_mask =
+      _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
   const __m128i v_kTransformRowMultiplier =
       _mm_set1_epi16(kTransformRowMultiplier << 3);
   const __m128i v_src_round =
@@ -1283,20 +1253,16 @@
   return true;
 }
 
-LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, const void* source,
-                                             int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, int adjusted_tx_height,
                                              int width) {
-  if (non_zero_coeff_count > 1) {
-    return false;
-  }
+  if (adjusted_tx_height > 1) return false;
 
   auto* dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
   __m128i s[8];
 
   int i = 0;
   do {
-    const __m128i v_src = LoadLo8(&src[i]);
+    const __m128i v_src = LoadLo8(dst);
     // stage 1.
     s[1] = v_src;
 
@@ -1342,32 +1308,31 @@
   return true;
 }
 
-template <ButterflyRotationFunc bufferfly_rotation, bool stage_is_rectangular>
-LIBGAV1_ALWAYS_INLINE void Adst16_SSE4_1(void* dest, const void* source,
-                                         int32_t step, bool transpose) {
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst16_SSE4_1(void* dest, int32_t step,
+                                         bool transpose) {
   auto* const dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
   __m128i s[16], x[16];
 
   if (stage_is_rectangular) {
     if (transpose) {
       __m128i input[4];
-      LoadSrc<16, 4>(src, step, 0, input);
+      LoadSrc<16, 4>(dst, step, 0, input);
       Transpose8x4To4x8_U16(input, x);
-      LoadSrc<16, 4>(src, step, 8, input);
+      LoadSrc<16, 4>(dst, step, 8, input);
       Transpose8x4To4x8_U16(input, &x[8]);
     } else {
-      LoadSrc<8, 16>(src, step, 0, x);
+      LoadSrc<8, 16>(dst, step, 0, x);
     }
   } else {
     if (transpose) {
       for (int idx = 0; idx < 16; idx += 8) {
         __m128i input[8];
-        LoadSrc<16, 8>(src, step, idx, input);
+        LoadSrc<16, 8>(dst, step, idx, input);
         Transpose8x8_U16(input, &x[idx]);
       }
     } else {
-      LoadSrc<16, 16>(src, step, 0, x);
+      LoadSrc<16, 16>(dst, step, 0, x);
     }
   }
 
@@ -1390,14 +1355,14 @@
   s[15] = x[14];
 
   // stage 2.
-  bufferfly_rotation(&s[0], &s[1], 62 - 0, true);
-  bufferfly_rotation(&s[2], &s[3], 62 - 8, true);
-  bufferfly_rotation(&s[4], &s[5], 62 - 16, true);
-  bufferfly_rotation(&s[6], &s[7], 62 - 24, true);
-  bufferfly_rotation(&s[8], &s[9], 62 - 32, true);
-  bufferfly_rotation(&s[10], &s[11], 62 - 40, true);
-  bufferfly_rotation(&s[12], &s[13], 62 - 48, true);
-  bufferfly_rotation(&s[14], &s[15], 62 - 56, true);
+  butterfly_rotation(&s[0], &s[1], 62 - 0, true);
+  butterfly_rotation(&s[2], &s[3], 62 - 8, true);
+  butterfly_rotation(&s[4], &s[5], 62 - 16, true);
+  butterfly_rotation(&s[6], &s[7], 62 - 24, true);
+  butterfly_rotation(&s[8], &s[9], 62 - 32, true);
+  butterfly_rotation(&s[10], &s[11], 62 - 40, true);
+  butterfly_rotation(&s[12], &s[13], 62 - 48, true);
+  butterfly_rotation(&s[14], &s[15], 62 - 56, true);
 
   // stage 3.
   HadamardRotation(&s[0], &s[8], false);
@@ -1410,10 +1375,10 @@
   HadamardRotation(&s[7], &s[15], false);
 
   // stage 4.
-  bufferfly_rotation(&s[8], &s[9], 56 - 0, true);
-  bufferfly_rotation(&s[13], &s[12], 8 + 0, true);
-  bufferfly_rotation(&s[10], &s[11], 56 - 32, true);
-  bufferfly_rotation(&s[15], &s[14], 8 + 32, true);
+  butterfly_rotation(&s[8], &s[9], 56 - 0, true);
+  butterfly_rotation(&s[13], &s[12], 8 + 0, true);
+  butterfly_rotation(&s[10], &s[11], 56 - 32, true);
+  butterfly_rotation(&s[15], &s[14], 8 + 32, true);
 
   // stage 5.
   HadamardRotation(&s[0], &s[4], false);
@@ -1426,10 +1391,10 @@
   HadamardRotation(&s[11], &s[15], false);
 
   // stage 6.
-  bufferfly_rotation(&s[4], &s[5], 48 - 0, true);
-  bufferfly_rotation(&s[12], &s[13], 48 - 0, true);
-  bufferfly_rotation(&s[7], &s[6], 48 - 32, true);
-  bufferfly_rotation(&s[15], &s[14], 48 - 32, true);
+  butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+  butterfly_rotation(&s[12], &s[13], 48 - 0, true);
+  butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+  butterfly_rotation(&s[15], &s[14], 48 - 32, true);
 
   // stage 7.
   HadamardRotation(&s[0], &s[2], false);
@@ -1442,10 +1407,10 @@
   HadamardRotation(&s[13], &s[15], false);
 
   // stage 8.
-  bufferfly_rotation(&s[2], &s[3], 32, true);
-  bufferfly_rotation(&s[6], &s[7], 32, true);
-  bufferfly_rotation(&s[10], &s[11], 32, true);
-  bufferfly_rotation(&s[14], &s[15], 32, true);
+  butterfly_rotation(&s[2], &s[3], 32, true);
+  butterfly_rotation(&s[6], &s[7], 32, true);
+  butterfly_rotation(&s[10], &s[11], 32, true);
+  butterfly_rotation(&s[14], &s[15], 32, true);
 
   // stage 9.
   const __m128i v_zero = _mm_setzero_si128();
@@ -1546,20 +1511,17 @@
   x[15] = _mm_subs_epi16(v_zero, s[1]);
 }
 
-LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, const void* source,
-                                        int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, int adjusted_tx_height,
                                         bool should_round, int row_shift) {
-  if (non_zero_coeff_count > 1) {
-    return false;
-  }
+  if (adjusted_tx_height > 1) return false;
 
   auto* dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
   __m128i s[16];
   __m128i x[16];
 
-  const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(src[0]), 0);
-  const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+  const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
+  const __m128i v_mask =
+      _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
   const __m128i v_kTransformRowMultiplier =
       _mm_set1_epi16(kTransformRowMultiplier << 3);
   const __m128i v_src_round =
@@ -1589,21 +1551,17 @@
   return true;
 }
 
-LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest, const void* source,
-                                              int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest,
+                                              int adjusted_tx_height,
                                               int width) {
-  if (non_zero_coeff_count > 1) {
-    return false;
-  }
+  if (adjusted_tx_height > 1) return false;
 
   auto* dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
-
   int i = 0;
   do {
     __m128i s[16];
     __m128i x[16];
-    const __m128i v_src = LoadUnaligned16(&src[i]);
+    const __m128i v_src = LoadUnaligned16(dst);
     // stage 1.
     s[1] = v_src;
 
@@ -1623,10 +1581,8 @@
 // Identity Transforms.
 
 template <bool is_row_shift>
-LIBGAV1_ALWAYS_INLINE void Identity4_SSE4_1(void* dest, const void* source,
-                                            int32_t step) {
+LIBGAV1_ALWAYS_INLINE void Identity4_SSE4_1(void* dest, int32_t step) {
   auto* const dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
 
   if (is_row_shift) {
     const int shift = 1;
@@ -1634,7 +1590,7 @@
     const __m128i v_multiplier_one =
         _mm_set1_epi32((kIdentity4Multiplier << 16) | 0x0001);
     for (int i = 0; i < 4; i += 2) {
-      const __m128i v_src = LoadUnaligned16(&src[i * step]);
+      const __m128i v_src = LoadUnaligned16(&dst[i * step]);
       const __m128i v_src_round = _mm_unpacklo_epi16(v_dual_round, v_src);
       const __m128i v_src_round_hi = _mm_unpackhi_epi16(v_dual_round, v_src);
       const __m128i a = _mm_madd_epi16(v_src_round, v_multiplier_one);
@@ -1647,7 +1603,7 @@
     const __m128i v_multiplier =
         _mm_set1_epi16(kIdentity4MultiplierFraction << 3);
     for (int i = 0; i < 4; i += 2) {
-      const __m128i v_src = LoadUnaligned16(&src[i * step]);
+      const __m128i v_src = LoadUnaligned16(&dst[i * step]);
       const __m128i a = _mm_mulhrs_epi16(v_src, v_multiplier);
       const __m128i b = _mm_adds_epi16(a, v_src);
       StoreUnaligned16(&dst[i * step], b);
@@ -1655,18 +1611,14 @@
   }
 }
 
-LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, const void* source,
-                                           int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height,
                                            bool should_round, int tx_height) {
-  if (non_zero_coeff_count > 1) {
-    return false;
-  }
+  if (adjusted_tx_height > 1) return false;
 
   auto* dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
-
-  const __m128i v_src0 = _mm_cvtsi32_si128(src[0]);
-  const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+  const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
+  const __m128i v_mask =
+      _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
   const __m128i v_kTransformRowMultiplier =
       _mm_set1_epi16(kTransformRowMultiplier << 3);
   const __m128i v_src_round =
@@ -1786,29 +1738,25 @@
   }
 }
 
-LIBGAV1_ALWAYS_INLINE void Identity8Row32_SSE4_1(void* dest, const void* source,
-                                                 int32_t step) {
+LIBGAV1_ALWAYS_INLINE void Identity8Row32_SSE4_1(void* dest, int32_t step) {
   auto* const dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
 
   // When combining the identity8 multiplier with the row shift, the
   // calculations for tx_height equal to 32 can be simplified from
   // ((A * 2) + 2) >> 2) to ((A + 1) >> 1).
   const __m128i v_row_multiplier = _mm_set1_epi16(1 << 14);
   for (int h = 0; h < 4; ++h) {
-    const __m128i v_src = LoadUnaligned16(&src[h * step]);
+    const __m128i v_src = LoadUnaligned16(&dst[h * step]);
     const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_row_multiplier);
     StoreUnaligned16(&dst[h * step], v_src_mult);
   }
 }
 
-LIBGAV1_ALWAYS_INLINE void Identity8Row4_SSE4_1(void* dest, const void* source,
-                                                int32_t step) {
+LIBGAV1_ALWAYS_INLINE void Identity8Row4_SSE4_1(void* dest, int32_t step) {
   auto* const dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
 
   for (int h = 0; h < 4; ++h) {
-    const __m128i v_src = LoadUnaligned16(&src[h * step]);
+    const __m128i v_src = LoadUnaligned16(&dst[h * step]);
     // For bitdepth == 8, the identity row clamps to a signed 16bit value, so
     // saturating add here is ok.
     const __m128i a = _mm_adds_epi16(v_src, v_src);
@@ -1816,18 +1764,14 @@
   }
 }
 
-LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, const void* source,
-                                           int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height,
                                            bool should_round, int row_shift) {
-  if (non_zero_coeff_count > 1) {
-    return false;
-  }
+  if (adjusted_tx_height > 1) return false;
 
   auto* dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
-
-  const __m128i v_src0 = _mm_cvtsi32_si128(src[0]);
-  const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+  const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
+  const __m128i v_mask =
+      _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
   const __m128i v_kTransformRowMultiplier =
       _mm_set1_epi16(kTransformRowMultiplier << 3);
   const __m128i v_src_round =
@@ -1884,10 +1828,9 @@
   }
 }
 
-LIBGAV1_ALWAYS_INLINE void Identity16Row_SSE4_1(void* dest, const void* source,
-                                                int32_t step, int shift) {
+LIBGAV1_ALWAYS_INLINE void Identity16Row_SSE4_1(void* dest, int32_t step,
+                                                int shift) {
   auto* const dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
 
   const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
   const __m128i v_multiplier_one =
@@ -1895,8 +1838,8 @@
   const __m128i v_shift = _mm_set_epi64x(0, 12 + shift);
 
   for (int h = 0; h < 4; ++h) {
-    const __m128i v_src = LoadUnaligned16(&src[h * step]);
-    const __m128i v_src2 = LoadUnaligned16(&src[h * step + 8]);
+    const __m128i v_src = LoadUnaligned16(&dst[h * step]);
+    const __m128i v_src2 = LoadUnaligned16(&dst[h * step + 8]);
     const __m128i v_src_round0 = _mm_unpacklo_epi16(v_dual_round, v_src);
     const __m128i v_src_round1 = _mm_unpackhi_epi16(v_dual_round, v_src);
     const __m128i v_src2_round0 = _mm_unpacklo_epi16(v_dual_round, v_src2);
@@ -1914,18 +1857,14 @@
   }
 }
 
-LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, const void* source,
-                                            int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height,
                                             bool should_round, int shift) {
-  if (non_zero_coeff_count > 1) {
-    return false;
-  }
+  if (adjusted_tx_height > 1) return false;
 
   auto* dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
-
-  const __m128i v_src0 = _mm_cvtsi32_si128(src[0]);
-  const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+  const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
+  const __m128i v_mask =
+      _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
   const __m128i v_kTransformRowMultiplier =
       _mm_set1_epi16(kTransformRowMultiplier << 3);
   const __m128i v_src_round0 =
@@ -1990,17 +1929,15 @@
 }
 
 LIBGAV1_ALWAYS_INLINE void Identity32Row16_SSE4_1(void* dest,
-                                                  const void* source,
                                                   const int32_t step) {
   auto* const dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
 
   // When combining the identity32 multiplier with the row shift, the
   // calculation for tx_height equal to 16 can be simplified from
   // ((A * 4) + 1) >> 1) to (A * 2).
   for (int h = 0; h < 4; ++h) {
     for (int i = 0; i < 32; i += 8) {
-      const __m128i v_src = LoadUnaligned16(&src[h * step + i]);
+      const __m128i v_src = LoadUnaligned16(&dst[h * step + i]);
       // For bitdepth == 8, the identity row clamps to a signed 16bit value, so
       // saturating add here is ok.
       const __m128i v_dst_i = _mm_adds_epi16(v_src, v_src);
@@ -2009,16 +1946,12 @@
   }
 }
 
-LIBGAV1_ALWAYS_INLINE bool Identity32DcOnly(void* dest, const void* source,
-                                            int non_zero_coeff_count) {
-  if (non_zero_coeff_count > 1) {
-    return false;
-  }
+LIBGAV1_ALWAYS_INLINE bool Identity32DcOnly(void* dest,
+                                            int adjusted_tx_height) {
+  if (adjusted_tx_height > 1) return false;
 
   auto* dst = static_cast<int16_t*>(dest);
-  const auto* const src = static_cast<const int16_t*>(source);
-
-  const __m128i v_src0 = _mm_cvtsi32_si128(src[0]);
+  const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
   const __m128i v_kTransformRowMultiplier =
       _mm_set1_epi16(kTransformRowMultiplier << 3);
   const __m128i v_src = _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
@@ -2063,11 +1996,11 @@
 LIBGAV1_ALWAYS_INLINE void Wht4_SSE4_1(Array2DView<uint8_t> frame,
                                        const int start_x, const int start_y,
                                        const void* source,
-                                       const int non_zero_coeff_count) {
+                                       const int adjusted_tx_height) {
   const auto* const src = static_cast<const int16_t*>(source);
   __m128i s[4], x[4];
 
-  if (non_zero_coeff_count == 1) {
+  if (adjusted_tx_height == 1) {
     // Special case: only src[0] is nonzero.
     //   src[0]  0   0   0
     //       0   0   0   0
@@ -2292,479 +2225,459 @@
   }
 }
 
-void Dct4TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
-                              void* src_buffer, int start_x, int start_y,
-                              void* dst_frame, bool is_row,
-                              int non_zero_coeff_count) {
-  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+void Dct4TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                 TransformSize tx_size, int adjusted_tx_height,
+                                 void* src_buffer, int /*start_x*/,
+                                 int /*start_y*/, void* /*dst_frame*/) {
   auto* src = static_cast<int16_t*>(src_buffer);
-  const int tx_width = kTransformWidth[tx_size];
   const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = (tx_height == 8);
+  const int row_shift = static_cast<int>(tx_height == 16);
 
-  if (is_row) {
-    const bool should_round = (tx_height == 8);
-    const int row_shift = static_cast<int>(tx_height == 16);
-
-    if (DctDcOnly<4>(&src[0], &src[0], non_zero_coeff_count, should_round,
-                     row_shift)) {
-      return;
-    }
-
-    const int num_rows =
-        GetNumRows<4>(tx_type, tx_height, non_zero_coeff_count);
-    if (should_round) {
-      ApplyRounding<4>(src, num_rows);
-    }
-
-    if (num_rows <= 4) {
-      // Process 4 1d dct4 rows in parallel.
-      Dct4_SSE4_1<ButterflyRotation_4, false>(&src[0], &src[0], /*step=*/4,
-                                              /*transpose=*/true);
-    } else {
-      // Process 8 1d dct4 rows in parallel per iteration.
-      int i = 0;
-      do {
-        Dct4_SSE4_1<ButterflyRotation_8, true>(&src[i * 4], &src[i * 4],
-                                               /*step=*/4, /*transpose=*/true);
-        i += 8;
-      } while (i < num_rows);
-    }
-    if (tx_height == 16) {
-      RowShift<4>(src, num_rows, 1);
-    }
+  if (DctDcOnly<4>(src, adjusted_tx_height, should_round, row_shift)) {
     return;
   }
 
-  assert(!is_row);
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height <= 4) {
+    // Process 4 1d dct4 rows in parallel.
+    Dct4_SSE4_1<ButterflyRotation_4, false>(src, /*step=*/4,
+                                            /*transpose=*/true);
+  } else {
+    // Process 8 1d dct4 rows in parallel per iteration.
+    int i = 0;
+    do {
+      Dct4_SSE4_1<ButterflyRotation_8, true>(&src[i * 4], /*step=*/4,
+                                             /*transpose=*/true);
+      i += 8;
+    } while (i < adjusted_tx_height);
+  }
+  if (tx_height == 16) {
+    RowShift<4>(src, adjusted_tx_height, 1);
+  }
+}
+
+void Dct4TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height, void* src_buffer,
+                                    int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
   if (kTransformFlipColumnsMask.Contains(tx_type)) {
     FlipColumns<4>(src, tx_width);
   }
 
-  if (!DctDcOnlyColumn<4>(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
+  if (!DctDcOnlyColumn<4>(src, adjusted_tx_height, tx_width)) {
     if (tx_width == 4) {
       // Process 4 1d dct4 columns in parallel.
-      Dct4_SSE4_1<ButterflyRotation_4, false>(&src[0], &src[0], tx_width,
+      Dct4_SSE4_1<ButterflyRotation_4, false>(src, tx_width,
                                               /*transpose=*/false);
     } else {
       // Process 8 1d dct4 columns in parallel per iteration.
       int i = 0;
       do {
-        Dct4_SSE4_1<ButterflyRotation_8, true>(&src[i], &src[i], tx_width,
+        Dct4_SSE4_1<ButterflyRotation_8, true>(&src[i], tx_width,
                                                /*transpose=*/false);
         i += 8;
       } while (i < tx_width);
     }
   }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
   StoreToFrameWithRound(frame, start_x, start_y, tx_width, 4, src, tx_type);
 }
 
-void Dct8TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
-                              void* src_buffer, int start_x, int start_y,
-                              void* dst_frame, bool is_row,
-                              int non_zero_coeff_count) {
-  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+void Dct8TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                 TransformSize tx_size, int adjusted_tx_height,
+                                 void* src_buffer, int /*start_x*/,
+                                 int /*start_y*/, void* /*dst_frame*/) {
   auto* src = static_cast<int16_t*>(src_buffer);
-  const int tx_width = kTransformWidth[tx_size];
-  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
 
-  if (is_row) {
-    const bool should_round = kShouldRound[tx_size];
-    const uint8_t row_shift = kTransformRowShift[tx_size];
-
-    if (DctDcOnly<8>(&src[0], &src[0], non_zero_coeff_count, should_round,
-                     row_shift)) {
-      return;
-    }
-
-    const int num_rows =
-        GetNumRows<8>(tx_type, tx_height, non_zero_coeff_count);
-    if (should_round) {
-      ApplyRounding<8>(src, num_rows);
-    }
-
-    if (num_rows <= 4) {
-      // Process 4 1d dct8 rows in parallel.
-      Dct8_SSE4_1<ButterflyRotation_4, true>(&src[0], &src[0], /*step=*/8,
-                                             /*transpose=*/true);
-    } else {
-      // Process 8 1d dct8 rows in parallel per iteration.
-      int i = 0;
-      do {
-        Dct8_SSE4_1<ButterflyRotation_8, false>(&src[i * 8], &src[i * 8],
-                                                /*step=*/8, /*transpose=*/true);
-        i += 8;
-      } while (i < num_rows);
-    }
-    if (row_shift > 0) {
-      RowShift<8>(src, num_rows, row_shift);
-    }
+  if (DctDcOnly<8>(src, adjusted_tx_height, should_round, row_shift)) {
     return;
   }
 
-  assert(!is_row);
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height <= 4) {
+    // Process 4 1d dct8 rows in parallel.
+    Dct8_SSE4_1<ButterflyRotation_4, true>(src, /*step=*/8, /*transpose=*/true);
+  } else {
+    // Process 8 1d dct8 rows in parallel per iteration.
+    int i = 0;
+    do {
+      Dct8_SSE4_1<ButterflyRotation_8, false>(&src[i * 8], /*step=*/8,
+                                              /*transpose=*/true);
+      i += 8;
+    } while (i < adjusted_tx_height);
+  }
+  if (row_shift > 0) {
+    RowShift<8>(src, adjusted_tx_height, row_shift);
+  }
+}
+
+void Dct8TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height, void* src_buffer,
+                                    int start_x, int start_y, void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
   if (kTransformFlipColumnsMask.Contains(tx_type)) {
     FlipColumns<8>(src, tx_width);
   }
 
-  if (!DctDcOnlyColumn<8>(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
+  if (!DctDcOnlyColumn<8>(src, adjusted_tx_height, tx_width)) {
     if (tx_width == 4) {
       // Process 4 1d dct8 columns in parallel.
-      Dct8_SSE4_1<ButterflyRotation_4, true>(&src[0], &src[0], 4,
-                                             /*transpose=*/false);
+      Dct8_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
     } else {
       // Process 8 1d dct8 columns in parallel per iteration.
       int i = 0;
       do {
-        Dct8_SSE4_1<ButterflyRotation_8, false>(&src[i], &src[i], tx_width,
+        Dct8_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
                                                 /*transpose=*/false);
         i += 8;
       } while (i < tx_width);
     }
   }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
   StoreToFrameWithRound(frame, start_x, start_y, tx_width, 8, src, tx_type);
 }
 
-void Dct16TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
-                               void* src_buffer, int start_x, int start_y,
-                               void* dst_frame, bool is_row,
-                               int non_zero_coeff_count) {
-  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+void Dct16TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                  TransformSize tx_size, int adjusted_tx_height,
+                                  void* src_buffer, int /*start_x*/,
+                                  int /*start_y*/, void* /*dst_frame*/) {
   auto* src = static_cast<int16_t*>(src_buffer);
-  const int tx_width = kTransformWidth[tx_size];
-  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
 
-  if (is_row) {
-    const bool should_round = kShouldRound[tx_size];
-    const uint8_t row_shift = kTransformRowShift[tx_size];
-
-    if (DctDcOnly<16>(&src[0], &src[0], non_zero_coeff_count, should_round,
-                      row_shift)) {
-      return;
-    }
-
-    const int num_rows =
-        GetNumRows<16>(tx_type, std::min(tx_height, 32), non_zero_coeff_count);
-    if (should_round) {
-      ApplyRounding<16>(src, num_rows);
-    }
-
-    if (num_rows <= 4) {
-      // Process 4 1d dct16 rows in parallel.
-      Dct16_SSE4_1<ButterflyRotation_4, true>(&src[0], &src[0], 16,
-                                              /*transpose=*/true);
-    } else {
-      int i = 0;
-      do {
-        // Process 8 1d dct16 rows in parallel per iteration.
-        Dct16_SSE4_1<ButterflyRotation_8, false>(&src[i * 16], &src[i * 16], 16,
-                                                 /*transpose=*/true);
-        i += 8;
-      } while (i < num_rows);
-    }
-    // row_shift is always non zero here.
-    RowShift<16>(src, num_rows, row_shift);
-
+  if (DctDcOnly<16>(src, adjusted_tx_height, should_round, row_shift)) {
     return;
   }
 
-  assert(!is_row);
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height <= 4) {
+    // Process 4 1d dct16 rows in parallel.
+    Dct16_SSE4_1<ButterflyRotation_4, true>(src, 16, /*transpose=*/true);
+  } else {
+    int i = 0;
+    do {
+      // Process 8 1d dct16 rows in parallel per iteration.
+      Dct16_SSE4_1<ButterflyRotation_8, false>(&src[i * 16], 16,
+                                               /*transpose=*/true);
+      i += 8;
+    } while (i < adjusted_tx_height);
+  }
+  // row_shift is always non zero here.
+  RowShift<16>(src, adjusted_tx_height, row_shift);
+}
+
+void Dct16TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                     TransformSize tx_size,
+                                     int adjusted_tx_height, void* src_buffer,
+                                     int start_x, int start_y,
+                                     void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
   if (kTransformFlipColumnsMask.Contains(tx_type)) {
     FlipColumns<16>(src, tx_width);
   }
 
-  if (!DctDcOnlyColumn<16>(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
+  if (!DctDcOnlyColumn<16>(src, adjusted_tx_height, tx_width)) {
     if (tx_width == 4) {
       // Process 4 1d dct16 columns in parallel.
-      Dct16_SSE4_1<ButterflyRotation_4, true>(&src[0], &src[0], 4,
-                                              /*transpose=*/false);
+      Dct16_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
     } else {
       int i = 0;
       do {
         // Process 8 1d dct16 columns in parallel per iteration.
-        Dct16_SSE4_1<ButterflyRotation_8, false>(&src[i], &src[i], tx_width,
+        Dct16_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
                                                  /*transpose=*/false);
         i += 8;
       } while (i < tx_width);
     }
   }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
   StoreToFrameWithRound(frame, start_x, start_y, tx_width, 16, src, tx_type);
 }
 
-void Dct32TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
-                               void* src_buffer, int start_x, int start_y,
-                               void* dst_frame, bool is_row,
-                               int non_zero_coeff_count) {
-  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+void Dct32TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                  TransformSize tx_size, int adjusted_tx_height,
+                                  void* src_buffer, int /*start_x*/,
+                                  int /*start_y*/, void* /*dst_frame*/) {
   auto* src = static_cast<int16_t*>(src_buffer);
-  const int tx_width = kTransformWidth[tx_size];
-  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
 
-  if (is_row) {
-    const bool should_round = kShouldRound[tx_size];
-    const uint8_t row_shift = kTransformRowShift[tx_size];
-
-    if (DctDcOnly<32>(&src[0], &src[0], non_zero_coeff_count, should_round,
-                      row_shift)) {
-      return;
-    }
-
-    const int num_rows =
-        GetNumRows<32>(tx_type, std::min(tx_height, 32), non_zero_coeff_count);
-    if (should_round) {
-      ApplyRounding<32>(src, num_rows);
-    }
-    // Process 8 1d dct32 rows in parallel per iteration.
-    int i = 0;
-    do {
-      Dct32_SSE4_1(&src[i * 32], &src[i * 32], 32, /*transpose=*/true);
-      i += 8;
-    } while (i < num_rows);
-    // row_shift is always non zero here.
-    RowShift<32>(src, num_rows, row_shift);
-
+  if (DctDcOnly<32>(src, adjusted_tx_height, should_round, row_shift)) {
     return;
   }
 
-  assert(!is_row);
-  if (!DctDcOnlyColumn<32>(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
+  if (should_round) {
+    ApplyRounding<32>(src, adjusted_tx_height);
+  }
+  // Process 8 1d dct32 rows in parallel per iteration.
+  int i = 0;
+  do {
+    Dct32_SSE4_1(&src[i * 32], 32, /*transpose=*/true);
+    i += 8;
+  } while (i < adjusted_tx_height);
+  // row_shift is always non zero here.
+  RowShift<32>(src, adjusted_tx_height, row_shift);
+}
+
+void Dct32TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                     TransformSize tx_size,
+                                     int adjusted_tx_height, void* src_buffer,
+                                     int start_x, int start_y,
+                                     void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (!DctDcOnlyColumn<32>(src, adjusted_tx_height, tx_width)) {
     // Process 8 1d dct32 columns in parallel per iteration.
     int i = 0;
     do {
-      Dct32_SSE4_1(&src[i], &src[i], tx_width, /*transpose=*/false);
+      Dct32_SSE4_1(&src[i], tx_width, /*transpose=*/false);
       i += 8;
     } while (i < tx_width);
   }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
   StoreToFrameWithRound(frame, start_x, start_y, tx_width, 32, src, tx_type);
 }
 
-void Dct64TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
-                               void* src_buffer, int start_x, int start_y,
-                               void* dst_frame, bool is_row,
-                               int non_zero_coeff_count) {
-  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+void Dct64TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                  TransformSize tx_size, int adjusted_tx_height,
+                                  void* src_buffer, int /*start_x*/,
+                                  int /*start_y*/, void* /*dst_frame*/) {
   auto* src = static_cast<int16_t*>(src_buffer);
-  const int tx_width = kTransformWidth[tx_size];
-  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
 
-  if (is_row) {
-    const bool should_round = kShouldRound[tx_size];
-    const uint8_t row_shift = kTransformRowShift[tx_size];
-
-    if (DctDcOnly<64>(&src[0], &src[0], non_zero_coeff_count, should_round,
-                      row_shift)) {
-      return;
-    }
-
-    const int num_rows =
-        GetNumRows<32>(tx_type, std::min(tx_height, 32), non_zero_coeff_count);
-    if (should_round) {
-      ApplyRounding<64>(src, num_rows);
-    }
-    // Process 8 1d dct64 rows in parallel per iteration.
-    int i = 0;
-    do {
-      Dct64_SSE4_1(&src[i * 64], &src[i * 64], 64, /*transpose=*/true);
-      i += 8;
-    } while (i < num_rows);
-    // row_shift is always non zero here.
-    RowShift<64>(src, num_rows, row_shift);
-
+  if (DctDcOnly<64>(src, adjusted_tx_height, should_round, row_shift)) {
     return;
   }
 
-  assert(!is_row);
-  if (!DctDcOnlyColumn<64>(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
+  if (should_round) {
+    ApplyRounding<64>(src, adjusted_tx_height);
+  }
+  // Process 8 1d dct64 rows in parallel per iteration.
+  int i = 0;
+  do {
+    Dct64_SSE4_1(&src[i * 64], 64, /*transpose=*/true);
+    i += 8;
+  } while (i < adjusted_tx_height);
+  // row_shift is always non zero here.
+  RowShift<64>(src, adjusted_tx_height, row_shift);
+}
+
+void Dct64TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                     TransformSize tx_size,
+                                     int adjusted_tx_height, void* src_buffer,
+                                     int start_x, int start_y,
+                                     void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  if (!DctDcOnlyColumn<64>(src, adjusted_tx_height, tx_width)) {
     // Process 8 1d dct64 columns in parallel per iteration.
     int i = 0;
     do {
-      Dct64_SSE4_1(&src[i], &src[i], tx_width, /*transpose=*/false);
+      Dct64_SSE4_1(&src[i], tx_width, /*transpose=*/false);
       i += 8;
     } while (i < tx_width);
   }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
   StoreToFrameWithRound(frame, start_x, start_y, tx_width, 64, src, tx_type);
 }
 
-void Adst4TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
-                               void* src_buffer, int start_x, int start_y,
-                               void* dst_frame, bool is_row,
-                               int non_zero_coeff_count) {
-  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+void Adst4TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                  TransformSize tx_size, int adjusted_tx_height,
+                                  void* src_buffer, int /*start_x*/,
+                                  int /*start_y*/, void* /*dst_frame*/) {
   auto* src = static_cast<int16_t*>(src_buffer);
-  const int tx_width = kTransformWidth[tx_size];
   const int tx_height = kTransformHeight[tx_size];
+  const int row_shift = static_cast<int>(tx_height == 16);
+  const bool should_round = (tx_height == 8);
 
-  if (is_row) {
-    const uint8_t row_shift = static_cast<uint8_t>(tx_height == 16);
-    const bool should_round = (tx_height == 8);
-
-    if (Adst4DcOnly(&src[0], &src[0], non_zero_coeff_count, should_round,
-                    row_shift)) {
-      return;
-    }
-
-    const int num_rows =
-        GetNumRows<4>(tx_type, tx_height, non_zero_coeff_count);
-    if (should_round) {
-      ApplyRounding<4>(src, num_rows);
-    }
-
-    // Process 4 1d adst4 rows in parallel per iteration.
-    int i = 0;
-    do {
-      Adst4_SSE4_1<false>(&src[i * 4], &src[i * 4], /*step=*/4,
-                          /*transpose=*/true);
-      i += 4;
-    } while (i < num_rows);
-
-    if (row_shift != 0u) {
-      RowShift<4>(src, num_rows, 1);
-    }
+  if (Adst4DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
     return;
   }
 
-  assert(!is_row);
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+
+  // Process 4 1d adst4 rows in parallel per iteration.
+  int i = 0;
+  do {
+    Adst4_SSE4_1<false>(&src[i * 4], /*step=*/4, /*transpose=*/true);
+    i += 4;
+  } while (i < adjusted_tx_height);
+
+  if (row_shift != 0) {
+    RowShift<4>(src, adjusted_tx_height, 1);
+  }
+}
+
+void Adst4TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                     TransformSize tx_size,
+                                     int adjusted_tx_height, void* src_buffer,
+                                     int start_x, int start_y,
+                                     void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
   if (kTransformFlipColumnsMask.Contains(tx_type)) {
     FlipColumns<4>(src, tx_width);
   }
 
-  if (!Adst4DcOnlyColumn(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
+  if (!Adst4DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
     // Process 4 1d adst4 columns in parallel per iteration.
     int i = 0;
     do {
-      Adst4_SSE4_1<false>(&src[i], &src[i], tx_width, /*transpose=*/false);
+      Adst4_SSE4_1<false>(&src[i], tx_width, /*transpose=*/false);
       i += 4;
     } while (i < tx_width);
   }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
   StoreToFrameWithRound</*enable_flip_rows=*/true>(frame, start_x, start_y,
                                                    tx_width, 4, src, tx_type);
 }
 
-void Adst8TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
-                               void* src_buffer, int start_x, int start_y,
-                               void* dst_frame, bool is_row,
-                               int non_zero_coeff_count) {
-  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+void Adst8TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                  TransformSize tx_size, int adjusted_tx_height,
+                                  void* src_buffer, int /*start_x*/,
+                                  int /*start_y*/, void* /*dst_frame*/) {
   auto* src = static_cast<int16_t*>(src_buffer);
-  const int tx_width = kTransformWidth[tx_size];
-  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
 
-  if (is_row) {
-    const bool should_round = kShouldRound[tx_size];
-    const uint8_t row_shift = kTransformRowShift[tx_size];
-
-    if (Adst8DcOnly(&src[0], &src[0], non_zero_coeff_count, should_round,
-                    row_shift)) {
-      return;
-    }
-
-    const int num_rows =
-        GetNumRows<8>(tx_type, tx_height, non_zero_coeff_count);
-    if (should_round) {
-      ApplyRounding<8>(src, num_rows);
-    }
-
-    if (num_rows <= 4) {
-      // Process 4 1d adst8 rows in parallel.
-      Adst8_SSE4_1<ButterflyRotation_4, true>(&src[0], &src[0], /*step=*/8,
-                                              /*transpose=*/true);
-    } else {
-      // Process 8 1d adst8 rows in parallel per iteration.
-      int i = 0;
-      do {
-        Adst8_SSE4_1<ButterflyRotation_8, false>(&src[i * 8], &src[i * 8],
-                                                 /*step=*/8,
-                                                 /*transpose=*/true);
-        i += 8;
-      } while (i < num_rows);
-    }
-    if (row_shift > 0) {
-      RowShift<8>(src, num_rows, row_shift);
-    }
+  if (Adst8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
     return;
   }
 
-  assert(!is_row);
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height <= 4) {
+    // Process 4 1d adst8 rows in parallel.
+    Adst8_SSE4_1<ButterflyRotation_4, true>(src, /*step=*/8,
+                                            /*transpose=*/true);
+  } else {
+    // Process 8 1d adst8 rows in parallel per iteration.
+    int i = 0;
+    do {
+      Adst8_SSE4_1<ButterflyRotation_8, false>(&src[i * 8], /*step=*/8,
+                                               /*transpose=*/true);
+      i += 8;
+    } while (i < adjusted_tx_height);
+  }
+  if (row_shift > 0) {
+    RowShift<8>(src, adjusted_tx_height, row_shift);
+  }
+}
+
+void Adst8TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                     TransformSize tx_size,
+                                     int adjusted_tx_height, void* src_buffer,
+                                     int start_x, int start_y,
+                                     void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
   if (kTransformFlipColumnsMask.Contains(tx_type)) {
     FlipColumns<8>(src, tx_width);
   }
 
-  if (!Adst8DcOnlyColumn(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
+  if (!Adst8DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
     if (tx_width == 4) {
       // Process 4 1d adst8 columns in parallel.
-      Adst8_SSE4_1<ButterflyRotation_4, true>(&src[0], &src[0], 4,
-                                              /*transpose=*/false);
+      Adst8_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
     } else {
       // Process 8 1d adst8 columns in parallel per iteration.
       int i = 0;
       do {
-        Adst8_SSE4_1<ButterflyRotation_8, false>(&src[i], &src[i], tx_width,
+        Adst8_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
                                                  /*transpose=*/false);
         i += 8;
       } while (i < tx_width);
     }
   }
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
   StoreToFrameWithRound</*enable_flip_rows=*/true>(frame, start_x, start_y,
                                                    tx_width, 8, src, tx_type);
 }
 
-void Adst16TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
-                                void* src_buffer, int start_x, int start_y,
-                                void* dst_frame, bool is_row,
-                                int non_zero_coeff_count) {
-  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+void Adst16TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                   TransformSize tx_size,
+                                   int adjusted_tx_height, void* src_buffer,
+                                   int /*start_x*/, int /*start_y*/,
+                                   void* /*dst_frame*/) {
   auto* src = static_cast<int16_t*>(src_buffer);
-  const int tx_width = kTransformWidth[tx_size];
-  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
 
-  if (is_row) {
-    const bool should_round = kShouldRound[tx_size];
-    const uint8_t row_shift = kTransformRowShift[tx_size];
-
-    if (Adst16DcOnly(&src[0], &src[0], non_zero_coeff_count, should_round,
-                     row_shift)) {
-      return;
-    }
-
-    const int num_rows =
-        GetNumRows<16>(tx_type, std::min(tx_height, 32), non_zero_coeff_count);
-    if (should_round) {
-      ApplyRounding<16>(src, num_rows);
-    }
-
-    if (num_rows <= 4) {
-      // Process 4 1d adst16 rows in parallel.
-      Adst16_SSE4_1<ButterflyRotation_4, true>(&src[0], &src[0], 16,
-                                               /*transpose=*/true);
-    } else {
-      int i = 0;
-      do {
-        // Process 8 1d adst16 rows in parallel per iteration.
-        Adst16_SSE4_1<ButterflyRotation_8, false>(&src[i * 16], &src[i * 16],
-                                                  16, /*transpose=*/true);
-        i += 8;
-      } while (i < num_rows);
-    }
-    // row_shift is always non zero here.
-    RowShift<16>(src, num_rows, row_shift);
-
+  if (Adst16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
     return;
   }
 
-  assert(!is_row);
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+
+  if (adjusted_tx_height <= 4) {
+    // Process 4 1d adst16 rows in parallel.
+    Adst16_SSE4_1<ButterflyRotation_4, true>(src, 16, /*transpose=*/true);
+  } else {
+    int i = 0;
+    do {
+      // Process 8 1d adst16 rows in parallel per iteration.
+      Adst16_SSE4_1<ButterflyRotation_8, false>(&src[i * 16], 16,
+                                                /*transpose=*/true);
+      i += 8;
+    } while (i < adjusted_tx_height);
+  }
+  // row_shift is always non zero here.
+  RowShift<16>(src, adjusted_tx_height, row_shift);
+}
+
+void Adst16TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                      TransformSize tx_size,
+                                      int adjusted_tx_height, void* src_buffer,
+                                      int start_x, int start_y,
+                                      void* dst_frame) {
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
   if (kTransformFlipColumnsMask.Contains(tx_type)) {
     FlipColumns<16>(src, tx_width);
   }
 
-  if (!Adst16DcOnlyColumn(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
+  if (!Adst16DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
     if (tx_width == 4) {
       // Process 4 1d adst16 columns in parallel.
-      Adst16_SSE4_1<ButterflyRotation_4, true>(&src[0], &src[0], 4,
-                                               /*transpose=*/false);
+      Adst16_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
     } else {
       int i = 0;
       do {
         // Process 8 1d adst16 columns in parallel per iteration.
-        Adst16_SSE4_1<ButterflyRotation_8, false>(&src[i], &src[i], tx_width,
+        Adst16_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
                                                   /*transpose=*/false);
         i += 8;
       } while (i < tx_width);
@@ -2774,56 +2687,57 @@
                                                    tx_width, 16, src, tx_type);
 }
 
-void Identity4TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
-                                   void* src_buffer, int start_x, int start_y,
-                                   void* dst_frame, bool is_row,
-                                   int non_zero_coeff_count) {
+void Identity4TransformLoopRow_SSE4_1(TransformType tx_type,
+                                      TransformSize tx_size,
+                                      int adjusted_tx_height, void* src_buffer,
+                                      int /*start_x*/, int /*start_y*/,
+                                      void* /*dst_frame*/) {
+  // Special case: Process row calculations during column transform call.
+  // Improves performance.
+  if (tx_type == kTransformTypeIdentityIdentity &&
+      tx_size == kTransformSize4x4) {
+    return;
+  }
+
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = (tx_height == 8);
+  if (Identity4DcOnly(src, adjusted_tx_height, should_round, tx_height)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<4>(src, adjusted_tx_height);
+  }
+  if (tx_height < 16) {
+    int i = 0;
+    do {
+      Identity4_SSE4_1<false>(&src[i * 4], /*step=*/4);
+      i += 4;
+    } while (i < adjusted_tx_height);
+  } else {
+    int i = 0;
+    do {
+      Identity4_SSE4_1<true>(&src[i * 4], /*step=*/4);
+      i += 4;
+    } while (i < adjusted_tx_height);
+  }
+}
+
+void Identity4TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                         TransformSize tx_size,
+                                         int adjusted_tx_height,
+                                         void* src_buffer, int start_x,
+                                         int start_y, void* dst_frame) {
   auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
   auto* src = static_cast<int16_t*>(src_buffer);
   const int tx_width = kTransformWidth[tx_size];
-  const int tx_height = kTransformHeight[tx_size];
 
-  if (is_row) {
-    // Special case: Process row calculations during column transform call.
-    // Improves performance.
-    if (tx_type == kTransformTypeIdentityIdentity &&
-        tx_size == kTransformSize4x4) {
-      return;
-    }
-
-    const bool should_round = (tx_height == 8);
-    if (Identity4DcOnly(&src[0], &src[0], non_zero_coeff_count, should_round,
-                        tx_height)) {
-      return;
-    }
-
-    const int num_rows =
-        GetNumRows<4>(tx_type, tx_height, non_zero_coeff_count);
-    if (should_round) {
-      ApplyRounding<4>(src, num_rows);
-    }
-    if (tx_height < 16) {
-      int i = 0;
-      do {
-        Identity4_SSE4_1<false>(&src[i * 4], &src[i * 4], /*step=*/4);
-        i += 4;
-      } while (i < num_rows);
-    } else {
-      int i = 0;
-      do {
-        Identity4_SSE4_1<true>(&src[i * 4], &src[i * 4], /*step=*/4);
-        i += 4;
-      } while (i < num_rows);
-    }
-    return;
-  }
-  assert(!is_row);
-  const int height = (non_zero_coeff_count == 1) ? 1 : tx_height;
   // Special case: Process row calculations during column transform call.
   if (tx_type == kTransformTypeIdentityIdentity &&
       (tx_size == kTransformSize4x4 || tx_size == kTransformSize8x4)) {
-    Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width, height,
-                                   src);
+    Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width,
+                                   adjusted_tx_height, src);
     return;
   }
 
@@ -2831,274 +2745,272 @@
     FlipColumns<4>(src, tx_width);
   }
 
-  Identity4ColumnStoreToFrame(frame, start_x, start_y, tx_width, height, src);
+  Identity4ColumnStoreToFrame(frame, start_x, start_y, tx_width,
+                              adjusted_tx_height, src);
 }
 
-void Identity8TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
-                                   void* src_buffer, int start_x, int start_y,
-                                   void* dst_frame, bool is_row,
-                                   int non_zero_coeff_count) {
-  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
-  auto* src = static_cast<int16_t*>(src_buffer);
-  const int tx_width = kTransformWidth[tx_size];
-  const int tx_height = kTransformHeight[tx_size];
-
-  if (is_row) {
-    // Special case: Process row calculations during column transform call.
-    // Improves performance.
-    if (tx_type == kTransformTypeIdentityIdentity &&
-        tx_size == kTransformSize8x4) {
-      return;
-    }
-
-    const bool should_round = kShouldRound[tx_size];
-    const uint8_t row_shift = kTransformRowShift[tx_size];
-    if (Identity8DcOnly(&src[0], &src[0], non_zero_coeff_count, should_round,
-                        row_shift)) {
-      return;
-    }
-
-    const int num_rows =
-        GetNumRows<8>(tx_type, tx_height, non_zero_coeff_count);
-    if (should_round) {
-      ApplyRounding<8>(src, num_rows);
-    }
-
-    // When combining the identity8 multiplier with the row shift, the
-    // calculations for tx_height == 8 and tx_height == 16 can be simplified
-    // from ((A * 2) + 1) >> 1) to A.
-    if ((tx_height & 0x18) != 0) {
-      return;
-    }
-    if (tx_height == 32) {
-      int i = 0;
-      do {
-        Identity8Row32_SSE4_1(&src[i * 8], &src[i * 8], /*step=*/8);
-        i += 4;
-      } while (i < num_rows);
-      return;
-    }
-
-    // Process kTransformSize8x4
-    assert(tx_size == kTransformSize8x4);
-    int i = 0;
-    do {
-      Identity8Row4_SSE4_1(&src[i * 8], &src[i * 8], /*step=*/8);
-      i += 4;
-    } while (i < num_rows);
+void Identity8TransformLoopRow_SSE4_1(TransformType tx_type,
+                                      TransformSize tx_size,
+                                      int adjusted_tx_height, void* src_buffer,
+                                      int /*start_x*/, int /*start_y*/,
+                                      void* /*dst_frame*/) {
+  // Special case: Process row calculations during column transform call.
+  // Improves performance.
+  if (tx_type == kTransformTypeIdentityIdentity &&
+      tx_size == kTransformSize8x4) {
     return;
   }
 
-  assert(!is_row);
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_height = kTransformHeight[tx_size];
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+  if (Identity8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+    return;
+  }
+
+  if (should_round) {
+    ApplyRounding<8>(src, adjusted_tx_height);
+  }
+
+  // When combining the identity8 multiplier with the row shift, the
+  // calculations for tx_height == 8 and tx_height == 16 can be simplified
+  // from ((A * 2) + 1) >> 1) to A.
+  if ((tx_height & 0x18) != 0) {
+    return;
+  }
+  if (tx_height == 32) {
+    int i = 0;
+    do {
+      Identity8Row32_SSE4_1(&src[i * 8], /*step=*/8);
+      i += 4;
+    } while (i < adjusted_tx_height);
+    return;
+  }
+
+  assert(tx_size == kTransformSize8x4);
+  int i = 0;
+  do {
+    Identity8Row4_SSE4_1(&src[i * 8], /*step=*/8);
+    i += 4;
+  } while (i < adjusted_tx_height);
+}
+
+void Identity8TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                         TransformSize tx_size,
+                                         int adjusted_tx_height,
+                                         void* src_buffer, int start_x,
+                                         int start_y, void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
   if (kTransformFlipColumnsMask.Contains(tx_type)) {
     FlipColumns<8>(src, tx_width);
   }
 
-  const int height = (non_zero_coeff_count == 1) ? 1 : tx_height;
-  Identity8ColumnStoreToFrame_SSE4_1(frame, start_x, start_y, tx_width, height,
-                                     src);
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  Identity8ColumnStoreToFrame_SSE4_1(frame, start_x, start_y, tx_width,
+                                     adjusted_tx_height, src);
 }
 
-void Identity16TransformLoop_SSE4_1(TransformType tx_type,
-                                    TransformSize tx_size, void* src_buffer,
-                                    int start_x, int start_y, void* dst_frame,
-                                    bool is_row, int non_zero_coeff_count) {
-  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+void Identity16TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                       TransformSize tx_size,
+                                       int adjusted_tx_height, void* src_buffer,
+                                       int /*start_x*/, int /*start_y*/,
+                                       void* /*dst_frame*/) {
   auto* src = static_cast<int16_t*>(src_buffer);
-  const int tx_width = kTransformWidth[tx_size];
-  const int tx_height = kTransformHeight[tx_size];
-
-  if (is_row) {
-    const bool should_round = kShouldRound[tx_size];
-    const uint8_t row_shift = kTransformRowShift[tx_size];
-    if (Identity16DcOnly(&src[0], &src[0], non_zero_coeff_count, should_round,
-                         row_shift)) {
-      return;
-    }
-
-    const int num_rows =
-        GetNumRows<16>(tx_type, std::min(tx_height, 32), non_zero_coeff_count);
-    if (should_round) {
-      ApplyRounding<16>(src, num_rows);
-    }
-    int i = 0;
-    do {
-      Identity16Row_SSE4_1(&src[i * 16], &src[i * 16], /*step=*/16,
-                           kTransformRowShift[tx_size]);
-      i += 4;
-    } while (i < num_rows);
+  const bool should_round = kShouldRound[tx_size];
+  const uint8_t row_shift = kTransformRowShift[tx_size];
+  if (Identity16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
     return;
   }
 
-  assert(!is_row);
+  if (should_round) {
+    ApplyRounding<16>(src, adjusted_tx_height);
+  }
+  int i = 0;
+  do {
+    Identity16Row_SSE4_1(&src[i * 16], /*step=*/16,
+                         kTransformRowShift[tx_size]);
+    i += 4;
+  } while (i < adjusted_tx_height);
+}
+
+void Identity16TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                          TransformSize tx_size,
+                                          int adjusted_tx_height,
+                                          void* src_buffer, int start_x,
+                                          int start_y, void* dst_frame) {
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
   if (kTransformFlipColumnsMask.Contains(tx_type)) {
     FlipColumns<16>(src, tx_width);
   }
-  const int height = (non_zero_coeff_count == 1) ? 1 : tx_height;
-  Identity16ColumnStoreToFrame_SSE4_1(frame, start_x, start_y, tx_width, height,
-                                      src);
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  Identity16ColumnStoreToFrame_SSE4_1(frame, start_x, start_y, tx_width,
+                                      adjusted_tx_height, src);
 }
 
-void Identity32TransformLoop_SSE4_1(TransformType tx_type,
-                                    TransformSize tx_size, void* src_buffer,
-                                    int start_x, int start_y, void* dst_frame,
-                                    bool is_row, int non_zero_coeff_count) {
-  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
-  auto* src = static_cast<int16_t*>(src_buffer);
-  const int tx_width = kTransformWidth[tx_size];
+void Identity32TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+                                       TransformSize tx_size,
+                                       int adjusted_tx_height, void* src_buffer,
+                                       int /*start_x*/, int /*start_y*/,
+                                       void* /*dst_frame*/) {
   const int tx_height = kTransformHeight[tx_size];
-
-  if (is_row) {
-    // When combining the identity32 multiplier with the row shift, the
-    // calculations for tx_height == 8 and tx_height == 32 can be simplified
-    // from ((A * 4) + 2) >> 2) to A.
-    if ((tx_height & 0x28) != 0) {
-      return;
-    }
-
-    // Process kTransformSize32x16. The src is always rounded before the
-    // identity transform and shifted by 1 afterwards.
-
-    if (Identity32DcOnly(&src[0], &src[0], non_zero_coeff_count)) {
-      return;
-    }
-
-    const int num_rows =
-        GetNumRows<32>(tx_type, tx_height, non_zero_coeff_count);
-
-    // Process kTransformSize32x16
-    assert(tx_size == kTransformSize32x16);
-    ApplyRounding<32>(src, num_rows);
-    int i = 0;
-    do {
-      Identity32Row16_SSE4_1(&src[i * 32], &src[i * 32], /*step=*/32);
-      i += 4;
-    } while (i < num_rows);
+  // When combining the identity32 multiplier with the row shift, the
+  // calculations for tx_height == 8 and tx_height == 32 can be simplified
+  // from ((A * 4) + 2) >> 2) to A.
+  if ((tx_height & 0x28) != 0) {
     return;
   }
 
-  assert(!is_row);
-  const int height = (non_zero_coeff_count == 1) ? 1 : tx_height;
-  Identity32ColumnStoreToFrame(frame, start_x, start_y, tx_width, height, src);
+  // Process kTransformSize32x16. The src is always rounded before the
+  // identity transform and shifted by 1 afterwards.
+  auto* src = static_cast<int16_t*>(src_buffer);
+  if (Identity32DcOnly(src, adjusted_tx_height)) {
+    return;
+  }
+
+  assert(tx_size == kTransformSize32x16);
+  ApplyRounding<32>(src, adjusted_tx_height);
+  int i = 0;
+  do {
+    Identity32Row16_SSE4_1(&src[i * 32], /*step=*/32);
+    i += 4;
+  } while (i < adjusted_tx_height);
 }
 
-void Wht4TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
-                              void* src_buffer, int start_x, int start_y,
-                              void* dst_frame, bool is_row,
-                              int non_zero_coeff_count) {
+void Identity32TransformLoopColumn_SSE4_1(TransformType /*tx_type*/,
+                                          TransformSize tx_size,
+                                          int adjusted_tx_height,
+                                          void* src_buffer, int start_x,
+                                          int start_y, void* dst_frame) {
+  auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+  auto* src = static_cast<int16_t*>(src_buffer);
+  const int tx_width = kTransformWidth[tx_size];
+
+  Identity32ColumnStoreToFrame(frame, start_x, start_y, tx_width,
+                               adjusted_tx_height, src);
+}
+
+void Wht4TransformLoopRow_SSE4_1(TransformType tx_type, TransformSize tx_size,
+                                 int /*adjusted_tx_height*/,
+                                 void* /*src_buffer*/, int /*start_x*/,
+                                 int /*start_y*/, void* /*dst_frame*/) {
   assert(tx_type == kTransformTypeDctDct);
   assert(tx_size == kTransformSize4x4);
   static_cast<void>(tx_type);
   static_cast<void>(tx_size);
-  if (is_row) {
-    // Do both row and column transforms in the column-transform pass.
-    return;
-  }
+  // Do both row and column transforms in the column-transform pass.
+}
 
-  assert(!is_row);
+void Wht4TransformLoopColumn_SSE4_1(TransformType tx_type,
+                                    TransformSize tx_size,
+                                    int adjusted_tx_height, void* src_buffer,
+                                    int start_x, int start_y, void* dst_frame) {
+  assert(tx_type == kTransformTypeDctDct);
+  assert(tx_size == kTransformSize4x4);
+  static_cast<void>(tx_type);
+  static_cast<void>(tx_size);
+
+  // Do both row and column transforms in the column-transform pass.
   // Process 4 1d wht4 rows and columns in parallel.
   const auto* src = static_cast<int16_t*>(src_buffer);
   auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
-  Wht4_SSE4_1(frame, start_x, start_y, src, non_zero_coeff_count);
+  Wht4_SSE4_1(frame, start_x, start_y, src, adjusted_tx_height);
 }
 
 //------------------------------------------------------------------------------
 
-template <typename Residual, typename Pixel>
-void InitAll(Dsp* const dsp) {
-  // Maximum transform size for Dct is 64.
-  dsp->inverse_transforms[k1DTransformSize4][k1DTransformDct] =
-      Dct4TransformLoop_SSE4_1;
-  dsp->inverse_transforms[k1DTransformSize8][k1DTransformDct] =
-      Dct8TransformLoop_SSE4_1;
-  dsp->inverse_transforms[k1DTransformSize16][k1DTransformDct] =
-      Dct16TransformLoop_SSE4_1;
-  dsp->inverse_transforms[k1DTransformSize32][k1DTransformDct] =
-      Dct32TransformLoop_SSE4_1;
-  dsp->inverse_transforms[k1DTransformSize64][k1DTransformDct] =
-      Dct64TransformLoop_SSE4_1;
-
-  // Maximum transform size for Adst is 16.
-  dsp->inverse_transforms[k1DTransformSize4][k1DTransformAdst] =
-      Adst4TransformLoop_SSE4_1;
-  dsp->inverse_transforms[k1DTransformSize8][k1DTransformAdst] =
-      Adst8TransformLoop_SSE4_1;
-  dsp->inverse_transforms[k1DTransformSize16][k1DTransformAdst] =
-      Adst16TransformLoop_SSE4_1;
-
-  // Maximum transform size for Identity transform is 32.
-  dsp->inverse_transforms[k1DTransformSize4][k1DTransformIdentity] =
-      Identity4TransformLoop_SSE4_1;
-  dsp->inverse_transforms[k1DTransformSize8][k1DTransformIdentity] =
-      Identity8TransformLoop_SSE4_1;
-  dsp->inverse_transforms[k1DTransformSize16][k1DTransformIdentity] =
-      Identity16TransformLoop_SSE4_1;
-  dsp->inverse_transforms[k1DTransformSize32][k1DTransformIdentity] =
-      Identity32TransformLoop_SSE4_1;
-
-  // Maximum transform size for Wht is 4.
-  dsp->inverse_transforms[k1DTransformSize4][k1DTransformWht] =
-      Wht4TransformLoop_SSE4_1;
-}
-
 void Init8bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
   assert(dsp != nullptr);
-#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
-  InitAll<int16_t, uint8_t>(dsp);
-#else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+
+  // Maximum transform size for Dct is 64.
 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformDct)
-  dsp->inverse_transforms[k1DTransformSize4][k1DTransformDct] =
-      Dct4TransformLoop_SSE4_1;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
+      Dct4TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
+      Dct4TransformLoopColumn_SSE4_1;
 #endif
 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize8_1DTransformDct)
-  dsp->inverse_transforms[k1DTransformSize8][k1DTransformDct] =
-      Dct8TransformLoop_SSE4_1;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
+      Dct8TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
+      Dct8TransformLoopColumn_SSE4_1;
 #endif
 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize16_1DTransformDct)
-  dsp->inverse_transforms[k1DTransformSize16][k1DTransformDct] =
-      Dct16TransformLoop_SSE4_1;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
+      Dct16TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
+      Dct16TransformLoopColumn_SSE4_1;
 #endif
 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize32_1DTransformDct)
-  dsp->inverse_transforms[k1DTransformSize32][k1DTransformDct] =
-      Dct32TransformLoop_SSE4_1;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
+      Dct32TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
+      Dct32TransformLoopColumn_SSE4_1;
 #endif
 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize64_1DTransformDct)
-  dsp->inverse_transforms[k1DTransformSize64][k1DTransformDct] =
-      Dct64TransformLoop_SSE4_1;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
+      Dct64TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
+      Dct64TransformLoopColumn_SSE4_1;
 #endif
+
+  // Maximum transform size for Adst is 16.
 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformAdst)
-  dsp->inverse_transforms[k1DTransformSize4][k1DTransformAdst] =
-      Adst4TransformLoop_SSE4_1;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
+      Adst4TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
+      Adst4TransformLoopColumn_SSE4_1;
 #endif
 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize8_1DTransformAdst)
-  dsp->inverse_transforms[k1DTransformSize8][k1DTransformAdst] =
-      Adst8TransformLoop_SSE4_1;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
+      Adst8TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
+      Adst8TransformLoopColumn_SSE4_1;
 #endif
 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize16_1DTransformAdst)
-  dsp->inverse_transforms[k1DTransformSize16][k1DTransformAdst] =
-      Adst16TransformLoop_SSE4_1;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
+      Adst16TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
+      Adst16TransformLoopColumn_SSE4_1;
 #endif
+
+  // Maximum transform size for Identity transform is 32.
 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformIdentity)
-  dsp->inverse_transforms[k1DTransformSize4][k1DTransformIdentity] =
-      Identity4TransformLoop_SSE4_1;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
+      Identity4TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
+      Identity4TransformLoopColumn_SSE4_1;
 #endif
 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize8_1DTransformIdentity)
-  dsp->inverse_transforms[k1DTransformSize8][k1DTransformIdentity] =
-      Identity8TransformLoop_SSE4_1;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
+      Identity8TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
+      Identity8TransformLoopColumn_SSE4_1;
 #endif
 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize16_1DTransformIdentity)
-  dsp->inverse_transforms[k1DTransformSize16][k1DTransformIdentity] =
-      Identity16TransformLoop_SSE4_1;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
+      Identity16TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
+      Identity16TransformLoopColumn_SSE4_1;
 #endif
 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize32_1DTransformIdentity)
-  dsp->inverse_transforms[k1DTransformSize32][k1DTransformIdentity] =
-      Identity32TransformLoop_SSE4_1;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] =
+      Identity32TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] =
+      Identity32TransformLoopColumn_SSE4_1;
 #endif
+
+  // Maximum transform size for Wht is 4.
 #if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformWht)
-  dsp->inverse_transforms[k1DTransformSize4][k1DTransformWht] =
-      Wht4TransformLoop_SSE4_1;
-#endif
+  dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] =
+      Wht4TransformLoopRow_SSE4_1;
+  dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] =
+      Wht4TransformLoopColumn_SSE4_1;
 #endif
 }
 
@@ -3109,7 +3021,7 @@
 
 }  // namespace dsp
 }  // namespace libgav1
-#else  // !LIBGAV1_ENABLE_SSE4_1
+#else   // !LIBGAV1_TARGETING_SSE4_1
 namespace libgav1 {
 namespace dsp {
 
@@ -3117,4 +3029,4 @@
 
 }  // namespace dsp
 }  // namespace libgav1
-#endif  // LIBGAV1_ENABLE_SSE4_1
+#endif  // LIBGAV1_TARGETING_SSE4_1

diff --git a/libgav1/src/dsp/x86/inverse_transform_sse4.h b/libgav1/src/dsp/x86/inverse_transform_sse4.h
index 423173b..106084b 100644
--- a/libgav1/src/dsp/x86/inverse_transform_sse4.h
+++ b/libgav1/src/dsp/x86/inverse_transform_sse4.h

@@ -32,7 +32,7 @@
 
 // If sse4 is enabled and the baseline isn't set due to a higher level of
 // optimization being enabled, signal the sse4 implementation should be used.
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
 
 #ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformDct
 #define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformDct LIBGAV1_CPU_SSE4_1
@@ -85,5 +85,5 @@
 #ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformWht
 #define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformWht LIBGAV1_CPU_SSE4_1
 #endif
-#endif  // LIBGAV1_ENABLE_SSE4_1
+#endif  // LIBGAV1_TARGETING_SSE4_1
 #endif  // LIBGAV1_SRC_DSP_X86_INVERSE_TRANSFORM_SSE4_H_

diff --git a/libgav1/src/dsp/x86/loop_filter_sse4.cc b/libgav1/src/dsp/x86/loop_filter_sse4.cc
index 462b885..b9da2d5 100644
--- a/libgav1/src/dsp/x86/loop_filter_sse4.cc
+++ b/libgav1/src/dsp/x86/loop_filter_sse4.cc

@@ -15,7 +15,7 @@
 #include "src/dsp/loop_filter.h"
 #include "src/utils/cpu.h"
 
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
 
 #include <smmintrin.h>
 
@@ -350,7 +350,7 @@
   const __m128i v_mask =
       _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat3_mask), 0);
 
-  if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
     __m128i oqp1_f6;
     __m128i oqp0_f6;
 
@@ -454,7 +454,7 @@
   const __m128i v_mask =
       _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat3_mask), 0);
 
-  if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
     __m128i oqp1_f6;
     __m128i oqp0_f6;
 
@@ -595,7 +595,7 @@
   const __m128i v_mask =
       _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
 
-  if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
     __m128i oqp2_f8;
     __m128i oqp1_f8;
     __m128i oqp0_f8;
@@ -697,7 +697,7 @@
   const __m128i v_mask =
       _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
 
-  if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
     __m128i oqp2_f8;
     __m128i oqp1_f8;
     __m128i oqp0_f8;
@@ -838,7 +838,7 @@
   const __m128i v_mask =
       _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
 
-  if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
     const __m128i p6 = Load4(dst - 7 * stride);
     const __m128i p5 = Load4(dst - 6 * stride);
     const __m128i p4 = Load4(dst - 5 * stride);
@@ -864,8 +864,7 @@
     oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
     oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
 
-    if (_mm_test_all_zeros(v_flat4_mask,
-                           _mm_cmpeq_epi8(v_flat4_mask, v_flat4_mask)) == 0) {
+    if (_mm_test_all_zeros(v_flat4_mask, v_flat4_mask) == 0) {
       __m128i oqp5_f14;
       __m128i oqp4_f14;
       __m128i oqp3_f14;
@@ -1050,7 +1049,7 @@
   const __m128i v_mask =
       _mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
 
-  if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
     const __m128i v_isflatouter4_mask =
         IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh);
     const __m128i v_flat4_mask =
@@ -1066,8 +1065,7 @@
     oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
     oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
 
-    if (_mm_test_all_zeros(v_flat4_mask,
-                           _mm_cmpeq_epi8(v_flat4_mask, v_flat4_mask)) == 0) {
+    if (_mm_test_all_zeros(v_flat4_mask, v_flat4_mask) == 0) {
       __m128i oqp5_f14;
       __m128i oqp4_f14;
       __m128i oqp3_f14;
@@ -1458,7 +1456,7 @@
   const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat3_mask);
   const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
 
-  if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
     __m128i oqp1_f6;
     __m128i oqp0_f6;
 
@@ -1572,7 +1570,7 @@
   const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat3_mask);
   const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
 
-  if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
     __m128i oqp1_f6;
     __m128i oqp0_f6;
 
@@ -1711,7 +1709,7 @@
   const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
   const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
 
-  if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
     __m128i oqp2_f8;
     __m128i oqp1_f8;
     __m128i oqp0_f8;
@@ -1821,7 +1819,7 @@
   const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
   const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
 
-  if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
     __m128i oqp2_f8;
     __m128i oqp1_f8;
     __m128i oqp0_f8;
@@ -1957,7 +1955,7 @@
   const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
   const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
 
-  if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
     const __m128i p6 = LoadLo8(dst - 7 * stride);
     const __m128i p5 = LoadLo8(dst - 6 * stride);
     const __m128i p4 = LoadLo8(dst - 5 * stride);
@@ -1984,8 +1982,7 @@
     oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
     oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
 
-    if (_mm_test_all_zeros(v_flat4_mask,
-                           _mm_cmpeq_epi16(v_flat4_mask, v_flat4_mask)) == 0) {
+    if (_mm_test_all_zeros(v_flat4_mask, v_flat4_mask) == 0) {
       __m128i oqp5_f14;
       __m128i oqp4_f14;
       __m128i oqp3_f14;
@@ -2133,7 +2130,7 @@
   const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
   const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
 
-  if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+  if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
     const __m128i v_isflatouter4_mask =
         IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh);
     const __m128i v_flat4_mask_lo = _mm_and_si128(v_mask, v_isflatouter4_mask);
@@ -2150,8 +2147,7 @@
     oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
     oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
 
-    if (_mm_test_all_zeros(v_flat4_mask,
-                           _mm_cmpeq_epi16(v_flat4_mask, v_flat4_mask)) == 0) {
+    if (_mm_test_all_zeros(v_flat4_mask, v_flat4_mask) == 0) {
       __m128i oqp5_f14;
       __m128i oqp4_f14;
       __m128i oqp3_f14;
@@ -2245,7 +2241,7 @@
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_SSE4_1
+#else   // !LIBGAV1_TARGETING_SSE4_1
 namespace libgav1 {
 namespace dsp {
 
@@ -2253,4 +2249,4 @@
 
 }  // namespace dsp
 }  // namespace libgav1
-#endif  // LIBGAV1_ENABLE_SSE4_1
+#endif  // LIBGAV1_TARGETING_SSE4_1

diff --git a/libgav1/src/dsp/x86/loop_filter_sse4.h b/libgav1/src/dsp/x86/loop_filter_sse4.h
index b8c1fe5..4795d8b 100644
--- a/libgav1/src/dsp/x86/loop_filter_sse4.h
+++ b/libgav1/src/dsp/x86/loop_filter_sse4.h

@@ -32,7 +32,7 @@
 
 // If sse4 is enabled and the baseline isn't set due to a higher level of
 // optimization being enabled, signal the sse4 implementation should be used.
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
 
 #ifndef LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeHorizontal
 #define LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeHorizontal \
@@ -114,6 +114,6 @@
   LIBGAV1_CPU_SSE4_1
 #endif
 
-#endif  // LIBGAV1_ENABLE_SSE4_1
+#endif  // LIBGAV1_TARGETING_SSE4_1
 
 #endif  // LIBGAV1_SRC_DSP_X86_LOOP_FILTER_SSE4_H_

diff --git a/libgav1/src/dsp/x86/loop_restoration_10bit_avx2.cc b/libgav1/src/dsp/x86/loop_restoration_10bit_avx2.cc
new file mode 100644
index 0000000..b38f322
--- /dev/null
+++ b/libgav1/src/dsp/x86/loop_restoration_10bit_avx2.cc

@@ -0,0 +1,3157 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2 && LIBGAV1_MAX_BITDEPTH >= 10
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_avx2.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline void WienerHorizontalClip(const __m256i s[2],
+                                 int16_t* const wiener_buffer) {
+  constexpr int offset =
+      1 << (10 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+  constexpr int limit = (offset << 2) - 1;
+  const __m256i offsets = _mm256_set1_epi16(-offset);
+  const __m256i limits = _mm256_set1_epi16(limit - offset);
+  const __m256i round = _mm256_set1_epi32(1 << (kInterRoundBitsHorizontal - 1));
+  const __m256i sum0 = _mm256_add_epi32(s[0], round);
+  const __m256i sum1 = _mm256_add_epi32(s[1], round);
+  const __m256i rounded_sum0 =
+      _mm256_srai_epi32(sum0, kInterRoundBitsHorizontal);
+  const __m256i rounded_sum1 =
+      _mm256_srai_epi32(sum1, kInterRoundBitsHorizontal);
+  const __m256i rounded_sum = _mm256_packs_epi32(rounded_sum0, rounded_sum1);
+  const __m256i d0 = _mm256_max_epi16(rounded_sum, offsets);
+  const __m256i d1 = _mm256_min_epi16(d0, limits);
+  StoreAligned32(wiener_buffer, d1);
+}
+
+inline void WienerHorizontalTap7Kernel(const __m256i s[7],
+                                       const __m256i filter[2],
+                                       int16_t* const wiener_buffer) {
+  const __m256i s06 = _mm256_add_epi16(s[0], s[6]);
+  const __m256i s15 = _mm256_add_epi16(s[1], s[5]);
+  const __m256i s24 = _mm256_add_epi16(s[2], s[4]);
+  const __m256i ss0 = _mm256_unpacklo_epi16(s06, s15);
+  const __m256i ss1 = _mm256_unpackhi_epi16(s06, s15);
+  const __m256i ss2 = _mm256_unpacklo_epi16(s24, s[3]);
+  const __m256i ss3 = _mm256_unpackhi_epi16(s24, s[3]);
+  __m256i madds[4];
+  madds[0] = _mm256_madd_epi16(ss0, filter[0]);
+  madds[1] = _mm256_madd_epi16(ss1, filter[0]);
+  madds[2] = _mm256_madd_epi16(ss2, filter[1]);
+  madds[3] = _mm256_madd_epi16(ss3, filter[1]);
+  madds[0] = _mm256_add_epi32(madds[0], madds[2]);
+  madds[1] = _mm256_add_epi32(madds[1], madds[3]);
+  WienerHorizontalClip(madds, wiener_buffer);
+}
+
+inline void WienerHorizontalTap5Kernel(const __m256i s[5], const __m256i filter,
+                                       int16_t* const wiener_buffer) {
+  const __m256i s04 = _mm256_add_epi16(s[0], s[4]);
+  const __m256i s13 = _mm256_add_epi16(s[1], s[3]);
+  const __m256i s2d = _mm256_add_epi16(s[2], s[2]);
+  const __m256i s0m = _mm256_sub_epi16(s04, s2d);
+  const __m256i s1m = _mm256_sub_epi16(s13, s2d);
+  const __m256i ss0 = _mm256_unpacklo_epi16(s0m, s1m);
+  const __m256i ss1 = _mm256_unpackhi_epi16(s0m, s1m);
+  __m256i madds[2];
+  madds[0] = _mm256_madd_epi16(ss0, filter);
+  madds[1] = _mm256_madd_epi16(ss1, filter);
+  const __m256i s2_lo = _mm256_unpacklo_epi16(s[2], _mm256_setzero_si256());
+  const __m256i s2_hi = _mm256_unpackhi_epi16(s[2], _mm256_setzero_si256());
+  const __m256i s2x128_lo = _mm256_slli_epi32(s2_lo, 7);
+  const __m256i s2x128_hi = _mm256_slli_epi32(s2_hi, 7);
+  madds[0] = _mm256_add_epi32(madds[0], s2x128_lo);
+  madds[1] = _mm256_add_epi32(madds[1], s2x128_hi);
+  WienerHorizontalClip(madds, wiener_buffer);
+}
+
+inline void WienerHorizontalTap3Kernel(const __m256i s[3], const __m256i filter,
+                                       int16_t* const wiener_buffer) {
+  const __m256i s02 = _mm256_add_epi16(s[0], s[2]);
+  const __m256i ss0 = _mm256_unpacklo_epi16(s02, s[1]);
+  const __m256i ss1 = _mm256_unpackhi_epi16(s02, s[1]);
+  __m256i madds[2];
+  madds[0] = _mm256_madd_epi16(ss0, filter);
+  madds[1] = _mm256_madd_epi16(ss1, filter);
+  WienerHorizontalClip(madds, wiener_buffer);
+}
+
+inline void WienerHorizontalTap7(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const __m256i* const coefficients,
+                                 int16_t** const wiener_buffer) {
+  __m256i filter[2];
+  filter[0] = _mm256_shuffle_epi32(*coefficients, 0x0);
+  filter[1] = _mm256_shuffle_epi32(*coefficients, 0x55);
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i s[7];
+      s[0] = LoadUnaligned32(src + x + 0);
+      s[1] = LoadUnaligned32(src + x + 1);
+      s[2] = LoadUnaligned32(src + x + 2);
+      s[3] = LoadUnaligned32(src + x + 3);
+      s[4] = LoadUnaligned32(src + x + 4);
+      s[5] = LoadUnaligned32(src + x + 5);
+      s[6] = LoadUnaligned32(src + x + 6);
+      WienerHorizontalTap7Kernel(s, filter, *wiener_buffer + x);
+      x += 16;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap5(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const __m256i* const coefficients,
+                                 int16_t** const wiener_buffer) {
+  const __m256i filter =
+      _mm256_shuffle_epi8(*coefficients, _mm256_set1_epi32(0x05040302));
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i s[5];
+      s[0] = LoadUnaligned32(src + x + 0);
+      s[1] = LoadUnaligned32(src + x + 1);
+      s[2] = LoadUnaligned32(src + x + 2);
+      s[3] = LoadUnaligned32(src + x + 3);
+      s[4] = LoadUnaligned32(src + x + 4);
+      WienerHorizontalTap5Kernel(s, filter, *wiener_buffer + x);
+      x += 16;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap3(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const __m256i* const coefficients,
+                                 int16_t** const wiener_buffer) {
+  const auto filter = _mm256_shuffle_epi32(*coefficients, 0x55);
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i s[3];
+      s[0] = LoadUnaligned32(src + x + 0);
+      s[1] = LoadUnaligned32(src + x + 1);
+      s[2] = LoadUnaligned32(src + x + 2);
+      WienerHorizontalTap3Kernel(s, filter, *wiener_buffer + x);
+      x += 16;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap1(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 int16_t** const wiener_buffer) {
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      const __m256i s0 = LoadUnaligned32(src + x);
+      const __m256i d0 = _mm256_slli_epi16(s0, 4);
+      StoreAligned32(*wiener_buffer + x, d0);
+      x += 16;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline __m256i WienerVertical7(const __m256i a[4], const __m256i filter[4]) {
+  const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+  const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+  const __m256i madd2 = _mm256_madd_epi16(a[2], filter[2]);
+  const __m256i madd3 = _mm256_madd_epi16(a[3], filter[3]);
+  const __m256i madd01 = _mm256_add_epi32(madd0, madd1);
+  const __m256i madd23 = _mm256_add_epi32(madd2, madd3);
+  const __m256i sum = _mm256_add_epi32(madd01, madd23);
+  return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVertical5(const __m256i a[3], const __m256i filter[3]) {
+  const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+  const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+  const __m256i madd2 = _mm256_madd_epi16(a[2], filter[2]);
+  const __m256i madd01 = _mm256_add_epi32(madd0, madd1);
+  const __m256i sum = _mm256_add_epi32(madd01, madd2);
+  return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVertical3(const __m256i a[2], const __m256i filter[2]) {
+  const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+  const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+  const __m256i sum = _mm256_add_epi32(madd0, madd1);
+  return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVerticalClip(const __m256i s[2]) {
+  const __m256i d = _mm256_packus_epi32(s[0], s[1]);
+  return _mm256_min_epu16(d, _mm256_set1_epi16(1023));
+}
+
+inline __m256i WienerVerticalFilter7(const __m256i a[7],
+                                     const __m256i filter[2]) {
+  const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+  __m256i b[4], c[2];
+  b[0] = _mm256_unpacklo_epi16(a[0], a[1]);
+  b[1] = _mm256_unpacklo_epi16(a[2], a[3]);
+  b[2] = _mm256_unpacklo_epi16(a[4], a[5]);
+  b[3] = _mm256_unpacklo_epi16(a[6], round);
+  c[0] = WienerVertical7(b, filter);
+  b[0] = _mm256_unpackhi_epi16(a[0], a[1]);
+  b[1] = _mm256_unpackhi_epi16(a[2], a[3]);
+  b[2] = _mm256_unpackhi_epi16(a[4], a[5]);
+  b[3] = _mm256_unpackhi_epi16(a[6], round);
+  c[1] = WienerVertical7(b, filter);
+  return WienerVerticalClip(c);
+}
+
+inline __m256i WienerVerticalFilter5(const __m256i a[5],
+                                     const __m256i filter[3]) {
+  const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+  __m256i b[3], c[2];
+  b[0] = _mm256_unpacklo_epi16(a[0], a[1]);
+  b[1] = _mm256_unpacklo_epi16(a[2], a[3]);
+  b[2] = _mm256_unpacklo_epi16(a[4], round);
+  c[0] = WienerVertical5(b, filter);
+  b[0] = _mm256_unpackhi_epi16(a[0], a[1]);
+  b[1] = _mm256_unpackhi_epi16(a[2], a[3]);
+  b[2] = _mm256_unpackhi_epi16(a[4], round);
+  c[1] = WienerVertical5(b, filter);
+  return WienerVerticalClip(c);
+}
+
+inline __m256i WienerVerticalFilter3(const __m256i a[3],
+                                     const __m256i filter[2]) {
+  const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+  __m256i b[2], c[2];
+  b[0] = _mm256_unpacklo_epi16(a[0], a[1]);
+  b[1] = _mm256_unpacklo_epi16(a[2], round);
+  c[0] = WienerVertical3(b, filter);
+  b[0] = _mm256_unpackhi_epi16(a[0], a[1]);
+  b[1] = _mm256_unpackhi_epi16(a[2], round);
+  c[1] = WienerVertical3(b, filter);
+  return WienerVerticalClip(c);
+}
+
+inline __m256i WienerVerticalTap7Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m256i filter[2], __m256i a[7]) {
+  a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+  a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+  a[4] = LoadAligned32(wiener_buffer + 4 * wiener_stride);
+  a[5] = LoadAligned32(wiener_buffer + 5 * wiener_stride);
+  a[6] = LoadAligned32(wiener_buffer + 6 * wiener_stride);
+  return WienerVerticalFilter7(a, filter);
+}
+
+inline __m256i WienerVerticalTap5Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m256i filter[3], __m256i a[5]) {
+  a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+  a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+  a[4] = LoadAligned32(wiener_buffer + 4 * wiener_stride);
+  return WienerVerticalFilter5(a, filter);
+}
+
+inline __m256i WienerVerticalTap3Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m256i filter[2], __m256i a[3]) {
+  a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+  return WienerVerticalFilter3(a, filter);
+}
+
+inline void WienerVerticalTap7Kernel2(const int16_t* wiener_buffer,
+                                      const ptrdiff_t wiener_stride,
+                                      const __m256i filter[2], __m256i d[2]) {
+  __m256i a[8];
+  d[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[7] = LoadAligned32(wiener_buffer + 7 * wiener_stride);
+  d[1] = WienerVerticalFilter7(a + 1, filter);
+}
+
+inline void WienerVerticalTap5Kernel2(const int16_t* wiener_buffer,
+                                      const ptrdiff_t wiener_stride,
+                                      const __m256i filter[3], __m256i d[2]) {
+  __m256i a[6];
+  d[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[5] = LoadAligned32(wiener_buffer + 5 * wiener_stride);
+  d[1] = WienerVerticalFilter5(a + 1, filter);
+}
+
+inline void WienerVerticalTap3Kernel2(const int16_t* wiener_buffer,
+                                      const ptrdiff_t wiener_stride,
+                                      const __m256i filter[2], __m256i d[2]) {
+  __m256i a[4];
+  d[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+  d[1] = WienerVerticalFilter3(a + 1, filter);
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[4], uint16_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const __m256i c = _mm256_broadcastq_epi64(LoadLo8(coefficients));
+  __m256i filter[4];
+  filter[0] = _mm256_shuffle_epi32(c, 0x0);
+  filter[1] = _mm256_shuffle_epi32(c, 0x55);
+  filter[2] = _mm256_shuffle_epi8(c, _mm256_set1_epi32(0x03020504));
+  filter[3] =
+      _mm256_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i d[2];
+      WienerVerticalTap7Kernel2(wiener_buffer + x, width, filter, d);
+      StoreUnaligned32(dst + x, d[0]);
+      StoreUnaligned32(dst + dst_stride + x, d[1]);
+      x += 16;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i a[7];
+      const __m256i d =
+          WienerVerticalTap7Kernel(wiener_buffer + x, width, filter, a);
+      StoreUnaligned32(dst + x, d);
+      x += 16;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[3], uint16_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const __m256i c = _mm256_broadcastq_epi64(LoadLo8(coefficients));
+  __m256i filter[3];
+  filter[0] = _mm256_shuffle_epi32(c, 0x0);
+  filter[1] = _mm256_shuffle_epi8(c, _mm256_set1_epi32(0x03020504));
+  filter[2] =
+      _mm256_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i d[2];
+      WienerVerticalTap5Kernel2(wiener_buffer + x, width, filter, d);
+      StoreUnaligned32(dst + x, d[0]);
+      StoreUnaligned32(dst + dst_stride + x, d[1]);
+      x += 16;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i a[5];
+      const __m256i d =
+          WienerVerticalTap5Kernel(wiener_buffer + x, width, filter, a);
+      StoreUnaligned32(dst + x, d);
+      x += 16;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[2], uint16_t* dst,
+                               const ptrdiff_t dst_stride) {
+  __m256i filter[2];
+  filter[0] =
+      _mm256_set1_epi32(*reinterpret_cast<const int32_t*>(coefficients));
+  filter[1] =
+      _mm256_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i d[2][2];
+      WienerVerticalTap3Kernel2(wiener_buffer + x, width, filter, d[0]);
+      StoreUnaligned32(dst + x, d[0][0]);
+      StoreUnaligned32(dst + dst_stride + x, d[0][1]);
+      x += 16;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i a[3];
+      const __m256i d =
+          WienerVerticalTap3Kernel(wiener_buffer + x, width, filter, a);
+      StoreUnaligned32(dst + x, d);
+      x += 16;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+                                     uint16_t* const dst) {
+  const __m256i a = LoadAligned32(wiener_buffer);
+  const __m256i b = _mm256_add_epi16(a, _mm256_set1_epi16(8));
+  const __m256i c = _mm256_srai_epi16(b, 4);
+  const __m256i d = _mm256_max_epi16(c, _mm256_setzero_si256());
+  const __m256i e = _mm256_min_epi16(d, _mm256_set1_epi16(1023));
+  StoreUnaligned32(dst, e);
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               uint16_t* dst, const ptrdiff_t dst_stride) {
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+      WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x);
+      x += 16;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+      x += 16;
+    } while (x < width);
+  }
+}
+
+void WienerFilter_AVX2(
+    const RestorationUnitInfo& restoration_info, const void* const source,
+    const ptrdiff_t stride, const void* const top_border,
+    const ptrdiff_t top_border_stride, const void* const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* const restoration_buffer, void* const dest) {
+  const int16_t* const number_leading_zero_coefficients =
+      restoration_info.wiener_info.number_leading_zero_coefficients;
+  const int number_rows_to_skip = std::max(
+      static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+      1);
+  const ptrdiff_t wiener_stride = Align(width, 16);
+  int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+  // The values are saturated to 13 bits before storing.
+  int16_t* wiener_buffer_horizontal =
+      wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+
+  // horizontal filtering.
+  // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
+  const int height_horizontal =
+      height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+  const int height_extra = (height_horizontal - height) >> 1;
+  assert(height_extra <= 2);
+  const auto* const src = static_cast<const uint16_t*>(source);
+  const auto* const top = static_cast<const uint16_t*>(top_border);
+  const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
+  const __m128i c =
+      LoadLo8(restoration_info.wiener_info.filter[WienerInfo::kHorizontal]);
+  const __m256i coefficients_horizontal = _mm256_broadcastq_epi64(c);
+  if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+    WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+                         top_border_stride, wiener_stride, height_extra,
+                         &coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+                         &coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+                         height_extra, &coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+    WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+                         top_border_stride, wiener_stride, height_extra,
+                         &coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+                         &coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+                         height_extra, &coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+    // The maximum over-reads happen here.
+    WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+                         top_border_stride, wiener_stride, height_extra,
+                         &coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+                         &coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+                         height_extra, &coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+    WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+                         top_border_stride, wiener_stride, height_extra,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(src, stride, wiener_stride, height,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+                         height_extra, &wiener_buffer_horizontal);
+  }
+
+  // vertical filtering.
+  // Over-writes up to 15 values.
+  const int16_t* const filter_vertical =
+      restoration_info.wiener_info.filter[WienerInfo::kVertical];
+  auto* dst = static_cast<uint16_t*>(dest);
+  if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+    // Because the top row of |source| is a duplicate of the second row, and the
+    // bottom row of |source| is a duplicate of its above row, we can duplicate
+    // the top and bottom row of |wiener_buffer| accordingly.
+    memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+           sizeof(*wiener_buffer_horizontal) * wiener_stride);
+    memcpy(restoration_buffer->wiener_buffer,
+           restoration_buffer->wiener_buffer + wiener_stride,
+           sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+    WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+                       filter_vertical, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+    WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+                       height, filter_vertical + 1, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+    WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+                       wiener_stride, height, filter_vertical + 2, dst, stride);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+    WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+                       wiener_stride, height, dst, stride);
+  }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+constexpr int kSumOffset = 24;
+
+// SIMD overreads the number of pixels in SIMD registers - (width % 8) - 2 *
+// padding pixels, where padding is 3 for Pass 1 and 2 for Pass 2. The number of
+// bytes in SIMD registers is 16 for SSE4.1 and 32 for AVX2.
+constexpr int kOverreadInBytesPass1_128 = 4;
+constexpr int kOverreadInBytesPass2_128 = 8;
+constexpr int kOverreadInBytesPass1_256 = kOverreadInBytesPass1_128 + 16;
+constexpr int kOverreadInBytesPass2_256 = kOverreadInBytesPass2_128 + 16;
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+                               __m128i dst[2]) {
+  dst[0] = LoadAligned16(src[0] + x);
+  dst[1] = LoadAligned16(src[1] + x);
+}
+
+inline void LoadAligned32x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+                               __m256i dst[2]) {
+  dst[0] = LoadAligned32(src[0] + x);
+  dst[1] = LoadAligned32(src[1] + x);
+}
+
+inline void LoadAligned32x2U16Msan(const uint16_t* const src[2],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m256i dst[2]) {
+  dst[0] = LoadAligned32Msan(src[0] + x, sizeof(**src) * (x + 16 - border));
+  dst[1] = LoadAligned32Msan(src[1] + x, sizeof(**src) * (x + 16 - border));
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+                               __m128i dst[3]) {
+  dst[0] = LoadAligned16(src[0] + x);
+  dst[1] = LoadAligned16(src[1] + x);
+  dst[2] = LoadAligned16(src[2] + x);
+}
+
+inline void LoadAligned32x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+                               __m256i dst[3]) {
+  dst[0] = LoadAligned32(src[0] + x);
+  dst[1] = LoadAligned32(src[1] + x);
+  dst[2] = LoadAligned32(src[2] + x);
+}
+
+inline void LoadAligned32x3U16Msan(const uint16_t* const src[3],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m256i dst[3]) {
+  dst[0] = LoadAligned32Msan(src[0] + x, sizeof(**src) * (x + 16 - border));
+  dst[1] = LoadAligned32Msan(src[1] + x, sizeof(**src) * (x + 16 - border));
+  dst[2] = LoadAligned32Msan(src[2] + x, sizeof(**src) * (x + 16 - border));
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) {
+  dst[0] = LoadAligned16(src + 0);
+  dst[1] = LoadAligned16(src + 4);
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+                               __m128i dst[2][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned64x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+                               __m256i dst[2][2]) {
+  LoadAligned64(src[0] + x, dst[0]);
+  LoadAligned64(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned64x2U32Msan(const uint32_t* const src[2],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m256i dst[2][2]) {
+  LoadAligned64Msan(src[0] + x, sizeof(**src) * (x + 16 - border), dst[0]);
+  LoadAligned64Msan(src[1] + x, sizeof(**src) * (x + 16 - border), dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+                               __m128i dst[3][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+  LoadAligned32U32(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned64x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+                               __m256i dst[3][2]) {
+  LoadAligned64(src[0] + x, dst[0]);
+  LoadAligned64(src[1] + x, dst[1]);
+  LoadAligned64(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned64x3U32Msan(const uint32_t* const src[3],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m256i dst[3][2]) {
+  LoadAligned64Msan(src[0] + x, sizeof(**src) * (x + 16 - border), dst[0]);
+  LoadAligned64Msan(src[1] + x, sizeof(**src) * (x + 16 - border), dst[1]);
+  LoadAligned64Msan(src[2] + x, sizeof(**src) * (x + 16 - border), dst[2]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) {
+  StoreAligned16(dst + 0, src[0]);
+  StoreAligned16(dst + 4, src[1]);
+}
+
+// The AVX2 ymm register holds ma[0], ma[1], ..., ma[7], and ma[16], ma[17],
+// ..., ma[23].
+// There is an 8 pixel gap between the first half and the second half.
+constexpr int kMaStoreOffset = 8;
+
+inline void StoreAligned32_ma(uint16_t* src, const __m256i v) {
+  StoreAligned16(src + 0 * 8, _mm256_extracti128_si256(v, 0));
+  StoreAligned16(src + 2 * 8, _mm256_extracti128_si256(v, 1));
+}
+
+inline void StoreAligned64_ma(uint16_t* src, const __m256i v[2]) {
+  // The next 4 lines are much faster than:
+  // StoreAligned32(src + 0, _mm256_permute2x128_si256(v[0], v[1], 0x20));
+  // StoreAligned32(src + 16, _mm256_permute2x128_si256(v[0], v[1], 0x31));
+  StoreAligned16(src + 0 * 8, _mm256_extracti128_si256(v[0], 0));
+  StoreAligned16(src + 1 * 8, _mm256_extracti128_si256(v[1], 0));
+  StoreAligned16(src + 2 * 8, _mm256_extracti128_si256(v[0], 1));
+  StoreAligned16(src + 3 * 8, _mm256_extracti128_si256(v[1], 1));
+}
+
+// Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following
+// functions. Some compilers may generate super inefficient code and the whole
+// decoder could be 15% slower.
+
+inline __m256i VaddlLo8(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpacklo_epi8(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpacklo_epi8(src1, _mm256_setzero_si256());
+  return _mm256_add_epi16(s0, s1);
+}
+
+inline __m256i VaddlHi8(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpackhi_epi8(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpackhi_epi8(src1, _mm256_setzero_si256());
+  return _mm256_add_epi16(s0, s1);
+}
+
+inline __m256i VaddwLo8(const __m256i src0, const __m256i src1) {
+  const __m256i s1 = _mm256_unpacklo_epi8(src1, _mm256_setzero_si256());
+  return _mm256_add_epi16(src0, s1);
+}
+
+inline __m256i VaddwHi8(const __m256i src0, const __m256i src1) {
+  const __m256i s1 = _mm256_unpackhi_epi8(src1, _mm256_setzero_si256());
+  return _mm256_add_epi16(src0, s1);
+}
+
+inline __m256i VmullNLo8(const __m256i src0, const int src1) {
+  const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+  return _mm256_madd_epi16(s0, _mm256_set1_epi32(src1));
+}
+
+inline __m256i VmullNHi8(const __m256i src0, const int src1) {
+  const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+  return _mm256_madd_epi16(s0, _mm256_set1_epi32(src1));
+}
+
+inline __m128i VmullLo16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, s1);
+}
+
+inline __m256i VmullLo16(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpacklo_epi16(src1, _mm256_setzero_si256());
+  return _mm256_madd_epi16(s0, s1);
+}
+
+inline __m128i VmullHi16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, s1);
+}
+
+inline __m256i VmullHi16(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpackhi_epi16(src1, _mm256_setzero_si256());
+  return _mm256_madd_epi16(s0, s1);
+}
+
+inline __m128i VrshrU16(const __m128i src0, const int src1) {
+  const __m128i sum = _mm_add_epi16(src0, _mm_set1_epi16(1 << (src1 - 1)));
+  return _mm_srli_epi16(sum, src1);
+}
+
+inline __m256i VrshrU16(const __m256i src0, const int src1) {
+  const __m256i sum =
+      _mm256_add_epi16(src0, _mm256_set1_epi16(1 << (src1 - 1)));
+  return _mm256_srli_epi16(sum, src1);
+}
+
+inline __m256i VrshrS32(const __m256i src0, const int src1) {
+  const __m256i sum =
+      _mm256_add_epi32(src0, _mm256_set1_epi32(1 << (src1 - 1)));
+  return _mm256_srai_epi32(sum, src1);
+}
+
+inline __m128i VrshrU32(const __m128i src0, const int src1) {
+  const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+  return _mm_srli_epi32(sum, src1);
+}
+
+inline __m256i VrshrU32(const __m256i src0, const int src1) {
+  const __m256i sum =
+      _mm256_add_epi32(src0, _mm256_set1_epi32(1 << (src1 - 1)));
+  return _mm256_srli_epi32(sum, src1);
+}
+
+inline void Square(const __m128i src, __m128i dst[2]) {
+  const __m128i s0 = _mm_unpacklo_epi16(src, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpackhi_epi16(src, _mm_setzero_si128());
+  dst[0] = _mm_madd_epi16(s0, s0);
+  dst[1] = _mm_madd_epi16(s1, s1);
+}
+
+inline void Square(const __m256i src, __m256i dst[2]) {
+  const __m256i s0 = _mm256_unpacklo_epi16(src, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpackhi_epi16(src, _mm256_setzero_si256());
+  dst[0] = _mm256_madd_epi16(s0, s0);
+  dst[1] = _mm256_madd_epi16(s1, s1);
+}
+
+inline void Prepare3_8(const __m256i src[2], __m256i dst[3]) {
+  dst[0] = _mm256_alignr_epi8(src[1], src[0], 0);
+  dst[1] = _mm256_alignr_epi8(src[1], src[0], 1);
+  dst[2] = _mm256_alignr_epi8(src[1], src[0], 2);
+}
+
+inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) {
+  dst[0] = src[0];
+  dst[1] = _mm_alignr_epi8(src[1], src[0], 2);
+  dst[2] = _mm_alignr_epi8(src[1], src[0], 4);
+}
+
+inline void Prepare3_32(const __m128i src[2], __m128i dst[3]) {
+  dst[0] = src[0];
+  dst[1] = _mm_alignr_epi8(src[1], src[0], 4);
+  dst[2] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare3_32(const __m256i src[2], __m256i dst[3]) {
+  dst[0] = src[0];
+  dst[1] = _mm256_alignr_epi8(src[1], src[0], 4);
+  dst[2] = _mm256_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) {
+  Prepare3_16(src, dst);
+  dst[3] = _mm_alignr_epi8(src[1], src[0], 6);
+  dst[4] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare5_32(const __m128i src[2], __m128i dst[5]) {
+  Prepare3_32(src, dst);
+  dst[3] = _mm_alignr_epi8(src[1], src[0], 12);
+  dst[4] = src[1];
+}
+
+inline void Prepare5_32(const __m256i src[2], __m256i dst[5]) {
+  Prepare3_32(src, dst);
+  dst[3] = _mm256_alignr_epi8(src[1], src[0], 12);
+  dst[4] = src[1];
+}
+
+inline __m128i Sum3_16(const __m128i src0, const __m128i src1,
+                       const __m128i src2) {
+  const __m128i sum = _mm_add_epi16(src0, src1);
+  return _mm_add_epi16(sum, src2);
+}
+
+inline __m256i Sum3_16(const __m256i src0, const __m256i src1,
+                       const __m256i src2) {
+  const __m256i sum = _mm256_add_epi16(src0, src1);
+  return _mm256_add_epi16(sum, src2);
+}
+
+inline __m128i Sum3_16(const __m128i src[3]) {
+  return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m256i Sum3_16(const __m256i src[3]) {
+  return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m128i Sum3_32(const __m128i src0, const __m128i src1,
+                       const __m128i src2) {
+  const __m128i sum = _mm_add_epi32(src0, src1);
+  return _mm_add_epi32(sum, src2);
+}
+
+inline __m256i Sum3_32(const __m256i src0, const __m256i src1,
+                       const __m256i src2) {
+  const __m256i sum = _mm256_add_epi32(src0, src1);
+  return _mm256_add_epi32(sum, src2);
+}
+
+inline __m128i Sum3_32(const __m128i src[3]) {
+  return Sum3_32(src[0], src[1], src[2]);
+}
+
+inline __m256i Sum3_32(const __m256i src[3]) {
+  return Sum3_32(src[0], src[1], src[2]);
+}
+
+inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) {
+  dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+  dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline void Sum3_32(const __m256i src[3][2], __m256i dst[2]) {
+  dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+  dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline __m256i Sum3WLo16(const __m256i src[3]) {
+  const __m256i sum = VaddlLo8(src[0], src[1]);
+  return VaddwLo8(sum, src[2]);
+}
+
+inline __m256i Sum3WHi16(const __m256i src[3]) {
+  const __m256i sum = VaddlHi8(src[0], src[1]);
+  return VaddwHi8(sum, src[2]);
+}
+
+inline __m128i Sum5_16(const __m128i src[5]) {
+  const __m128i sum01 = _mm_add_epi16(src[0], src[1]);
+  const __m128i sum23 = _mm_add_epi16(src[2], src[3]);
+  const __m128i sum = _mm_add_epi16(sum01, sum23);
+  return _mm_add_epi16(sum, src[4]);
+}
+
+inline __m256i Sum5_16(const __m256i src[5]) {
+  const __m256i sum01 = _mm256_add_epi16(src[0], src[1]);
+  const __m256i sum23 = _mm256_add_epi16(src[2], src[3]);
+  const __m256i sum = _mm256_add_epi16(sum01, sum23);
+  return _mm256_add_epi16(sum, src[4]);
+}
+
+inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1,
+                       const __m128i* const src2, const __m128i* const src3,
+                       const __m128i* const src4) {
+  const __m128i sum01 = _mm_add_epi32(*src0, *src1);
+  const __m128i sum23 = _mm_add_epi32(*src2, *src3);
+  const __m128i sum = _mm_add_epi32(sum01, sum23);
+  return _mm_add_epi32(sum, *src4);
+}
+
+inline __m256i Sum5_32(const __m256i* const src0, const __m256i* const src1,
+                       const __m256i* const src2, const __m256i* const src3,
+                       const __m256i* const src4) {
+  const __m256i sum01 = _mm256_add_epi32(*src0, *src1);
+  const __m256i sum23 = _mm256_add_epi32(*src2, *src3);
+  const __m256i sum = _mm256_add_epi32(sum01, sum23);
+  return _mm256_add_epi32(sum, *src4);
+}
+
+inline __m128i Sum5_32(const __m128i src[5]) {
+  return Sum5_32(&src[0], &src[1], &src[2], &src[3], &src[4]);
+}
+
+inline __m256i Sum5_32(const __m256i src[5]) {
+  return Sum5_32(&src[0], &src[1], &src[2], &src[3], &src[4]);
+}
+
+inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) {
+  dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+  dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline void Sum5_32(const __m256i src[5][2], __m256i dst[2]) {
+  dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+  dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline __m128i Sum3Horizontal16(const __m128i src[2]) {
+  __m128i s[3];
+  Prepare3_16(src, s);
+  return Sum3_16(s);
+}
+
+inline __m256i Sum3Horizontal16(const uint16_t* const src,
+                                const ptrdiff_t over_read_in_bytes) {
+  __m256i s[3];
+  s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+  s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 2);
+  s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 4);
+  return Sum3_16(s);
+}
+
+inline __m128i Sum5Horizontal16(const __m128i src[2]) {
+  __m128i s[5];
+  Prepare5_16(src, s);
+  return Sum5_16(s);
+}
+
+inline __m256i Sum5Horizontal16(const uint16_t* const src,
+                                const ptrdiff_t over_read_in_bytes) {
+  __m256i s[5];
+  s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+  s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 2);
+  s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 4);
+  s[3] = LoadUnaligned32Msan(src + 3, over_read_in_bytes + 6);
+  s[4] = LoadUnaligned32Msan(src + 4, over_read_in_bytes + 8);
+  return Sum5_16(s);
+}
+
+inline void SumHorizontal16(const uint16_t* const src,
+                            const ptrdiff_t over_read_in_bytes,
+                            __m256i* const row3, __m256i* const row5) {
+  __m256i s[5];
+  s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+  s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 2);
+  s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 4);
+  s[3] = LoadUnaligned32Msan(src + 3, over_read_in_bytes + 6);
+  s[4] = LoadUnaligned32Msan(src + 4, over_read_in_bytes + 8);
+  const __m256i sum04 = _mm256_add_epi16(s[0], s[4]);
+  *row3 = Sum3_16(s + 1);
+  *row5 = _mm256_add_epi16(sum04, *row3);
+}
+
+inline void SumHorizontal16(const uint16_t* const src,
+                            const ptrdiff_t over_read_in_bytes,
+                            __m256i* const row3_0, __m256i* const row3_1,
+                            __m256i* const row5_0, __m256i* const row5_1) {
+  SumHorizontal16(src + 0, over_read_in_bytes + 0, row3_0, row5_0);
+  SumHorizontal16(src + 16, over_read_in_bytes + 32, row3_1, row5_1);
+}
+
+inline void SumHorizontal32(const __m128i src[5], __m128i* const row_sq3,
+                            __m128i* const row_sq5) {
+  const __m128i sum04 = _mm_add_epi32(src[0], src[4]);
+  *row_sq3 = Sum3_32(src + 1);
+  *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+inline void SumHorizontal32(const __m256i src[5], __m256i* const row_sq3,
+                            __m256i* const row_sq5) {
+  const __m256i sum04 = _mm256_add_epi32(src[0], src[4]);
+  *row_sq3 = Sum3_32(src + 1);
+  *row_sq5 = _mm256_add_epi32(sum04, *row_sq3);
+}
+
+inline void SumHorizontal32(const __m128i src[3], __m128i* const row_sq3_0,
+                            __m128i* const row_sq3_1, __m128i* const row_sq5_0,
+                            __m128i* const row_sq5_1) {
+  __m128i s[5];
+  Prepare5_32(src + 0, s);
+  SumHorizontal32(s, row_sq3_0, row_sq5_0);
+  Prepare5_32(src + 1, s);
+  SumHorizontal32(s, row_sq3_1, row_sq5_1);
+}
+
+inline void SumHorizontal32(const __m256i src[3], __m256i* const row_sq3_0,
+                            __m256i* const row_sq3_1, __m256i* const row_sq5_0,
+                            __m256i* const row_sq5_1) {
+  __m256i s[5];
+  Prepare5_32(src + 0, s);
+  SumHorizontal32(s, row_sq3_0, row_sq5_0);
+  Prepare5_32(src + 1, s);
+  SumHorizontal32(s, row_sq3_1, row_sq5_1);
+}
+
+inline void Sum3Horizontal32(const __m128i src[3], __m128i dst[2]) {
+  __m128i s[3];
+  Prepare3_32(src + 0, s);
+  dst[0] = Sum3_32(s);
+  Prepare3_32(src + 1, s);
+  dst[1] = Sum3_32(s);
+}
+
+inline void Sum3Horizontal32(const __m256i src[3], __m256i dst[2]) {
+  __m256i s[3];
+  Prepare3_32(src + 0, s);
+  dst[0] = Sum3_32(s);
+  Prepare3_32(src + 1, s);
+  dst[1] = Sum3_32(s);
+}
+
+inline void Sum5Horizontal32(const __m128i src[3], __m128i dst[2]) {
+  __m128i s[5];
+  Prepare5_32(src + 0, s);
+  dst[0] = Sum5_32(s);
+  Prepare5_32(src + 1, s);
+  dst[1] = Sum5_32(s);
+}
+
+inline void Sum5Horizontal32(const __m256i src[3], __m256i dst[2]) {
+  __m256i s[5];
+  Prepare5_32(src + 0, s);
+  dst[0] = Sum5_32(s);
+  Prepare5_32(src + 1, s);
+  dst[1] = Sum5_32(s);
+}
+
+void SumHorizontal16(const __m128i src[2], __m128i* const row3,
+                     __m128i* const row5) {
+  __m128i s[5];
+  Prepare5_16(src, s);
+  const __m128i sum04 = _mm_add_epi16(s[0], s[4]);
+  *row3 = Sum3_16(s + 1);
+  *row5 = _mm_add_epi16(sum04, *row3);
+}
+
+inline __m256i Sum343Lo(const __m256i ma3[3]) {
+  const __m256i sum = Sum3WLo16(ma3);
+  const __m256i sum3 = Sum3_16(sum, sum, sum);
+  return VaddwLo8(sum3, ma3[1]);
+}
+
+inline __m256i Sum343Hi(const __m256i ma3[3]) {
+  const __m256i sum = Sum3WHi16(ma3);
+  const __m256i sum3 = Sum3_16(sum, sum, sum);
+  return VaddwHi8(sum3, ma3[1]);
+}
+
+inline __m256i Sum343(const __m256i src[3]) {
+  const __m256i sum = Sum3_32(src);
+  const __m256i sum3 = Sum3_32(sum, sum, sum);
+  return _mm256_add_epi32(sum3, src[1]);
+}
+
+inline void Sum343(const __m256i src[3], __m256i dst[2]) {
+  __m256i s[3];
+  Prepare3_32(src + 0, s);
+  dst[0] = Sum343(s);
+  Prepare3_32(src + 1, s);
+  dst[1] = Sum343(s);
+}
+
+inline __m256i Sum565Lo(const __m256i src[3]) {
+  const __m256i sum = Sum3WLo16(src);
+  const __m256i sum4 = _mm256_slli_epi16(sum, 2);
+  const __m256i sum5 = _mm256_add_epi16(sum4, sum);
+  return VaddwLo8(sum5, src[1]);
+}
+
+inline __m256i Sum565Hi(const __m256i src[3]) {
+  const __m256i sum = Sum3WHi16(src);
+  const __m256i sum4 = _mm256_slli_epi16(sum, 2);
+  const __m256i sum5 = _mm256_add_epi16(sum4, sum);
+  return VaddwHi8(sum5, src[1]);
+}
+
+inline __m256i Sum565(const __m256i src[3]) {
+  const __m256i sum = Sum3_32(src);
+  const __m256i sum4 = _mm256_slli_epi32(sum, 2);
+  const __m256i sum5 = _mm256_add_epi32(sum4, sum);
+  return _mm256_add_epi32(sum5, src[1]);
+}
+
+inline void Sum565(const __m256i src[3], __m256i dst[2]) {
+  __m256i s[3];
+  Prepare3_32(src + 0, s);
+  dst[0] = Sum565(s);
+  Prepare3_32(src + 1, s);
+  dst[1] = Sum565(s);
+}
+
+inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
+                   uint32_t* square_sum3, uint32_t* square_sum5) {
+  const ptrdiff_t overread_in_bytes_128 =
+      kOverreadInBytesPass1_128 - sizeof(*src) * width;
+  const ptrdiff_t overread_in_bytes_256 =
+      kOverreadInBytesPass1_256 - sizeof(*src) * width;
+  int y = 2;
+  do {
+    __m128i s0[2], sq_128[4], s3, s5, sq3[2], sq5[2];
+    __m256i sq[8];
+    s0[0] = LoadUnaligned16Msan(src + 0, overread_in_bytes_128 + 0);
+    s0[1] = LoadUnaligned16Msan(src + 8, overread_in_bytes_128 + 16);
+    Square(s0[0], sq_128 + 0);
+    Square(s0[1], sq_128 + 2);
+    SumHorizontal16(s0, &s3, &s5);
+    StoreAligned16(sum3, s3);
+    StoreAligned16(sum5, s5);
+    SumHorizontal32(sq_128, &sq3[0], &sq3[1], &sq5[0], &sq5[1]);
+    StoreAligned32U32(square_sum3, sq3);
+    StoreAligned32U32(square_sum5, sq5);
+    src += 8;
+    sum3 += 8;
+    sum5 += 8;
+    square_sum3 += 8;
+    square_sum5 += 8;
+    sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+    sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+    ptrdiff_t x = sum_width;
+    do {
+      __m256i s[2], row3[2], row5[2], row_sq3[2], row_sq5[2];
+      s[0] = LoadUnaligned32Msan(
+          src + 8, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8));
+      s[1] = LoadUnaligned32Msan(
+          src + 24,
+          overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 24));
+      Square(s[0], sq + 2);
+      Square(s[1], sq + 6);
+      sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+      sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21);
+      sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21);
+      sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21);
+      SumHorizontal16(
+          src, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8),
+          &row3[0], &row3[1], &row5[0], &row5[1]);
+      StoreAligned64(sum3, row3);
+      StoreAligned64(sum5, row5);
+      SumHorizontal32(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0],
+                      &row_sq5[1]);
+      StoreAligned64(square_sum3 + 0, row_sq3);
+      StoreAligned64(square_sum5 + 0, row_sq5);
+      SumHorizontal32(sq + 4, &row_sq3[0], &row_sq3[1], &row_sq5[0],
+                      &row_sq5[1]);
+      StoreAligned64(square_sum3 + 16, row_sq3);
+      StoreAligned64(square_sum5 + 16, row_sq5);
+      sq[0] = sq[6];
+      sq[1] = sq[7];
+      src += 32;
+      sum3 += 32;
+      sum5 += 32;
+      square_sum3 += 32;
+      square_sum5 += 32;
+      x -= 32;
+    } while (x != 0);
+    src += src_stride - sum_width - 8;
+    sum3 += sum_stride - sum_width - 8;
+    sum5 += sum_stride - sum_width - 8;
+    square_sum3 += sum_stride - sum_width - 8;
+    square_sum5 += sum_stride - sum_width - 8;
+  } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sums,
+                   uint32_t* square_sums) {
+  static_assert(size == 3 || size == 5, "");
+  int overread_in_bytes_128, overread_in_bytes_256;
+  if (size == 3) {
+    overread_in_bytes_128 = kOverreadInBytesPass2_128;
+    overread_in_bytes_256 = kOverreadInBytesPass2_256;
+  } else {
+    overread_in_bytes_128 = kOverreadInBytesPass1_128;
+    overread_in_bytes_256 = kOverreadInBytesPass1_256;
+  }
+  overread_in_bytes_128 -= sizeof(*src) * width;
+  overread_in_bytes_256 -= sizeof(*src) * width;
+  int y = 2;
+  do {
+    __m128i s_128[2], ss, sq_128[4], sqs[2];
+    __m256i sq[8];
+    s_128[0] = LoadUnaligned16Msan(src + 0, overread_in_bytes_128);
+    s_128[1] = LoadUnaligned16Msan(src + 8, overread_in_bytes_128 + 16);
+    Square(s_128[0], sq_128 + 0);
+    Square(s_128[1], sq_128 + 2);
+    if (size == 3) {
+      ss = Sum3Horizontal16(s_128);
+      Sum3Horizontal32(sq_128, sqs);
+    } else {
+      ss = Sum5Horizontal16(s_128);
+      Sum5Horizontal32(sq_128, sqs);
+    }
+    StoreAligned16(sums, ss);
+    StoreAligned32U32(square_sums, sqs);
+    src += 8;
+    sums += 8;
+    square_sums += 8;
+    sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+    sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+    ptrdiff_t x = sum_width;
+    do {
+      __m256i s[2], row[2], row_sq[4];
+      s[0] = LoadUnaligned32Msan(
+          src + 8, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8));
+      s[1] = LoadUnaligned32Msan(
+          src + 24,
+          overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 24));
+      Square(s[0], sq + 2);
+      Square(s[1], sq + 6);
+      sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+      sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21);
+      sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21);
+      sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21);
+      if (size == 3) {
+        row[0] = Sum3Horizontal16(
+            src, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8));
+        row[1] =
+            Sum3Horizontal16(src + 16, overread_in_bytes_256 +
+                                           sizeof(*src) * (sum_width - x + 24));
+        Sum3Horizontal32(sq + 0, row_sq + 0);
+        Sum3Horizontal32(sq + 4, row_sq + 2);
+      } else {
+        row[0] = Sum5Horizontal16(
+            src, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8));
+        row[1] =
+            Sum5Horizontal16(src + 16, overread_in_bytes_256 +
+                                           sizeof(*src) * (sum_width - x + 24));
+        Sum5Horizontal32(sq + 0, row_sq + 0);
+        Sum5Horizontal32(sq + 4, row_sq + 2);
+      }
+      StoreAligned64(sums, row);
+      StoreAligned64(square_sums + 0, row_sq + 0);
+      StoreAligned64(square_sums + 16, row_sq + 2);
+      sq[0] = sq[6];
+      sq[1] = sq[7];
+      src += 32;
+      sums += 32;
+      square_sums += 32;
+      x -= 32;
+    } while (x != 0);
+    src += src_stride - sum_width - 8;
+    sums += sum_stride - sum_width - 8;
+    square_sums += sum_stride - sum_width - 8;
+  } while (--y != 0);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq,
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  // a = |sum_sq|
+  // d = |sum|
+  // p = (a * n < d * d) ? 0 : a * n - d * d;
+  const __m128i dxd = _mm_madd_epi16(sum, sum);
+  // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
+  // Some compilers could do this for us but we make this explicit.
+  // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n));
+  __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3));
+  if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4));
+  const __m128i sub = _mm_sub_epi32(axn, dxd);
+  const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
+  const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale));
+  return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  const __m128i b = VrshrU16(sum, 2);
+  const __m128i sum_lo = _mm_unpacklo_epi16(b, _mm_setzero_si128());
+  const __m128i sum_hi = _mm_unpackhi_epi16(b, _mm_setzero_si128());
+  const __m128i z0 = CalculateMa<n>(sum_lo, VrshrU32(sum_sq[0], 4), scale);
+  const __m128i z1 = CalculateMa<n>(sum_hi, VrshrU32(sum_sq[1], 4), scale);
+  return _mm_packus_epi32(z0, z1);
+}
+
+template <int n>
+inline __m256i CalculateMa(const __m256i sum, const __m256i sum_sq,
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  // a = |sum_sq|
+  // d = |sum|
+  // p = (a * n < d * d) ? 0 : a * n - d * d;
+  const __m256i dxd = _mm256_madd_epi16(sum, sum);
+  // _mm256_mullo_epi32() has high latency. Using shifts and additions instead.
+  // Some compilers could do this for us but we make this explicit.
+  // return _mm256_mullo_epi32(sum_sq, _mm256_set1_epi32(n));
+  __m256i axn = _mm256_add_epi32(sum_sq, _mm256_slli_epi32(sum_sq, 3));
+  if (n == 25) axn = _mm256_add_epi32(axn, _mm256_slli_epi32(sum_sq, 4));
+  const __m256i sub = _mm256_sub_epi32(axn, dxd);
+  const __m256i p = _mm256_max_epi32(sub, _mm256_setzero_si256());
+  const __m256i pxs = _mm256_mullo_epi32(p, _mm256_set1_epi32(scale));
+  return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m256i CalculateMa(const __m256i sum, const __m256i sum_sq[2],
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  const __m256i b = VrshrU16(sum, 2);
+  const __m256i sum_lo = _mm256_unpacklo_epi16(b, _mm256_setzero_si256());
+  const __m256i sum_hi = _mm256_unpackhi_epi16(b, _mm256_setzero_si256());
+  const __m256i z0 = CalculateMa<n>(sum_lo, VrshrU32(sum_sq[0], 4), scale);
+  const __m256i z1 = CalculateMa<n>(sum_hi, VrshrU32(sum_sq[1], 4), scale);
+  return _mm256_packus_epi32(z0, z1);
+}
+
+inline void CalculateB5(const __m128i sum, const __m128i ma, __m128i b[2]) {
+  // one_over_n == 164.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+  // one_over_n_quarter == 41.
+  constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+  static_assert(one_over_n == one_over_n_quarter << 2, "");
+  // |ma| is in range [0, 255].
+  const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter));
+  const __m128i m0 = VmullLo16(m, sum);
+  const __m128i m1 = VmullHi16(m, sum);
+  b[0] = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+  b[1] = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+}
+
+inline void CalculateB5(const __m256i sum, const __m256i ma, __m256i b[2]) {
+  // one_over_n == 164.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+  // one_over_n_quarter == 41.
+  constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+  static_assert(one_over_n == one_over_n_quarter << 2, "");
+  // |ma| is in range [0, 255].
+  const __m256i m =
+      _mm256_maddubs_epi16(ma, _mm256_set1_epi16(one_over_n_quarter));
+  const __m256i m0 = VmullLo16(m, sum);
+  const __m256i m1 = VmullHi16(m, sum);
+  b[0] = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+  b[1] = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+}
+
+inline void CalculateB3(const __m128i sum, const __m128i ma, __m128i b[2]) {
+  // one_over_n == 455.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+  const __m128i m0 = VmullLo16(ma, sum);
+  const __m128i m1 = VmullHi16(ma, sum);
+  const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
+  const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
+  b[0] = VrshrU32(m2, kSgrProjReciprocalBits);
+  b[1] = VrshrU32(m3, kSgrProjReciprocalBits);
+}
+
+inline void CalculateB3(const __m256i sum, const __m256i ma, __m256i b[2]) {
+  // one_over_n == 455.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+  const __m256i m0 = VmullLo16(ma, sum);
+  const __m256i m1 = VmullHi16(ma, sum);
+  const __m256i m2 = _mm256_mullo_epi32(m0, _mm256_set1_epi32(one_over_n));
+  const __m256i m3 = _mm256_mullo_epi32(m1, _mm256_set1_epi32(one_over_n));
+  b[0] = VrshrU32(m2, kSgrProjReciprocalBits);
+  b[1] = VrshrU32(m3, kSgrProjReciprocalBits);
+}
+
+inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2],
+                                  const uint32_t scale, __m128i* const sum,
+                                  __m128i* const index) {
+  __m128i sum_sq[2];
+  *sum = Sum5_16(s5);
+  Sum5_32(sq5, sum_sq);
+  *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex5(const __m256i s5[5], const __m256i sq5[5][2],
+                                  const uint32_t scale, __m256i* const sum,
+                                  __m256i* const index) {
+  __m256i sum_sq[2];
+  *sum = Sum5_16(s5);
+  Sum5_32(sq5, sum_sq);
+  *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2],
+                                  const uint32_t scale, __m128i* const sum,
+                                  __m128i* const index) {
+  __m128i sum_sq[2];
+  *sum = Sum3_16(s3);
+  Sum3_32(sq3, sum_sq);
+  *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m256i s3[3], const __m256i sq3[3][2],
+                                  const uint32_t scale, __m256i* const sum,
+                                  __m256i* const index) {
+  __m256i sum_sq[2];
+  *sum = Sum3_16(s3);
+  Sum3_32(sq3, sum_sq);
+  *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+template <int n>
+inline void LookupIntermediate(const __m128i sum, const __m128i index,
+                               __m128i* const ma, __m128i b[2]) {
+  static_assert(n == 9 || n == 25, "");
+  const __m128i idx = _mm_packus_epi16(index, index);
+  // Actually it's not stored and loaded. The compiler will use a 64-bit
+  // general-purpose register to process. Faster than using _mm_extract_epi8().
+  uint8_t temp[8];
+  StoreLo8(temp, idx);
+  *ma = _mm_cvtsi32_si128(kSgrMaLookup[temp[0]]);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], 1);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], 2);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], 3);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], 4);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], 5);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], 6);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], 7);
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const __m128i maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+  if (n == 9) {
+    CalculateB3(sum, maq, b);
+  } else {
+    CalculateB5(sum, maq, b);
+  }
+}
+
+// Repeat the first 48 elements in kSgrMaLookup with a period of 16.
+alignas(32) constexpr uint8_t kSgrMaLookupAvx2[96] = {
+    255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16,
+    255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16,
+    15,  14,  13, 13, 12, 12, 11, 11, 10, 10, 9,  9,  9,  9,  8,  8,
+    15,  14,  13, 13, 12, 12, 11, 11, 10, 10, 9,  9,  9,  9,  8,  8,
+    8,   8,   7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,  5,  5,
+    8,   8,   7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,  5,  5};
+
+// Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
+// to get value 0 as the shuffle result. The most significiant bit 1 comes
+// either from the comparison instruction, or from the sign bit of the index.
+inline __m128i ShuffleIndex(const __m128i table, const __m128i index) {
+  __m128i mask;
+  mask = _mm_cmpgt_epi8(index, _mm_set1_epi8(15));
+  mask = _mm_or_si128(mask, index);
+  return _mm_shuffle_epi8(table, mask);
+}
+
+inline __m256i ShuffleIndex(const __m256i table, const __m256i index) {
+  __m256i mask;
+  mask = _mm256_cmpgt_epi8(index, _mm256_set1_epi8(15));
+  mask = _mm256_or_si256(mask, index);
+  return _mm256_shuffle_epi8(table, mask);
+}
+
+inline __m128i AdjustValue(const __m128i value, const __m128i index,
+                           const int threshold) {
+  const __m128i thresholds = _mm_set1_epi8(threshold - 128);
+  const __m128i offset = _mm_cmpgt_epi8(index, thresholds);
+  return _mm_add_epi8(value, offset);
+}
+
+inline __m256i AdjustValue(const __m256i value, const __m256i index,
+                           const int threshold) {
+  const __m256i thresholds = _mm256_set1_epi8(threshold - 128);
+  const __m256i offset = _mm256_cmpgt_epi8(index, thresholds);
+  return _mm256_add_epi8(value, offset);
+}
+
+inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
+                                  __m128i* const ma, __m128i b0[2],
+                                  __m128i b1[2]) {
+  // Use table lookup to read elements whose indices are less than 48.
+  const __m128i c0 = LoadAligned16(kSgrMaLookup + 0 * 16);
+  const __m128i c1 = LoadAligned16(kSgrMaLookup + 1 * 16);
+  const __m128i c2 = LoadAligned16(kSgrMaLookup + 2 * 16);
+  const __m128i indices = _mm_packus_epi16(index[0], index[1]);
+  __m128i idx;
+  // Clip idx to 127 to apply signed comparison instructions.
+  idx = _mm_min_epu8(indices, _mm_set1_epi8(127));
+  // All elements whose indices are less than 48 are set to 0.
+  // Get shuffle results for indices in range [0, 15].
+  *ma = ShuffleIndex(c0, idx);
+  // Get shuffle results for indices in range [16, 31].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+  const __m128i res1 = ShuffleIndex(c1, idx);
+  // Use OR instruction to combine shuffle results together.
+  *ma = _mm_or_si128(*ma, res1);
+  // Get shuffle results for indices in range [32, 47].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+  const __m128i res2 = ShuffleIndex(c2, idx);
+  *ma = _mm_or_si128(*ma, res2);
+
+  // For elements whose indices are larger than 47, since they seldom change
+  // values with the increase of the index, we use comparison and arithmetic
+  // operations to calculate their values.
+  // Add -128 to apply signed comparison instructions.
+  idx = _mm_add_epi8(indices, _mm_set1_epi8(-128));
+  // Elements whose indices are larger than 47 (with value 0) are set to 5.
+  *ma = _mm_max_epu8(*ma, _mm_set1_epi8(5));
+  *ma = AdjustValue(*ma, idx, 55);   // 55 is the last index which value is 5.
+  *ma = AdjustValue(*ma, idx, 72);   // 72 is the last index which value is 4.
+  *ma = AdjustValue(*ma, idx, 101);  // 101 is the last index which value is 3.
+  *ma = AdjustValue(*ma, idx, 169);  // 169 is the last index which value is 2.
+  *ma = AdjustValue(*ma, idx, 254);  // 254 is the last index which value is 1.
+
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const __m128i maq0 = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+  CalculateB3(sum[0], maq0, b0);
+  const __m128i maq1 = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
+  CalculateB3(sum[1], maq1, b1);
+}
+
+template <int n>
+inline void CalculateIntermediate(const __m256i sum[2], const __m256i index[2],
+                                  __m256i ma[3], __m256i b0[2], __m256i b1[2]) {
+  static_assert(n == 9 || n == 25, "");
+  // Use table lookup to read elements whose indices are less than 48.
+  const __m256i c0 = LoadAligned32(kSgrMaLookupAvx2 + 0 * 32);
+  const __m256i c1 = LoadAligned32(kSgrMaLookupAvx2 + 1 * 32);
+  const __m256i c2 = LoadAligned32(kSgrMaLookupAvx2 + 2 * 32);
+  const __m256i indices = _mm256_packus_epi16(index[0], index[1]);  // 0 2 1 3
+  __m256i idx, mas;
+  // Clip idx to 127 to apply signed comparison instructions.
+  idx = _mm256_min_epu8(indices, _mm256_set1_epi8(127));
+  // All elements whose indices are less than 48 are set to 0.
+  // Get shuffle results for indices in range [0, 15].
+  mas = ShuffleIndex(c0, idx);
+  // Get shuffle results for indices in range [16, 31].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm256_sub_epi8(idx, _mm256_set1_epi8(16));
+  const __m256i res1 = ShuffleIndex(c1, idx);
+  // Use OR instruction to combine shuffle results together.
+  mas = _mm256_or_si256(mas, res1);
+  // Get shuffle results for indices in range [32, 47].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm256_sub_epi8(idx, _mm256_set1_epi8(16));
+  const __m256i res2 = ShuffleIndex(c2, idx);
+  mas = _mm256_or_si256(mas, res2);
+
+  // For elements whose indices are larger than 47, since they seldom change
+  // values with the increase of the index, we use comparison and arithmetic
+  // operations to calculate their values.
+  // Add -128 to apply signed comparison instructions.
+  idx = _mm256_add_epi8(indices, _mm256_set1_epi8(-128));
+  // Elements whose indices are larger than 47 (with value 0) are set to 5.
+  mas = _mm256_max_epu8(mas, _mm256_set1_epi8(5));
+  mas = AdjustValue(mas, idx, 55);   // 55 is the last index which value is 5.
+  mas = AdjustValue(mas, idx, 72);   // 72 is the last index which value is 4.
+  mas = AdjustValue(mas, idx, 101);  // 101 is the last index which value is 3.
+  mas = AdjustValue(mas, idx, 169);  // 169 is the last index which value is 2.
+  mas = AdjustValue(mas, idx, 254);  // 254 is the last index which value is 1.
+
+  ma[2] = _mm256_permute4x64_epi64(mas, 0x63);     // 32-39 8-15 16-23 24-31
+  ma[0] = _mm256_blend_epi32(ma[0], ma[2], 0xfc);  //  0-7  8-15 16-23 24-31
+  ma[1] = _mm256_permute2x128_si256(ma[0], ma[2], 0x21);
+
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const __m256i maq0 = _mm256_unpackhi_epi8(ma[0], _mm256_setzero_si256());
+  const __m256i maq1 = _mm256_unpacklo_epi8(ma[1], _mm256_setzero_si256());
+  __m256i sums[2];
+  sums[0] = _mm256_permute2x128_si256(sum[0], sum[1], 0x20);
+  sums[1] = _mm256_permute2x128_si256(sum[0], sum[1], 0x31);
+  if (n == 9) {
+    CalculateB3(sums[0], maq0, b0);
+    CalculateB3(sums[1], maq1, b1);
+  } else {
+    CalculateB5(sums[0], maq0, b0);
+    CalculateB5(sums[1], maq1, b1);
+  }
+}
+
+inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2],
+                                   const uint32_t scale, __m128i* const ma,
+                                   __m128i b[2]) {
+  __m128i sum, index;
+  CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
+  LookupIntermediate<25>(sum, index, ma, b);
+}
+
+inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2],
+                                   const uint32_t scale, __m128i* const ma,
+                                   __m128i b[2]) {
+  __m128i sum, index;
+  CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
+  LookupIntermediate<9>(sum, index, ma, b);
+}
+
+inline void Store343_444(const __m256i b3[3], const ptrdiff_t x,
+                         __m256i sum_b343[2], __m256i sum_b444[2],
+                         uint32_t* const b343, uint32_t* const b444) {
+  __m256i b[3], sum_b111[2];
+  Prepare3_32(b3 + 0, b);
+  sum_b111[0] = Sum3_32(b);
+  sum_b444[0] = _mm256_slli_epi32(sum_b111[0], 2);
+  sum_b343[0] = _mm256_sub_epi32(sum_b444[0], sum_b111[0]);
+  sum_b343[0] = _mm256_add_epi32(sum_b343[0], b[1]);
+  Prepare3_32(b3 + 1, b);
+  sum_b111[1] = Sum3_32(b);
+  sum_b444[1] = _mm256_slli_epi32(sum_b111[1], 2);
+  sum_b343[1] = _mm256_sub_epi32(sum_b444[1], sum_b111[1]);
+  sum_b343[1] = _mm256_add_epi32(sum_b343[1], b[1]);
+  StoreAligned64(b444 + x, sum_b444);
+  StoreAligned64(b343 + x, sum_b343);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, __m256i* const sum_ma343,
+                           __m256i* const sum_ma444, __m256i sum_b343[2],
+                           __m256i sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const __m256i sum_ma111 = Sum3WLo16(ma3);
+  *sum_ma444 = _mm256_slli_epi16(sum_ma111, 2);
+  StoreAligned32_ma(ma444 + x, *sum_ma444);
+  const __m256i sum333 = _mm256_sub_epi16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwLo8(sum333, ma3[1]);
+  StoreAligned32_ma(ma343 + x, *sum_ma343);
+  Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, __m256i* const sum_ma343,
+                           __m256i* const sum_ma444, __m256i sum_b343[2],
+                           __m256i sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const __m256i sum_ma111 = Sum3WHi16(ma3);
+  *sum_ma444 = _mm256_slli_epi16(sum_ma111, 2);
+  StoreAligned32_ma(ma444 + x, *sum_ma444);
+  const __m256i sum333 = _mm256_sub_epi16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwHi8(sum333, ma3[1]);
+  StoreAligned32_ma(ma343 + x, *sum_ma343);
+  Store343_444(b3, x + kMaStoreOffset, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, __m256i* const sum_ma343,
+                           __m256i sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m256i sum_ma444, sum_b444[2];
+  Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, __m256i* const sum_ma343,
+                           __m256i sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m256i sum_ma444, sum_b444[2];
+  Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m256i sum_ma343, sum_b343[2];
+  Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m256i sum_ma343, sum_b343[2];
+  Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+// Don't combine the following 2 functions, which would be slower.
+inline void Store343_444(const __m256i ma3[3], const __m256i b3[6],
+                         const ptrdiff_t x, __m256i* const sum_ma343_lo,
+                         __m256i* const sum_ma343_hi,
+                         __m256i* const sum_ma444_lo,
+                         __m256i* const sum_ma444_hi, __m256i sum_b343_lo[2],
+                         __m256i sum_b343_hi[2], __m256i sum_b444_lo[2],
+                         __m256i sum_b444_hi[2], uint16_t* const ma343,
+                         uint16_t* const ma444, uint32_t* const b343,
+                         uint32_t* const b444) {
+  __m256i sum_mat343[2], sum_mat444[2];
+  const __m256i sum_ma111_lo = Sum3WLo16(ma3);
+  sum_mat444[0] = _mm256_slli_epi16(sum_ma111_lo, 2);
+  const __m256i sum333_lo = _mm256_sub_epi16(sum_mat444[0], sum_ma111_lo);
+  sum_mat343[0] = VaddwLo8(sum333_lo, ma3[1]);
+  Store343_444(b3, x, sum_b343_lo, sum_b444_lo, b343, b444);
+  const __m256i sum_ma111_hi = Sum3WHi16(ma3);
+  sum_mat444[1] = _mm256_slli_epi16(sum_ma111_hi, 2);
+  *sum_ma444_lo = _mm256_permute2x128_si256(sum_mat444[0], sum_mat444[1], 0x20);
+  *sum_ma444_hi = _mm256_permute2x128_si256(sum_mat444[0], sum_mat444[1], 0x31);
+  StoreAligned32(ma444 + x + 0, *sum_ma444_lo);
+  StoreAligned32(ma444 + x + 16, *sum_ma444_hi);
+  const __m256i sum333_hi = _mm256_sub_epi16(sum_mat444[1], sum_ma111_hi);
+  sum_mat343[1] = VaddwHi8(sum333_hi, ma3[1]);
+  *sum_ma343_lo = _mm256_permute2x128_si256(sum_mat343[0], sum_mat343[1], 0x20);
+  *sum_ma343_hi = _mm256_permute2x128_si256(sum_mat343[0], sum_mat343[1], 0x31);
+  StoreAligned32(ma343 + x + 0, *sum_ma343_lo);
+  StoreAligned32(ma343 + x + 16, *sum_ma343_hi);
+  Store343_444(b3 + 3, x + 16, sum_b343_hi, sum_b444_hi, b343, b444);
+}
+
+inline void Store343_444(const __m256i ma3[3], const __m256i b3[6],
+                         const ptrdiff_t x, __m256i* const sum_ma343_lo,
+                         __m256i* const sum_ma343_hi, __m256i sum_b343_lo[2],
+                         __m256i sum_b343_hi[2], uint16_t* const ma343,
+                         uint16_t* const ma444, uint32_t* const b343,
+                         uint32_t* const b444) {
+  __m256i sum_ma444[2], sum_b444[2], sum_mat343[2];
+  const __m256i sum_ma111_lo = Sum3WLo16(ma3);
+  sum_ma444[0] = _mm256_slli_epi16(sum_ma111_lo, 2);
+  const __m256i sum333_lo = _mm256_sub_epi16(sum_ma444[0], sum_ma111_lo);
+  sum_mat343[0] = VaddwLo8(sum333_lo, ma3[1]);
+  Store343_444(b3, x, sum_b343_lo, sum_b444, b343, b444);
+  const __m256i sum_ma111_hi = Sum3WHi16(ma3);
+  sum_ma444[1] = _mm256_slli_epi16(sum_ma111_hi, 2);
+  StoreAligned64_ma(ma444 + x, sum_ma444);
+  const __m256i sum333_hi = _mm256_sub_epi16(sum_ma444[1], sum_ma111_hi);
+  sum_mat343[1] = VaddwHi8(sum333_hi, ma3[1]);
+  *sum_ma343_lo = _mm256_permute2x128_si256(sum_mat343[0], sum_mat343[1], 0x20);
+  *sum_ma343_hi = _mm256_permute2x128_si256(sum_mat343[0], sum_mat343[1], 0x31);
+  StoreAligned32(ma343 + x + 0, *sum_ma343_lo);
+  StoreAligned32(ma343 + x + 16, *sum_ma343_hi);
+  Store343_444(b3 + 3, x + 16, sum_b343_hi, sum_b444, b343, b444);
+}
+
+inline void PermuteB(const __m256i t[4], __m256i b[7]) {
+  // Input:
+  //                             0     1      2     3  // b[0]
+  //                             4     5      6     7  // b[1]
+  //  8     9     10    11      24    25     26    27  // t[0]
+  // 12    13     14    15      28    29     30    31  // t[1]
+  // 16    17     18    19      32    33     34    35  // t[2]
+  // 20    21     22    23      36    37     38    39  // t[3]
+
+  // Output:
+  //  0     1      2     3       8     9     10    11  // b[0]
+  //  4     5      6     7      12    13     14    15  // b[1]
+  //  8     9     10    11      16    17     18    19  // b[2]
+  // 16    17     18    19      24    25     26    27  // b[3]
+  // 20    21     22    23      28    29     30    31  // b[4]
+  // 24    25     26    27      32    33     34    35  // b[5]
+  // 20    21     22    23      36    37     38    39  // b[6]
+  b[0] = _mm256_permute2x128_si256(b[0], t[0], 0x21);
+  b[1] = _mm256_permute2x128_si256(b[1], t[1], 0x21);
+  b[2] = _mm256_permute2x128_si256(t[0], t[2], 0x20);
+  b[3] = _mm256_permute2x128_si256(t[2], t[0], 0x30);
+  b[4] = _mm256_permute2x128_si256(t[3], t[1], 0x30);
+  b[5] = _mm256_permute2x128_si256(t[0], t[2], 0x31);
+  b[6] = t[3];
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+    const __m128i s[2][2], const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i* const ma,
+    __m128i b[2]) {
+  __m128i s5[2][5], sq5[5][2];
+  Square(s[0][1], sq[0] + 2);
+  Square(s[1][1], sq[1] + 2);
+  s5[0][3] = Sum5Horizontal16(s[0]);
+  StoreAligned16(sum5[3], s5[0][3]);
+  s5[0][4] = Sum5Horizontal16(s[1]);
+  StoreAligned16(sum5[4], s5[0][4]);
+  Sum5Horizontal32(sq[0], sq5[3]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  Sum5Horizontal32(sq[1], sq5[4]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x3U16(sum5, 0, s5[0]);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5(s5[0], sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+    const uint16_t* const src0, const uint16_t* const src1,
+    const ptrdiff_t over_read_in_bytes, const ptrdiff_t sum_width,
+    const ptrdiff_t x, const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], __m256i sq[2][8], __m256i ma[3],
+    __m256i b[3]) {
+  __m256i s[2], s5[2][5], sq5[5][2], sum[2], index[2], t[4];
+  s[0] = LoadUnaligned32Msan(src0 + 8, over_read_in_bytes + 16);
+  s[1] = LoadUnaligned32Msan(src1 + 8, over_read_in_bytes + 16);
+  Square(s[0], sq[0] + 2);
+  Square(s[1], sq[1] + 2);
+  sq[0][0] = _mm256_permute2x128_si256(sq[0][0], sq[0][2], 0x21);
+  sq[0][1] = _mm256_permute2x128_si256(sq[0][1], sq[0][3], 0x21);
+  sq[1][0] = _mm256_permute2x128_si256(sq[1][0], sq[1][2], 0x21);
+  sq[1][1] = _mm256_permute2x128_si256(sq[1][1], sq[1][3], 0x21);
+  s5[0][3] = Sum5Horizontal16(src0 + 0, over_read_in_bytes + 0);
+  s5[1][3] = Sum5Horizontal16(src0 + 16, over_read_in_bytes + 32);
+  s5[0][4] = Sum5Horizontal16(src1 + 0, over_read_in_bytes + 0);
+  s5[1][4] = Sum5Horizontal16(src1 + 16, over_read_in_bytes + 32);
+  StoreAligned32(sum5[3] + x + 0, s5[0][3]);
+  StoreAligned32(sum5[3] + x + 16, s5[1][3]);
+  StoreAligned32(sum5[4] + x + 0, s5[0][4]);
+  StoreAligned32(sum5[4] + x + 16, s5[1][4]);
+  Sum5Horizontal32(sq[0], sq5[3]);
+  StoreAligned64(square_sum5[3] + x, sq5[3]);
+  Sum5Horizontal32(sq[1], sq5[4]);
+  StoreAligned64(square_sum5[4] + x, sq5[4]);
+  LoadAligned32x3U16(sum5, x, s5[0]);
+  LoadAligned64x3U32(square_sum5, x, sq5);
+  CalculateSumAndIndex5(s5[0], sq5, scale, &sum[0], &index[0]);
+
+  s[0] = LoadUnaligned32Msan(src0 + 24, over_read_in_bytes + 48);
+  s[1] = LoadUnaligned32Msan(src1 + 24, over_read_in_bytes + 48);
+  Square(s[0], sq[0] + 6);
+  Square(s[1], sq[1] + 6);
+  sq[0][4] = _mm256_permute2x128_si256(sq[0][2], sq[0][6], 0x21);
+  sq[0][5] = _mm256_permute2x128_si256(sq[0][3], sq[0][7], 0x21);
+  sq[1][4] = _mm256_permute2x128_si256(sq[1][2], sq[1][6], 0x21);
+  sq[1][5] = _mm256_permute2x128_si256(sq[1][3], sq[1][7], 0x21);
+  Sum5Horizontal32(sq[0] + 4, sq5[3]);
+  StoreAligned64(square_sum5[3] + x + 16, sq5[3]);
+  Sum5Horizontal32(sq[1] + 4, sq5[4]);
+  StoreAligned64(square_sum5[4] + x + 16, sq5[4]);
+  LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+  LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+  CalculateSumAndIndex5(s5[1], sq5, scale, &sum[1], &index[1]);
+  CalculateIntermediate<25>(sum, index, ma, t, t + 2);
+  PermuteB(t, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+    const __m128i s[2], const uint32_t scale, const uint16_t* const sum5[5],
+    const uint32_t* const square_sum5[5], __m128i sq[4], __m128i* const ma,
+    __m128i b[2]) {
+  __m128i s5[5], sq5[5][2];
+  Square(s[1], sq + 2);
+  s5[3] = s5[4] = Sum5Horizontal16(s);
+  Sum5Horizontal32(sq, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+    const uint16_t* const src, const ptrdiff_t over_read_in_bytes,
+    const ptrdiff_t sum_width, const ptrdiff_t x, const uint32_t scale,
+    const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
+    __m256i sq[3], __m256i ma[3], __m256i b[3]) {
+  const __m256i s0 = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 16);
+  __m256i s5[2][5], sq5[5][2], sum[2], index[2], t[4];
+  Square(s0, sq + 2);
+  sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+  sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21);
+  s5[0][3] = Sum5Horizontal16(src + 0, over_read_in_bytes + 0);
+  s5[1][3] = Sum5Horizontal16(src + 16, over_read_in_bytes + 32);
+  s5[0][4] = s5[0][3];
+  s5[1][4] = s5[1][3];
+  Sum5Horizontal32(sq, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned32x3U16(sum5, x, s5[0]);
+  LoadAligned64x3U32(square_sum5, x, sq5);
+  CalculateSumAndIndex5(s5[0], sq5, scale, &sum[0], &index[0]);
+
+  const __m256i s1 = LoadUnaligned32Msan(src + 24, over_read_in_bytes + 48);
+  Square(s1, sq + 6);
+  sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21);
+  sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21);
+  Sum5Horizontal32(sq + 4, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+  LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+  CalculateSumAndIndex5(s5[1], sq5, scale, &sum[1], &index[1]);
+  CalculateIntermediate<25>(sum, index, ma, t, t + 2);
+  PermuteB(t, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+    const __m128i s[2], const uint32_t scale, uint16_t* const sum3[3],
+    uint32_t* const square_sum3[3], __m128i sq[4], __m128i* const ma,
+    __m128i b[2]) {
+  __m128i s3[3], sq3[3][2];
+  Square(s[1], sq + 2);
+  s3[2] = Sum3Horizontal16(s);
+  StoreAligned16(sum3[2], s3[2]);
+  Sum3Horizontal32(sq, sq3[2]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+    const uint16_t* const src, const ptrdiff_t over_read_in_bytes,
+    const ptrdiff_t x, const ptrdiff_t sum_width, const uint32_t scale,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3], __m256i sq[8],
+    __m256i ma[3], __m256i b[7]) {
+  __m256i s[2], s3[4], sq3[3][2], sum[2], index[2], t[4];
+  s[0] = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 16);
+  s[1] = LoadUnaligned32Msan(src + 24, over_read_in_bytes + 48);
+  Square(s[0], sq + 2);
+  sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+  sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21);
+  s3[2] = Sum3Horizontal16(src, over_read_in_bytes);
+  s3[3] = Sum3Horizontal16(src + 16, over_read_in_bytes + 32);
+  StoreAligned64(sum3[2] + x, s3 + 2);
+  Sum3Horizontal32(sq + 0, sq3[2]);
+  StoreAligned64(square_sum3[2] + x, sq3[2]);
+  LoadAligned32x2U16(sum3, x, s3);
+  LoadAligned64x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
+
+  Square(s[1], sq + 6);
+  sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21);
+  sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21);
+  Sum3Horizontal32(sq + 4, sq3[2]);
+  StoreAligned64(square_sum3[2] + x + 16, sq3[2]);
+  LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3 + 1);
+  LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+  CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
+  CalculateIntermediate<9>(sum, index, ma, t, t + 2);
+  PermuteB(t, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+    const __m128i s[2][4], const uint16_t scales[2], uint16_t* const sum3[4],
+    uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+    uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i ma3[2][3],
+    __m128i b3[2][10], __m128i* const ma5, __m128i b5[2]) {
+  __m128i s3[4], s5[5], sq3[4][2], sq5[5][2], sum[2], index[2];
+  Square(s[0][1], sq[0] + 2);
+  Square(s[1][1], sq[1] + 2);
+  SumHorizontal16(s[0], &s3[2], &s5[3]);
+  SumHorizontal16(s[1], &s3[3], &s5[4]);
+  StoreAligned16(sum3[2], s3[2]);
+  StoreAligned16(sum3[3], s3[3]);
+  StoreAligned16(sum5[3], s5[3]);
+  StoreAligned16(sum5[4], s5[4]);
+  SumHorizontal32(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  SumHorizontal32(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3], sq3[3]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateSumAndIndex3(s3 + 0, sq3 + 0, scales[1], &sum[0], &index[0]);
+  CalculateSumAndIndex3(s3 + 1, sq3 + 1, scales[1], &sum[1], &index[1]);
+  CalculateIntermediate(sum, index, &ma3[0][0], b3[0], b3[1]);
+  ma3[1][0] = _mm_srli_si128(ma3[0][0], 8);
+  CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+    const uint16_t* const src0, const uint16_t* const src1,
+    const ptrdiff_t over_read_in_bytes, const ptrdiff_t x,
+    const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, __m256i sq[2][8], __m256i ma3[2][3],
+    __m256i b3[2][7], __m256i ma5[3], __m256i b5[5]) {
+  __m256i s[2], s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum_3[2][2],
+      index_3[2][2], sum_5[2], index_5[2], t[4];
+  s[0] = LoadUnaligned32Msan(src0 + 8, over_read_in_bytes + 16);
+  s[1] = LoadUnaligned32Msan(src1 + 8, over_read_in_bytes + 16);
+  Square(s[0], sq[0] + 2);
+  Square(s[1], sq[1] + 2);
+  sq[0][0] = _mm256_permute2x128_si256(sq[0][0], sq[0][2], 0x21);
+  sq[0][1] = _mm256_permute2x128_si256(sq[0][1], sq[0][3], 0x21);
+  sq[1][0] = _mm256_permute2x128_si256(sq[1][0], sq[1][2], 0x21);
+  sq[1][1] = _mm256_permute2x128_si256(sq[1][1], sq[1][3], 0x21);
+  SumHorizontal16(src0, over_read_in_bytes, &s3[0][2], &s3[1][2], &s5[0][3],
+                  &s5[1][3]);
+  SumHorizontal16(src1, over_read_in_bytes, &s3[0][3], &s3[1][3], &s5[0][4],
+                  &s5[1][4]);
+  StoreAligned32(sum3[2] + x + 0, s3[0][2]);
+  StoreAligned32(sum3[2] + x + 16, s3[1][2]);
+  StoreAligned32(sum3[3] + x + 0, s3[0][3]);
+  StoreAligned32(sum3[3] + x + 16, s3[1][3]);
+  StoreAligned32(sum5[3] + x + 0, s5[0][3]);
+  StoreAligned32(sum5[3] + x + 16, s5[1][3]);
+  StoreAligned32(sum5[4] + x + 0, s5[0][4]);
+  StoreAligned32(sum5[4] + x + 16, s5[1][4]);
+  SumHorizontal32(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  SumHorizontal32(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned64(square_sum3[2] + x, sq3[2]);
+  StoreAligned64(square_sum5[3] + x, sq5[3]);
+  StoreAligned64(square_sum3[3] + x, sq3[3]);
+  StoreAligned64(square_sum5[4] + x, sq5[4]);
+  LoadAligned32x2U16(sum3, x, s3[0]);
+  LoadAligned64x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum_3[0][0], &index_3[0][0]);
+  CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum_3[1][0],
+                        &index_3[1][0]);
+  LoadAligned32x3U16(sum5, x, s5[0]);
+  LoadAligned64x3U32(square_sum5, x, sq5);
+  CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]);
+
+  s[0] = LoadUnaligned32Msan(src0 + 24, over_read_in_bytes + 48);
+  s[1] = LoadUnaligned32Msan(src1 + 24, over_read_in_bytes + 48);
+  Square(s[0], sq[0] + 6);
+  Square(s[1], sq[1] + 6);
+  sq[0][4] = _mm256_permute2x128_si256(sq[0][2], sq[0][6], 0x21);
+  sq[0][5] = _mm256_permute2x128_si256(sq[0][3], sq[0][7], 0x21);
+  sq[1][4] = _mm256_permute2x128_si256(sq[1][2], sq[1][6], 0x21);
+  sq[1][5] = _mm256_permute2x128_si256(sq[1][3], sq[1][7], 0x21);
+  SumHorizontal32(sq[0] + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  SumHorizontal32(sq[1] + 4, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned64(square_sum3[2] + x + 16, sq3[2]);
+  StoreAligned64(square_sum5[3] + x + 16, sq5[3]);
+  StoreAligned64(square_sum3[3] + x + 16, sq3[3]);
+  StoreAligned64(square_sum5[4] + x + 16, sq5[4]);
+  LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]);
+  LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum_3[0][1], &index_3[0][1]);
+  CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum_3[1][1],
+                        &index_3[1][1]);
+  CalculateIntermediate<9>(sum_3[0], index_3[0], ma3[0], t, t + 2);
+  PermuteB(t, b3[0]);
+  CalculateIntermediate<9>(sum_3[1], index_3[1], ma3[1], t, t + 2);
+  PermuteB(t, b3[1]);
+  LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+  LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+  CalculateSumAndIndex5(s5[1], sq5, scales[0], &sum_5[1], &index_5[1]);
+  CalculateIntermediate<25>(sum_5, index_5, ma5, t, t + 2);
+  PermuteB(t, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+    const __m128i s[2], const uint16_t scales[2], const uint16_t* const sum3[4],
+    const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+    const uint32_t* const square_sum5[5], __m128i sq[4], __m128i* const ma3,
+    __m128i* const ma5, __m128i b3[2], __m128i b5[2]) {
+  __m128i s3[3], s5[5], sq3[3][2], sq5[5][2];
+  Square(s[1], sq + 2);
+  SumHorizontal16(s, &s3[2], &s5[3]);
+  SumHorizontal32(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16(sum5, 0, s5);
+  s5[4] = s5[3];
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+    const uint16_t* const src, const ptrdiff_t over_read_in_bytes,
+    const ptrdiff_t sum_width, const ptrdiff_t x, const uint16_t scales[2],
+    const uint16_t* const sum3[4], const uint16_t* const sum5[5],
+    const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
+    __m256i sq[6], __m256i ma3[2], __m256i ma5[2], __m256i b3[5],
+    __m256i b5[5]) {
+  const __m256i s0 = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 16);
+  __m256i s3[2][3], s5[2][5], sq3[4][2], sq5[5][2], sum_3[2], index_3[2],
+      sum_5[2], index_5[2], t[4];
+  Square(s0, sq + 2);
+  sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+  sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21);
+  SumHorizontal16(src, over_read_in_bytes, &s3[0][2], &s3[1][2], &s5[0][3],
+                  &s5[1][3]);
+  SumHorizontal32(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned32x2U16(sum3, x, s3[0]);
+  LoadAligned64x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum_3[0], &index_3[0]);
+  LoadAligned32x3U16(sum5, x, s5[0]);
+  s5[0][4] = s5[0][3];
+  LoadAligned64x3U32(square_sum5, x, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]);
+
+  const __m256i s1 = LoadUnaligned32Msan(src + 24, over_read_in_bytes + 48);
+  Square(s1, sq + 6);
+  sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21);
+  sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21);
+  SumHorizontal32(sq + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]);
+  LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum_3[1], &index_3[1]);
+  CalculateIntermediate<9>(sum_3, index_3, ma3, t, t + 2);
+  PermuteB(t, b3);
+  LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+  s5[1][4] = s5[1][3];
+  LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateSumAndIndex5(s5[1], sq5, scales[0], &sum_5[1], &index_5[1]);
+  CalculateIntermediate<25>(sum_5, index_5, ma5, t, t + 2);
+  PermuteB(t, b5);
+}
+
+inline void BoxSumFilterPreProcess5(const uint16_t* const src0,
+                                    const uint16_t* const src1, const int width,
+                                    const uint32_t scale,
+                                    uint16_t* const sum5[5],
+                                    uint32_t* const square_sum5[5],
+                                    const ptrdiff_t sum_width, uint16_t* ma565,
+                                    uint32_t* b565) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+  __m128i s[2][2], ma0, sq_128[2][4], b0[2];
+  __m256i mas[3], sq[2][8], bs[10];
+  s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq_128[0]);
+  Square(s[1][0], sq_128[1]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq_128, &ma0, b0);
+  sq[0][0] = SetrM128i(sq_128[0][2], sq_128[0][2]);
+  sq[0][1] = SetrM128i(sq_128[0][3], sq_128[0][3]);
+  sq[1][0] = SetrM128i(sq_128[1][2], sq_128[1][2]);
+  sq[1][1] = SetrM128i(sq_128[1][3], sq_128[1][3]);
+  mas[0] = SetrM128i(ma0, ma0);
+  bs[0] = SetrM128i(b0[0], b0[0]);
+  bs[1] = SetrM128i(b0[1], b0[1]);
+
+  int x = 0;
+  do {
+    __m256i ma5[3], ma[2], b[4];
+    BoxFilterPreProcess5(
+        src0 + x + 8, src1 + x + 8,
+        kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), sum_width,
+        x + 8, scale, sum5, square_sum5, sq, mas, bs);
+    Prepare3_8(mas, ma5);
+    ma[0] = Sum565Lo(ma5);
+    ma[1] = Sum565Hi(ma5);
+    StoreAligned64_ma(ma565, ma);
+    Sum565(bs + 0, b + 0);
+    Sum565(bs + 3, b + 2);
+    StoreAligned64(b565, b + 0);
+    StoreAligned64(b565 + 16, b + 2);
+    sq[0][0] = sq[0][6];
+    sq[0][1] = sq[0][7];
+    sq[1][0] = sq[1][6];
+    sq[1][1] = sq[1][7];
+    mas[0] = mas[2];
+    bs[0] = bs[5];
+    bs[1] = bs[6];
+    ma565 += 32;
+    b565 += 32;
+    x += 32;
+  } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+    const uint16_t* const src, const int width, const uint32_t scale,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
+    uint32_t* b444) {
+  const ptrdiff_t overread_in_bytes_128 =
+      kOverreadInBytesPass2_128 - sizeof(*src) * width;
+  __m128i s[2], ma0, sq_128[4], b0[2];
+  __m256i mas[3], sq[8], bs[7];
+  s[0] = LoadUnaligned16Msan(src + 0, overread_in_bytes_128 + 0);
+  s[1] = LoadUnaligned16Msan(src + 8, overread_in_bytes_128 + 16);
+  Square(s[0], sq_128);
+  BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq_128, &ma0, b0);
+  sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+  sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+  mas[0] = SetrM128i(ma0, ma0);
+  bs[0] = SetrM128i(b0[0], b0[0]);
+  bs[1] = SetrM128i(b0[1], b0[1]);
+
+  int x = 0;
+  do {
+    __m256i ma3[3];
+    BoxFilterPreProcess3(
+        src + x + 8, kOverreadInBytesPass2_256 + sizeof(*src) * (x + 8 - width),
+        x + 8, sum_width, scale, sum3, square_sum3, sq, mas, bs);
+    Prepare3_8(mas, ma3);
+    if (calculate444) {  // NOLINT(readability-simplify-boolean-expr)
+      Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
+      Store343_444Hi(ma3, bs + 3, kMaStoreOffset, ma343, ma444, b343, b444);
+      ma444 += 32;
+      b444 += 32;
+    } else {
+      __m256i ma[2], b[4];
+      ma[0] = Sum343Lo(ma3);
+      ma[1] = Sum343Hi(ma3);
+      StoreAligned64_ma(ma343, ma);
+      Sum343(bs + 0, b + 0);
+      Sum343(bs + 3, b + 2);
+      StoreAligned64(b343 + 0, b + 0);
+      StoreAligned64(b343 + 16, b + 2);
+    }
+    sq[0] = sq[6];
+    sq[1] = sq[7];
+    mas[0] = mas[2];
+    bs[0] = bs[5];
+    bs[1] = bs[6];
+    ma343 += 32;
+    b343 += 32;
+    x += 32;
+  } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+    const uint16_t* const src0, const uint16_t* const src1, const int width,
+    const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
+    uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
+    uint32_t* b565) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+  __m128i s[2][4], ma3_128[2][3], ma5_128[3], sq_128[2][8], b3_128[2][10],
+      b5_128[10];
+  __m256i ma3[2][3], ma5[3], sq[2][8], b3[2][7], b5[7];
+  s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq_128[0]);
+  Square(s[1][0], sq_128[1]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq_128,
+                        ma3_128, b3_128, &ma5_128[0], b5_128);
+  sq[0][0] = SetrM128i(sq_128[0][2], sq_128[0][2]);
+  sq[0][1] = SetrM128i(sq_128[0][3], sq_128[0][3]);
+  sq[1][0] = SetrM128i(sq_128[1][2], sq_128[1][2]);
+  sq[1][1] = SetrM128i(sq_128[1][3], sq_128[1][3]);
+  ma3[0][0] = SetrM128i(ma3_128[0][0], ma3_128[0][0]);
+  ma3[1][0] = SetrM128i(ma3_128[1][0], ma3_128[1][0]);
+  ma5[0] = SetrM128i(ma5_128[0], ma5_128[0]);
+  b3[0][0] = SetrM128i(b3_128[0][0], b3_128[0][0]);
+  b3[0][1] = SetrM128i(b3_128[0][1], b3_128[0][1]);
+  b3[1][0] = SetrM128i(b3_128[1][0], b3_128[1][0]);
+  b3[1][1] = SetrM128i(b3_128[1][1], b3_128[1][1]);
+  b5[0] = SetrM128i(b5_128[0], b5_128[0]);
+  b5[1] = SetrM128i(b5_128[1], b5_128[1]);
+
+  int x = 0;
+  do {
+    __m256i ma[2], b[4], ma3x[3], ma5x[3];
+    BoxFilterPreProcess(
+        src0 + x + 8, src1 + x + 8,
+        kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), x + 8,
+        scales, sum3, sum5, square_sum3, square_sum5, sum_width, sq, ma3, b3,
+        ma5, b5);
+    Prepare3_8(ma3[0], ma3x);
+    ma[0] = Sum343Lo(ma3x);
+    ma[1] = Sum343Hi(ma3x);
+    StoreAligned64_ma(ma343[0] + x, ma);
+    Sum343(b3[0], b);
+    Sum343(b3[0] + 3, b + 2);
+    StoreAligned64(b343[0] + x, b);
+    StoreAligned64(b343[0] + x + 16, b + 2);
+    Prepare3_8(ma3[1], ma3x);
+    Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+    Store343_444Hi(ma3x, b3[1] + 3, x + kMaStoreOffset, ma343[1], ma444,
+                   b343[1], b444);
+    Prepare3_8(ma5, ma5x);
+    ma[0] = Sum565Lo(ma5x);
+    ma[1] = Sum565Hi(ma5x);
+    StoreAligned64_ma(ma565, ma);
+    Sum565(b5, b);
+    StoreAligned64(b565, b);
+    Sum565(b5 + 3, b);
+    StoreAligned64(b565 + 16, b);
+    sq[0][0] = sq[0][6];
+    sq[0][1] = sq[0][7];
+    sq[1][0] = sq[1][6];
+    sq[1][1] = sq[1][7];
+    ma3[0][0] = ma3[0][2];
+    ma3[1][0] = ma3[1][2];
+    ma5[0] = ma5[2];
+    b3[0][0] = b3[0][5];
+    b3[0][1] = b3[0][6];
+    b3[1][0] = b3[1][5];
+    b3[1][1] = b3[1][6];
+    b5[0] = b5[5];
+    b5[1] = b5[6];
+    ma565 += 32;
+    b565 += 32;
+    x += 32;
+  } while (x < width);
+}
+
+template <int shift>
+inline __m256i FilterOutput(const __m256i ma_x_src, const __m256i b) {
+  // ma: 255 * 32 = 8160 (13 bits)
+  // b: 65088 * 32 = 2082816 (21 bits)
+  // v: b - ma * 255 (22 bits)
+  const __m256i v = _mm256_sub_epi32(b, ma_x_src);
+  // kSgrProjSgrBits = 8
+  // kSgrProjRestoreBits = 4
+  // shift = 4 or 5
+  // v >> 8 or 9 (13 bits)
+  return VrshrS32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <int shift>
+inline __m256i CalculateFilteredOutput(const __m256i src, const __m256i ma,
+                                       const __m256i b[2]) {
+  const __m256i ma_x_src_lo = VmullLo16(ma, src);
+  const __m256i ma_x_src_hi = VmullHi16(ma, src);
+  const __m256i dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
+  const __m256i dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
+  return _mm256_packs_epi32(dst_lo, dst_hi);  // 13 bits
+}
+
+inline __m256i CalculateFilteredOutputPass1(const __m256i src,
+                                            const __m256i ma[2],
+                                            const __m256i b[2][2]) {
+  const __m256i ma_sum = _mm256_add_epi16(ma[0], ma[1]);
+  __m256i b_sum[2];
+  b_sum[0] = _mm256_add_epi32(b[0][0], b[1][0]);
+  b_sum[1] = _mm256_add_epi32(b[0][1], b[1][1]);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m256i CalculateFilteredOutputPass2(const __m256i src,
+                                            const __m256i ma[3],
+                                            const __m256i b[3][2]) {
+  const __m256i ma_sum = Sum3_16(ma);
+  __m256i b_sum[2];
+  Sum3_32(b, b_sum);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m256i SelfGuidedFinal(const __m256i src, const __m256i v[2]) {
+  const __m256i v_lo =
+      VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const __m256i v_hi =
+      VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const __m256i vv = _mm256_packs_epi32(v_lo, v_hi);
+  return _mm256_add_epi16(src, vv);
+}
+
+inline __m256i SelfGuidedDoubleMultiplier(const __m256i src,
+                                          const __m256i filter[2], const int w0,
+                                          const int w2) {
+  __m256i v[2];
+  const __m256i w0_w2 =
+      _mm256_set1_epi32((w2 << 16) | static_cast<uint16_t>(w0));
+  const __m256i f_lo = _mm256_unpacklo_epi16(filter[0], filter[1]);
+  const __m256i f_hi = _mm256_unpackhi_epi16(filter[0], filter[1]);
+  v[0] = _mm256_madd_epi16(w0_w2, f_lo);
+  v[1] = _mm256_madd_epi16(w0_w2, f_hi);
+  return SelfGuidedFinal(src, v);
+}
+
+inline __m256i SelfGuidedSingleMultiplier(const __m256i src,
+                                          const __m256i filter, const int w0) {
+  // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+  __m256i v[2];
+  v[0] = VmullNLo8(filter, w0);
+  v[1] = VmullNHi8(filter, w0);
+  return SelfGuidedFinal(src, v);
+}
+
+inline void ClipAndStore(uint16_t* const dst, const __m256i val) {
+  const __m256i val0 = _mm256_max_epi16(val, _mm256_setzero_si256());
+  const __m256i val1 = _mm256_min_epi16(val0, _mm256_set1_epi16(1023));
+  StoreUnaligned32(dst, val1);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+    const uint16_t* const src, const uint16_t* const src0,
+    const uint16_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
+    const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
+    uint32_t* const b565[2], uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+  __m128i s[2][2], ma0, sq_128[2][4], b0[2];
+  __m256i mas[3], sq[2][8], bs[7];
+  s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq_128[0]);
+  Square(s[1][0], sq_128[1]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq_128, &ma0, b0);
+  sq[0][0] = SetrM128i(sq_128[0][2], sq_128[0][2]);
+  sq[0][1] = SetrM128i(sq_128[0][3], sq_128[0][3]);
+  sq[1][0] = SetrM128i(sq_128[1][2], sq_128[1][2]);
+  sq[1][1] = SetrM128i(sq_128[1][3], sq_128[1][3]);
+  mas[0] = SetrM128i(ma0, ma0);
+  bs[0] = SetrM128i(b0[0], b0[0]);
+  bs[1] = SetrM128i(b0[1], b0[1]);
+
+  int x = 0;
+  do {
+    __m256i ma5[3], ma[4], b[4][2];
+    BoxFilterPreProcess5(
+        src0 + x + 8, src1 + x + 8,
+        kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), sum_width,
+        x + 8, scale, sum5, square_sum5, sq, mas, bs);
+    Prepare3_8(mas, ma5);
+    ma[2] = Sum565Lo(ma5);
+    ma[3] = Sum565Hi(ma5);
+    ma[1] = _mm256_permute2x128_si256(ma[2], ma[3], 0x20);
+    ma[3] = _mm256_permute2x128_si256(ma[2], ma[3], 0x31);
+    StoreAligned32(ma565[1] + x + 0, ma[1]);
+    StoreAligned32(ma565[1] + x + 16, ma[3]);
+    Sum565(bs + 0, b[1]);
+    Sum565(bs + 3, b[3]);
+    StoreAligned64(b565[1] + x, b[1]);
+    StoreAligned64(b565[1] + x + 16, b[3]);
+    const __m256i sr0_lo = LoadUnaligned32(src + x + 0);
+    ma[0] = LoadAligned32(ma565[0] + x);
+    LoadAligned64(b565[0] + x, b[0]);
+    const __m256i p0 = CalculateFilteredOutputPass1(sr0_lo, ma, b);
+    const __m256i d0 = SelfGuidedSingleMultiplier(sr0_lo, p0, w0);
+    ClipAndStore(dst + x + 0, d0);
+    const __m256i sr0_hi = LoadUnaligned32(src + x + 16);
+    ma[2] = LoadAligned32(ma565[0] + x + 16);
+    LoadAligned64(b565[0] + x + 16, b[2]);
+    const __m256i p1 = CalculateFilteredOutputPass1(sr0_hi, ma + 2, b + 2);
+    const __m256i d1 = SelfGuidedSingleMultiplier(sr0_hi, p1, w0);
+    ClipAndStore(dst + x + 16, d1);
+    const __m256i sr1_lo = LoadUnaligned32(src + stride + x + 0);
+    const __m256i p10 = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[1]);
+    const __m256i d10 = SelfGuidedSingleMultiplier(sr1_lo, p10, w0);
+    ClipAndStore(dst + stride + x + 0, d10);
+    const __m256i sr1_hi = LoadUnaligned32(src + stride + x + 16);
+    const __m256i p11 = CalculateFilteredOutput<4>(sr1_hi, ma[3], b[3]);
+    const __m256i d11 = SelfGuidedSingleMultiplier(sr1_hi, p11, w0);
+    ClipAndStore(dst + stride + x + 16, d11);
+    sq[0][0] = sq[0][6];
+    sq[0][1] = sq[0][7];
+    sq[1][0] = sq[1][6];
+    sq[1][1] = sq[1][7];
+    mas[0] = mas[2];
+    bs[0] = bs[5];
+    bs[1] = bs[6];
+    x += 32;
+  } while (x < width);
+}
+
+inline void BoxFilterPass1LastRow(
+    const uint16_t* const src, const uint16_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
+    uint32_t* b565, uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+  __m128i s[2], ma0[2], sq_128[8], b0[6];
+  __m256i mas[3], sq[8], bs[7];
+  s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  Square(s[0], sq_128);
+  BoxFilterPreProcess5LastRowLo(s, scale, sum5, square_sum5, sq_128, &ma0[0],
+                                b0);
+  sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+  sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+  mas[0] = SetrM128i(ma0[0], ma0[0]);
+  bs[0] = SetrM128i(b0[0], b0[0]);
+  bs[1] = SetrM128i(b0[1], b0[1]);
+
+  int x = 0;
+  do {
+    __m256i ma5[3], ma[4], b[4][2];
+    BoxFilterPreProcess5LastRow(
+        src0 + x + 8,
+        kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), sum_width,
+        x + 8, scale, sum5, square_sum5, sq, mas, bs);
+    Prepare3_8(mas, ma5);
+    ma[2] = Sum565Lo(ma5);
+    ma[3] = Sum565Hi(ma5);
+    Sum565(bs + 0, b[1]);
+    Sum565(bs + 3, b[3]);
+    const __m256i sr0_lo = LoadUnaligned32(src + x + 0);
+    ma[0] = LoadAligned32(ma565 + x);
+    ma[1] = _mm256_permute2x128_si256(ma[2], ma[3], 0x20);
+    LoadAligned64(b565 + x, b[0]);
+    const __m256i p0 = CalculateFilteredOutputPass1(sr0_lo, ma, b);
+    const __m256i d0 = SelfGuidedSingleMultiplier(sr0_lo, p0, w0);
+    ClipAndStore(dst + x + 0, d0);
+    const __m256i sr0_hi = LoadUnaligned32(src + x + 16);
+    ma[0] = LoadAligned32(ma565 + x + 16);
+    ma[1] = _mm256_permute2x128_si256(ma[2], ma[3], 0x31);
+    LoadAligned64(b565 + x + 16, b[2]);
+    const __m256i p1 = CalculateFilteredOutputPass1(sr0_hi, ma, b + 2);
+    const __m256i d1 = SelfGuidedSingleMultiplier(sr0_hi, p1, w0);
+    ClipAndStore(dst + x + 16, d1);
+    sq[0] = sq[6];
+    sq[1] = sq[7];
+    mas[0] = mas[2];
+    bs[0] = bs[5];
+    bs[1] = bs[6];
+    x += 32;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+    const uint16_t* const src, const uint16_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
+    uint32_t* const b444[2], uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes_128 =
+      kOverreadInBytesPass2_128 - sizeof(*src0) * width;
+  __m128i s0[2], ma0, sq_128[4], b0[2];
+  __m256i mas[3], sq[8], bs[7];
+  s0[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes_128 + 0);
+  s0[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes_128 + 16);
+  Square(s0[0], sq_128);
+  BoxFilterPreProcess3Lo(s0, scale, sum3, square_sum3, sq_128, &ma0, b0);
+  sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+  sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+  mas[0] = SetrM128i(ma0, ma0);
+  bs[0] = SetrM128i(b0[0], b0[0]);
+  bs[1] = SetrM128i(b0[1], b0[1]);
+
+  int x = 0;
+  do {
+    __m256i ma[4], b[4][2], ma3[3];
+    BoxFilterPreProcess3(
+        src0 + x + 8,
+        kOverreadInBytesPass2_256 + sizeof(*src0) * (x + 8 - width), x + 8,
+        sum_width, scale, sum3, square_sum3, sq, mas, bs);
+    Prepare3_8(mas, ma3);
+    Store343_444(ma3, bs, x, &ma[2], &ma[3], b[2], b[3], ma343[2], ma444[1],
+                 b343[2], b444[1]);
+    const __m256i sr_lo = LoadUnaligned32(src + x + 0);
+    const __m256i sr_hi = LoadUnaligned32(src + x + 16);
+    ma[0] = LoadAligned32(ma343[0] + x);
+    ma[1] = LoadAligned32(ma444[0] + x);
+    LoadAligned64(b343[0] + x, b[0]);
+    LoadAligned64(b444[0] + x, b[1]);
+    const __m256i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
+    ma[1] = LoadAligned32(ma343[0] + x + 16);
+    ma[2] = LoadAligned32(ma444[0] + x + 16);
+    LoadAligned64(b343[0] + x + 16, b[1]);
+    LoadAligned64(b444[0] + x + 16, b[2]);
+    const __m256i p1 = CalculateFilteredOutputPass2(sr_hi, ma + 1, b + 1);
+    const __m256i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+    const __m256i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+    ClipAndStore(dst + x + 0, d0);
+    ClipAndStore(dst + x + 16, d1);
+    sq[0] = sq[6];
+    sq[1] = sq[7];
+    mas[0] = mas[2];
+    bs[0] = bs[5];
+    bs[1] = bs[6];
+    x += 32;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+    const uint16_t* const src, const uint16_t* const src0,
+    const uint16_t* const src1, const ptrdiff_t stride, const int width,
+    const uint16_t scales[2], const int16_t w0, const int16_t w2,
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4],
+    uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
+    uint32_t* const b444[3], uint32_t* const b565[2], uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+  __m128i s[2][4], ma3_128[2][3], ma5_0, sq_128[2][8], b3_128[2][10], b5_128[2];
+  __m256i ma3[2][3], ma5[3], sq[2][8], b3[2][7], b5[7];
+  s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq_128[0]);
+  Square(s[1][0], sq_128[1]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq_128,
+                        ma3_128, b3_128, &ma5_0, b5_128);
+  sq[0][0] = SetrM128i(sq_128[0][2], sq_128[0][2]);
+  sq[0][1] = SetrM128i(sq_128[0][3], sq_128[0][3]);
+  sq[1][0] = SetrM128i(sq_128[1][2], sq_128[1][2]);
+  sq[1][1] = SetrM128i(sq_128[1][3], sq_128[1][3]);
+  ma3[0][0] = SetrM128i(ma3_128[0][0], ma3_128[0][0]);
+  ma3[1][0] = SetrM128i(ma3_128[1][0], ma3_128[1][0]);
+  ma5[0] = SetrM128i(ma5_0, ma5_0);
+  b3[0][0] = SetrM128i(b3_128[0][0], b3_128[0][0]);
+  b3[0][1] = SetrM128i(b3_128[0][1], b3_128[0][1]);
+  b3[1][0] = SetrM128i(b3_128[1][0], b3_128[1][0]);
+  b3[1][1] = SetrM128i(b3_128[1][1], b3_128[1][1]);
+  b5[0] = SetrM128i(b5_128[0], b5_128[0]);
+  b5[1] = SetrM128i(b5_128[1], b5_128[1]);
+
+  int x = 0;
+  do {
+    __m256i ma[3][4], mat[3][3], b[3][3][2], bt[3][3][2], p[2][2], ma3x[2][3],
+        ma5x[3];
+    BoxFilterPreProcess(
+        src0 + x + 8, src1 + x + 8,
+        kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), x + 8,
+        scales, sum3, sum5, square_sum3, square_sum5, sum_width, sq, ma3, b3,
+        ma5, b5);
+    Prepare3_8(ma3[0], ma3x[0]);
+    Prepare3_8(ma3[1], ma3x[1]);
+    Prepare3_8(ma5, ma5x);
+    Store343_444(ma3x[0], b3[0], x, &ma[1][2], &mat[1][2], &ma[2][1],
+                 &mat[2][1], b[1][2], bt[1][2], b[2][1], bt[2][1], ma343[2],
+                 ma444[1], b343[2], b444[1]);
+    Store343_444(ma3x[1], b3[1], x, &ma[2][2], &mat[2][2], b[2][2], bt[2][2],
+                 ma343[3], ma444[2], b343[3], b444[2]);
+
+    ma[0][2] = Sum565Lo(ma5x);
+    ma[0][3] = Sum565Hi(ma5x);
+    ma[0][1] = _mm256_permute2x128_si256(ma[0][2], ma[0][3], 0x20);
+    ma[0][3] = _mm256_permute2x128_si256(ma[0][2], ma[0][3], 0x31);
+    StoreAligned32(ma565[1] + x + 0, ma[0][1]);
+    StoreAligned32(ma565[1] + x + 16, ma[0][3]);
+    Sum565(b5, b[0][1]);
+    StoreAligned64(b565[1] + x, b[0][1]);
+    const __m256i sr0_lo = LoadUnaligned32(src + x);
+    const __m256i sr1_lo = LoadUnaligned32(src + stride + x);
+    ma[0][0] = LoadAligned32(ma565[0] + x);
+    LoadAligned64(b565[0] + x, b[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
+    ma[1][0] = LoadAligned32(ma343[0] + x);
+    ma[1][1] = LoadAligned32(ma444[0] + x);
+    // Keeping the following 4 redundant lines is faster. The reason is that
+    // there are not enough registers available, and these values could be saved
+    // and loaded which is even slower.
+    ma[1][2] = LoadAligned32(ma343[2] + x);  // Redundant line 1.
+    LoadAligned64(b343[0] + x, b[1][0]);
+    LoadAligned64(b444[0] + x, b[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
+    ma[2][0] = LoadAligned32(ma343[1] + x);
+    ma[2][1] = LoadAligned32(ma444[1] + x);  // Redundant line 2.
+    LoadAligned64(b343[1] + x, b[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
+    const __m256i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
+    ClipAndStore(dst + x, d00);
+    const __m256i d10x = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
+    ClipAndStore(dst + stride + x, d10x);
+
+    Sum565(b5 + 3, bt[0][1]);
+    StoreAligned64(b565[1] + x + 16, bt[0][1]);
+    const __m256i sr0_hi = LoadUnaligned32(src + x + 16);
+    const __m256i sr1_hi = LoadUnaligned32(src + stride + x + 16);
+    ma[0][2] = LoadAligned32(ma565[0] + x + 16);
+    LoadAligned64(b565[0] + x + 16, bt[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_hi, ma[0] + 2, bt[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_hi, ma[0][3], bt[0][1]);
+    mat[1][0] = LoadAligned32(ma343[0] + x + 16);
+    mat[1][1] = LoadAligned32(ma444[0] + x + 16);
+    mat[1][2] = LoadAligned32(ma343[2] + x + 16);  // Redundant line 3.
+    LoadAligned64(b343[0] + x + 16, bt[1][0]);
+    LoadAligned64(b444[0] + x + 16, bt[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_hi, mat[1], bt[1]);
+    mat[2][0] = LoadAligned32(ma343[1] + x + 16);
+    mat[2][1] = LoadAligned32(ma444[1] + x + 16);  // Redundant line 4.
+    LoadAligned64(b343[1] + x + 16, bt[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_hi, mat[2], bt[2]);
+    const __m256i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
+    ClipAndStore(dst + x + 16, d01);
+    const __m256i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
+    ClipAndStore(dst + stride + x + 16, d11);
+
+    sq[0][0] = sq[0][6];
+    sq[0][1] = sq[0][7];
+    sq[1][0] = sq[1][6];
+    sq[1][1] = sq[1][7];
+    ma3[0][0] = ma3[0][2];
+    ma3[1][0] = ma3[1][2];
+    ma5[0] = ma5[2];
+    b3[0][0] = b3[0][5];
+    b3[0][1] = b3[0][6];
+    b3[1][0] = b3[1][5];
+    b3[1][1] = b3[1][6];
+    b5[0] = b5[5];
+    b5[1] = b5[6];
+    x += 32;
+  } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+    const uint16_t* const src, const uint16_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
+    const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+    uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+    uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+  __m128i s[2], ma3_0, ma5_0, sq_128[4], b3_128[2], b5_128[2];
+  __m256i ma3[3], ma5[3], sq[8], b3[7], b5[7];
+  s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  Square(s[0], sq_128);
+  BoxFilterPreProcessLastRowLo(s, scales, sum3, sum5, square_sum3, square_sum5,
+                               sq_128, &ma3_0, &ma5_0, b3_128, b5_128);
+  sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+  sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+  ma3[0] = SetrM128i(ma3_0, ma3_0);
+  ma5[0] = SetrM128i(ma5_0, ma5_0);
+  b3[0] = SetrM128i(b3_128[0], b3_128[0]);
+  b3[1] = SetrM128i(b3_128[1], b3_128[1]);
+  b5[0] = SetrM128i(b5_128[0], b5_128[0]);
+  b5[1] = SetrM128i(b5_128[1], b5_128[1]);
+
+  int x = 0;
+  do {
+    __m256i ma[4], mat[4], b[3][2], bt[3][2], ma3x[3], ma5x[3], p[2];
+    BoxFilterPreProcessLastRow(
+        src0 + x + 8,
+        kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), sum_width,
+        x + 8, scales, sum3, sum5, square_sum3, square_sum5, sq, ma3, ma5, b3,
+        b5);
+    Prepare3_8(ma3, ma3x);
+    Prepare3_8(ma5, ma5x);
+    ma[2] = Sum565Lo(ma5x);
+    Sum565(b5, b[1]);
+    mat[1] = Sum565Hi(ma5x);
+    Sum565(b5 + 3, bt[1]);
+    ma[3] = Sum343Lo(ma3x);
+    Sum343(b3, b[2]);
+    mat[2] = Sum343Hi(ma3x);
+    Sum343(b3 + 3, bt[2]);
+
+    const __m256i sr_lo = LoadUnaligned32(src + x);
+    ma[0] = LoadAligned32(ma565 + x);
+    ma[1] = _mm256_permute2x128_si256(ma[2], mat[1], 0x20);
+    mat[1] = _mm256_permute2x128_si256(ma[2], mat[1], 0x31);
+    LoadAligned64(b565 + x, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
+    ma[0] = LoadAligned32(ma343 + x);
+    ma[1] = LoadAligned32(ma444 + x);
+    ma[2] = _mm256_permute2x128_si256(ma[3], mat[2], 0x20);
+    LoadAligned64(b343 + x, b[0]);
+    LoadAligned64(b444 + x, b[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
+    const __m256i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
+
+    const __m256i sr_hi = LoadUnaligned32(src + x + 16);
+    mat[0] = LoadAligned32(ma565 + x + 16);
+    LoadAligned64(b565 + x + 16, bt[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_hi, mat, bt);
+    mat[0] = LoadAligned32(ma343 + x + 16);
+    mat[1] = LoadAligned32(ma444 + x + 16);
+    mat[2] = _mm256_permute2x128_si256(ma[3], mat[2], 0x31);
+    LoadAligned64(b343 + x + 16, bt[0]);
+    LoadAligned64(b444 + x + 16, bt[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_hi, mat, bt);
+    const __m256i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
+    ClipAndStore(dst + x + 0, d0);
+    ClipAndStore(dst + x + 16, d1);
+
+    sq[0] = sq[6];
+    sq[1] = sq[7];
+    ma3[0] = ma3[2];
+    ma5[0] = ma5[2];
+    b3[0] = b3[5];
+    b3[1] = b3[6];
+    b5[0] = b5[5];
+    b5[1] = b5[6];
+    x += 32;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+    const RestorationUnitInfo& restoration_info, const uint16_t* src,
+    const ptrdiff_t stride, const uint16_t* const top_border,
+    const ptrdiff_t top_border_stride, const uint16_t* bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    SgrBuffer* const sgr_buffer, uint16_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 32);
+  const auto sum_width = temp_stride + 8;
+  const auto sum_stride = temp_stride + 32;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+  uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+  uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+  sum3[0] = sgr_buffer->sum3 + kSumOffset;
+  square_sum3[0] = sgr_buffer->square_sum3 + kSumOffset;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 3; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  sum5[0] = sgr_buffer->sum5 + kSumOffset;
+  square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  b444[0] = sgr_buffer->b444;
+  for (int i = 1; i <= 2; ++i) {
+    ma444[i] = ma444[i - 1] + temp_stride;
+    b444[i] = b444[i - 1] + temp_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scales[0] != 0);
+  assert(scales[1] != 0);
+  BoxSum(top_border, top_border_stride, width, sum_stride, temp_stride, sum3[0],
+         sum5[1], square_sum3[0], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+                         square_sum5, sum_width, ma343, ma444[0], ma565[0],
+                         b343, b444[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5 + kSumOffset;
+  square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate4PointersBy2<uint16_t>(sum3);
+    Circulate4PointersBy2<uint32_t>(square_sum3);
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+              scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
+              ma343, ma444, ma565, b343, b444, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    Circulate4PointersBy2<uint16_t>(ma343);
+    Circulate4PointersBy2<uint32_t>(b343);
+    std::swap(ma444[0], ma444[2]);
+    std::swap(b444[0], b444[2]);
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate4PointersBy2<uint16_t>(sum3);
+  Circulate4PointersBy2<uint32_t>(square_sum3);
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint16_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+              square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
+              b444, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      Circulate4PointersBy2<uint16_t>(sum3);
+      Circulate4PointersBy2<uint32_t>(square_sum3);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+      Circulate4PointersBy2<uint16_t>(ma343);
+      Circulate4PointersBy2<uint32_t>(b343);
+      std::swap(ma444[0], ma444[2]);
+      std::swap(b444[0], b444[2]);
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+    }
+    BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+                     sum_width, scales, w0, w2, sum3, sum5, square_sum3,
+                     square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
+                     b444[0], b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+                                  const uint16_t* src, const ptrdiff_t stride,
+                                  const uint16_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint16_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint16_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 32);
+  const auto sum_width = temp_stride + 8;
+  const auto sum_stride = temp_stride + 32;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  uint16_t *sum5[5], *ma565[2];
+  uint32_t *square_sum5[5], *b565[2];
+  sum5[0] = sgr_buffer->sum5 + kSumOffset;
+  square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<5>(top_border, top_border_stride, width, sum_stride, temp_stride,
+            sum5[1], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
+                          ma565[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5 + kSumOffset;
+  square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+                   square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint16_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+                   sum_width, scale, w0, ma565, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    src += 3;
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+    }
+    BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
+                          sum_width, scale, w0, sum5, square_sum5, ma565[0],
+                          b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+                                  const uint16_t* src, const ptrdiff_t stride,
+                                  const uint16_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint16_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint16_t* dst) {
+  assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+  const auto temp_stride = Align<ptrdiff_t>(width, 32);
+  const auto sum_width = temp_stride + 8;
+  const auto sum_stride = temp_stride + 32;
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1];  // < 2^12.
+  uint16_t *sum3[3], *ma343[3], *ma444[2];
+  uint32_t *square_sum3[3], *b343[3], *b444[2];
+  sum3[0] = sgr_buffer->sum3 + kSumOffset;
+  square_sum3[0] = sgr_buffer->square_sum3 + kSumOffset;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 2; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  ma444[1] = ma444[0] + temp_stride;
+  b444[0] = sgr_buffer->b444;
+  b444[1] = b444[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<3>(top_border, top_border_stride, width, sum_stride, temp_stride,
+            sum3[0], square_sum3[0]);
+  BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
+                                 sum_width, ma343[0], nullptr, b343[0],
+                                 nullptr);
+  Circulate3PointersBy1<uint16_t>(sum3);
+  Circulate3PointersBy1<uint32_t>(square_sum3);
+  const uint16_t* s;
+  if (height > 1) {
+    s = src + stride;
+  } else {
+    s = bottom_border;
+    bottom_border += bottom_border_stride;
+  }
+  BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
+                                ma343[1], ma444[0], b343[1], b444[0]);
+
+  for (int y = height - 2; y > 0; --y) {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  }
+
+  int y = std::min(height, 2);
+  src += 2;
+  do {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    bottom_border += bottom_border_stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  } while (--y != 0);
+}
+
+// If |width| is non-multiple of 32, up to 31 more pixels are written to |dest|
+// in the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_AVX2(
+    const RestorationUnitInfo& restoration_info, const void* const source,
+    const ptrdiff_t stride, const void* const top_border,
+    const ptrdiff_t top_border_stride, const void* const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* const restoration_buffer, void* const dest) {
+  const int index = restoration_info.sgr_proj_info.index;
+  const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
+  const int radius_pass_1 = kSgrProjParams[index][2];  // 1 or 0
+  const auto* const src = static_cast<const uint16_t*>(source);
+  const auto* const top = static_cast<const uint16_t*>(top_border);
+  const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
+  auto* const dst = static_cast<uint16_t*>(dest);
+  SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+  if (radius_pass_1 == 0) {
+    // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+    // following assertion.
+    assert(radius_pass_0 != 0);
+    BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+                          top_border_stride, bottom - 3, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
+  } else if (radius_pass_0 == 0) {
+    BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+                          top_border_stride, bottom - 2, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
+  } else {
+    BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+                     top_border_stride, bottom - 3, bottom_border_stride, width,
+                     height, sgr_buffer, dst);
+  }
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_10BPP_AVX2(WienerFilter)
+  dsp->loop_restorations[0] = WienerFilter_AVX2;
+#endif
+#if DSP_ENABLED_10BPP_AVX2(SelfGuidedFilter)
+  dsp->loop_restorations[1] = SelfGuidedFilter_AVX2;
+#endif
+}
+
+}  // namespace
+
+void LoopRestorationInit10bpp_AVX2() { Init10bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !(LIBGAV1_TARGETING_AVX2 && LIBGAV1_MAX_BITDEPTH >= 10)
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit10bpp_AVX2() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_AVX2 && LIBGAV1_MAX_BITDEPTH >= 10

diff --git a/libgav1/src/dsp/x86/loop_restoration_10bit_sse4.cc b/libgav1/src/dsp/x86/loop_restoration_10bit_sse4.cc
new file mode 100644
index 0000000..96380e3
--- /dev/null
+++ b/libgav1/src/dsp/x86/loop_restoration_10bit_sse4.cc

@@ -0,0 +1,2530 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline void WienerHorizontalClip(const __m128i s[2],
+                                 int16_t* const wiener_buffer) {
+  constexpr int offset =
+      1 << (10 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+  constexpr int limit = (offset << 2) - 1;
+  const __m128i offsets = _mm_set1_epi16(-offset);
+  const __m128i limits = _mm_set1_epi16(limit - offset);
+  const __m128i round = _mm_set1_epi32(1 << (kInterRoundBitsHorizontal - 1));
+  const __m128i sum0 = _mm_add_epi32(s[0], round);
+  const __m128i sum1 = _mm_add_epi32(s[1], round);
+  const __m128i rounded_sum0 = _mm_srai_epi32(sum0, kInterRoundBitsHorizontal);
+  const __m128i rounded_sum1 = _mm_srai_epi32(sum1, kInterRoundBitsHorizontal);
+  const __m128i rounded_sum = _mm_packs_epi32(rounded_sum0, rounded_sum1);
+  const __m128i d0 = _mm_max_epi16(rounded_sum, offsets);
+  const __m128i d1 = _mm_min_epi16(d0, limits);
+  StoreAligned16(wiener_buffer, d1);
+}
+
+inline void WienerHorizontalTap7(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const __m128i coefficients,
+                                 int16_t** const wiener_buffer) {
+  __m128i filter[2];
+  filter[0] = _mm_shuffle_epi32(coefficients, 0x0);
+  filter[1] = _mm_shuffle_epi32(coefficients, 0x55);
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i s[7], madds[4];
+      s[0] = LoadUnaligned16(src + x + 0);
+      s[1] = LoadUnaligned16(src + x + 1);
+      s[2] = LoadUnaligned16(src + x + 2);
+      s[3] = LoadUnaligned16(src + x + 3);
+      s[4] = LoadUnaligned16(src + x + 4);
+      s[5] = LoadUnaligned16(src + x + 5);
+      s[6] = LoadUnaligned16(src + x + 6);
+      const __m128i s06 = _mm_add_epi16(s[0], s[6]);
+      const __m128i s15 = _mm_add_epi16(s[1], s[5]);
+      const __m128i s24 = _mm_add_epi16(s[2], s[4]);
+      const __m128i ss0 = _mm_unpacklo_epi16(s06, s15);
+      const __m128i ss1 = _mm_unpackhi_epi16(s06, s15);
+      const __m128i ss2 = _mm_unpacklo_epi16(s24, s[3]);
+      const __m128i ss3 = _mm_unpackhi_epi16(s24, s[3]);
+      madds[0] = _mm_madd_epi16(ss0, filter[0]);
+      madds[1] = _mm_madd_epi16(ss1, filter[0]);
+      madds[2] = _mm_madd_epi16(ss2, filter[1]);
+      madds[3] = _mm_madd_epi16(ss3, filter[1]);
+      madds[0] = _mm_add_epi32(madds[0], madds[2]);
+      madds[1] = _mm_add_epi32(madds[1], madds[3]);
+      WienerHorizontalClip(madds, *wiener_buffer + x);
+      x += 8;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap5(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const __m128i coefficients,
+                                 int16_t** const wiener_buffer) {
+  const __m128i filter =
+      _mm_shuffle_epi8(coefficients, _mm_set1_epi32(0x05040302));
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i s[5], madds[2];
+      s[0] = LoadUnaligned16(src + x + 0);
+      s[1] = LoadUnaligned16(src + x + 1);
+      s[2] = LoadUnaligned16(src + x + 2);
+      s[3] = LoadUnaligned16(src + x + 3);
+      s[4] = LoadUnaligned16(src + x + 4);
+      const __m128i s04 = _mm_add_epi16(s[0], s[4]);
+      const __m128i s13 = _mm_add_epi16(s[1], s[3]);
+      const __m128i s2d = _mm_add_epi16(s[2], s[2]);
+      const __m128i s0m = _mm_sub_epi16(s04, s2d);
+      const __m128i s1m = _mm_sub_epi16(s13, s2d);
+      const __m128i ss0 = _mm_unpacklo_epi16(s0m, s1m);
+      const __m128i ss1 = _mm_unpackhi_epi16(s0m, s1m);
+      madds[0] = _mm_madd_epi16(ss0, filter);
+      madds[1] = _mm_madd_epi16(ss1, filter);
+      const __m128i s2_lo = _mm_unpacklo_epi16(s[2], _mm_setzero_si128());
+      const __m128i s2_hi = _mm_unpackhi_epi16(s[2], _mm_setzero_si128());
+      const __m128i s2x128_lo = _mm_slli_epi32(s2_lo, 7);
+      const __m128i s2x128_hi = _mm_slli_epi32(s2_hi, 7);
+      madds[0] = _mm_add_epi32(madds[0], s2x128_lo);
+      madds[1] = _mm_add_epi32(madds[1], s2x128_hi);
+      WienerHorizontalClip(madds, *wiener_buffer + x);
+      x += 8;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap3(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const __m128i coefficients,
+                                 int16_t** const wiener_buffer) {
+  const auto filter = _mm_shuffle_epi32(coefficients, 0x55);
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i s[3], madds[2];
+      s[0] = LoadUnaligned16(src + x + 0);
+      s[1] = LoadUnaligned16(src + x + 1);
+      s[2] = LoadUnaligned16(src + x + 2);
+      const __m128i s02 = _mm_add_epi16(s[0], s[2]);
+      const __m128i ss0 = _mm_unpacklo_epi16(s02, s[1]);
+      const __m128i ss1 = _mm_unpackhi_epi16(s02, s[1]);
+      madds[0] = _mm_madd_epi16(ss0, filter);
+      madds[1] = _mm_madd_epi16(ss1, filter);
+      WienerHorizontalClip(madds, *wiener_buffer + x);
+      x += 8;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap1(const uint16_t* src,
+                                 const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 int16_t** const wiener_buffer) {
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      const __m128i s = LoadUnaligned16(src + x);
+      const __m128i d = _mm_slli_epi16(s, 4);
+      StoreAligned16(*wiener_buffer + x, d);
+      x += 8;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline __m128i WienerVertical7(const __m128i a[4], const __m128i filter[4]) {
+  const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
+  const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
+  const __m128i madd2 = _mm_madd_epi16(a[2], filter[2]);
+  const __m128i madd3 = _mm_madd_epi16(a[3], filter[3]);
+  const __m128i madd01 = _mm_add_epi32(madd0, madd1);
+  const __m128i madd23 = _mm_add_epi32(madd2, madd3);
+  const __m128i sum = _mm_add_epi32(madd01, madd23);
+  return _mm_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVertical5(const __m128i a[3], const __m128i filter[3]) {
+  const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
+  const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
+  const __m128i madd2 = _mm_madd_epi16(a[2], filter[2]);
+  const __m128i madd01 = _mm_add_epi32(madd0, madd1);
+  const __m128i sum = _mm_add_epi32(madd01, madd2);
+  return _mm_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVertical3(const __m128i a[2], const __m128i filter[2]) {
+  const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
+  const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
+  const __m128i sum = _mm_add_epi32(madd0, madd1);
+  return _mm_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVerticalClip(const __m128i s[2]) {
+  const __m128i d = _mm_packus_epi32(s[0], s[1]);
+  return _mm_min_epu16(d, _mm_set1_epi16(1023));
+}
+
+inline __m128i WienerVerticalFilter7(const __m128i a[7],
+                                     const __m128i filter[2]) {
+  const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+  __m128i b[4], c[2];
+  b[0] = _mm_unpacklo_epi16(a[0], a[1]);
+  b[1] = _mm_unpacklo_epi16(a[2], a[3]);
+  b[2] = _mm_unpacklo_epi16(a[4], a[5]);
+  b[3] = _mm_unpacklo_epi16(a[6], round);
+  c[0] = WienerVertical7(b, filter);
+  b[0] = _mm_unpackhi_epi16(a[0], a[1]);
+  b[1] = _mm_unpackhi_epi16(a[2], a[3]);
+  b[2] = _mm_unpackhi_epi16(a[4], a[5]);
+  b[3] = _mm_unpackhi_epi16(a[6], round);
+  c[1] = WienerVertical7(b, filter);
+  return WienerVerticalClip(c);
+}
+
+inline __m128i WienerVerticalFilter5(const __m128i a[5],
+                                     const __m128i filter[3]) {
+  const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+  __m128i b[3], c[2];
+  b[0] = _mm_unpacklo_epi16(a[0], a[1]);
+  b[1] = _mm_unpacklo_epi16(a[2], a[3]);
+  b[2] = _mm_unpacklo_epi16(a[4], round);
+  c[0] = WienerVertical5(b, filter);
+  b[0] = _mm_unpackhi_epi16(a[0], a[1]);
+  b[1] = _mm_unpackhi_epi16(a[2], a[3]);
+  b[2] = _mm_unpackhi_epi16(a[4], round);
+  c[1] = WienerVertical5(b, filter);
+  return WienerVerticalClip(c);
+}
+
+inline __m128i WienerVerticalFilter3(const __m128i a[3],
+                                     const __m128i filter[2]) {
+  const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+  __m128i b[2], c[2];
+  b[0] = _mm_unpacklo_epi16(a[0], a[1]);
+  b[1] = _mm_unpacklo_epi16(a[2], round);
+  c[0] = WienerVertical3(b, filter);
+  b[0] = _mm_unpackhi_epi16(a[0], a[1]);
+  b[1] = _mm_unpackhi_epi16(a[2], round);
+  c[1] = WienerVertical3(b, filter);
+  return WienerVerticalClip(c);
+}
+
+inline __m128i WienerVerticalTap7Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m128i filter[2], __m128i a[7]) {
+  a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+  a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
+  a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
+  a[5] = LoadAligned16(wiener_buffer + 5 * wiener_stride);
+  a[6] = LoadAligned16(wiener_buffer + 6 * wiener_stride);
+  return WienerVerticalFilter7(a, filter);
+}
+
+inline __m128i WienerVerticalTap5Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m128i filter[3], __m128i a[5]) {
+  a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+  a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
+  a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
+  return WienerVerticalFilter5(a, filter);
+}
+
+inline __m128i WienerVerticalTap3Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m128i filter[2], __m128i a[3]) {
+  a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+  return WienerVerticalFilter3(a, filter);
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[4], uint16_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const __m128i c = LoadLo8(coefficients);
+  __m128i filter[4];
+  filter[0] = _mm_shuffle_epi32(c, 0x0);
+  filter[1] = _mm_shuffle_epi32(c, 0x55);
+  filter[2] = _mm_shuffle_epi8(c, _mm_set1_epi32(0x03020504));
+  filter[3] =
+      _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i a[8], d[2];
+      d[0] = WienerVerticalTap7Kernel(wiener_buffer + x, width, filter, a);
+      a[7] = LoadAligned16(wiener_buffer + x + 7 * width);
+      d[1] = WienerVerticalFilter7(a + 1, filter);
+      StoreAligned16(dst + x, d[0]);
+      StoreAligned16(dst + dst_stride + x, d[1]);
+      x += 8;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i a[7];
+      const __m128i d =
+          WienerVerticalTap7Kernel(wiener_buffer + x, width, filter, a);
+      StoreAligned16(dst + x, d);
+      x += 8;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[3], uint16_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const __m128i c = LoadLo8(coefficients);
+  __m128i filter[3];
+  filter[0] = _mm_shuffle_epi32(c, 0x0);
+  filter[1] = _mm_shuffle_epi8(c, _mm_set1_epi32(0x03020504));
+  filter[2] =
+      _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i a[6], d[2];
+      d[0] = WienerVerticalTap5Kernel(wiener_buffer + x, width, filter, a);
+      a[5] = LoadAligned16(wiener_buffer + x + 5 * width);
+      d[1] = WienerVerticalFilter5(a + 1, filter);
+      StoreAligned16(dst + x, d[0]);
+      StoreAligned16(dst + dst_stride + x, d[1]);
+      x += 8;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i a[5];
+      const __m128i d =
+          WienerVerticalTap5Kernel(wiener_buffer + x, width, filter, a);
+      StoreAligned16(dst + x, d);
+      x += 8;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[2], uint16_t* dst,
+                               const ptrdiff_t dst_stride) {
+  __m128i filter[2];
+  filter[0] = _mm_set1_epi32(*reinterpret_cast<const int32_t*>(coefficients));
+  filter[1] =
+      _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i a[4], d[2];
+      d[0] = WienerVerticalTap3Kernel(wiener_buffer + x, width, filter, a);
+      a[3] = LoadAligned16(wiener_buffer + x + 3 * width);
+      d[1] = WienerVerticalFilter3(a + 1, filter);
+      StoreAligned16(dst + x, d[0]);
+      StoreAligned16(dst + dst_stride + x, d[1]);
+      x += 8;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m128i a[3];
+      const __m128i d =
+          WienerVerticalTap3Kernel(wiener_buffer + x, width, filter, a);
+      StoreAligned16(dst + x, d);
+      x += 8;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+                                     uint16_t* const dst) {
+  const __m128i a = LoadAligned16(wiener_buffer);
+  const __m128i b = _mm_add_epi16(a, _mm_set1_epi16(8));
+  const __m128i c = _mm_srai_epi16(b, 4);
+  const __m128i d = _mm_max_epi16(c, _mm_setzero_si128());
+  const __m128i e = _mm_min_epi16(d, _mm_set1_epi16(1023));
+  StoreAligned16(dst, e);
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               uint16_t* dst, const ptrdiff_t dst_stride) {
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+      WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x);
+      x += 8;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+      x += 8;
+    } while (x < width);
+  }
+}
+
+void WienerFilter_SSE4_1(
+    const RestorationUnitInfo& restoration_info, const void* const source,
+    const ptrdiff_t stride, const void* const top_border,
+    const ptrdiff_t top_border_stride, const void* const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* const restoration_buffer, void* const dest) {
+  const int16_t* const number_leading_zero_coefficients =
+      restoration_info.wiener_info.number_leading_zero_coefficients;
+  const int number_rows_to_skip = std::max(
+      static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+      1);
+  const ptrdiff_t wiener_stride = Align(width, 16);
+  int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+  // The values are saturated to 13 bits before storing.
+  int16_t* wiener_buffer_horizontal =
+      wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+
+  // horizontal filtering.
+  // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
+  const int height_horizontal =
+      height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+  const int height_extra = (height_horizontal - height) >> 1;
+  assert(height_extra <= 2);
+  const auto* const src = static_cast<const uint16_t*>(source);
+  const auto* const top = static_cast<const uint16_t*>(top_border);
+  const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
+  const __m128i coefficients_horizontal =
+      LoadLo8(restoration_info.wiener_info.filter[WienerInfo::kHorizontal]);
+  if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+    WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+                         top_border_stride, wiener_stride, height_extra,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+                         height_extra, coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+    WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+                         top_border_stride, wiener_stride, height_extra,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+                         height_extra, coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+    // The maximum over-reads happen here.
+    WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+                         top_border_stride, wiener_stride, height_extra,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+                         height_extra, coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+    WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+                         top_border_stride, wiener_stride, height_extra,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(src, stride, wiener_stride, height,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+                         height_extra, &wiener_buffer_horizontal);
+  }
+
+  // vertical filtering.
+  // Over-writes up to 15 values.
+  const int16_t* const filter_vertical =
+      restoration_info.wiener_info.filter[WienerInfo::kVertical];
+  auto* dst = static_cast<uint16_t*>(dest);
+  if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+    // Because the top row of |source| is a duplicate of the second row, and the
+    // bottom row of |source| is a duplicate of its above row, we can duplicate
+    // the top and bottom row of |wiener_buffer| accordingly.
+    memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+           sizeof(*wiener_buffer_horizontal) * wiener_stride);
+    memcpy(restoration_buffer->wiener_buffer,
+           restoration_buffer->wiener_buffer + wiener_stride,
+           sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+    WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+                       filter_vertical, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+    WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+                       height, filter_vertical + 1, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+    WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+                       wiener_stride, height, filter_vertical + 2, dst, stride);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+    WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+                       wiener_stride, height, dst, stride);
+  }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+// SIMD overreads 8 - (width % 8) - 2 * padding pixels, where padding is 3 for
+// Pass 1 and 2 for Pass 2.
+constexpr int kOverreadInBytesPass1 = 4;
+constexpr int kOverreadInBytesPass2 = 8;
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+                               __m128i dst[2]) {
+  dst[0] = LoadAligned16(src[0] + x);
+  dst[1] = LoadAligned16(src[1] + x);
+}
+
+inline void LoadAligned16x2U16Msan(const uint16_t* const src[2],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m128i dst[2]) {
+  dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
+  dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+                               __m128i dst[3]) {
+  dst[0] = LoadAligned16(src[0] + x);
+  dst[1] = LoadAligned16(src[1] + x);
+  dst[2] = LoadAligned16(src[2] + x);
+}
+
+inline void LoadAligned16x3U16Msan(const uint16_t* const src[3],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m128i dst[3]) {
+  dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
+  dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
+  dst[2] = LoadAligned16Msan(src[2] + x, sizeof(**src) * (x + 8 - border));
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) {
+  dst[0] = LoadAligned16(src + 0);
+  dst[1] = LoadAligned16(src + 4);
+}
+
+inline void LoadAligned32U32Msan(const uint32_t* const src, const ptrdiff_t x,
+                                 const ptrdiff_t border, __m128i dst[2]) {
+  dst[0] = LoadAligned16Msan(src + x + 0, sizeof(*src) * (x + 4 - border));
+  dst[1] = LoadAligned16Msan(src + x + 4, sizeof(*src) * (x + 8 - border));
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+                               __m128i dst[2][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned32x2U32Msan(const uint32_t* const src[2],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m128i dst[2][2]) {
+  LoadAligned32U32Msan(src[0], x, border, dst[0]);
+  LoadAligned32U32Msan(src[1], x, border, dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+                               __m128i dst[3][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+  LoadAligned32U32(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned32x3U32Msan(const uint32_t* const src[3],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m128i dst[3][2]) {
+  LoadAligned32U32Msan(src[0], x, border, dst[0]);
+  LoadAligned32U32Msan(src[1], x, border, dst[1]);
+  LoadAligned32U32Msan(src[2], x, border, dst[2]);
+}
+
+inline void StoreAligned32U16(uint16_t* const dst, const __m128i src[2]) {
+  StoreAligned16(dst + 0, src[0]);
+  StoreAligned16(dst + 8, src[1]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) {
+  StoreAligned16(dst + 0, src[0]);
+  StoreAligned16(dst + 4, src[1]);
+}
+
+inline void StoreAligned64U32(uint32_t* const dst, const __m128i src[4]) {
+  StoreAligned32U32(dst + 0, src + 0);
+  StoreAligned32U32(dst + 8, src + 2);
+}
+
+// Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following
+// functions. Some compilers may generate super inefficient code and the whole
+// decoder could be 15% slower.
+
+inline __m128i VaddlLo8(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpacklo_epi8(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+  return _mm_add_epi16(s0, s1);
+}
+
+inline __m128i VaddlHi8(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpackhi_epi8(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
+  return _mm_add_epi16(s0, s1);
+}
+
+inline __m128i VaddwLo8(const __m128i src0, const __m128i src1) {
+  const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+  return _mm_add_epi16(src0, s1);
+}
+
+inline __m128i VaddwHi8(const __m128i src0, const __m128i src1) {
+  const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
+  return _mm_add_epi16(src0, s1);
+}
+
+inline __m128i VmullNLo8(const __m128i src0, const int src1) {
+  const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
+}
+
+inline __m128i VmullNHi8(const __m128i src0, const int src1) {
+  const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
+}
+
+inline __m128i VmullLo16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, s1);
+}
+
+inline __m128i VmullHi16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, s1);
+}
+
+inline __m128i VrshrU16(const __m128i src0, const int src1) {
+  const __m128i sum = _mm_add_epi16(src0, _mm_set1_epi16(1 << (src1 - 1)));
+  return _mm_srli_epi16(sum, src1);
+}
+
+inline __m128i VrshrS32(const __m128i src0, const int src1) {
+  const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+  return _mm_srai_epi32(sum, src1);
+}
+
+inline __m128i VrshrU32(const __m128i src0, const int src1) {
+  const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+  return _mm_srli_epi32(sum, src1);
+}
+
+inline void Square(const __m128i src, __m128i dst[2]) {
+  const __m128i s0 = _mm_unpacklo_epi16(src, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpackhi_epi16(src, _mm_setzero_si128());
+  dst[0] = _mm_madd_epi16(s0, s0);
+  dst[1] = _mm_madd_epi16(s1, s1);
+}
+
+template <int offset>
+inline void Prepare3_8(const __m128i src[2], __m128i dst[3]) {
+  dst[0] = _mm_alignr_epi8(src[1], src[0], offset + 0);
+  dst[1] = _mm_alignr_epi8(src[1], src[0], offset + 1);
+  dst[2] = _mm_alignr_epi8(src[1], src[0], offset + 2);
+}
+
+inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) {
+  dst[0] = src[0];
+  dst[1] = _mm_alignr_epi8(src[1], src[0], 2);
+  dst[2] = _mm_alignr_epi8(src[1], src[0], 4);
+}
+
+inline void Prepare3_32(const __m128i src[2], __m128i dst[3]) {
+  dst[0] = src[0];
+  dst[1] = _mm_alignr_epi8(src[1], src[0], 4);
+  dst[2] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) {
+  Prepare3_16(src, dst);
+  dst[3] = _mm_alignr_epi8(src[1], src[0], 6);
+  dst[4] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare5_32(const __m128i src[2], __m128i dst[5]) {
+  Prepare3_32(src, dst);
+  dst[3] = _mm_alignr_epi8(src[1], src[0], 12);
+  dst[4] = src[1];
+}
+
+inline __m128i Sum3_16(const __m128i src0, const __m128i src1,
+                       const __m128i src2) {
+  const __m128i sum = _mm_add_epi16(src0, src1);
+  return _mm_add_epi16(sum, src2);
+}
+
+inline __m128i Sum3_16(const __m128i src[3]) {
+  return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m128i Sum3_32(const __m128i src0, const __m128i src1,
+                       const __m128i src2) {
+  const __m128i sum = _mm_add_epi32(src0, src1);
+  return _mm_add_epi32(sum, src2);
+}
+
+inline __m128i Sum3_32(const __m128i src[3]) {
+  return Sum3_32(src[0], src[1], src[2]);
+}
+
+inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) {
+  dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+  dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline __m128i Sum3WLo16(const __m128i src[3]) {
+  const __m128i sum = VaddlLo8(src[0], src[1]);
+  return VaddwLo8(sum, src[2]);
+}
+
+inline __m128i Sum3WHi16(const __m128i src[3]) {
+  const __m128i sum = VaddlHi8(src[0], src[1]);
+  return VaddwHi8(sum, src[2]);
+}
+
+inline __m128i Sum5_16(const __m128i src[5]) {
+  const __m128i sum01 = _mm_add_epi16(src[0], src[1]);
+  const __m128i sum23 = _mm_add_epi16(src[2], src[3]);
+  const __m128i sum = _mm_add_epi16(sum01, sum23);
+  return _mm_add_epi16(sum, src[4]);
+}
+
+inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1,
+                       const __m128i* const src2, const __m128i* const src3,
+                       const __m128i* const src4) {
+  const __m128i sum01 = _mm_add_epi32(*src0, *src1);
+  const __m128i sum23 = _mm_add_epi32(*src2, *src3);
+  const __m128i sum = _mm_add_epi32(sum01, sum23);
+  return _mm_add_epi32(sum, *src4);
+}
+
+inline __m128i Sum5_32(const __m128i src[5]) {
+  return Sum5_32(&src[0], &src[1], &src[2], &src[3], &src[4]);
+}
+
+inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) {
+  dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+  dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline __m128i Sum3Horizontal16(const __m128i src[2]) {
+  __m128i s[3];
+  Prepare3_16(src, s);
+  return Sum3_16(s);
+}
+
+inline void Sum3Horizontal32(const __m128i src[3], __m128i dst[2]) {
+  __m128i s[3];
+  Prepare3_32(src + 0, s);
+  dst[0] = Sum3_32(s);
+  Prepare3_32(src + 1, s);
+  dst[1] = Sum3_32(s);
+}
+
+inline __m128i Sum5Horizontal16(const __m128i src[2]) {
+  __m128i s[5];
+  Prepare5_16(src, s);
+  return Sum5_16(s);
+}
+
+inline void Sum5Horizontal32(const __m128i src[3], __m128i dst[2]) {
+  __m128i s[5];
+  Prepare5_32(src + 0, s);
+  dst[0] = Sum5_32(s);
+  Prepare5_32(src + 1, s);
+  dst[1] = Sum5_32(s);
+}
+
+void SumHorizontal16(const __m128i src[2], __m128i* const row3,
+                     __m128i* const row5) {
+  __m128i s[5];
+  Prepare5_16(src, s);
+  const __m128i sum04 = _mm_add_epi16(s[0], s[4]);
+  *row3 = Sum3_16(s + 1);
+  *row5 = _mm_add_epi16(sum04, *row3);
+}
+
+inline void SumHorizontal16(const __m128i src[3], __m128i* const row3_0,
+                            __m128i* const row3_1, __m128i* const row5_0,
+                            __m128i* const row5_1) {
+  SumHorizontal16(src + 0, row3_0, row5_0);
+  SumHorizontal16(src + 1, row3_1, row5_1);
+}
+
+void SumHorizontal32(const __m128i src[5], __m128i* const row_sq3,
+                     __m128i* const row_sq5) {
+  const __m128i sum04 = _mm_add_epi32(src[0], src[4]);
+  *row_sq3 = Sum3_32(src + 1);
+  *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+inline void SumHorizontal32(const __m128i src[3], __m128i* const row_sq3_0,
+                            __m128i* const row_sq3_1, __m128i* const row_sq5_0,
+                            __m128i* const row_sq5_1) {
+  __m128i s[5];
+  Prepare5_32(src + 0, s);
+  SumHorizontal32(s, row_sq3_0, row_sq5_0);
+  Prepare5_32(src + 1, s);
+  SumHorizontal32(s, row_sq3_1, row_sq5_1);
+}
+
+inline __m128i Sum343Lo(const __m128i ma3[3]) {
+  const __m128i sum = Sum3WLo16(ma3);
+  const __m128i sum3 = Sum3_16(sum, sum, sum);
+  return VaddwLo8(sum3, ma3[1]);
+}
+
+inline __m128i Sum343Hi(const __m128i ma3[3]) {
+  const __m128i sum = Sum3WHi16(ma3);
+  const __m128i sum3 = Sum3_16(sum, sum, sum);
+  return VaddwHi8(sum3, ma3[1]);
+}
+
+inline __m128i Sum343(const __m128i src[3]) {
+  const __m128i sum = Sum3_32(src);
+  const __m128i sum3 = Sum3_32(sum, sum, sum);
+  return _mm_add_epi32(sum3, src[1]);
+}
+
+inline void Sum343(const __m128i src[3], __m128i dst[2]) {
+  __m128i s[3];
+  Prepare3_32(src + 0, s);
+  dst[0] = Sum343(s);
+  Prepare3_32(src + 1, s);
+  dst[1] = Sum343(s);
+}
+
+inline __m128i Sum565Lo(const __m128i src[3]) {
+  const __m128i sum = Sum3WLo16(src);
+  const __m128i sum4 = _mm_slli_epi16(sum, 2);
+  const __m128i sum5 = _mm_add_epi16(sum4, sum);
+  return VaddwLo8(sum5, src[1]);
+}
+
+inline __m128i Sum565Hi(const __m128i src[3]) {
+  const __m128i sum = Sum3WHi16(src);
+  const __m128i sum4 = _mm_slli_epi16(sum, 2);
+  const __m128i sum5 = _mm_add_epi16(sum4, sum);
+  return VaddwHi8(sum5, src[1]);
+}
+
+inline __m128i Sum565(const __m128i src[3]) {
+  const __m128i sum = Sum3_32(src);
+  const __m128i sum4 = _mm_slli_epi32(sum, 2);
+  const __m128i sum5 = _mm_add_epi32(sum4, sum);
+  return _mm_add_epi32(sum5, src[1]);
+}
+
+inline void Sum565(const __m128i src[3], __m128i dst[2]) {
+  __m128i s[3];
+  Prepare3_32(src + 0, s);
+  dst[0] = Sum565(s);
+  Prepare3_32(src + 1, s);
+  dst[1] = Sum565(s);
+}
+
+inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
+                   uint32_t* square_sum3, uint32_t* square_sum5) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src) * width;
+  int y = 2;
+  do {
+    __m128i s[3], sq[6];
+    s[0] = LoadUnaligned16Msan(src, overread_in_bytes);
+    Square(s[0], sq);
+    ptrdiff_t x = sum_width;
+    do {
+      __m128i row3[2], row5[2], row_sq3[2], row_sq5[2];
+      s[1] = LoadUnaligned16Msan(
+          src + 8, overread_in_bytes + sizeof(*src) * (sum_width - x + 8));
+      x -= 16;
+      src += 16;
+      s[2] = LoadUnaligned16Msan(
+          src, overread_in_bytes + sizeof(*src) * (sum_width - x));
+      Square(s[1], sq + 2);
+      Square(s[2], sq + 4);
+      SumHorizontal16(s, &row3[0], &row3[1], &row5[0], &row5[1]);
+      StoreAligned32U16(sum3, row3);
+      StoreAligned32U16(sum5, row5);
+      SumHorizontal32(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0],
+                      &row_sq5[1]);
+      StoreAligned32U32(square_sum3 + 0, row_sq3);
+      StoreAligned32U32(square_sum5 + 0, row_sq5);
+      SumHorizontal32(sq + 2, &row_sq3[0], &row_sq3[1], &row_sq5[0],
+                      &row_sq5[1]);
+      StoreAligned32U32(square_sum3 + 8, row_sq3);
+      StoreAligned32U32(square_sum5 + 8, row_sq5);
+      s[0] = s[2];
+      sq[0] = sq[4];
+      sq[1] = sq[5];
+      sum3 += 16;
+      sum5 += 16;
+      square_sum3 += 16;
+      square_sum5 += 16;
+    } while (x != 0);
+    src += src_stride - sum_width;
+    sum3 += sum_stride - sum_width;
+    sum5 += sum_stride - sum_width;
+    square_sum3 += sum_stride - sum_width;
+    square_sum5 += sum_stride - sum_width;
+  } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sums,
+                   uint32_t* square_sums) {
+  static_assert(size == 3 || size == 5, "");
+  const ptrdiff_t overread_in_bytes =
+      ((size == 5) ? kOverreadInBytesPass1 : kOverreadInBytesPass2) -
+      sizeof(*src) * width;
+  int y = 2;
+  do {
+    __m128i s[3], sq[6];
+    s[0] = LoadUnaligned16Msan(src, overread_in_bytes);
+    Square(s[0], sq);
+    ptrdiff_t x = sum_width;
+    do {
+      __m128i row[2], row_sq[4];
+      s[1] = LoadUnaligned16Msan(
+          src + 8, overread_in_bytes + sizeof(*src) * (sum_width - x + 8));
+      x -= 16;
+      src += 16;
+      s[2] = LoadUnaligned16Msan(
+          src, overread_in_bytes + sizeof(*src) * (sum_width - x));
+      Square(s[1], sq + 2);
+      Square(s[2], sq + 4);
+      if (size == 3) {
+        row[0] = Sum3Horizontal16(s + 0);
+        row[1] = Sum3Horizontal16(s + 1);
+        Sum3Horizontal32(sq + 0, row_sq + 0);
+        Sum3Horizontal32(sq + 2, row_sq + 2);
+      } else {
+        row[0] = Sum5Horizontal16(s + 0);
+        row[1] = Sum5Horizontal16(s + 1);
+        Sum5Horizontal32(sq + 0, row_sq + 0);
+        Sum5Horizontal32(sq + 2, row_sq + 2);
+      }
+      StoreAligned32U16(sums, row);
+      StoreAligned64U32(square_sums, row_sq);
+      s[0] = s[2];
+      sq[0] = sq[4];
+      sq[1] = sq[5];
+      sums += 16;
+      square_sums += 16;
+    } while (x != 0);
+    src += src_stride - sum_width;
+    sums += sum_stride - sum_width;
+    square_sums += sum_stride - sum_width;
+  } while (--y != 0);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq,
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  // a = |sum_sq|
+  // d = |sum|
+  // p = (a * n < d * d) ? 0 : a * n - d * d;
+  const __m128i dxd = _mm_madd_epi16(sum, sum);
+  // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
+  // Some compilers could do this for us but we make this explicit.
+  // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n));
+  __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3));
+  if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4));
+  const __m128i sub = _mm_sub_epi32(axn, dxd);
+  const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
+  const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale));
+  return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  const __m128i b = VrshrU16(sum, 2);
+  const __m128i sum_lo = _mm_unpacklo_epi16(b, _mm_setzero_si128());
+  const __m128i sum_hi = _mm_unpackhi_epi16(b, _mm_setzero_si128());
+  const __m128i z0 = CalculateMa<n>(sum_lo, VrshrU32(sum_sq[0], 4), scale);
+  const __m128i z1 = CalculateMa<n>(sum_hi, VrshrU32(sum_sq[1], 4), scale);
+  return _mm_packus_epi32(z0, z1);
+}
+
+inline void CalculateB5(const __m128i sum, const __m128i ma, __m128i b[2]) {
+  // one_over_n == 164.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+  // one_over_n_quarter == 41.
+  constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+  static_assert(one_over_n == one_over_n_quarter << 2, "");
+  // |ma| is in range [0, 255].
+  const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter));
+  const __m128i m0 = VmullLo16(m, sum);
+  const __m128i m1 = VmullHi16(m, sum);
+  b[0] = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+  b[1] = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+}
+
+inline void CalculateB3(const __m128i sum, const __m128i ma, __m128i b[2]) {
+  // one_over_n == 455.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+  const __m128i m0 = VmullLo16(ma, sum);
+  const __m128i m1 = VmullHi16(ma, sum);
+  const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
+  const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
+  b[0] = VrshrU32(m2, kSgrProjReciprocalBits);
+  b[1] = VrshrU32(m3, kSgrProjReciprocalBits);
+}
+
+inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2],
+                                  const uint32_t scale, __m128i* const sum,
+                                  __m128i* const index) {
+  __m128i sum_sq[2];
+  *sum = Sum5_16(s5);
+  Sum5_32(sq5, sum_sq);
+  *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2],
+                                  const uint32_t scale, __m128i* const sum,
+                                  __m128i* const index) {
+  __m128i sum_sq[2];
+  *sum = Sum3_16(s3);
+  Sum3_32(sq3, sum_sq);
+  *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+template <int n, int offset>
+inline void LookupIntermediate(const __m128i sum, const __m128i index,
+                               __m128i* const ma, __m128i b[2]) {
+  static_assert(n == 9 || n == 25, "");
+  static_assert(offset == 0 || offset == 8, "");
+  const __m128i idx = _mm_packus_epi16(index, index);
+  // Actually it's not stored and loaded. The compiler will use a 64-bit
+  // general-purpose register to process. Faster than using _mm_extract_epi8().
+  uint8_t temp[8];
+  StoreLo8(temp, idx);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[0]], offset + 0);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], offset + 1);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], offset + 2);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], offset + 3);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], offset + 4);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], offset + 5);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], offset + 6);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], offset + 7);
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  __m128i maq;
+  if (offset == 0) {
+    maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+  } else {
+    maq = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
+  }
+  if (n == 9) {
+    CalculateB3(sum, maq, b);
+  } else {
+    CalculateB5(sum, maq, b);
+  }
+}
+
+// Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
+// to get value 0 as the shuffle result. The most significiant bit 1 comes
+// either from the comparison instruction, or from the sign bit of the index.
+inline __m128i ShuffleIndex(const __m128i table, const __m128i index) {
+  __m128i mask;
+  mask = _mm_cmpgt_epi8(index, _mm_set1_epi8(15));
+  mask = _mm_or_si128(mask, index);
+  return _mm_shuffle_epi8(table, mask);
+}
+
+inline __m128i AdjustValue(const __m128i value, const __m128i index,
+                           const int threshold) {
+  const __m128i thresholds = _mm_set1_epi8(threshold - 128);
+  const __m128i offset = _mm_cmpgt_epi8(index, thresholds);
+  return _mm_add_epi8(value, offset);
+}
+
+inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
+                                  __m128i* const ma, __m128i b0[2],
+                                  __m128i b1[2]) {
+  // Use table lookup to read elements whose indices are less than 48.
+  const __m128i c0 = LoadAligned16(kSgrMaLookup + 0 * 16);
+  const __m128i c1 = LoadAligned16(kSgrMaLookup + 1 * 16);
+  const __m128i c2 = LoadAligned16(kSgrMaLookup + 2 * 16);
+  const __m128i indices = _mm_packus_epi16(index[0], index[1]);
+  __m128i idx;
+  // Clip idx to 127 to apply signed comparison instructions.
+  idx = _mm_min_epu8(indices, _mm_set1_epi8(127));
+  // All elements whose indices are less than 48 are set to 0.
+  // Get shuffle results for indices in range [0, 15].
+  *ma = ShuffleIndex(c0, idx);
+  // Get shuffle results for indices in range [16, 31].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+  const __m128i res1 = ShuffleIndex(c1, idx);
+  // Use OR instruction to combine shuffle results together.
+  *ma = _mm_or_si128(*ma, res1);
+  // Get shuffle results for indices in range [32, 47].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+  const __m128i res2 = ShuffleIndex(c2, idx);
+  *ma = _mm_or_si128(*ma, res2);
+
+  // For elements whose indices are larger than 47, since they seldom change
+  // values with the increase of the index, we use comparison and arithmetic
+  // operations to calculate their values.
+  // Add -128 to apply signed comparison instructions.
+  idx = _mm_add_epi8(indices, _mm_set1_epi8(-128));
+  // Elements whose indices are larger than 47 (with value 0) are set to 5.
+  *ma = _mm_max_epu8(*ma, _mm_set1_epi8(5));
+  *ma = AdjustValue(*ma, idx, 55);   // 55 is the last index which value is 5.
+  *ma = AdjustValue(*ma, idx, 72);   // 72 is the last index which value is 4.
+  *ma = AdjustValue(*ma, idx, 101);  // 101 is the last index which value is 3.
+  *ma = AdjustValue(*ma, idx, 169);  // 169 is the last index which value is 2.
+  *ma = AdjustValue(*ma, idx, 254);  // 254 is the last index which value is 1.
+
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const __m128i maq0 = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+  CalculateB3(sum[0], maq0, b0);
+  const __m128i maq1 = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
+  CalculateB3(sum[1], maq1, b1);
+}
+
+inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
+                                  __m128i ma[2], __m128i b[4]) {
+  __m128i mas;
+  CalculateIntermediate(sum, index, &mas, b + 0, b + 2);
+  ma[0] = _mm_unpacklo_epi64(ma[0], mas);
+  ma[1] = _mm_srli_si128(mas, 8);
+}
+
+// Note: It has been tried to call CalculateIntermediate() to replace the slow
+// LookupIntermediate() when calculating 16 intermediate data points. However,
+// the compiler generates even slower code.
+template <int offset>
+inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2],
+                                   const uint32_t scale, __m128i* const ma,
+                                   __m128i b[2]) {
+  static_assert(offset == 0 || offset == 8, "");
+  __m128i sum, index;
+  CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
+  LookupIntermediate<25, offset>(sum, index, ma, b);
+}
+
+inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2],
+                                   const uint32_t scale, __m128i* const ma,
+                                   __m128i b[2]) {
+  __m128i sum, index;
+  CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
+  LookupIntermediate<9, 0>(sum, index, ma, b);
+}
+
+inline void Store343_444(const __m128i b3[3], const ptrdiff_t x,
+                         __m128i sum_b343[2], __m128i sum_b444[2],
+                         uint32_t* const b343, uint32_t* const b444) {
+  __m128i b[3], sum_b111[2];
+  Prepare3_32(b3 + 0, b);
+  sum_b111[0] = Sum3_32(b);
+  sum_b444[0] = _mm_slli_epi32(sum_b111[0], 2);
+  sum_b343[0] = _mm_sub_epi32(sum_b444[0], sum_b111[0]);
+  sum_b343[0] = _mm_add_epi32(sum_b343[0], b[1]);
+  Prepare3_32(b3 + 1, b);
+  sum_b111[1] = Sum3_32(b);
+  sum_b444[1] = _mm_slli_epi32(sum_b111[1], 2);
+  sum_b343[1] = _mm_sub_epi32(sum_b444[1], sum_b111[1]);
+  sum_b343[1] = _mm_add_epi32(sum_b343[1], b[1]);
+  StoreAligned32U32(b444 + x, sum_b444);
+  StoreAligned32U32(b343 + x, sum_b343);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[3],
+                           const ptrdiff_t x, __m128i* const sum_ma343,
+                           __m128i* const sum_ma444, __m128i sum_b343[2],
+                           __m128i sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const __m128i sum_ma111 = Sum3WLo16(ma3);
+  *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
+  StoreAligned16(ma444 + x, *sum_ma444);
+  const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwLo8(sum333, ma3[1]);
+  StoreAligned16(ma343 + x, *sum_ma343);
+  Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[3],
+                           const ptrdiff_t x, __m128i* const sum_ma343,
+                           __m128i* const sum_ma444, __m128i sum_b343[2],
+                           __m128i sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const __m128i sum_ma111 = Sum3WHi16(ma3);
+  *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
+  StoreAligned16(ma444 + x, *sum_ma444);
+  const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwHi8(sum333, ma3[1]);
+  StoreAligned16(ma343 + x, *sum_ma343);
+  Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, __m128i* const sum_ma343,
+                           __m128i sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m128i sum_ma444, sum_b444[2];
+  Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, __m128i* const sum_ma343,
+                           __m128i sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m128i sum_ma444, sum_b444[2];
+  Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m128i sum_ma343, sum_b343[2];
+  Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m128i sum_ma343, sum_b343[2];
+  Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+    const __m128i s[2][4], const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i* const ma,
+    __m128i b[2]) {
+  __m128i s5[2][5], sq5[5][2];
+  Square(s[0][1], sq[0] + 2);
+  Square(s[1][1], sq[1] + 2);
+  s5[0][3] = Sum5Horizontal16(s[0]);
+  StoreAligned16(sum5[3], s5[0][3]);
+  s5[0][4] = Sum5Horizontal16(s[1]);
+  StoreAligned16(sum5[4], s5[0][4]);
+  Sum5Horizontal32(sq[0], sq5[3]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  Sum5Horizontal32(sq[1], sq5[4]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x3U16(sum5, 0, s5[0]);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5<0>(s5[0], sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+    const __m128i s[2][4], const ptrdiff_t sum_width, const ptrdiff_t x,
+    const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i ma[2],
+    __m128i b[6]) {
+  __m128i s5[2][5], sq5[5][2];
+  Square(s[0][2], sq[0] + 4);
+  Square(s[1][2], sq[1] + 4);
+  s5[0][3] = Sum5Horizontal16(s[0] + 1);
+  s5[1][3] = Sum5Horizontal16(s[0] + 2);
+  StoreAligned16(sum5[3] + x + 0, s5[0][3]);
+  StoreAligned16(sum5[3] + x + 8, s5[1][3]);
+  s5[0][4] = Sum5Horizontal16(s[1] + 1);
+  s5[1][4] = Sum5Horizontal16(s[1] + 2);
+  StoreAligned16(sum5[4] + x + 0, s5[0][4]);
+  StoreAligned16(sum5[4] + x + 8, s5[1][4]);
+  Sum5Horizontal32(sq[0] + 2, sq5[3]);
+  StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+  Sum5Horizontal32(sq[1] + 2, sq5[4]);
+  StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], b + 2);
+
+  Square(s[0][3], sq[0] + 6);
+  Square(s[1][3], sq[1] + 6);
+  Sum5Horizontal32(sq[0] + 4, sq5[3]);
+  StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+  Sum5Horizontal32(sq[1] + 4, sq5[4]);
+  StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], b + 4);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+    const __m128i s[2], const uint32_t scale, const uint16_t* const sum5[5],
+    const uint32_t* const square_sum5[5], __m128i sq[4], __m128i* const ma,
+    __m128i b[2]) {
+  __m128i s5[5], sq5[5][2];
+  Square(s[1], sq + 2);
+  s5[3] = s5[4] = Sum5Horizontal16(s);
+  Sum5Horizontal32(sq, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+    const __m128i s[4], const ptrdiff_t sum_width, const ptrdiff_t x,
+    const uint32_t scale, const uint16_t* const sum5[5],
+    const uint32_t* const square_sum5[5], __m128i sq[8], __m128i ma[2],
+    __m128i b[6]) {
+  __m128i s5[2][5], sq5[5][2];
+  Square(s[2], sq + 4);
+  s5[0][3] = Sum5Horizontal16(s + 1);
+  s5[1][3] = Sum5Horizontal16(s + 2);
+  s5[0][4] = s5[0][3];
+  s5[1][4] = s5[1][3];
+  Sum5Horizontal32(sq + 2, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], b + 2);
+
+  Square(s[3], sq + 6);
+  Sum5Horizontal32(sq + 4, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], b + 4);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+    const __m128i s[2], const uint32_t scale, uint16_t* const sum3[3],
+    uint32_t* const square_sum3[3], __m128i sq[4], __m128i* const ma,
+    __m128i b[2]) {
+  __m128i s3[3], sq3[3][2];
+  Square(s[1], sq + 2);
+  s3[2] = Sum3Horizontal16(s);
+  StoreAligned16(sum3[2], s3[2]);
+  Sum3Horizontal32(sq, sq3[2]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+    const __m128i s[4], const ptrdiff_t x, const ptrdiff_t sum_width,
+    const uint32_t scale, uint16_t* const sum3[3],
+    uint32_t* const square_sum3[3], __m128i sq[8], __m128i ma[2],
+    __m128i b[6]) {
+  __m128i s3[4], sq3[3][2], sum[2], index[2];
+  Square(s[2], sq + 4);
+  s3[2] = Sum3Horizontal16(s + 1);
+  s3[3] = Sum3Horizontal16(s + 2);
+  StoreAligned32U16(sum3[2] + x, s3 + 2);
+  Sum3Horizontal32(sq + 2, sq3[2]);
+  StoreAligned32U32(square_sum3[2] + x + 0, sq3[2]);
+  LoadAligned16x2U16(sum3, x, s3);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
+
+  Square(s[3], sq + 6);
+  Sum3Horizontal32(sq + 4, sq3[2]);
+  StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+  LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3 + 1);
+  LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+  CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
+  CalculateIntermediate(sum, index, ma, b + 2);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+    const __m128i s[2][4], const uint16_t scales[2], uint16_t* const sum3[4],
+    uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+    uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i ma3[2][2],
+    __m128i b3[2][6], __m128i* const ma5, __m128i b5[2]) {
+  __m128i s3[4], s5[5], sq3[4][2], sq5[5][2], sum[2], index[2];
+  Square(s[0][1], sq[0] + 2);
+  Square(s[1][1], sq[1] + 2);
+  SumHorizontal16(s[0], &s3[2], &s5[3]);
+  SumHorizontal16(s[1], &s3[3], &s5[4]);
+  StoreAligned16(sum3[2], s3[2]);
+  StoreAligned16(sum3[3], s3[3]);
+  StoreAligned16(sum5[3], s5[3]);
+  StoreAligned16(sum5[4], s5[4]);
+  SumHorizontal32(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  SumHorizontal32(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3], sq3[3]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateSumAndIndex3(s3 + 0, sq3 + 0, scales[1], &sum[0], &index[0]);
+  CalculateSumAndIndex3(s3 + 1, sq3 + 1, scales[1], &sum[1], &index[1]);
+  CalculateIntermediate(sum, index, &ma3[0][0], b3[0], b3[1]);
+  ma3[1][0] = _mm_srli_si128(ma3[0][0], 8);
+  CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+    const __m128i s[2][4], const ptrdiff_t x, const uint16_t scales[2],
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, __m128i sq[2][8], __m128i ma3[2][2],
+    __m128i b3[2][6], __m128i ma5[2], __m128i b5[6]) {
+  __m128i s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum[2][2], index[2][2];
+  SumHorizontal16(s[0] + 1, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+  StoreAligned16(sum3[2] + x + 0, s3[0][2]);
+  StoreAligned16(sum3[2] + x + 8, s3[1][2]);
+  StoreAligned16(sum5[3] + x + 0, s5[0][3]);
+  StoreAligned16(sum5[3] + x + 8, s5[1][3]);
+  SumHorizontal16(s[1] + 1, &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]);
+  StoreAligned16(sum3[3] + x + 0, s3[0][3]);
+  StoreAligned16(sum3[3] + x + 8, s3[1][3]);
+  StoreAligned16(sum5[4] + x + 0, s5[0][4]);
+  StoreAligned16(sum5[4] + x + 8, s5[1][4]);
+  Square(s[0][2], sq[0] + 4);
+  Square(s[1][2], sq[1] + 4);
+  SumHorizontal32(sq[0] + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2] + x, sq3[2]);
+  StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+  SumHorizontal32(sq[1] + 2, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3] + x, sq3[3]);
+  StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+  LoadAligned16x2U16(sum3, x, s3[0]);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0][0], &index[0][0]);
+  CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum[1][0],
+                        &index[1][0]);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], b5 + 2);
+
+  Square(s[0][3], sq[0] + 6);
+  Square(s[1][3], sq[1] + 6);
+  SumHorizontal32(sq[0] + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+  StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+  SumHorizontal32(sq[1] + 4, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3] + x + 8, sq3[3]);
+  StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+  LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
+  LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[0][1], &index[0][1]);
+  CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum[1][1],
+                        &index[1][1]);
+  CalculateIntermediate(sum[0], index[0], ma3[0], b3[0] + 2);
+  CalculateIntermediate(sum[1], index[1], ma3[1], b3[1] + 2);
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], b5 + 4);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+    const __m128i s[2], const uint16_t scales[2], const uint16_t* const sum3[4],
+    const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+    const uint32_t* const square_sum5[5], __m128i sq[4], __m128i* const ma3,
+    __m128i* const ma5, __m128i b3[2], __m128i b5[2]) {
+  __m128i s3[3], s5[5], sq3[3][2], sq5[5][2];
+  Square(s[1], sq + 2);
+  SumHorizontal16(s, &s3[2], &s5[3]);
+  SumHorizontal32(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16(sum5, 0, s5);
+  s5[4] = s5[3];
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+    const __m128i s[4], const ptrdiff_t sum_width, const ptrdiff_t x,
+    const uint16_t scales[2], const uint16_t* const sum3[4],
+    const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+    const uint32_t* const square_sum5[5], __m128i sq[8], __m128i ma3[2],
+    __m128i ma5[2], __m128i b3[6], __m128i b5[6]) {
+  __m128i s3[2][3], s5[2][5], sq3[3][2], sq5[5][2], sum[2], index[2];
+  Square(s[2], sq + 4);
+  SumHorizontal16(s + 1, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+  SumHorizontal32(sq + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  s5[0][4] = s5[0][3];
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5<8>(s5[0], sq5, scales[0], ma5, b5 + 2);
+  LoadAligned16x2U16(sum3, x, s3[0]);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0], &index[0]);
+
+  Square(s[3], sq + 6);
+  SumHorizontal32(sq + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  s5[1][4] = s5[1][3];
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5<0>(s5[1], sq5, scales[0], ma5 + 1, b5 + 4);
+  LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
+  LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[1], &index[1]);
+  CalculateIntermediate(sum, index, ma3, b3 + 2);
+}
+
+inline void BoxSumFilterPreProcess5(const uint16_t* const src0,
+                                    const uint16_t* const src1, const int width,
+                                    const uint32_t scale,
+                                    uint16_t* const sum5[5],
+                                    uint32_t* const square_sum5[5],
+                                    const ptrdiff_t sum_width, uint16_t* ma565,
+                                    uint32_t* b565) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src0) * width;
+  __m128i s[2][4], mas[2], sq[2][8], bs[6];
+  s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq[0]);
+  Square(s[1][0], sq[1]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
+
+  int x = 0;
+  do {
+    __m128i ma5[3], ma[2], b[4];
+    s[0][2] = LoadUnaligned16Msan(src0 + x + 16,
+                                  overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[0][3] = LoadUnaligned16Msan(src0 + x + 24,
+                                  overread_in_bytes + sizeof(*src0) * (x + 24));
+    s[1][2] = LoadUnaligned16Msan(src1 + x + 16,
+                                  overread_in_bytes + sizeof(*src1) * (x + 16));
+    s[1][3] = LoadUnaligned16Msan(src1 + x + 24,
+                                  overread_in_bytes + sizeof(*src1) * (x + 24));
+    BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+                         bs);
+    Prepare3_8<0>(mas, ma5);
+    ma[0] = Sum565Lo(ma5);
+    ma[1] = Sum565Hi(ma5);
+    StoreAligned32U16(ma565, ma);
+    Sum565(bs + 0, b + 0);
+    Sum565(bs + 2, b + 2);
+    StoreAligned64U32(b565, b);
+    s[0][0] = s[0][2];
+    s[0][1] = s[0][3];
+    s[1][0] = s[1][2];
+    s[1][1] = s[1][3];
+    sq[0][2] = sq[0][6];
+    sq[0][3] = sq[0][7];
+    sq[1][2] = sq[1][6];
+    sq[1][3] = sq[1][7];
+    mas[0] = mas[1];
+    bs[0] = bs[4];
+    bs[1] = bs[5];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+    const uint16_t* const src, const int width, const uint32_t scale,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
+    uint32_t* b444) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass2 - sizeof(*src) * width;
+  __m128i s[4], mas[2], sq[8], bs[6];
+  s[0] = LoadUnaligned16Msan(src + 0, overread_in_bytes + 0);
+  s[1] = LoadUnaligned16Msan(src + 8, overread_in_bytes + 16);
+  Square(s[0], sq);
+  BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq, &mas[0], bs);
+
+  int x = 0;
+  do {
+    s[2] = LoadUnaligned16Msan(src + x + 16,
+                               overread_in_bytes + sizeof(*src) * (x + 16));
+    s[3] = LoadUnaligned16Msan(src + x + 24,
+                               overread_in_bytes + sizeof(*src) * (x + 24));
+    BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+                         bs);
+    __m128i ma3[3];
+    Prepare3_8<0>(mas, ma3);
+    if (calculate444) {  // NOLINT(readability-simplify-boolean-expr)
+      Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
+      Store343_444Hi(ma3, bs + 2, 8, ma343, ma444, b343, b444);
+      ma444 += 16;
+      b444 += 16;
+    } else {
+      __m128i ma[2], b[4];
+      ma[0] = Sum343Lo(ma3);
+      ma[1] = Sum343Hi(ma3);
+      StoreAligned32U16(ma343, ma);
+      Sum343(bs + 0, b + 0);
+      Sum343(bs + 2, b + 2);
+      StoreAligned64U32(b343, b);
+    }
+    s[1] = s[3];
+    sq[2] = sq[6];
+    sq[3] = sq[7];
+    mas[0] = mas[1];
+    bs[0] = bs[4];
+    bs[1] = bs[5];
+    ma343 += 16;
+    b343 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+    const uint16_t* const src0, const uint16_t* const src1, const int width,
+    const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
+    uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
+    uint32_t* b565) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src0) * width;
+  __m128i s[2][4], ma3[2][2], ma5[2], sq[2][8], b3[2][6], b5[6];
+  s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq[0]);
+  Square(s[1][0], sq[1]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+                        ma3, b3, &ma5[0], b5);
+
+  int x = 0;
+  do {
+    __m128i ma[2], b[4], ma3x[3], ma5x[3];
+    s[0][2] = LoadUnaligned16Msan(src0 + x + 16,
+                                  overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[0][3] = LoadUnaligned16Msan(src0 + x + 24,
+                                  overread_in_bytes + sizeof(*src0) * (x + 24));
+    s[1][2] = LoadUnaligned16Msan(src1 + x + 16,
+                                  overread_in_bytes + sizeof(*src1) * (x + 16));
+    s[1][3] = LoadUnaligned16Msan(src1 + x + 24,
+                                  overread_in_bytes + sizeof(*src1) * (x + 24));
+    BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+                        sum_width, sq, ma3, b3, ma5, b5);
+
+    Prepare3_8<0>(ma3[0], ma3x);
+    ma[0] = Sum343Lo(ma3x);
+    ma[1] = Sum343Hi(ma3x);
+    StoreAligned32U16(ma343[0] + x, ma);
+    Sum343(b3[0] + 0, b + 0);
+    Sum343(b3[0] + 2, b + 2);
+    StoreAligned64U32(b343[0] + x, b);
+    Sum565(b5 + 0, b + 0);
+    Sum565(b5 + 2, b + 2);
+    StoreAligned64U32(b565, b);
+    Prepare3_8<0>(ma3[1], ma3x);
+    Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+    Store343_444Hi(ma3x, b3[1] + 2, x + 8, ma343[1], ma444, b343[1], b444);
+    Prepare3_8<0>(ma5, ma5x);
+    ma[0] = Sum565Lo(ma5x);
+    ma[1] = Sum565Hi(ma5x);
+    StoreAligned32U16(ma565, ma);
+    s[0][0] = s[0][2];
+    s[0][1] = s[0][3];
+    s[1][0] = s[1][2];
+    s[1][1] = s[1][3];
+    sq[0][2] = sq[0][6];
+    sq[0][3] = sq[0][7];
+    sq[1][2] = sq[1][6];
+    sq[1][3] = sq[1][7];
+    ma3[0][0] = ma3[0][1];
+    ma3[1][0] = ma3[1][1];
+    ma5[0] = ma5[1];
+    b3[0][0] = b3[0][4];
+    b3[0][1] = b3[0][5];
+    b3[1][0] = b3[1][4];
+    b3[1][1] = b3[1][5];
+    b5[0] = b5[4];
+    b5[1] = b5[5];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+template <int shift>
+inline __m128i FilterOutput(const __m128i ma_x_src, const __m128i b) {
+  // ma: 255 * 32 = 8160 (13 bits)
+  // b: 65088 * 32 = 2082816 (21 bits)
+  // v: b - ma * 255 (22 bits)
+  const __m128i v = _mm_sub_epi32(b, ma_x_src);
+  // kSgrProjSgrBits = 8
+  // kSgrProjRestoreBits = 4
+  // shift = 4 or 5
+  // v >> 8 or 9 (13 bits)
+  return VrshrS32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <int shift>
+inline __m128i CalculateFilteredOutput(const __m128i src, const __m128i ma,
+                                       const __m128i b[2]) {
+  const __m128i ma_x_src_lo = VmullLo16(ma, src);
+  const __m128i ma_x_src_hi = VmullHi16(ma, src);
+  const __m128i dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
+  const __m128i dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
+  return _mm_packs_epi32(dst_lo, dst_hi);  // 13 bits
+}
+
+inline __m128i CalculateFilteredOutputPass1(const __m128i src,
+                                            const __m128i ma[2],
+                                            const __m128i b[2][2]) {
+  const __m128i ma_sum = _mm_add_epi16(ma[0], ma[1]);
+  __m128i b_sum[2];
+  b_sum[0] = _mm_add_epi32(b[0][0], b[1][0]);
+  b_sum[1] = _mm_add_epi32(b[0][1], b[1][1]);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m128i CalculateFilteredOutputPass2(const __m128i src,
+                                            const __m128i ma[3],
+                                            const __m128i b[3][2]) {
+  const __m128i ma_sum = Sum3_16(ma);
+  __m128i b_sum[2];
+  Sum3_32(b, b_sum);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m128i SelfGuidedFinal(const __m128i src, const __m128i v[2]) {
+  const __m128i v_lo =
+      VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const __m128i v_hi =
+      VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const __m128i vv = _mm_packs_epi32(v_lo, v_hi);
+  return _mm_add_epi16(src, vv);
+}
+
+inline __m128i SelfGuidedDoubleMultiplier(const __m128i src,
+                                          const __m128i filter[2], const int w0,
+                                          const int w2) {
+  __m128i v[2];
+  const __m128i w0_w2 = _mm_set1_epi32((w2 << 16) | static_cast<uint16_t>(w0));
+  const __m128i f_lo = _mm_unpacklo_epi16(filter[0], filter[1]);
+  const __m128i f_hi = _mm_unpackhi_epi16(filter[0], filter[1]);
+  v[0] = _mm_madd_epi16(w0_w2, f_lo);
+  v[1] = _mm_madd_epi16(w0_w2, f_hi);
+  return SelfGuidedFinal(src, v);
+}
+
+inline __m128i SelfGuidedSingleMultiplier(const __m128i src,
+                                          const __m128i filter, const int w0) {
+  // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+  __m128i v[2];
+  v[0] = VmullNLo8(filter, w0);
+  v[1] = VmullNHi8(filter, w0);
+  return SelfGuidedFinal(src, v);
+}
+
+inline void ClipAndStore(uint16_t* const dst, const __m128i val) {
+  const __m128i val0 = _mm_max_epi16(val, _mm_setzero_si128());
+  const __m128i val1 = _mm_min_epi16(val0, _mm_set1_epi16(1023));
+  StoreAligned16(dst, val1);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+    const uint16_t* const src, const uint16_t* const src0,
+    const uint16_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
+    const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
+    uint32_t* const b565[2], uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src0) * width;
+  __m128i s[2][4], mas[2], sq[2][8], bs[6];
+  s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq[0]);
+  Square(s[1][0], sq[1]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
+
+  int x = 0;
+  do {
+    __m128i ma[2], ma5[3], b[2][2], p[2];
+    s[0][2] = LoadUnaligned16Msan(src0 + x + 16,
+                                  overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[0][3] = LoadUnaligned16Msan(src0 + x + 24,
+                                  overread_in_bytes + sizeof(*src0) * (x + 24));
+    s[1][2] = LoadUnaligned16Msan(src1 + x + 16,
+                                  overread_in_bytes + sizeof(*src1) * (x + 16));
+    s[1][3] = LoadUnaligned16Msan(src1 + x + 24,
+                                  overread_in_bytes + sizeof(*src1) * (x + 24));
+    BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+                         bs);
+    Prepare3_8<0>(mas, ma5);
+    ma[1] = Sum565Lo(ma5);
+    StoreAligned16(ma565[1] + x, ma[1]);
+    Sum565(bs, b[1]);
+    StoreAligned32U32(b565[1] + x, b[1]);
+    const __m128i sr0_lo = LoadAligned16(src + x + 0);
+    const __m128i sr1_lo = LoadAligned16(src + stride + x + 0);
+    ma[0] = LoadAligned16(ma565[0] + x);
+    LoadAligned32U32(b565[0] + x, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr0_lo, ma, b);
+    p[1] = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[1]);
+    const __m128i d00 = SelfGuidedSingleMultiplier(sr0_lo, p[0], w0);
+    const __m128i d10 = SelfGuidedSingleMultiplier(sr1_lo, p[1], w0);
+
+    ma[1] = Sum565Hi(ma5);
+    StoreAligned16(ma565[1] + x + 8, ma[1]);
+    Sum565(bs + 2, b[1]);
+    StoreAligned32U32(b565[1] + x + 8, b[1]);
+    const __m128i sr0_hi = LoadAligned16(src + x + 8);
+    const __m128i sr1_hi = LoadAligned16(src + stride + x + 8);
+    ma[0] = LoadAligned16(ma565[0] + x + 8);
+    LoadAligned32U32(b565[0] + x + 8, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr0_hi, ma, b);
+    p[1] = CalculateFilteredOutput<4>(sr1_hi, ma[1], b[1]);
+    const __m128i d01 = SelfGuidedSingleMultiplier(sr0_hi, p[0], w0);
+    ClipAndStore(dst + x + 0, d00);
+    ClipAndStore(dst + x + 8, d01);
+    const __m128i d11 = SelfGuidedSingleMultiplier(sr1_hi, p[1], w0);
+    ClipAndStore(dst + stride + x + 0, d10);
+    ClipAndStore(dst + stride + x + 8, d11);
+    s[0][0] = s[0][2];
+    s[0][1] = s[0][3];
+    s[1][0] = s[1][2];
+    s[1][1] = s[1][3];
+    sq[0][2] = sq[0][6];
+    sq[0][3] = sq[0][7];
+    sq[1][2] = sq[1][6];
+    sq[1][3] = sq[1][7];
+    mas[0] = mas[1];
+    bs[0] = bs[4];
+    bs[1] = bs[5];
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxFilterPass1LastRow(
+    const uint16_t* const src, const uint16_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
+    uint32_t* b565, uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src0) * width;
+  __m128i s[4], mas[2], sq[8], bs[6];
+  s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  Square(s[0], sq);
+  BoxFilterPreProcess5LastRowLo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
+
+  int x = 0;
+  do {
+    __m128i ma[2], ma5[3], b[2][2];
+    s[2] = LoadUnaligned16Msan(src0 + x + 16,
+                               overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[3] = LoadUnaligned16Msan(src0 + x + 24,
+                               overread_in_bytes + sizeof(*src0) * (x + 24));
+    BoxFilterPreProcess5LastRow(s, sum_width, x + 8, scale, sum5, square_sum5,
+                                sq, mas, bs);
+    Prepare3_8<0>(mas, ma5);
+    ma[1] = Sum565Lo(ma5);
+    Sum565(bs, b[1]);
+    ma[0] = LoadAligned16(ma565);
+    LoadAligned32U32(b565, b[0]);
+    const __m128i sr_lo = LoadAligned16(src + x + 0);
+    __m128i p = CalculateFilteredOutputPass1(sr_lo, ma, b);
+    const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p, w0);
+
+    ma[1] = Sum565Hi(ma5);
+    Sum565(bs + 2, b[1]);
+    ma[0] = LoadAligned16(ma565 + 8);
+    LoadAligned32U32(b565 + 8, b[0]);
+    const __m128i sr_hi = LoadAligned16(src + x + 8);
+    p = CalculateFilteredOutputPass1(sr_hi, ma, b);
+    const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p, w0);
+    ClipAndStore(dst + x + 0, d0);
+    ClipAndStore(dst + x + 8, d1);
+    s[1] = s[3];
+    sq[2] = sq[6];
+    sq[3] = sq[7];
+    mas[0] = mas[1];
+    bs[0] = bs[4];
+    bs[1] = bs[5];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+    const uint16_t* const src, const uint16_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
+    uint32_t* const b444[2], uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass2 - sizeof(*src0) * width;
+  __m128i s[4], mas[2], sq[8], bs[6];
+  s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  Square(s[0], sq);
+  BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq, &mas[0], bs);
+
+  int x = 0;
+  do {
+    s[2] = LoadUnaligned16Msan(src0 + x + 16,
+                               overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[3] = LoadUnaligned16Msan(src0 + x + 24,
+                               overread_in_bytes + sizeof(*src0) * (x + 24));
+    BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+                         bs);
+    __m128i ma[3], b[3][2], ma3[3];
+    Prepare3_8<0>(mas, ma3);
+    Store343_444Lo(ma3, bs + 0, x, &ma[2], b[2], ma343[2], ma444[1], b343[2],
+                   b444[1]);
+    const __m128i sr_lo = LoadAligned16(src + x + 0);
+    ma[0] = LoadAligned16(ma343[0] + x);
+    ma[1] = LoadAligned16(ma444[0] + x);
+    LoadAligned32U32(b343[0] + x, b[0]);
+    LoadAligned32U32(b444[0] + x, b[1]);
+    const __m128i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
+
+    Store343_444Hi(ma3, bs + 2, x + 8, &ma[2], b[2], ma343[2], ma444[1],
+                   b343[2], b444[1]);
+    const __m128i sr_hi = LoadAligned16(src + x + 8);
+    ma[0] = LoadAligned16(ma343[0] + x + 8);
+    ma[1] = LoadAligned16(ma444[0] + x + 8);
+    LoadAligned32U32(b343[0] + x + 8, b[0]);
+    LoadAligned32U32(b444[0] + x + 8, b[1]);
+    const __m128i p1 = CalculateFilteredOutputPass2(sr_hi, ma, b);
+    const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+    const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+    ClipAndStore(dst + x + 0, d0);
+    ClipAndStore(dst + x + 8, d1);
+    s[1] = s[3];
+    sq[2] = sq[6];
+    sq[3] = sq[7];
+    mas[0] = mas[1];
+    bs[0] = bs[4];
+    bs[1] = bs[5];
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+    const uint16_t* const src, const uint16_t* const src0,
+    const uint16_t* const src1, const ptrdiff_t stride, const int width,
+    const uint16_t scales[2], const int16_t w0, const int16_t w2,
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4],
+    uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
+    uint32_t* const b444[3], uint32_t* const b565[2], uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src0) * width;
+  __m128i s[2][4], ma3[2][2], ma5[2], sq[2][8], b3[2][6], b5[6];
+  s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+  s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+  Square(s[0][0], sq[0]);
+  Square(s[1][0], sq[1]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+                        ma3, b3, &ma5[0], b5);
+
+  int x = 0;
+  do {
+    __m128i ma[3][3], b[3][3][2], p[2][2], ma3x[2][3], ma5x[3];
+    s[0][2] = LoadUnaligned16Msan(src0 + x + 16,
+                                  overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[0][3] = LoadUnaligned16Msan(src0 + x + 24,
+                                  overread_in_bytes + sizeof(*src0) * (x + 24));
+    s[1][2] = LoadUnaligned16Msan(src1 + x + 16,
+                                  overread_in_bytes + sizeof(*src1) * (x + 16));
+    s[1][3] = LoadUnaligned16Msan(src1 + x + 24,
+                                  overread_in_bytes + sizeof(*src1) * (x + 24));
+    BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+                        sum_width, sq, ma3, b3, ma5, b5);
+    Prepare3_8<0>(ma3[0], ma3x[0]);
+    Prepare3_8<0>(ma3[1], ma3x[1]);
+    Prepare3_8<0>(ma5, ma5x);
+    Store343_444Lo(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], b[1][2], b[2][1],
+                   ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444Lo(ma3x[1], b3[1], x, &ma[2][2], b[2][2], ma343[3], ma444[2],
+                   b343[3], b444[2]);
+    ma[0][1] = Sum565Lo(ma5x);
+    StoreAligned16(ma565[1] + x, ma[0][1]);
+    Sum565(b5, b[0][1]);
+    StoreAligned32U32(b565[1] + x, b[0][1]);
+    const __m128i sr0_lo = LoadAligned16(src + x);
+    const __m128i sr1_lo = LoadAligned16(src + stride + x);
+    ma[0][0] = LoadAligned16(ma565[0] + x);
+    LoadAligned32U32(b565[0] + x, b[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
+    ma[1][0] = LoadAligned16(ma343[0] + x);
+    ma[1][1] = LoadAligned16(ma444[0] + x);
+    LoadAligned32U32(b343[0] + x, b[1][0]);
+    LoadAligned32U32(b444[0] + x, b[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
+    const __m128i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
+    ma[2][0] = LoadAligned16(ma343[1] + x);
+    LoadAligned32U32(b343[1] + x, b[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
+    const __m128i d10 = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
+
+    Store343_444Hi(ma3x[0], b3[0] + 2, x + 8, &ma[1][2], &ma[2][1], b[1][2],
+                   b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444Hi(ma3x[1], b3[1] + 2, x + 8, &ma[2][2], b[2][2], ma343[3],
+                   ma444[2], b343[3], b444[2]);
+    ma[0][1] = Sum565Hi(ma5x);
+    StoreAligned16(ma565[1] + x + 8, ma[0][1]);
+    Sum565(b5 + 2, b[0][1]);
+    StoreAligned32U32(b565[1] + x + 8, b[0][1]);
+    const __m128i sr0_hi = LoadAligned16(src + x + 8);
+    const __m128i sr1_hi = LoadAligned16(src + stride + x + 8);
+    ma[0][0] = LoadAligned16(ma565[0] + x + 8);
+    LoadAligned32U32(b565[0] + x + 8, b[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_hi, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_hi, ma[0][1], b[0][1]);
+    ma[1][0] = LoadAligned16(ma343[0] + x + 8);
+    ma[1][1] = LoadAligned16(ma444[0] + x + 8);
+    LoadAligned32U32(b343[0] + x + 8, b[1][0]);
+    LoadAligned32U32(b444[0] + x + 8, b[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_hi, ma[1], b[1]);
+    const __m128i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
+    ClipAndStore(dst + x + 0, d00);
+    ClipAndStore(dst + x + 8, d01);
+    ma[2][0] = LoadAligned16(ma343[1] + x + 8);
+    LoadAligned32U32(b343[1] + x + 8, b[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_hi, ma[2], b[2]);
+    const __m128i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
+    ClipAndStore(dst + stride + x + 0, d10);
+    ClipAndStore(dst + stride + x + 8, d11);
+    s[0][0] = s[0][2];
+    s[0][1] = s[0][3];
+    s[1][0] = s[1][2];
+    s[1][1] = s[1][3];
+    sq[0][2] = sq[0][6];
+    sq[0][3] = sq[0][7];
+    sq[1][2] = sq[1][6];
+    sq[1][3] = sq[1][7];
+    ma3[0][0] = ma3[0][1];
+    ma3[1][0] = ma3[1][1];
+    ma5[0] = ma5[1];
+    b3[0][0] = b3[0][4];
+    b3[0][1] = b3[0][5];
+    b3[1][0] = b3[1][4];
+    b3[1][1] = b3[1][5];
+    b5[0] = b5[4];
+    b5[1] = b5[5];
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+    const uint16_t* const src, const uint16_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
+    const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+    uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+    uint16_t* const dst) {
+  const ptrdiff_t overread_in_bytes =
+      kOverreadInBytesPass1 - sizeof(*src0) * width;
+  __m128i s[4], ma3[2], ma5[2], sq[8], b3[6], b5[6], ma[3], b[3][2];
+  s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+  s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+  Square(s[0], sq);
+  BoxFilterPreProcessLastRowLo(s, scales, sum3, sum5, square_sum3, square_sum5,
+                               sq, &ma3[0], &ma5[0], b3, b5);
+
+  int x = 0;
+  do {
+    __m128i ma3x[3], ma5x[3], p[2];
+    s[2] = LoadUnaligned16Msan(src0 + x + 16,
+                               overread_in_bytes + sizeof(*src0) * (x + 16));
+    s[3] = LoadUnaligned16Msan(src0 + x + 24,
+                               overread_in_bytes + sizeof(*src0) * (x + 24));
+    BoxFilterPreProcessLastRow(s, sum_width, x + 8, scales, sum3, sum5,
+                               square_sum3, square_sum5, sq, ma3, ma5, b3, b5);
+    Prepare3_8<0>(ma3, ma3x);
+    Prepare3_8<0>(ma5, ma5x);
+    ma[1] = Sum565Lo(ma5x);
+    Sum565(b5, b[1]);
+    ma[2] = Sum343Lo(ma3x);
+    Sum343(b3, b[2]);
+    const __m128i sr_lo = LoadAligned16(src + x + 0);
+    ma[0] = LoadAligned16(ma565 + x);
+    LoadAligned32U32(b565 + x, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
+    ma[0] = LoadAligned16(ma343 + x);
+    ma[1] = LoadAligned16(ma444 + x);
+    LoadAligned32U32(b343 + x, b[0]);
+    LoadAligned32U32(b444 + x, b[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
+    const __m128i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
+
+    ma[1] = Sum565Hi(ma5x);
+    Sum565(b5 + 2, b[1]);
+    ma[2] = Sum343Hi(ma3x);
+    Sum343(b3 + 2, b[2]);
+    const __m128i sr_hi = LoadAligned16(src + x + 8);
+    ma[0] = LoadAligned16(ma565 + x + 8);
+    LoadAligned32U32(b565 + x + 8, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_hi, ma, b);
+    ma[0] = LoadAligned16(ma343 + x + 8);
+    ma[1] = LoadAligned16(ma444 + x + 8);
+    LoadAligned32U32(b343 + x + 8, b[0]);
+    LoadAligned32U32(b444 + x + 8, b[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_hi, ma, b);
+    const __m128i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
+    ClipAndStore(dst + x + 0, d0);
+    ClipAndStore(dst + x + 8, d1);
+    s[1] = s[3];
+    sq[2] = sq[6];
+    sq[3] = sq[7];
+    ma3[0] = ma3[1];
+    ma5[0] = ma5[1];
+    b3[0] = b3[4];
+    b3[1] = b3[5];
+    b5[0] = b5[4];
+    b5[1] = b5[5];
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+    const RestorationUnitInfo& restoration_info, const uint16_t* src,
+    const ptrdiff_t stride, const uint16_t* const top_border,
+    const ptrdiff_t top_border_stride, const uint16_t* bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    SgrBuffer* const sgr_buffer, uint16_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const auto sum_stride = temp_stride + 16;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+  uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+  uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+  sum3[0] = sgr_buffer->sum3;
+  square_sum3[0] = sgr_buffer->square_sum3;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 3; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  b444[0] = sgr_buffer->b444;
+  for (int i = 1; i <= 2; ++i) {
+    ma444[i] = ma444[i - 1] + temp_stride;
+    b444[i] = b444[i - 1] + temp_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scales[0] != 0);
+  assert(scales[1] != 0);
+  BoxSum(top_border, top_border_stride, width, sum_stride, sum_width, sum3[0],
+         sum5[1], square_sum3[0], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+                         square_sum5, sum_width, ma343, ma444[0], ma565[0],
+                         b343, b444[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate4PointersBy2<uint16_t>(sum3);
+    Circulate4PointersBy2<uint32_t>(square_sum3);
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+              scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
+              ma343, ma444, ma565, b343, b444, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    Circulate4PointersBy2<uint16_t>(ma343);
+    Circulate4PointersBy2<uint32_t>(b343);
+    std::swap(ma444[0], ma444[2]);
+    std::swap(b444[0], b444[2]);
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate4PointersBy2<uint16_t>(sum3);
+  Circulate4PointersBy2<uint32_t>(square_sum3);
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint16_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+              square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
+              b444, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      Circulate4PointersBy2<uint16_t>(sum3);
+      Circulate4PointersBy2<uint32_t>(square_sum3);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+      Circulate4PointersBy2<uint16_t>(ma343);
+      Circulate4PointersBy2<uint32_t>(b343);
+      std::swap(ma444[0], ma444[2]);
+      std::swap(b444[0], b444[2]);
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+    }
+    BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+                     sum_width, scales, w0, w2, sum3, sum5, square_sum3,
+                     square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
+                     b444[0], b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+                                  const uint16_t* src, const ptrdiff_t stride,
+                                  const uint16_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint16_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint16_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const auto sum_stride = temp_stride + 16;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  uint16_t *sum5[5], *ma565[2];
+  uint32_t *square_sum5[5], *b565[2];
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<5>(top_border, top_border_stride, width, sum_stride, sum_width,
+            sum5[1], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
+                          ma565[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+                   square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint16_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+                   sum_width, scale, w0, ma565, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    src += 3;
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+    }
+    BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
+                          sum_width, scale, w0, sum5, square_sum5, ma565[0],
+                          b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+                                  const uint16_t* src, const ptrdiff_t stride,
+                                  const uint16_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint16_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint16_t* dst) {
+  assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const auto sum_stride = temp_stride + 16;
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1];  // < 2^12.
+  uint16_t *sum3[3], *ma343[3], *ma444[2];
+  uint32_t *square_sum3[3], *b343[3], *b444[2];
+  sum3[0] = sgr_buffer->sum3;
+  square_sum3[0] = sgr_buffer->square_sum3;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 2; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  ma444[1] = ma444[0] + temp_stride;
+  b444[0] = sgr_buffer->b444;
+  b444[1] = b444[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<3>(top_border, top_border_stride, width, sum_stride, sum_width,
+            sum3[0], square_sum3[0]);
+  BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
+                                 sum_width, ma343[0], nullptr, b343[0],
+                                 nullptr);
+  Circulate3PointersBy1<uint16_t>(sum3);
+  Circulate3PointersBy1<uint32_t>(square_sum3);
+  const uint16_t* s;
+  if (height > 1) {
+    s = src + stride;
+  } else {
+    s = bottom_border;
+    bottom_border += bottom_border_stride;
+  }
+  BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
+                                ma343[1], ma444[0], b343[1], b444[0]);
+
+  for (int y = height - 2; y > 0; --y) {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  }
+
+  int y = std::min(height, 2);
+  src += 2;
+  do {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    bottom_border += bottom_border_stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  } while (--y != 0);
+}
+
+// If |width| is non-multiple of 16, up to 15 more pixels are written to |dest|
+// in the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_SSE4_1(
+    const RestorationUnitInfo& restoration_info, const void* const source,
+    const ptrdiff_t stride, const void* const top_border,
+    const ptrdiff_t top_border_stride, const void* const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* const restoration_buffer, void* const dest) {
+  const int index = restoration_info.sgr_proj_info.index;
+  const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
+  const int radius_pass_1 = kSgrProjParams[index][2];  // 1 or 0
+  const auto* const src = static_cast<const uint16_t*>(source);
+  const auto* const top = static_cast<const uint16_t*>(top_border);
+  const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
+  auto* const dst = static_cast<uint16_t*>(dest);
+  SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+  if (radius_pass_1 == 0) {
+    // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+    // following assertion.
+    assert(radius_pass_0 != 0);
+    BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+                          top_border_stride, bottom - 3, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
+  } else if (radius_pass_0 == 0) {
+    BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+                          top_border_stride, bottom - 2, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
+  } else {
+    BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+                     top_border_stride, bottom - 3, bottom_border_stride, width,
+                     height, sgr_buffer, dst);
+  }
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+#if DSP_ENABLED_10BPP_SSE4_1(WienerFilter)
+  dsp->loop_restorations[0] = WienerFilter_SSE4_1;
+#else
+  static_cast<void>(WienerFilter_SSE4_1);
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(SelfGuidedFilter)
+  dsp->loop_restorations[1] = SelfGuidedFilter_SSE4_1;
+#else
+  static_cast<void>(SelfGuidedFilter_SSE4_1);
+#endif
+}
+
+}  // namespace
+
+void LoopRestorationInit10bpp_SSE4_1() { Init10bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !(LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10)
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit10bpp_SSE4_1() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10

diff --git a/libgav1/src/dsp/x86/loop_restoration_avx2.cc b/libgav1/src/dsp/x86/loop_restoration_avx2.cc
new file mode 100644
index 0000000..351a324
--- /dev/null
+++ b/libgav1/src/dsp/x86/loop_restoration_avx2.cc

@@ -0,0 +1,2941 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_avx2.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+inline void WienerHorizontalClip(const __m256i s[2], const __m256i s_3x128,
+                                 int16_t* const wiener_buffer) {
+  constexpr int offset =
+      1 << (8 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+  constexpr int limit =
+      (1 << (8 + 1 + kWienerFilterBits - kInterRoundBitsHorizontal)) - 1;
+  const __m256i offsets = _mm256_set1_epi16(-offset);
+  const __m256i limits = _mm256_set1_epi16(limit - offset);
+  const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsHorizontal - 1));
+  // The sum range here is [-128 * 255, 90 * 255].
+  const __m256i madd = _mm256_add_epi16(s[0], s[1]);
+  const __m256i sum = _mm256_add_epi16(madd, round);
+  const __m256i rounded_sum0 =
+      _mm256_srai_epi16(sum, kInterRoundBitsHorizontal);
+  // Add back scaled down offset correction.
+  const __m256i rounded_sum1 = _mm256_add_epi16(rounded_sum0, s_3x128);
+  const __m256i d0 = _mm256_max_epi16(rounded_sum1, offsets);
+  const __m256i d1 = _mm256_min_epi16(d0, limits);
+  StoreAligned32(wiener_buffer, d1);
+}
+
+// Using _mm256_alignr_epi8() is about 8% faster than loading all and unpacking,
+// because the compiler generates redundant code when loading all and unpacking.
+inline void WienerHorizontalTap7Kernel(const __m256i s[2],
+                                       const __m256i filter[4],
+                                       int16_t* const wiener_buffer) {
+  const auto s01 = _mm256_alignr_epi8(s[1], s[0], 1);
+  const auto s23 = _mm256_alignr_epi8(s[1], s[0], 5);
+  const auto s45 = _mm256_alignr_epi8(s[1], s[0], 9);
+  const auto s67 = _mm256_alignr_epi8(s[1], s[0], 13);
+  __m256i madds[4];
+  madds[0] = _mm256_maddubs_epi16(s01, filter[0]);
+  madds[1] = _mm256_maddubs_epi16(s23, filter[1]);
+  madds[2] = _mm256_maddubs_epi16(s45, filter[2]);
+  madds[3] = _mm256_maddubs_epi16(s67, filter[3]);
+  madds[0] = _mm256_add_epi16(madds[0], madds[2]);
+  madds[1] = _mm256_add_epi16(madds[1], madds[3]);
+  const __m256i s_3x128 = _mm256_slli_epi16(_mm256_srli_epi16(s23, 8),
+                                            7 - kInterRoundBitsHorizontal);
+  WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap5Kernel(const __m256i s[2],
+                                       const __m256i filter[3],
+                                       int16_t* const wiener_buffer) {
+  const auto s01 = _mm256_alignr_epi8(s[1], s[0], 1);
+  const auto s23 = _mm256_alignr_epi8(s[1], s[0], 5);
+  const auto s45 = _mm256_alignr_epi8(s[1], s[0], 9);
+  __m256i madds[3];
+  madds[0] = _mm256_maddubs_epi16(s01, filter[0]);
+  madds[1] = _mm256_maddubs_epi16(s23, filter[1]);
+  madds[2] = _mm256_maddubs_epi16(s45, filter[2]);
+  madds[0] = _mm256_add_epi16(madds[0], madds[2]);
+  const __m256i s_3x128 = _mm256_srli_epi16(_mm256_slli_epi16(s23, 8),
+                                            kInterRoundBitsHorizontal + 1);
+  WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap3Kernel(const __m256i s[2],
+                                       const __m256i filter[2],
+                                       int16_t* const wiener_buffer) {
+  const auto s01 = _mm256_alignr_epi8(s[1], s[0], 1);
+  const auto s23 = _mm256_alignr_epi8(s[1], s[0], 5);
+  __m256i madds[2];
+  madds[0] = _mm256_maddubs_epi16(s01, filter[0]);
+  madds[1] = _mm256_maddubs_epi16(s23, filter[1]);
+  const __m256i s_3x128 = _mm256_slli_epi16(_mm256_srli_epi16(s01, 8),
+                                            7 - kInterRoundBitsHorizontal);
+  WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap7(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const __m256i coefficients,
+                                 int16_t** const wiener_buffer) {
+  __m256i filter[4];
+  filter[0] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0100));
+  filter[1] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0302));
+  filter[2] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0102));
+  filter[3] = _mm256_shuffle_epi8(
+      coefficients, _mm256_set1_epi16(static_cast<int16_t>(0x8000)));
+  for (int y = height; y != 0; --y) {
+    __m256i s = LoadUnaligned32(src);
+    __m256i ss[4];
+    ss[0] = _mm256_unpacklo_epi8(s, s);
+    ptrdiff_t x = 0;
+    do {
+      ss[1] = _mm256_unpackhi_epi8(s, s);
+      s = LoadUnaligned32(src + x + 32);
+      ss[3] = _mm256_unpacklo_epi8(s, s);
+      ss[2] = _mm256_permute2x128_si256(ss[0], ss[3], 0x21);
+      WienerHorizontalTap7Kernel(ss + 0, filter, *wiener_buffer + x + 0);
+      WienerHorizontalTap7Kernel(ss + 1, filter, *wiener_buffer + x + 16);
+      ss[0] = ss[3];
+      x += 32;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap5(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const __m256i coefficients,
+                                 int16_t** const wiener_buffer) {
+  __m256i filter[3];
+  filter[0] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0201));
+  filter[1] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0203));
+  filter[2] = _mm256_shuffle_epi8(
+      coefficients, _mm256_set1_epi16(static_cast<int16_t>(0x8001)));
+  for (int y = height; y != 0; --y) {
+    __m256i s = LoadUnaligned32(src);
+    __m256i ss[4];
+    ss[0] = _mm256_unpacklo_epi8(s, s);
+    ptrdiff_t x = 0;
+    do {
+      ss[1] = _mm256_unpackhi_epi8(s, s);
+      s = LoadUnaligned32(src + x + 32);
+      ss[3] = _mm256_unpacklo_epi8(s, s);
+      ss[2] = _mm256_permute2x128_si256(ss[0], ss[3], 0x21);
+      WienerHorizontalTap5Kernel(ss + 0, filter, *wiener_buffer + x + 0);
+      WienerHorizontalTap5Kernel(ss + 1, filter, *wiener_buffer + x + 16);
+      ss[0] = ss[3];
+      x += 32;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 const __m256i coefficients,
+                                 int16_t** const wiener_buffer) {
+  __m256i filter[2];
+  filter[0] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0302));
+  filter[1] = _mm256_shuffle_epi8(
+      coefficients, _mm256_set1_epi16(static_cast<int16_t>(0x8002)));
+  for (int y = height; y != 0; --y) {
+    __m256i s = LoadUnaligned32(src);
+    __m256i ss[4];
+    ss[0] = _mm256_unpacklo_epi8(s, s);
+    ptrdiff_t x = 0;
+    do {
+      ss[1] = _mm256_unpackhi_epi8(s, s);
+      s = LoadUnaligned32(src + x + 32);
+      ss[3] = _mm256_unpacklo_epi8(s, s);
+      ss[2] = _mm256_permute2x128_si256(ss[0], ss[3], 0x21);
+      WienerHorizontalTap3Kernel(ss + 0, filter, *wiener_buffer + x + 0);
+      WienerHorizontalTap3Kernel(ss + 1, filter, *wiener_buffer + x + 16);
+      ss[0] = ss[3];
+      x += 32;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline void WienerHorizontalTap1(const uint8_t* src, const ptrdiff_t src_stride,
+                                 const ptrdiff_t width, const int height,
+                                 int16_t** const wiener_buffer) {
+  for (int y = height; y != 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      const __m256i s = LoadUnaligned32(src + x);
+      const __m256i s0 = _mm256_unpacklo_epi8(s, _mm256_setzero_si256());
+      const __m256i s1 = _mm256_unpackhi_epi8(s, _mm256_setzero_si256());
+      __m256i d[2];
+      d[0] = _mm256_slli_epi16(s0, 4);
+      d[1] = _mm256_slli_epi16(s1, 4);
+      StoreAligned64(*wiener_buffer + x, d);
+      x += 32;
+    } while (x < width);
+    src += src_stride;
+    *wiener_buffer += width;
+  }
+}
+
+inline __m256i WienerVertical7(const __m256i a[2], const __m256i filter[2]) {
+  const __m256i round = _mm256_set1_epi32(1 << (kInterRoundBitsVertical - 1));
+  const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+  const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+  const __m256i sum0 = _mm256_add_epi32(round, madd0);
+  const __m256i sum1 = _mm256_add_epi32(sum0, madd1);
+  return _mm256_srai_epi32(sum1, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVertical5(const __m256i a[2], const __m256i filter[2]) {
+  const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+  const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+  const __m256i sum = _mm256_add_epi32(madd0, madd1);
+  return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVertical3(const __m256i a, const __m256i filter) {
+  const __m256i round = _mm256_set1_epi32(1 << (kInterRoundBitsVertical - 1));
+  const __m256i madd = _mm256_madd_epi16(a, filter);
+  const __m256i sum = _mm256_add_epi32(round, madd);
+  return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVerticalFilter7(const __m256i a[7],
+                                     const __m256i filter[2]) {
+  __m256i b[2];
+  const __m256i a06 = _mm256_add_epi16(a[0], a[6]);
+  const __m256i a15 = _mm256_add_epi16(a[1], a[5]);
+  const __m256i a24 = _mm256_add_epi16(a[2], a[4]);
+  b[0] = _mm256_unpacklo_epi16(a06, a15);
+  b[1] = _mm256_unpacklo_epi16(a24, a[3]);
+  const __m256i sum0 = WienerVertical7(b, filter);
+  b[0] = _mm256_unpackhi_epi16(a06, a15);
+  b[1] = _mm256_unpackhi_epi16(a24, a[3]);
+  const __m256i sum1 = WienerVertical7(b, filter);
+  return _mm256_packs_epi32(sum0, sum1);
+}
+
+inline __m256i WienerVerticalFilter5(const __m256i a[5],
+                                     const __m256i filter[2]) {
+  const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+  __m256i b[2];
+  const __m256i a04 = _mm256_add_epi16(a[0], a[4]);
+  const __m256i a13 = _mm256_add_epi16(a[1], a[3]);
+  b[0] = _mm256_unpacklo_epi16(a04, a13);
+  b[1] = _mm256_unpacklo_epi16(a[2], round);
+  const __m256i sum0 = WienerVertical5(b, filter);
+  b[0] = _mm256_unpackhi_epi16(a04, a13);
+  b[1] = _mm256_unpackhi_epi16(a[2], round);
+  const __m256i sum1 = WienerVertical5(b, filter);
+  return _mm256_packs_epi32(sum0, sum1);
+}
+
+inline __m256i WienerVerticalFilter3(const __m256i a[3], const __m256i filter) {
+  __m256i b;
+  const __m256i a02 = _mm256_add_epi16(a[0], a[2]);
+  b = _mm256_unpacklo_epi16(a02, a[1]);
+  const __m256i sum0 = WienerVertical3(b, filter);
+  b = _mm256_unpackhi_epi16(a02, a[1]);
+  const __m256i sum1 = WienerVertical3(b, filter);
+  return _mm256_packs_epi32(sum0, sum1);
+}
+
+inline __m256i WienerVerticalTap7Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m256i filter[2], __m256i a[7]) {
+  a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+  a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+  a[4] = LoadAligned32(wiener_buffer + 4 * wiener_stride);
+  a[5] = LoadAligned32(wiener_buffer + 5 * wiener_stride);
+  a[6] = LoadAligned32(wiener_buffer + 6 * wiener_stride);
+  return WienerVerticalFilter7(a, filter);
+}
+
+inline __m256i WienerVerticalTap5Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m256i filter[2], __m256i a[5]) {
+  a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+  a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+  a[4] = LoadAligned32(wiener_buffer + 4 * wiener_stride);
+  return WienerVerticalFilter5(a, filter);
+}
+
+inline __m256i WienerVerticalTap3Kernel(const int16_t* wiener_buffer,
+                                        const ptrdiff_t wiener_stride,
+                                        const __m256i filter, __m256i a[3]) {
+  a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+  a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+  a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+  return WienerVerticalFilter3(a, filter);
+}
+
+inline void WienerVerticalTap7Kernel2(const int16_t* wiener_buffer,
+                                      const ptrdiff_t wiener_stride,
+                                      const __m256i filter[2], __m256i d[2]) {
+  __m256i a[8];
+  d[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[7] = LoadAligned32(wiener_buffer + 7 * wiener_stride);
+  d[1] = WienerVerticalFilter7(a + 1, filter);
+}
+
+inline void WienerVerticalTap5Kernel2(const int16_t* wiener_buffer,
+                                      const ptrdiff_t wiener_stride,
+                                      const __m256i filter[2], __m256i d[2]) {
+  __m256i a[6];
+  d[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[5] = LoadAligned32(wiener_buffer + 5 * wiener_stride);
+  d[1] = WienerVerticalFilter5(a + 1, filter);
+}
+
+inline void WienerVerticalTap3Kernel2(const int16_t* wiener_buffer,
+                                      const ptrdiff_t wiener_stride,
+                                      const __m256i filter, __m256i d[2]) {
+  __m256i a[4];
+  d[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a);
+  a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+  d[1] = WienerVerticalFilter3(a + 1, filter);
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[4], uint8_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const __m256i c = _mm256_broadcastq_epi64(LoadLo8(coefficients));
+  __m256i filter[2];
+  filter[0] = _mm256_shuffle_epi32(c, 0x0);
+  filter[1] = _mm256_shuffle_epi32(c, 0x55);
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i d[2][2];
+      WienerVerticalTap7Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+      WienerVerticalTap7Kernel2(wiener_buffer + x + 16, width, filter, d[1]);
+      StoreUnaligned32(dst + x, _mm256_packus_epi16(d[0][0], d[1][0]));
+      StoreUnaligned32(dst + dst_stride + x,
+                       _mm256_packus_epi16(d[0][1], d[1][1]));
+      x += 32;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i a[7];
+      const __m256i d0 =
+          WienerVerticalTap7Kernel(wiener_buffer + x + 0, width, filter, a);
+      const __m256i d1 =
+          WienerVerticalTap7Kernel(wiener_buffer + x + 16, width, filter, a);
+      StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+      x += 32;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[3], uint8_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const __m256i c = _mm256_broadcastd_epi32(Load4(coefficients));
+  __m256i filter[2];
+  filter[0] = _mm256_shuffle_epi32(c, 0);
+  filter[1] =
+      _mm256_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[2]));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i d[2][2];
+      WienerVerticalTap5Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+      WienerVerticalTap5Kernel2(wiener_buffer + x + 16, width, filter, d[1]);
+      StoreUnaligned32(dst + x, _mm256_packus_epi16(d[0][0], d[1][0]));
+      StoreUnaligned32(dst + dst_stride + x,
+                       _mm256_packus_epi16(d[0][1], d[1][1]));
+      x += 32;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i a[5];
+      const __m256i d0 =
+          WienerVerticalTap5Kernel(wiener_buffer + x + 0, width, filter, a);
+      const __m256i d1 =
+          WienerVerticalTap5Kernel(wiener_buffer + x + 16, width, filter, a);
+      StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+      x += 32;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               const int16_t coefficients[2], uint8_t* dst,
+                               const ptrdiff_t dst_stride) {
+  const __m256i filter =
+      _mm256_set1_epi32(*reinterpret_cast<const int32_t*>(coefficients));
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i d[2][2];
+      WienerVerticalTap3Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+      WienerVerticalTap3Kernel2(wiener_buffer + x + 16, width, filter, d[1]);
+      StoreUnaligned32(dst + x, _mm256_packus_epi16(d[0][0], d[1][0]));
+      StoreUnaligned32(dst + dst_stride + x,
+                       _mm256_packus_epi16(d[0][1], d[1][1]));
+      x += 32;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      __m256i a[3];
+      const __m256i d0 =
+          WienerVerticalTap3Kernel(wiener_buffer + x + 0, width, filter, a);
+      const __m256i d1 =
+          WienerVerticalTap3Kernel(wiener_buffer + x + 16, width, filter, a);
+      StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+      x += 32;
+    } while (x < width);
+  }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+                                     uint8_t* const dst) {
+  const __m256i a0 = LoadAligned32(wiener_buffer + 0);
+  const __m256i a1 = LoadAligned32(wiener_buffer + 16);
+  const __m256i b0 = _mm256_add_epi16(a0, _mm256_set1_epi16(8));
+  const __m256i b1 = _mm256_add_epi16(a1, _mm256_set1_epi16(8));
+  const __m256i c0 = _mm256_srai_epi16(b0, 4);
+  const __m256i c1 = _mm256_srai_epi16(b1, 4);
+  const __m256i d = _mm256_packus_epi16(c0, c1);
+  StoreUnaligned32(dst, d);
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+                               const ptrdiff_t width, const int height,
+                               uint8_t* dst, const ptrdiff_t dst_stride) {
+  for (int y = height >> 1; y > 0; --y) {
+    ptrdiff_t x = 0;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+      WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x);
+      x += 32;
+    } while (x < width);
+    dst += 2 * dst_stride;
+    wiener_buffer += 2 * width;
+  }
+
+  if ((height & 1) != 0) {
+    ptrdiff_t x = 0;
+    do {
+      WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+      x += 32;
+    } while (x < width);
+  }
+}
+
+void WienerFilter_AVX2(
+    const RestorationUnitInfo& restoration_info, const void* const source,
+    const ptrdiff_t stride, const void* const top_border,
+    const ptrdiff_t top_border_stride, const void* const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* const restoration_buffer, void* const dest) {
+  const int16_t* const number_leading_zero_coefficients =
+      restoration_info.wiener_info.number_leading_zero_coefficients;
+  const int number_rows_to_skip = std::max(
+      static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+      1);
+  const ptrdiff_t wiener_stride = Align(width, 32);
+  int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+  // The values are saturated to 13 bits before storing.
+  int16_t* wiener_buffer_horizontal =
+      wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+
+  // horizontal filtering.
+  // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
+  const int height_horizontal =
+      height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+  const int height_extra = (height_horizontal - height) >> 1;
+  assert(height_extra <= 2);
+  const auto* const src = static_cast<const uint8_t*>(source);
+  const auto* const top = static_cast<const uint8_t*>(top_border);
+  const auto* const bottom = static_cast<const uint8_t*>(bottom_border);
+  const __m128i c =
+      LoadLo8(restoration_info.wiener_info.filter[WienerInfo::kHorizontal]);
+  // In order to keep the horizontal pass intermediate values within 16 bits we
+  // offset |filter[3]| by 128. The 128 offset will be added back in the loop.
+  __m128i c_horizontal =
+      _mm_sub_epi16(c, _mm_setr_epi16(0, 0, 0, 128, 0, 0, 0, 0));
+  c_horizontal = _mm_packs_epi16(c_horizontal, c_horizontal);
+  const __m256i coefficients_horizontal = _mm256_broadcastd_epi32(c_horizontal);
+  if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+    WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+                         top_border_stride, wiener_stride, height_extra,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+                         height_extra, coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+    WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+                         top_border_stride, wiener_stride, height_extra,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+                         height_extra, coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+  } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+    // The maximum over-reads happen here.
+    WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+                         top_border_stride, wiener_stride, height_extra,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+                         coefficients_horizontal, &wiener_buffer_horizontal);
+    WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+                         height_extra, coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+    WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+                         top_border_stride, wiener_stride, height_extra,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(src, stride, wiener_stride, height,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+                         height_extra, &wiener_buffer_horizontal);
+  }
+
+  // vertical filtering.
+  // Over-writes up to 15 values.
+  const int16_t* const filter_vertical =
+      restoration_info.wiener_info.filter[WienerInfo::kVertical];
+  auto* dst = static_cast<uint8_t*>(dest);
+  if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+    // Because the top row of |source| is a duplicate of the second row, and the
+    // bottom row of |source| is a duplicate of its above row, we can duplicate
+    // the top and bottom row of |wiener_buffer| accordingly.
+    memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+           sizeof(*wiener_buffer_horizontal) * wiener_stride);
+    memcpy(restoration_buffer->wiener_buffer,
+           restoration_buffer->wiener_buffer + wiener_stride,
+           sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+    WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+                       filter_vertical, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+    WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+                       height, filter_vertical + 1, dst, stride);
+  } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+    WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+                       wiener_stride, height, filter_vertical + 2, dst, stride);
+  } else {
+    assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+    WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+                       wiener_stride, height, dst, stride);
+  }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+constexpr int kSumOffset = 24;
+
+// SIMD overreads the number of bytes in SIMD registers - (width % 16) - 2 *
+// padding pixels, where padding is 3 for Pass 1 and 2 for Pass 2. The number of
+// bytes in SIMD registers is 16 for SSE4.1 and 32 for AVX2.
+constexpr int kOverreadInBytesPass1_128 = 10;
+constexpr int kOverreadInBytesPass2_128 = 12;
+constexpr int kOverreadInBytesPass1_256 = kOverreadInBytesPass1_128 + 16;
+constexpr int kOverreadInBytesPass2_256 = kOverreadInBytesPass2_128 + 16;
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+                               __m128i dst[2]) {
+  dst[0] = LoadAligned16(src[0] + x);
+  dst[1] = LoadAligned16(src[1] + x);
+}
+
+inline void LoadAligned32x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+                               __m256i dst[2]) {
+  dst[0] = LoadAligned32(src[0] + x);
+  dst[1] = LoadAligned32(src[1] + x);
+}
+
+inline void LoadAligned32x2U16Msan(const uint16_t* const src[2],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m256i dst[2]) {
+  dst[0] = LoadAligned32Msan(src[0] + x, sizeof(**src) * (x + 16 - border));
+  dst[1] = LoadAligned32Msan(src[1] + x, sizeof(**src) * (x + 16 - border));
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+                               __m128i dst[3]) {
+  dst[0] = LoadAligned16(src[0] + x);
+  dst[1] = LoadAligned16(src[1] + x);
+  dst[2] = LoadAligned16(src[2] + x);
+}
+
+inline void LoadAligned32x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+                               __m256i dst[3]) {
+  dst[0] = LoadAligned32(src[0] + x);
+  dst[1] = LoadAligned32(src[1] + x);
+  dst[2] = LoadAligned32(src[2] + x);
+}
+
+inline void LoadAligned32x3U16Msan(const uint16_t* const src[3],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m256i dst[3]) {
+  dst[0] = LoadAligned32Msan(src[0] + x, sizeof(**src) * (x + 16 - border));
+  dst[1] = LoadAligned32Msan(src[1] + x, sizeof(**src) * (x + 16 - border));
+  dst[2] = LoadAligned32Msan(src[2] + x, sizeof(**src) * (x + 16 - border));
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) {
+  dst[0] = LoadAligned16(src + 0);
+  dst[1] = LoadAligned16(src + 4);
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+                               __m128i dst[2][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned64x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+                               __m256i dst[2][2]) {
+  LoadAligned64(src[0] + x, dst[0]);
+  LoadAligned64(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned64x2U32Msan(const uint32_t* const src[2],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m256i dst[2][2]) {
+  LoadAligned64Msan(src[0] + x, sizeof(**src) * (x + 16 - border), dst[0]);
+  LoadAligned64Msan(src[1] + x, sizeof(**src) * (x + 16 - border), dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+                               __m128i dst[3][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+  LoadAligned32U32(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned64x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+                               __m256i dst[3][2]) {
+  LoadAligned64(src[0] + x, dst[0]);
+  LoadAligned64(src[1] + x, dst[1]);
+  LoadAligned64(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned64x3U32Msan(const uint32_t* const src[3],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m256i dst[3][2]) {
+  LoadAligned64Msan(src[0] + x, sizeof(**src) * (x + 16 - border), dst[0]);
+  LoadAligned64Msan(src[1] + x, sizeof(**src) * (x + 16 - border), dst[1]);
+  LoadAligned64Msan(src[2] + x, sizeof(**src) * (x + 16 - border), dst[2]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) {
+  StoreAligned16(dst + 0, src[0]);
+  StoreAligned16(dst + 4, src[1]);
+}
+
+// Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following
+// functions. Some compilers may generate super inefficient code and the whole
+// decoder could be 15% slower.
+
+inline __m128i VaddlLo8(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpacklo_epi8(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+  return _mm_add_epi16(s0, s1);
+}
+
+inline __m256i VaddlLo8(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpacklo_epi8(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpacklo_epi8(src1, _mm256_setzero_si256());
+  return _mm256_add_epi16(s0, s1);
+}
+
+inline __m256i VaddlHi8(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpackhi_epi8(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpackhi_epi8(src1, _mm256_setzero_si256());
+  return _mm256_add_epi16(s0, s1);
+}
+
+inline __m128i VaddlLo16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+  return _mm_add_epi32(s0, s1);
+}
+
+inline __m256i VaddlLo16(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpacklo_epi16(src1, _mm256_setzero_si256());
+  return _mm256_add_epi32(s0, s1);
+}
+
+inline __m128i VaddlHi16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+  return _mm_add_epi32(s0, s1);
+}
+
+inline __m256i VaddlHi16(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpackhi_epi16(src1, _mm256_setzero_si256());
+  return _mm256_add_epi32(s0, s1);
+}
+
+inline __m128i VaddwLo8(const __m128i src0, const __m128i src1) {
+  const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+  return _mm_add_epi16(src0, s1);
+}
+
+inline __m256i VaddwLo8(const __m256i src0, const __m256i src1) {
+  const __m256i s1 = _mm256_unpacklo_epi8(src1, _mm256_setzero_si256());
+  return _mm256_add_epi16(src0, s1);
+}
+
+inline __m256i VaddwHi8(const __m256i src0, const __m256i src1) {
+  const __m256i s1 = _mm256_unpackhi_epi8(src1, _mm256_setzero_si256());
+  return _mm256_add_epi16(src0, s1);
+}
+
+inline __m128i VaddwLo16(const __m128i src0, const __m128i src1) {
+  const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+  return _mm_add_epi32(src0, s1);
+}
+
+inline __m256i VaddwLo16(const __m256i src0, const __m256i src1) {
+  const __m256i s1 = _mm256_unpacklo_epi16(src1, _mm256_setzero_si256());
+  return _mm256_add_epi32(src0, s1);
+}
+
+inline __m128i VaddwHi16(const __m128i src0, const __m128i src1) {
+  const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+  return _mm_add_epi32(src0, s1);
+}
+
+inline __m256i VaddwHi16(const __m256i src0, const __m256i src1) {
+  const __m256i s1 = _mm256_unpackhi_epi16(src1, _mm256_setzero_si256());
+  return _mm256_add_epi32(src0, s1);
+}
+
+inline __m256i VmullNLo8(const __m256i src0, const int src1) {
+  const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+  return _mm256_madd_epi16(s0, _mm256_set1_epi32(src1));
+}
+
+inline __m256i VmullNHi8(const __m256i src0, const int src1) {
+  const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+  return _mm256_madd_epi16(s0, _mm256_set1_epi32(src1));
+}
+
+inline __m128i VmullLo16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, s1);
+}
+
+inline __m256i VmullLo16(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpacklo_epi16(src1, _mm256_setzero_si256());
+  return _mm256_madd_epi16(s0, s1);
+}
+
+inline __m128i VmullHi16(const __m128i src0, const __m128i src1) {
+  const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+  const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+  return _mm_madd_epi16(s0, s1);
+}
+
+inline __m256i VmullHi16(const __m256i src0, const __m256i src1) {
+  const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+  const __m256i s1 = _mm256_unpackhi_epi16(src1, _mm256_setzero_si256());
+  return _mm256_madd_epi16(s0, s1);
+}
+
+inline __m256i VrshrS32(const __m256i src0, const int src1) {
+  const __m256i sum =
+      _mm256_add_epi32(src0, _mm256_set1_epi32(1 << (src1 - 1)));
+  return _mm256_srai_epi32(sum, src1);
+}
+
+inline __m128i VrshrU32(const __m128i src0, const int src1) {
+  const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+  return _mm_srli_epi32(sum, src1);
+}
+
+inline __m256i VrshrU32(const __m256i src0, const int src1) {
+  const __m256i sum =
+      _mm256_add_epi32(src0, _mm256_set1_epi32(1 << (src1 - 1)));
+  return _mm256_srli_epi32(sum, src1);
+}
+
+inline __m128i SquareLo8(const __m128i src) {
+  const __m128i s = _mm_unpacklo_epi8(src, _mm_setzero_si128());
+  return _mm_mullo_epi16(s, s);
+}
+
+inline __m256i SquareLo8(const __m256i src) {
+  const __m256i s = _mm256_unpacklo_epi8(src, _mm256_setzero_si256());
+  return _mm256_mullo_epi16(s, s);
+}
+
+inline __m128i SquareHi8(const __m128i src) {
+  const __m128i s = _mm_unpackhi_epi8(src, _mm_setzero_si128());
+  return _mm_mullo_epi16(s, s);
+}
+
+inline __m256i SquareHi8(const __m256i src) {
+  const __m256i s = _mm256_unpackhi_epi8(src, _mm256_setzero_si256());
+  return _mm256_mullo_epi16(s, s);
+}
+
+inline void Prepare3Lo8(const __m128i src, __m128i dst[3]) {
+  dst[0] = src;
+  dst[1] = _mm_srli_si128(src, 1);
+  dst[2] = _mm_srli_si128(src, 2);
+}
+
+inline void Prepare3_8(const __m256i src[2], __m256i dst[3]) {
+  dst[0] = _mm256_alignr_epi8(src[1], src[0], 0);
+  dst[1] = _mm256_alignr_epi8(src[1], src[0], 1);
+  dst[2] = _mm256_alignr_epi8(src[1], src[0], 2);
+}
+
+inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) {
+  dst[0] = src[0];
+  dst[1] = _mm_alignr_epi8(src[1], src[0], 2);
+  dst[2] = _mm_alignr_epi8(src[1], src[0], 4);
+}
+
+inline void Prepare3_16(const __m256i src[2], __m256i dst[3]) {
+  dst[0] = src[0];
+  dst[1] = _mm256_alignr_epi8(src[1], src[0], 2);
+  dst[2] = _mm256_alignr_epi8(src[1], src[0], 4);
+}
+
+inline void Prepare5Lo8(const __m128i src, __m128i dst[5]) {
+  dst[0] = src;
+  dst[1] = _mm_srli_si128(src, 1);
+  dst[2] = _mm_srli_si128(src, 2);
+  dst[3] = _mm_srli_si128(src, 3);
+  dst[4] = _mm_srli_si128(src, 4);
+}
+
+inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) {
+  Prepare3_16(src, dst);
+  dst[3] = _mm_alignr_epi8(src[1], src[0], 6);
+  dst[4] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare5_16(const __m256i src[2], __m256i dst[5]) {
+  Prepare3_16(src, dst);
+  dst[3] = _mm256_alignr_epi8(src[1], src[0], 6);
+  dst[4] = _mm256_alignr_epi8(src[1], src[0], 8);
+}
+
+inline __m128i Sum3_16(const __m128i src0, const __m128i src1,
+                       const __m128i src2) {
+  const __m128i sum = _mm_add_epi16(src0, src1);
+  return _mm_add_epi16(sum, src2);
+}
+
+inline __m256i Sum3_16(const __m256i src0, const __m256i src1,
+                       const __m256i src2) {
+  const __m256i sum = _mm256_add_epi16(src0, src1);
+  return _mm256_add_epi16(sum, src2);
+}
+
+inline __m128i Sum3_16(const __m128i src[3]) {
+  return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m256i Sum3_16(const __m256i src[3]) {
+  return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m128i Sum3_32(const __m128i src0, const __m128i src1,
+                       const __m128i src2) {
+  const __m128i sum = _mm_add_epi32(src0, src1);
+  return _mm_add_epi32(sum, src2);
+}
+
+inline __m256i Sum3_32(const __m256i src0, const __m256i src1,
+                       const __m256i src2) {
+  const __m256i sum = _mm256_add_epi32(src0, src1);
+  return _mm256_add_epi32(sum, src2);
+}
+
+inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) {
+  dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+  dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline void Sum3_32(const __m256i src[3][2], __m256i dst[2]) {
+  dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+  dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline __m128i Sum3WLo16(const __m128i src[3]) {
+  const __m128i sum = VaddlLo8(src[0], src[1]);
+  return VaddwLo8(sum, src[2]);
+}
+
+inline __m256i Sum3WLo16(const __m256i src[3]) {
+  const __m256i sum = VaddlLo8(src[0], src[1]);
+  return VaddwLo8(sum, src[2]);
+}
+
+inline __m256i Sum3WHi16(const __m256i src[3]) {
+  const __m256i sum = VaddlHi8(src[0], src[1]);
+  return VaddwHi8(sum, src[2]);
+}
+
+inline __m128i Sum3WLo32(const __m128i src[3]) {
+  const __m128i sum = VaddlLo16(src[0], src[1]);
+  return VaddwLo16(sum, src[2]);
+}
+
+inline __m256i Sum3WLo32(const __m256i src[3]) {
+  const __m256i sum = VaddlLo16(src[0], src[1]);
+  return VaddwLo16(sum, src[2]);
+}
+
+inline __m128i Sum3WHi32(const __m128i src[3]) {
+  const __m128i sum = VaddlHi16(src[0], src[1]);
+  return VaddwHi16(sum, src[2]);
+}
+
+inline __m256i Sum3WHi32(const __m256i src[3]) {
+  const __m256i sum = VaddlHi16(src[0], src[1]);
+  return VaddwHi16(sum, src[2]);
+}
+
+inline __m128i Sum5_16(const __m128i src[5]) {
+  const __m128i sum01 = _mm_add_epi16(src[0], src[1]);
+  const __m128i sum23 = _mm_add_epi16(src[2], src[3]);
+  const __m128i sum = _mm_add_epi16(sum01, sum23);
+  return _mm_add_epi16(sum, src[4]);
+}
+
+inline __m256i Sum5_16(const __m256i src[5]) {
+  const __m256i sum01 = _mm256_add_epi16(src[0], src[1]);
+  const __m256i sum23 = _mm256_add_epi16(src[2], src[3]);
+  const __m256i sum = _mm256_add_epi16(sum01, sum23);
+  return _mm256_add_epi16(sum, src[4]);
+}
+
+inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1,
+                       const __m128i* const src2, const __m128i* const src3,
+                       const __m128i* const src4) {
+  const __m128i sum01 = _mm_add_epi32(*src0, *src1);
+  const __m128i sum23 = _mm_add_epi32(*src2, *src3);
+  const __m128i sum = _mm_add_epi32(sum01, sum23);
+  return _mm_add_epi32(sum, *src4);
+}
+
+inline __m256i Sum5_32(const __m256i* const src0, const __m256i* const src1,
+                       const __m256i* const src2, const __m256i* const src3,
+                       const __m256i* const src4) {
+  const __m256i sum01 = _mm256_add_epi32(*src0, *src1);
+  const __m256i sum23 = _mm256_add_epi32(*src2, *src3);
+  const __m256i sum = _mm256_add_epi32(sum01, sum23);
+  return _mm256_add_epi32(sum, *src4);
+}
+
+inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) {
+  dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+  dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline void Sum5_32(const __m256i src[5][2], __m256i dst[2]) {
+  dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+  dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline __m128i Sum5WLo16(const __m128i src[5]) {
+  const __m128i sum01 = VaddlLo8(src[0], src[1]);
+  const __m128i sum23 = VaddlLo8(src[2], src[3]);
+  const __m128i sum = _mm_add_epi16(sum01, sum23);
+  return VaddwLo8(sum, src[4]);
+}
+
+inline __m256i Sum5WLo16(const __m256i src[5]) {
+  const __m256i sum01 = VaddlLo8(src[0], src[1]);
+  const __m256i sum23 = VaddlLo8(src[2], src[3]);
+  const __m256i sum = _mm256_add_epi16(sum01, sum23);
+  return VaddwLo8(sum, src[4]);
+}
+
+inline __m256i Sum5WHi16(const __m256i src[5]) {
+  const __m256i sum01 = VaddlHi8(src[0], src[1]);
+  const __m256i sum23 = VaddlHi8(src[2], src[3]);
+  const __m256i sum = _mm256_add_epi16(sum01, sum23);
+  return VaddwHi8(sum, src[4]);
+}
+
+inline __m128i Sum3Horizontal(const __m128i src) {
+  __m128i s[3];
+  Prepare3Lo8(src, s);
+  return Sum3WLo16(s);
+}
+
+inline void Sum3Horizontal(const uint8_t* const src,
+                           const ptrdiff_t over_read_in_bytes, __m256i dst[2]) {
+  __m256i s[3];
+  s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+  s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 1);
+  s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 2);
+  dst[0] = Sum3WLo16(s);
+  dst[1] = Sum3WHi16(s);
+}
+
+inline void Sum3WHorizontal(const __m128i src[2], __m128i dst[2]) {
+  __m128i s[3];
+  Prepare3_16(src, s);
+  dst[0] = Sum3WLo32(s);
+  dst[1] = Sum3WHi32(s);
+}
+
+inline void Sum3WHorizontal(const __m256i src[2], __m256i dst[2]) {
+  __m256i s[3];
+  Prepare3_16(src, s);
+  dst[0] = Sum3WLo32(s);
+  dst[1] = Sum3WHi32(s);
+}
+
+inline __m128i Sum5Horizontal(const __m128i src) {
+  __m128i s[5];
+  Prepare5Lo8(src, s);
+  return Sum5WLo16(s);
+}
+
+inline void Sum5Horizontal(const uint8_t* const src,
+                           const ptrdiff_t over_read_in_bytes,
+                           __m256i* const dst0, __m256i* const dst1) {
+  __m256i s[5];
+  s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+  s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 1);
+  s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 2);
+  s[3] = LoadUnaligned32Msan(src + 3, over_read_in_bytes + 3);
+  s[4] = LoadUnaligned32Msan(src + 4, over_read_in_bytes + 4);
+  *dst0 = Sum5WLo16(s);
+  *dst1 = Sum5WHi16(s);
+}
+
+inline void Sum5WHorizontal(const __m128i src[2], __m128i dst[2]) {
+  __m128i s[5];
+  Prepare5_16(src, s);
+  const __m128i sum01_lo = VaddlLo16(s[0], s[1]);
+  const __m128i sum23_lo = VaddlLo16(s[2], s[3]);
+  const __m128i sum0123_lo = _mm_add_epi32(sum01_lo, sum23_lo);
+  dst[0] = VaddwLo16(sum0123_lo, s[4]);
+  const __m128i sum01_hi = VaddlHi16(s[0], s[1]);
+  const __m128i sum23_hi = VaddlHi16(s[2], s[3]);
+  const __m128i sum0123_hi = _mm_add_epi32(sum01_hi, sum23_hi);
+  dst[1] = VaddwHi16(sum0123_hi, s[4]);
+}
+
+inline void Sum5WHorizontal(const __m256i src[2], __m256i dst[2]) {
+  __m256i s[5];
+  Prepare5_16(src, s);
+  const __m256i sum01_lo = VaddlLo16(s[0], s[1]);
+  const __m256i sum23_lo = VaddlLo16(s[2], s[3]);
+  const __m256i sum0123_lo = _mm256_add_epi32(sum01_lo, sum23_lo);
+  dst[0] = VaddwLo16(sum0123_lo, s[4]);
+  const __m256i sum01_hi = VaddlHi16(s[0], s[1]);
+  const __m256i sum23_hi = VaddlHi16(s[2], s[3]);
+  const __m256i sum0123_hi = _mm256_add_epi32(sum01_hi, sum23_hi);
+  dst[1] = VaddwHi16(sum0123_hi, s[4]);
+}
+
+void SumHorizontalLo(const __m128i src[5], __m128i* const row_sq3,
+                     __m128i* const row_sq5) {
+  const __m128i sum04 = VaddlLo16(src[0], src[4]);
+  *row_sq3 = Sum3WLo32(src + 1);
+  *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalLo(const __m256i src[5], __m256i* const row_sq3,
+                     __m256i* const row_sq5) {
+  const __m256i sum04 = VaddlLo16(src[0], src[4]);
+  *row_sq3 = Sum3WLo32(src + 1);
+  *row_sq5 = _mm256_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalHi(const __m128i src[5], __m128i* const row_sq3,
+                     __m128i* const row_sq5) {
+  const __m128i sum04 = VaddlHi16(src[0], src[4]);
+  *row_sq3 = Sum3WHi32(src + 1);
+  *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalHi(const __m256i src[5], __m256i* const row_sq3,
+                     __m256i* const row_sq5) {
+  const __m256i sum04 = VaddlHi16(src[0], src[4]);
+  *row_sq3 = Sum3WHi32(src + 1);
+  *row_sq5 = _mm256_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalLo(const __m128i src, __m128i* const row3,
+                     __m128i* const row5) {
+  __m128i s[5];
+  Prepare5Lo8(src, s);
+  const __m128i sum04 = VaddlLo8(s[0], s[4]);
+  *row3 = Sum3WLo16(s + 1);
+  *row5 = _mm_add_epi16(sum04, *row3);
+}
+
+inline void SumHorizontal(const uint8_t* const src,
+                          const ptrdiff_t over_read_in_bytes,
+                          __m256i* const row3_0, __m256i* const row3_1,
+                          __m256i* const row5_0, __m256i* const row5_1) {
+  __m256i s[5];
+  s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+  s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 1);
+  s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 2);
+  s[3] = LoadUnaligned32Msan(src + 3, over_read_in_bytes + 3);
+  s[4] = LoadUnaligned32Msan(src + 4, over_read_in_bytes + 4);
+  const __m256i sum04_lo = VaddlLo8(s[0], s[4]);
+  const __m256i sum04_hi = VaddlHi8(s[0], s[4]);
+  *row3_0 = Sum3WLo16(s + 1);
+  *row3_1 = Sum3WHi16(s + 1);
+  *row5_0 = _mm256_add_epi16(sum04_lo, *row3_0);
+  *row5_1 = _mm256_add_epi16(sum04_hi, *row3_1);
+}
+
+inline void SumHorizontal(const __m128i src[2], __m128i* const row_sq3_0,
+                          __m128i* const row_sq3_1, __m128i* const row_sq5_0,
+                          __m128i* const row_sq5_1) {
+  __m128i s[5];
+  Prepare5_16(src, s);
+  SumHorizontalLo(s, row_sq3_0, row_sq5_0);
+  SumHorizontalHi(s, row_sq3_1, row_sq5_1);
+}
+
+inline void SumHorizontal(const __m256i src[2], __m256i* const row_sq3_0,
+                          __m256i* const row_sq3_1, __m256i* const row_sq5_0,
+                          __m256i* const row_sq5_1) {
+  __m256i s[5];
+  Prepare5_16(src, s);
+  SumHorizontalLo(s, row_sq3_0, row_sq5_0);
+  SumHorizontalHi(s, row_sq3_1, row_sq5_1);
+}
+
+inline __m256i Sum343Lo(const __m256i ma3[3]) {
+  const __m256i sum = Sum3WLo16(ma3);
+  const __m256i sum3 = Sum3_16(sum, sum, sum);
+  return VaddwLo8(sum3, ma3[1]);
+}
+
+inline __m256i Sum343Hi(const __m256i ma3[3]) {
+  const __m256i sum = Sum3WHi16(ma3);
+  const __m256i sum3 = Sum3_16(sum, sum, sum);
+  return VaddwHi8(sum3, ma3[1]);
+}
+
+inline __m256i Sum343WLo(const __m256i src[3]) {
+  const __m256i sum = Sum3WLo32(src);
+  const __m256i sum3 = Sum3_32(sum, sum, sum);
+  return VaddwLo16(sum3, src[1]);
+}
+
+inline __m256i Sum343WHi(const __m256i src[3]) {
+  const __m256i sum = Sum3WHi32(src);
+  const __m256i sum3 = Sum3_32(sum, sum, sum);
+  return VaddwHi16(sum3, src[1]);
+}
+
+inline void Sum343W(const __m256i src[2], __m256i dst[2]) {
+  __m256i s[3];
+  Prepare3_16(src, s);
+  dst[0] = Sum343WLo(s);
+  dst[1] = Sum343WHi(s);
+}
+
+inline __m256i Sum565Lo(const __m256i src[3]) {
+  const __m256i sum = Sum3WLo16(src);
+  const __m256i sum4 = _mm256_slli_epi16(sum, 2);
+  const __m256i sum5 = _mm256_add_epi16(sum4, sum);
+  return VaddwLo8(sum5, src[1]);
+}
+
+inline __m256i Sum565Hi(const __m256i src[3]) {
+  const __m256i sum = Sum3WHi16(src);
+  const __m256i sum4 = _mm256_slli_epi16(sum, 2);
+  const __m256i sum5 = _mm256_add_epi16(sum4, sum);
+  return VaddwHi8(sum5, src[1]);
+}
+
+inline __m256i Sum565WLo(const __m256i src[3]) {
+  const __m256i sum = Sum3WLo32(src);
+  const __m256i sum4 = _mm256_slli_epi32(sum, 2);
+  const __m256i sum5 = _mm256_add_epi32(sum4, sum);
+  return VaddwLo16(sum5, src[1]);
+}
+
+inline __m256i Sum565WHi(const __m256i src[3]) {
+  const __m256i sum = Sum3WHi32(src);
+  const __m256i sum4 = _mm256_slli_epi32(sum, 2);
+  const __m256i sum5 = _mm256_add_epi32(sum4, sum);
+  return VaddwHi16(sum5, src[1]);
+}
+
+inline void Sum565W(const __m256i src[2], __m256i dst[2]) {
+  __m256i s[3];
+  Prepare3_16(src, s);
+  dst[0] = Sum565WLo(s);
+  dst[1] = Sum565WHi(s);
+}
+
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
+                   uint32_t* square_sum3, uint32_t* square_sum5) {
+  int y = 2;
+  do {
+    const __m128i s0 =
+        LoadUnaligned16Msan(src, kOverreadInBytesPass1_128 - width);
+    __m128i sq_128[2], s3, s5, sq3[2], sq5[2];
+    __m256i sq[3];
+    sq_128[0] = SquareLo8(s0);
+    sq_128[1] = SquareHi8(s0);
+    SumHorizontalLo(s0, &s3, &s5);
+    StoreAligned16(sum3, s3);
+    StoreAligned16(sum5, s5);
+    SumHorizontal(sq_128, &sq3[0], &sq3[1], &sq5[0], &sq5[1]);
+    StoreAligned32U32(square_sum3, sq3);
+    StoreAligned32U32(square_sum5, sq5);
+    src += 8;
+    sum3 += 8;
+    sum5 += 8;
+    square_sum3 += 8;
+    square_sum5 += 8;
+    sq[0] = SetrM128i(sq_128[1], sq_128[1]);
+    ptrdiff_t x = sum_width;
+    do {
+      __m256i row3[2], row5[2], row_sq3[2], row_sq5[2];
+      const __m256i s = LoadUnaligned32Msan(
+          src + 8, sum_width - x + 16 + kOverreadInBytesPass1_256 - width);
+      sq[1] = SquareLo8(s);
+      sq[2] = SquareHi8(s);
+      sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+      SumHorizontal(src, sum_width - x + 8 + kOverreadInBytesPass1_256 - width,
+                    &row3[0], &row3[1], &row5[0], &row5[1]);
+      StoreAligned64(sum3, row3);
+      StoreAligned64(sum5, row5);
+      SumHorizontal(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
+      StoreAligned64(square_sum3 + 0, row_sq3);
+      StoreAligned64(square_sum5 + 0, row_sq5);
+      SumHorizontal(sq + 1, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
+      StoreAligned64(square_sum3 + 16, row_sq3);
+      StoreAligned64(square_sum5 + 16, row_sq5);
+      sq[0] = sq[2];
+      src += 32;
+      sum3 += 32;
+      sum5 += 32;
+      square_sum3 += 32;
+      square_sum5 += 32;
+      x -= 32;
+    } while (x != 0);
+    src += src_stride - sum_width - 8;
+    sum3 += sum_stride - sum_width - 8;
+    sum5 += sum_stride - sum_width - 8;
+    square_sum3 += sum_stride - sum_width - 8;
+    square_sum5 += sum_stride - sum_width - 8;
+  } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sums,
+                   uint32_t* square_sums) {
+  static_assert(size == 3 || size == 5, "");
+  int kOverreadInBytes_128, kOverreadInBytes_256;
+  if (size == 3) {
+    kOverreadInBytes_128 = kOverreadInBytesPass2_128;
+    kOverreadInBytes_256 = kOverreadInBytesPass2_256;
+  } else {
+    kOverreadInBytes_128 = kOverreadInBytesPass1_128;
+    kOverreadInBytes_256 = kOverreadInBytesPass1_256;
+  }
+  int y = 2;
+  do {
+    const __m128i s = LoadUnaligned16Msan(src, kOverreadInBytes_128 - width);
+    __m128i ss, sq_128[2], sqs[2];
+    __m256i sq[3];
+    sq_128[0] = SquareLo8(s);
+    sq_128[1] = SquareHi8(s);
+    if (size == 3) {
+      ss = Sum3Horizontal(s);
+      Sum3WHorizontal(sq_128, sqs);
+    } else {
+      ss = Sum5Horizontal(s);
+      Sum5WHorizontal(sq_128, sqs);
+    }
+    StoreAligned16(sums, ss);
+    StoreAligned32U32(square_sums, sqs);
+    src += 8;
+    sums += 8;
+    square_sums += 8;
+    sq[0] = SetrM128i(sq_128[1], sq_128[1]);
+    ptrdiff_t x = sum_width;
+    do {
+      __m256i row[2], row_sq[4];
+      const __m256i s = LoadUnaligned32Msan(
+          src + 8, sum_width - x + 16 + kOverreadInBytes_256 - width);
+      sq[1] = SquareLo8(s);
+      sq[2] = SquareHi8(s);
+      sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+      if (size == 3) {
+        Sum3Horizontal(src, sum_width - x + 8 + kOverreadInBytes_256 - width,
+                       row);
+        Sum3WHorizontal(sq + 0, row_sq + 0);
+        Sum3WHorizontal(sq + 1, row_sq + 2);
+      } else {
+        Sum5Horizontal(src, sum_width - x + 8 + kOverreadInBytes_256 - width,
+                       &row[0], &row[1]);
+        Sum5WHorizontal(sq + 0, row_sq + 0);
+        Sum5WHorizontal(sq + 1, row_sq + 2);
+      }
+      StoreAligned64(sums, row);
+      StoreAligned64(square_sums + 0, row_sq + 0);
+      StoreAligned64(square_sums + 16, row_sq + 2);
+      sq[0] = sq[2];
+      src += 32;
+      sums += 32;
+      square_sums += 32;
+      x -= 32;
+    } while (x != 0);
+    src += src_stride - sum_width - 8;
+    sums += sum_stride - sum_width - 8;
+    square_sums += sum_stride - sum_width - 8;
+  } while (--y != 0);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq,
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  // a = |sum_sq|
+  // d = |sum|
+  // p = (a * n < d * d) ? 0 : a * n - d * d;
+  const __m128i dxd = _mm_madd_epi16(sum, sum);
+  // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
+  // Some compilers could do this for us but we make this explicit.
+  // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n));
+  __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3));
+  if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4));
+  const __m128i sub = _mm_sub_epi32(axn, dxd);
+  const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
+  const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale));
+  return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  const __m128i sum_lo = _mm_unpacklo_epi16(sum, _mm_setzero_si128());
+  const __m128i sum_hi = _mm_unpackhi_epi16(sum, _mm_setzero_si128());
+  const __m128i z0 = CalculateMa<n>(sum_lo, sum_sq[0], scale);
+  const __m128i z1 = CalculateMa<n>(sum_hi, sum_sq[1], scale);
+  return _mm_packus_epi32(z0, z1);
+}
+
+template <int n>
+inline __m256i CalculateMa(const __m256i sum, const __m256i sum_sq,
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  // a = |sum_sq|
+  // d = |sum|
+  // p = (a * n < d * d) ? 0 : a * n - d * d;
+  const __m256i dxd = _mm256_madd_epi16(sum, sum);
+  // _mm256_mullo_epi32() has high latency. Using shifts and additions instead.
+  // Some compilers could do this for us but we make this explicit.
+  // return _mm256_mullo_epi32(sum_sq, _mm256_set1_epi32(n));
+  __m256i axn = _mm256_add_epi32(sum_sq, _mm256_slli_epi32(sum_sq, 3));
+  if (n == 25) axn = _mm256_add_epi32(axn, _mm256_slli_epi32(sum_sq, 4));
+  const __m256i sub = _mm256_sub_epi32(axn, dxd);
+  const __m256i p = _mm256_max_epi32(sub, _mm256_setzero_si256());
+  const __m256i pxs = _mm256_mullo_epi32(p, _mm256_set1_epi32(scale));
+  return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m256i CalculateMa(const __m256i sum, const __m256i sum_sq[2],
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  const __m256i sum_lo = _mm256_unpacklo_epi16(sum, _mm256_setzero_si256());
+  const __m256i sum_hi = _mm256_unpackhi_epi16(sum, _mm256_setzero_si256());
+  const __m256i z0 = CalculateMa<n>(sum_lo, sum_sq[0], scale);
+  const __m256i z1 = CalculateMa<n>(sum_hi, sum_sq[1], scale);
+  return _mm256_packus_epi32(z0, z1);
+}
+
+inline __m128i CalculateB5(const __m128i sum, const __m128i ma) {
+  // one_over_n == 164.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+  // one_over_n_quarter == 41.
+  constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+  static_assert(one_over_n == one_over_n_quarter << 2, "");
+  // |ma| is in range [0, 255].
+  const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter));
+  const __m128i m0 = VmullLo16(m, sum);
+  const __m128i m1 = VmullHi16(m, sum);
+  const __m128i b_lo = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+  const __m128i b_hi = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+  return _mm_packus_epi32(b_lo, b_hi);
+}
+
+inline __m256i CalculateB5(const __m256i sum, const __m256i ma) {
+  // one_over_n == 164.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+  // one_over_n_quarter == 41.
+  constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+  static_assert(one_over_n == one_over_n_quarter << 2, "");
+  // |ma| is in range [0, 255].
+  const __m256i m =
+      _mm256_maddubs_epi16(ma, _mm256_set1_epi16(one_over_n_quarter));
+  const __m256i m0 = VmullLo16(m, sum);
+  const __m256i m1 = VmullHi16(m, sum);
+  const __m256i b_lo = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+  const __m256i b_hi = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+  return _mm256_packus_epi32(b_lo, b_hi);
+}
+
+inline __m128i CalculateB3(const __m128i sum, const __m128i ma) {
+  // one_over_n == 455.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+  const __m128i m0 = VmullLo16(ma, sum);
+  const __m128i m1 = VmullHi16(ma, sum);
+  const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
+  const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
+  const __m128i b_lo = VrshrU32(m2, kSgrProjReciprocalBits);
+  const __m128i b_hi = VrshrU32(m3, kSgrProjReciprocalBits);
+  return _mm_packus_epi32(b_lo, b_hi);
+}
+
+inline __m256i CalculateB3(const __m256i sum, const __m256i ma) {
+  // one_over_n == 455.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+  const __m256i m0 = VmullLo16(ma, sum);
+  const __m256i m1 = VmullHi16(ma, sum);
+  const __m256i m2 = _mm256_mullo_epi32(m0, _mm256_set1_epi32(one_over_n));
+  const __m256i m3 = _mm256_mullo_epi32(m1, _mm256_set1_epi32(one_over_n));
+  const __m256i b_lo = VrshrU32(m2, kSgrProjReciprocalBits);
+  const __m256i b_hi = VrshrU32(m3, kSgrProjReciprocalBits);
+  return _mm256_packus_epi32(b_lo, b_hi);
+}
+
+inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2],
+                                  const uint32_t scale, __m128i* const sum,
+                                  __m128i* const index) {
+  __m128i sum_sq[2];
+  *sum = Sum5_16(s5);
+  Sum5_32(sq5, sum_sq);
+  *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex5(const __m256i s5[5], const __m256i sq5[5][2],
+                                  const uint32_t scale, __m256i* const sum,
+                                  __m256i* const index) {
+  __m256i sum_sq[2];
+  *sum = Sum5_16(s5);
+  Sum5_32(sq5, sum_sq);
+  *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2],
+                                  const uint32_t scale, __m128i* const sum,
+                                  __m128i* const index) {
+  __m128i sum_sq[2];
+  *sum = Sum3_16(s3);
+  Sum3_32(sq3, sum_sq);
+  *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m256i s3[3], const __m256i sq3[3][2],
+                                  const uint32_t scale, __m256i* const sum,
+                                  __m256i* const index) {
+  __m256i sum_sq[2];
+  *sum = Sum3_16(s3);
+  Sum3_32(sq3, sum_sq);
+  *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+template <int n>
+inline void LookupIntermediate(const __m128i sum, const __m128i index,
+                               __m128i* const ma, __m128i* const b) {
+  static_assert(n == 9 || n == 25, "");
+  const __m128i idx = _mm_packus_epi16(index, index);
+  // Actually it's not stored and loaded. The compiler will use a 64-bit
+  // general-purpose register to process. Faster than using _mm_extract_epi8().
+  uint8_t temp[8];
+  StoreLo8(temp, idx);
+  *ma = _mm_cvtsi32_si128(kSgrMaLookup[temp[0]]);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], 1);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], 2);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], 3);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], 4);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], 5);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], 6);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], 7);
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const __m128i maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+  *b = (n == 9) ? CalculateB3(sum, maq) : CalculateB5(sum, maq);
+}
+
+// Repeat the first 48 elements in kSgrMaLookup with a period of 16.
+alignas(32) constexpr uint8_t kSgrMaLookupAvx2[96] = {
+    255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16,
+    255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16,
+    15,  14,  13, 13, 12, 12, 11, 11, 10, 10, 9,  9,  9,  9,  8,  8,
+    15,  14,  13, 13, 12, 12, 11, 11, 10, 10, 9,  9,  9,  9,  8,  8,
+    8,   8,   7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,  5,  5,
+    8,   8,   7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,  5,  5};
+
+// Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
+// to get value 0 as the shuffle result. The most significiant bit 1 comes
+// either from the comparison instruction, or from the sign bit of the index.
+inline __m256i ShuffleIndex(const __m256i table, const __m256i index) {
+  __m256i mask;
+  mask = _mm256_cmpgt_epi8(index, _mm256_set1_epi8(15));
+  mask = _mm256_or_si256(mask, index);
+  return _mm256_shuffle_epi8(table, mask);
+}
+
+inline __m256i AdjustValue(const __m256i value, const __m256i index,
+                           const int threshold) {
+  const __m256i thresholds = _mm256_set1_epi8(threshold - 128);
+  const __m256i offset = _mm256_cmpgt_epi8(index, thresholds);
+  return _mm256_add_epi8(value, offset);
+}
+
+template <int n>
+inline void CalculateIntermediate(const __m256i sum[2], const __m256i index[2],
+                                  __m256i ma[3], __m256i b[2]) {
+  static_assert(n == 9 || n == 25, "");
+  // Use table lookup to read elements whose indices are less than 48.
+  const __m256i c0 = LoadAligned32(kSgrMaLookupAvx2 + 0 * 32);
+  const __m256i c1 = LoadAligned32(kSgrMaLookupAvx2 + 1 * 32);
+  const __m256i c2 = LoadAligned32(kSgrMaLookupAvx2 + 2 * 32);
+  const __m256i indices = _mm256_packus_epi16(index[0], index[1]);
+  __m256i idx, mas;
+  // Clip idx to 127 to apply signed comparison instructions.
+  idx = _mm256_min_epu8(indices, _mm256_set1_epi8(127));
+  // All elements whose indices are less than 48 are set to 0.
+  // Get shuffle results for indices in range [0, 15].
+  mas = ShuffleIndex(c0, idx);
+  // Get shuffle results for indices in range [16, 31].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm256_sub_epi8(idx, _mm256_set1_epi8(16));
+  const __m256i res1 = ShuffleIndex(c1, idx);
+  // Use OR instruction to combine shuffle results together.
+  mas = _mm256_or_si256(mas, res1);
+  // Get shuffle results for indices in range [32, 47].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm256_sub_epi8(idx, _mm256_set1_epi8(16));
+  const __m256i res2 = ShuffleIndex(c2, idx);
+  mas = _mm256_or_si256(mas, res2);
+
+  // For elements whose indices are larger than 47, since they seldom change
+  // values with the increase of the index, we use comparison and arithmetic
+  // operations to calculate their values.
+  // Add -128 to apply signed comparison instructions.
+  idx = _mm256_add_epi8(indices, _mm256_set1_epi8(-128));
+  // Elements whose indices are larger than 47 (with value 0) are set to 5.
+  mas = _mm256_max_epu8(mas, _mm256_set1_epi8(5));
+  mas = AdjustValue(mas, idx, 55);   // 55 is the last index which value is 5.
+  mas = AdjustValue(mas, idx, 72);   // 72 is the last index which value is 4.
+  mas = AdjustValue(mas, idx, 101);  // 101 is the last index which value is 3.
+  mas = AdjustValue(mas, idx, 169);  // 169 is the last index which value is 2.
+  mas = AdjustValue(mas, idx, 254);  // 254 is the last index which value is 1.
+
+  ma[2] = _mm256_permute4x64_epi64(mas, 0x93);     // 32-39 8-15 16-23 24-31
+  ma[0] = _mm256_blend_epi32(ma[0], ma[2], 0xfc);  //  0-7  8-15 16-23 24-31
+  ma[1] = _mm256_permute2x128_si256(ma[0], ma[2], 0x21);
+
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const __m256i maq0 = _mm256_unpackhi_epi8(ma[0], _mm256_setzero_si256());
+  const __m256i maq1 = _mm256_unpacklo_epi8(ma[1], _mm256_setzero_si256());
+  if (n == 9) {
+    b[0] = CalculateB3(sum[0], maq0);
+    b[1] = CalculateB3(sum[1], maq1);
+  } else {
+    b[0] = CalculateB5(sum[0], maq0);
+    b[1] = CalculateB5(sum[1], maq1);
+  }
+}
+
+inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2],
+                                   const uint32_t scale, __m128i* const ma,
+                                   __m128i* const b) {
+  __m128i sum, index;
+  CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
+  LookupIntermediate<25>(sum, index, ma, b);
+}
+
+inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2],
+                                   const uint32_t scale, __m128i* const ma,
+                                   __m128i* const b) {
+  __m128i sum, index;
+  CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
+  LookupIntermediate<9>(sum, index, ma, b);
+}
+
+inline void Store343_444(const __m256i b3[2], const ptrdiff_t x,
+                         __m256i sum_b343[2], __m256i sum_b444[2],
+                         uint32_t* const b343, uint32_t* const b444) {
+  __m256i b[3], sum_b111[2];
+  Prepare3_16(b3, b);
+  sum_b111[0] = Sum3WLo32(b);
+  sum_b111[1] = Sum3WHi32(b);
+  sum_b444[0] = _mm256_slli_epi32(sum_b111[0], 2);
+  sum_b444[1] = _mm256_slli_epi32(sum_b111[1], 2);
+  StoreAligned64(b444 + x, sum_b444);
+  sum_b343[0] = _mm256_sub_epi32(sum_b444[0], sum_b111[0]);
+  sum_b343[1] = _mm256_sub_epi32(sum_b444[1], sum_b111[1]);
+  sum_b343[0] = VaddwLo16(sum_b343[0], b[1]);
+  sum_b343[1] = VaddwHi16(sum_b343[1], b[1]);
+  StoreAligned64(b343 + x, sum_b343);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, __m256i* const sum_ma343,
+                           __m256i* const sum_ma444, __m256i sum_b343[2],
+                           __m256i sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const __m256i sum_ma111 = Sum3WLo16(ma3);
+  *sum_ma444 = _mm256_slli_epi16(sum_ma111, 2);
+  StoreAligned32(ma444 + x, *sum_ma444);
+  const __m256i sum333 = _mm256_sub_epi16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwLo8(sum333, ma3[1]);
+  StoreAligned32(ma343 + x, *sum_ma343);
+  Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, __m256i* const sum_ma343,
+                           __m256i* const sum_ma444, __m256i sum_b343[2],
+                           __m256i sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const __m256i sum_ma111 = Sum3WHi16(ma3);
+  *sum_ma444 = _mm256_slli_epi16(sum_ma111, 2);
+  StoreAligned32(ma444 + x, *sum_ma444);
+  const __m256i sum333 = _mm256_sub_epi16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwHi8(sum333, ma3[1]);
+  StoreAligned32(ma343 + x, *sum_ma343);
+  Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, __m256i* const sum_ma343,
+                           __m256i sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m256i sum_ma444, sum_b444[2];
+  Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, __m256i* const sum_ma343,
+                           __m256i sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m256i sum_ma444, sum_b444[2];
+  Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m256i sum_ma343, sum_b343[2];
+  Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m256i sum_ma343, sum_b343[2];
+  Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+    const __m128i s[2][3], const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], __m128i sq[2][2], __m128i* const ma,
+    __m128i* const b) {
+  __m128i s5[2][5], sq5[5][2];
+  sq[0][1] = SquareHi8(s[0][0]);
+  sq[1][1] = SquareHi8(s[1][0]);
+  s5[0][3] = Sum5Horizontal(s[0][0]);
+  StoreAligned16(sum5[3], s5[0][3]);
+  s5[0][4] = Sum5Horizontal(s[1][0]);
+  StoreAligned16(sum5[4], s5[0][4]);
+  Sum5WHorizontal(sq[0], sq5[3]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  Sum5WHorizontal(sq[1], sq5[4]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x3U16(sum5, 0, s5[0]);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5(s5[0], sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+    const uint8_t* const src0, const uint8_t* const src1,
+    const ptrdiff_t over_read_in_bytes, const ptrdiff_t sum_width,
+    const ptrdiff_t x, const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], __m256i sq[2][3], __m256i ma[3],
+    __m256i b[3]) {
+  const __m256i s0 = LoadUnaligned32Msan(src0 + 8, over_read_in_bytes + 8);
+  const __m256i s1 = LoadUnaligned32Msan(src1 + 8, over_read_in_bytes + 8);
+  __m256i s5[2][5], sq5[5][2], sum[2], index[2];
+  sq[0][1] = SquareLo8(s0);
+  sq[0][2] = SquareHi8(s0);
+  sq[1][1] = SquareLo8(s1);
+  sq[1][2] = SquareHi8(s1);
+  sq[0][0] = _mm256_permute2x128_si256(sq[0][0], sq[0][2], 0x21);
+  sq[1][0] = _mm256_permute2x128_si256(sq[1][0], sq[1][2], 0x21);
+  Sum5Horizontal(src0, over_read_in_bytes, &s5[0][3], &s5[1][3]);
+  Sum5Horizontal(src1, over_read_in_bytes, &s5[0][4], &s5[1][4]);
+  StoreAligned32(sum5[3] + x + 0, s5[0][3]);
+  StoreAligned32(sum5[3] + x + 16, s5[1][3]);
+  StoreAligned32(sum5[4] + x + 0, s5[0][4]);
+  StoreAligned32(sum5[4] + x + 16, s5[1][4]);
+  Sum5WHorizontal(sq[0], sq5[3]);
+  StoreAligned64(square_sum5[3] + x, sq5[3]);
+  Sum5WHorizontal(sq[1], sq5[4]);
+  StoreAligned64(square_sum5[4] + x, sq5[4]);
+  LoadAligned32x3U16(sum5, x, s5[0]);
+  LoadAligned64x3U32(square_sum5, x, sq5);
+  CalculateSumAndIndex5(s5[0], sq5, scale, &sum[0], &index[0]);
+
+  Sum5WHorizontal(sq[0] + 1, sq5[3]);
+  StoreAligned64(square_sum5[3] + x + 16, sq5[3]);
+  Sum5WHorizontal(sq[1] + 1, sq5[4]);
+  StoreAligned64(square_sum5[4] + x + 16, sq5[4]);
+  LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+  LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+  CalculateSumAndIndex5(s5[1], sq5, scale, &sum[1], &index[1]);
+  CalculateIntermediate<25>(sum, index, ma, b + 1);
+  b[0] = _mm256_permute2x128_si256(b[0], b[2], 0x21);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+    const __m128i s, const uint32_t scale, const uint16_t* const sum5[5],
+    const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma,
+    __m128i* const b) {
+  __m128i s5[5], sq5[5][2];
+  sq[1] = SquareHi8(s);
+  s5[3] = s5[4] = Sum5Horizontal(s);
+  Sum5WHorizontal(sq, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+    const uint8_t* const src, const ptrdiff_t over_read_in_bytes,
+    const ptrdiff_t sum_width, const ptrdiff_t x, const uint32_t scale,
+    const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
+    __m256i sq[3], __m256i ma[3], __m256i b[3]) {
+  const __m256i s = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 8);
+  __m256i s5[2][5], sq5[5][2], sum[2], index[2];
+  sq[1] = SquareLo8(s);
+  sq[2] = SquareHi8(s);
+  sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+  Sum5Horizontal(src, over_read_in_bytes, &s5[0][3], &s5[1][3]);
+  s5[0][4] = s5[0][3];
+  s5[1][4] = s5[1][3];
+  Sum5WHorizontal(sq, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned32x3U16(sum5, x, s5[0]);
+  LoadAligned64x3U32(square_sum5, x, sq5);
+  CalculateSumAndIndex5(s5[0], sq5, scale, &sum[0], &index[0]);
+
+  Sum5WHorizontal(sq + 1, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+  LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+  CalculateSumAndIndex5(s5[1], sq5, scale, &sum[1], &index[1]);
+  CalculateIntermediate<25>(sum, index, ma, b + 1);
+  b[0] = _mm256_permute2x128_si256(b[0], b[2], 0x21);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+    const __m128i s, const uint32_t scale, uint16_t* const sum3[3],
+    uint32_t* const square_sum3[3], __m128i sq[2], __m128i* const ma,
+    __m128i* const b) {
+  __m128i s3[3], sq3[3][2];
+  sq[1] = SquareHi8(s);
+  s3[2] = Sum3Horizontal(s);
+  StoreAligned16(sum3[2], s3[2]);
+  Sum3WHorizontal(sq, sq3[2]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+    const uint8_t* const src, const ptrdiff_t over_read_in_bytes,
+    const ptrdiff_t x, const ptrdiff_t sum_width, const uint32_t scale,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3], __m256i sq[3],
+    __m256i ma[3], __m256i b[3]) {
+  const __m256i s = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 8);
+  __m256i s3[4], sq3[3][2], sum[2], index[2];
+  sq[1] = SquareLo8(s);
+  sq[2] = SquareHi8(s);
+  sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+  Sum3Horizontal(src, over_read_in_bytes, s3 + 2);
+  StoreAligned64(sum3[2] + x, s3 + 2);
+  Sum3WHorizontal(sq + 0, sq3[2]);
+  StoreAligned64(square_sum3[2] + x, sq3[2]);
+  LoadAligned32x2U16(sum3, x, s3);
+  LoadAligned64x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
+
+  Sum3WHorizontal(sq + 1, sq3[2]);
+  StoreAligned64(square_sum3[2] + x + 16, sq3[2]);
+  LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3 + 1);
+  LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+  CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
+  CalculateIntermediate<9>(sum, index, ma, b + 1);
+  b[0] = _mm256_permute2x128_si256(b[0], b[2], 0x21);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+    const __m128i s[2], const uint16_t scales[2], uint16_t* const sum3[4],
+    uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+    uint32_t* const square_sum5[5], __m128i sq[2][2], __m128i ma3[2],
+    __m128i b3[2], __m128i* const ma5, __m128i* const b5) {
+  __m128i s3[4], s5[5], sq3[4][2], sq5[5][2];
+  sq[0][1] = SquareHi8(s[0]);
+  sq[1][1] = SquareHi8(s[1]);
+  SumHorizontalLo(s[0], &s3[2], &s5[3]);
+  SumHorizontalLo(s[1], &s3[3], &s5[4]);
+  StoreAligned16(sum3[2], s3[2]);
+  StoreAligned16(sum3[3], s3[3]);
+  StoreAligned16(sum5[3], s5[3]);
+  StoreAligned16(sum5[4], s5[4]);
+  SumHorizontal(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  SumHorizontal(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3], sq3[3]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  // Note: in the SSE4_1 version, CalculateIntermediate() is called
+  // to replace the slow LookupIntermediate() when calculating 16 intermediate
+  // data points. However, the AVX2 compiler generates even slower code. So we
+  // keep using CalculateIntermediate3().
+  CalculateIntermediate3(s3 + 0, sq3 + 0, scales[1], &ma3[0], &b3[0]);
+  CalculateIntermediate3(s3 + 1, sq3 + 1, scales[1], &ma3[1], &b3[1]);
+  CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+    const uint8_t* const src0, const uint8_t* const src1,
+    const ptrdiff_t over_read_in_bytes, const ptrdiff_t x,
+    const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, __m256i sq[2][3], __m256i ma3[2][3],
+    __m256i b3[2][5], __m256i ma5[3], __m256i b5[5]) {
+  const __m256i s0 = LoadUnaligned32Msan(src0 + 8, over_read_in_bytes + 8);
+  const __m256i s1 = LoadUnaligned32Msan(src1 + 8, over_read_in_bytes + 8);
+  __m256i s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum_3[2][2], index_3[2][2],
+      sum_5[2], index_5[2];
+  sq[0][1] = SquareLo8(s0);
+  sq[0][2] = SquareHi8(s0);
+  sq[1][1] = SquareLo8(s1);
+  sq[1][2] = SquareHi8(s1);
+  sq[0][0] = _mm256_permute2x128_si256(sq[0][0], sq[0][2], 0x21);
+  sq[1][0] = _mm256_permute2x128_si256(sq[1][0], sq[1][2], 0x21);
+  SumHorizontal(src0, over_read_in_bytes, &s3[0][2], &s3[1][2], &s5[0][3],
+                &s5[1][3]);
+  SumHorizontal(src1, over_read_in_bytes, &s3[0][3], &s3[1][3], &s5[0][4],
+                &s5[1][4]);
+  StoreAligned32(sum3[2] + x + 0, s3[0][2]);
+  StoreAligned32(sum3[2] + x + 16, s3[1][2]);
+  StoreAligned32(sum3[3] + x + 0, s3[0][3]);
+  StoreAligned32(sum3[3] + x + 16, s3[1][3]);
+  StoreAligned32(sum5[3] + x + 0, s5[0][3]);
+  StoreAligned32(sum5[3] + x + 16, s5[1][3]);
+  StoreAligned32(sum5[4] + x + 0, s5[0][4]);
+  StoreAligned32(sum5[4] + x + 16, s5[1][4]);
+  SumHorizontal(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  SumHorizontal(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned64(square_sum3[2] + x, sq3[2]);
+  StoreAligned64(square_sum5[3] + x, sq5[3]);
+  StoreAligned64(square_sum3[3] + x, sq3[3]);
+  StoreAligned64(square_sum5[4] + x, sq5[4]);
+  LoadAligned32x2U16(sum3, x, s3[0]);
+  LoadAligned64x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum_3[0][0], &index_3[0][0]);
+  CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum_3[1][0],
+                        &index_3[1][0]);
+  LoadAligned32x3U16(sum5, x, s5[0]);
+  LoadAligned64x3U32(square_sum5, x, sq5);
+  CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]);
+
+  SumHorizontal(sq[0] + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  SumHorizontal(sq[1] + 1, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned64(square_sum3[2] + x + 16, sq3[2]);
+  StoreAligned64(square_sum5[3] + x + 16, sq5[3]);
+  StoreAligned64(square_sum3[3] + x + 16, sq3[3]);
+  StoreAligned64(square_sum5[4] + x + 16, sq5[4]);
+  LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]);
+  LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum_3[0][1], &index_3[0][1]);
+  CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum_3[1][1],
+                        &index_3[1][1]);
+  CalculateIntermediate<9>(sum_3[0], index_3[0], ma3[0], b3[0] + 1);
+  CalculateIntermediate<9>(sum_3[1], index_3[1], ma3[1], b3[1] + 1);
+  LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+  LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+  CalculateSumAndIndex5(s5[1], sq5, scales[0], &sum_5[1], &index_5[1]);
+  CalculateIntermediate<25>(sum_5, index_5, ma5, b5 + 1);
+  b3[0][0] = _mm256_permute2x128_si256(b3[0][0], b3[0][2], 0x21);
+  b3[1][0] = _mm256_permute2x128_si256(b3[1][0], b3[1][2], 0x21);
+  b5[0] = _mm256_permute2x128_si256(b5[0], b5[2], 0x21);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+    const __m128i s, const uint16_t scales[2], const uint16_t* const sum3[4],
+    const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+    const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma3,
+    __m128i* const ma5, __m128i* const b3, __m128i* const b5) {
+  __m128i s3[3], s5[5], sq3[3][2], sq5[5][2];
+  sq[1] = SquareHi8(s);
+  SumHorizontalLo(s, &s3[2], &s5[3]);
+  SumHorizontal(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16(sum5, 0, s5);
+  s5[4] = s5[3];
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+    const uint8_t* const src, const ptrdiff_t over_read_in_bytes,
+    const ptrdiff_t sum_width, const ptrdiff_t x, const uint16_t scales[2],
+    const uint16_t* const sum3[4], const uint16_t* const sum5[5],
+    const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
+    __m256i sq[6], __m256i ma3[2], __m256i ma5[2], __m256i b3[5],
+    __m256i b5[5]) {
+  const __m256i s0 = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 8);
+  __m256i s3[2][3], s5[2][5], sq3[4][2], sq5[5][2], sum_3[2], index_3[2],
+      sum_5[2], index_5[2];
+  sq[1] = SquareLo8(s0);
+  sq[2] = SquareHi8(s0);
+  sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+  SumHorizontal(src, over_read_in_bytes, &s3[0][2], &s3[1][2], &s5[0][3],
+                &s5[1][3]);
+  SumHorizontal(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned32x2U16(sum3, x, s3[0]);
+  LoadAligned64x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum_3[0], &index_3[0]);
+  LoadAligned32x3U16(sum5, x, s5[0]);
+  s5[0][4] = s5[0][3];
+  LoadAligned64x3U32(square_sum5, x, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]);
+
+  SumHorizontal(sq + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]);
+  LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum_3[1], &index_3[1]);
+  CalculateIntermediate<9>(sum_3, index_3, ma3, b3 + 1);
+  LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+  s5[1][4] = s5[1][3];
+  LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateSumAndIndex5(s5[1], sq5, scales[0], &sum_5[1], &index_5[1]);
+  CalculateIntermediate<25>(sum_5, index_5, ma5, b5 + 1);
+  b3[0] = _mm256_permute2x128_si256(b3[0], b3[2], 0x21);
+  b5[0] = _mm256_permute2x128_si256(b5[0], b5[2], 0x21);
+}
+
+inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
+                                    const uint8_t* const src1, const int width,
+                                    const uint32_t scale,
+                                    uint16_t* const sum5[5],
+                                    uint32_t* const square_sum5[5],
+                                    const ptrdiff_t sum_width, uint16_t* ma565,
+                                    uint32_t* b565) {
+  __m128i ma0, b0, s[2][3], sq_128[2][2];
+  __m256i mas[3], sq[2][3], bs[3];
+  s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+  s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1_128 - width);
+  sq_128[0][0] = SquareLo8(s[0][0]);
+  sq_128[1][0] = SquareLo8(s[1][0]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq_128, &ma0, &b0);
+  sq[0][0] = SetrM128i(sq_128[0][0], sq_128[0][1]);
+  sq[1][0] = SetrM128i(sq_128[1][0], sq_128[1][1]);
+  mas[0] = SetrM128i(ma0, ma0);
+  bs[0] = SetrM128i(b0, b0);
+
+  int x = 0;
+  do {
+    __m256i ma5[3], ma[2], b[4];
+    BoxFilterPreProcess5(src0 + x + 8, src1 + x + 8,
+                         x + 8 + kOverreadInBytesPass1_256 - width, sum_width,
+                         x + 8, scale, sum5, square_sum5, sq, mas, bs);
+    Prepare3_8(mas, ma5);
+    ma[0] = Sum565Lo(ma5);
+    ma[1] = Sum565Hi(ma5);
+    StoreAligned64(ma565, ma);
+    Sum565W(bs + 0, b + 0);
+    Sum565W(bs + 1, b + 2);
+    StoreAligned64(b565, b + 0);
+    StoreAligned64(b565 + 16, b + 2);
+    sq[0][0] = sq[0][2];
+    sq[1][0] = sq[1][2];
+    mas[0] = mas[2];
+    bs[0] = bs[2];
+    ma565 += 32;
+    b565 += 32;
+    x += 32;
+  } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+    const uint8_t* const src, const int width, const uint32_t scale,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
+    uint32_t* b444) {
+  const __m128i s = LoadUnaligned16Msan(src, kOverreadInBytesPass2_128 - width);
+  __m128i ma0, sq_128[2], b0;
+  __m256i mas[3], sq[3], bs[3];
+  sq_128[0] = SquareLo8(s);
+  BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq_128, &ma0, &b0);
+  sq[0] = SetrM128i(sq_128[0], sq_128[1]);
+  mas[0] = SetrM128i(ma0, ma0);
+  bs[0] = SetrM128i(b0, b0);
+
+  int x = 0;
+  do {
+    __m256i ma3[3];
+    BoxFilterPreProcess3(src + x + 8, x + 8 + kOverreadInBytesPass2_256 - width,
+                         x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+                         bs);
+    Prepare3_8(mas, ma3);
+    if (calculate444) {  // NOLINT(readability-simplify-boolean-expr)
+      Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
+      Store343_444Hi(ma3, bs + 1, 16, ma343, ma444, b343, b444);
+      ma444 += 32;
+      b444 += 32;
+    } else {
+      __m256i ma[2], b[4];
+      ma[0] = Sum343Lo(ma3);
+      ma[1] = Sum343Hi(ma3);
+      StoreAligned64(ma343, ma);
+      Sum343W(bs + 0, b + 0);
+      Sum343W(bs + 1, b + 2);
+      StoreAligned64(b343 + 0, b + 0);
+      StoreAligned64(b343 + 16, b + 2);
+    }
+    sq[0] = sq[2];
+    mas[0] = mas[2];
+    bs[0] = bs[2];
+    ma343 += 32;
+    b343 += 32;
+    x += 32;
+  } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+    const uint8_t* const src0, const uint8_t* const src1, const int width,
+    const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
+    uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
+    uint32_t* b565) {
+  __m128i s[2], ma3_128[2], ma5_0, sq_128[2][2], b3_128[2], b5_0;
+  __m256i ma3[2][3], ma5[3], sq[2][3], b3[2][5], b5[5];
+  s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+  s[1] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1_128 - width);
+  sq_128[0][0] = SquareLo8(s[0]);
+  sq_128[1][0] = SquareLo8(s[1]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq_128,
+                        ma3_128, b3_128, &ma5_0, &b5_0);
+  sq[0][0] = SetrM128i(sq_128[0][0], sq_128[0][1]);
+  sq[1][0] = SetrM128i(sq_128[1][0], sq_128[1][1]);
+  ma3[0][0] = SetrM128i(ma3_128[0], ma3_128[0]);
+  ma3[1][0] = SetrM128i(ma3_128[1], ma3_128[1]);
+  ma5[0] = SetrM128i(ma5_0, ma5_0);
+  b3[0][0] = SetrM128i(b3_128[0], b3_128[0]);
+  b3[1][0] = SetrM128i(b3_128[1], b3_128[1]);
+  b5[0] = SetrM128i(b5_0, b5_0);
+
+  int x = 0;
+  do {
+    __m256i ma[2], b[4], ma3x[3], ma5x[3];
+    BoxFilterPreProcess(src0 + x + 8, src1 + x + 8,
+                        x + 8 + kOverreadInBytesPass1_256 - width, x + 8,
+                        scales, sum3, sum5, square_sum3, square_sum5, sum_width,
+                        sq, ma3, b3, ma5, b5);
+    Prepare3_8(ma3[0], ma3x);
+    ma[0] = Sum343Lo(ma3x);
+    ma[1] = Sum343Hi(ma3x);
+    StoreAligned64(ma343[0] + x, ma);
+    Sum343W(b3[0], b);
+    StoreAligned64(b343[0] + x, b);
+    Sum565W(b5, b);
+    StoreAligned64(b565, b);
+    Prepare3_8(ma3[1], ma3x);
+    Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+    Store343_444Hi(ma3x, b3[1] + 1, x + 16, ma343[1], ma444, b343[1], b444);
+    Prepare3_8(ma5, ma5x);
+    ma[0] = Sum565Lo(ma5x);
+    ma[1] = Sum565Hi(ma5x);
+    StoreAligned64(ma565, ma);
+    Sum343W(b3[0] + 1, b);
+    StoreAligned64(b343[0] + x + 16, b);
+    Sum565W(b5 + 1, b);
+    StoreAligned64(b565 + 16, b);
+    sq[0][0] = sq[0][2];
+    sq[1][0] = sq[1][2];
+    ma3[0][0] = ma3[0][2];
+    ma3[1][0] = ma3[1][2];
+    ma5[0] = ma5[2];
+    b3[0][0] = b3[0][2];
+    b3[1][0] = b3[1][2];
+    b5[0] = b5[2];
+    ma565 += 32;
+    b565 += 32;
+    x += 32;
+  } while (x < width);
+}
+
+template <int shift>
+inline __m256i FilterOutput(const __m256i ma_x_src, const __m256i b) {
+  // ma: 255 * 32 = 8160 (13 bits)
+  // b: 65088 * 32 = 2082816 (21 bits)
+  // v: b - ma * 255 (22 bits)
+  const __m256i v = _mm256_sub_epi32(b, ma_x_src);
+  // kSgrProjSgrBits = 8
+  // kSgrProjRestoreBits = 4
+  // shift = 4 or 5
+  // v >> 8 or 9 (13 bits)
+  return VrshrS32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <int shift>
+inline __m256i CalculateFilteredOutput(const __m256i src, const __m256i ma,
+                                       const __m256i b[2]) {
+  const __m256i ma_x_src_lo = VmullLo16(ma, src);
+  const __m256i ma_x_src_hi = VmullHi16(ma, src);
+  const __m256i dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
+  const __m256i dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
+  return _mm256_packs_epi32(dst_lo, dst_hi);  // 13 bits
+}
+
+inline __m256i CalculateFilteredOutputPass1(const __m256i src,
+                                            const __m256i ma[2],
+                                            const __m256i b[2][2]) {
+  const __m256i ma_sum = _mm256_add_epi16(ma[0], ma[1]);
+  __m256i b_sum[2];
+  b_sum[0] = _mm256_add_epi32(b[0][0], b[1][0]);
+  b_sum[1] = _mm256_add_epi32(b[0][1], b[1][1]);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m256i CalculateFilteredOutputPass2(const __m256i src,
+                                            const __m256i ma[3],
+                                            const __m256i b[3][2]) {
+  const __m256i ma_sum = Sum3_16(ma);
+  __m256i b_sum[2];
+  Sum3_32(b, b_sum);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m256i SelfGuidedFinal(const __m256i src, const __m256i v[2]) {
+  const __m256i v_lo =
+      VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const __m256i v_hi =
+      VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+  const __m256i vv = _mm256_packs_epi32(v_lo, v_hi);
+  return _mm256_add_epi16(src, vv);
+}
+
+inline __m256i SelfGuidedDoubleMultiplier(const __m256i src,
+                                          const __m256i filter[2], const int w0,
+                                          const int w2) {
+  __m256i v[2];
+  const __m256i w0_w2 =
+      _mm256_set1_epi32((w2 << 16) | static_cast<uint16_t>(w0));
+  const __m256i f_lo = _mm256_unpacklo_epi16(filter[0], filter[1]);
+  const __m256i f_hi = _mm256_unpackhi_epi16(filter[0], filter[1]);
+  v[0] = _mm256_madd_epi16(w0_w2, f_lo);
+  v[1] = _mm256_madd_epi16(w0_w2, f_hi);
+  return SelfGuidedFinal(src, v);
+}
+
+inline __m256i SelfGuidedSingleMultiplier(const __m256i src,
+                                          const __m256i filter, const int w0) {
+  // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+  __m256i v[2];
+  v[0] = VmullNLo8(filter, w0);
+  v[1] = VmullNHi8(filter, w0);
+  return SelfGuidedFinal(src, v);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+    const uint8_t* const src, const uint8_t* const src0,
+    const uint8_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
+    const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
+    uint32_t* const b565[2], uint8_t* const dst) {
+  __m128i ma0, b0, s[2][3], sq_128[2][2];
+  __m256i mas[3], sq[2][3], bs[3];
+  s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+  s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1_128 - width);
+  sq_128[0][0] = SquareLo8(s[0][0]);
+  sq_128[1][0] = SquareLo8(s[1][0]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq_128, &ma0, &b0);
+  sq[0][0] = SetrM128i(sq_128[0][0], sq_128[0][1]);
+  sq[1][0] = SetrM128i(sq_128[1][0], sq_128[1][1]);
+  mas[0] = SetrM128i(ma0, ma0);
+  bs[0] = SetrM128i(b0, b0);
+
+  int x = 0;
+  do {
+    __m256i ma[3], ma5[3], b[2][2][2];
+    BoxFilterPreProcess5(src0 + x + 8, src1 + x + 8,
+                         x + 8 + kOverreadInBytesPass1_256 - width, sum_width,
+                         x + 8, scale, sum5, square_sum5, sq, mas, bs);
+    Prepare3_8(mas, ma5);
+    ma[1] = Sum565Lo(ma5);
+    ma[2] = Sum565Hi(ma5);
+    StoreAligned64(ma565[1] + x, ma + 1);
+    Sum565W(bs + 0, b[0][1]);
+    Sum565W(bs + 1, b[1][1]);
+    StoreAligned64(b565[1] + x + 0, b[0][1]);
+    StoreAligned64(b565[1] + x + 16, b[1][1]);
+    const __m256i sr0 = LoadUnaligned32(src + x);
+    const __m256i sr1 = LoadUnaligned32(src + stride + x);
+    const __m256i sr0_lo = _mm256_unpacklo_epi8(sr0, _mm256_setzero_si256());
+    const __m256i sr1_lo = _mm256_unpacklo_epi8(sr1, _mm256_setzero_si256());
+    ma[0] = LoadAligned32(ma565[0] + x);
+    LoadAligned64(b565[0] + x, b[0][0]);
+    const __m256i p00 = CalculateFilteredOutputPass1(sr0_lo, ma, b[0]);
+    const __m256i p01 = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[0][1]);
+    const __m256i d00 = SelfGuidedSingleMultiplier(sr0_lo, p00, w0);
+    const __m256i d10 = SelfGuidedSingleMultiplier(sr1_lo, p01, w0);
+    const __m256i sr0_hi = _mm256_unpackhi_epi8(sr0, _mm256_setzero_si256());
+    const __m256i sr1_hi = _mm256_unpackhi_epi8(sr1, _mm256_setzero_si256());
+    ma[1] = LoadAligned32(ma565[0] + x + 16);
+    LoadAligned64(b565[0] + x + 16, b[1][0]);
+    const __m256i p10 = CalculateFilteredOutputPass1(sr0_hi, ma + 1, b[1]);
+    const __m256i p11 = CalculateFilteredOutput<4>(sr1_hi, ma[2], b[1][1]);
+    const __m256i d01 = SelfGuidedSingleMultiplier(sr0_hi, p10, w0);
+    const __m256i d11 = SelfGuidedSingleMultiplier(sr1_hi, p11, w0);
+    StoreUnaligned32(dst + x, _mm256_packus_epi16(d00, d01));
+    StoreUnaligned32(dst + stride + x, _mm256_packus_epi16(d10, d11));
+    sq[0][0] = sq[0][2];
+    sq[1][0] = sq[1][2];
+    mas[0] = mas[2];
+    bs[0] = bs[2];
+    x += 32;
+  } while (x < width);
+}
+
+inline void BoxFilterPass1LastRow(
+    const uint8_t* const src, const uint8_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
+    uint32_t* b565, uint8_t* const dst) {
+  const __m128i s0 =
+      LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+  __m128i ma0, b0, sq_128[2];
+  __m256i mas[3], sq[3], bs[3];
+  sq_128[0] = SquareLo8(s0);
+  BoxFilterPreProcess5LastRowLo(s0, scale, sum5, square_sum5, sq_128, &ma0,
+                                &b0);
+  sq[0] = SetrM128i(sq_128[0], sq_128[1]);
+  mas[0] = SetrM128i(ma0, ma0);
+  bs[0] = SetrM128i(b0, b0);
+
+  int x = 0;
+  do {
+    __m256i ma[3], ma5[3], b[2][2];
+    BoxFilterPreProcess5LastRow(
+        src0 + x + 8, x + 8 + kOverreadInBytesPass1_256 - width, sum_width,
+        x + 8, scale, sum5, square_sum5, sq, mas, bs);
+    Prepare3_8(mas, ma5);
+    ma[1] = Sum565Lo(ma5);
+    ma[2] = Sum565Hi(ma5);
+    Sum565W(bs + 0, b[1]);
+    const __m256i sr = LoadUnaligned32(src + x);
+    const __m256i sr_lo = _mm256_unpacklo_epi8(sr, _mm256_setzero_si256());
+    const __m256i sr_hi = _mm256_unpackhi_epi8(sr, _mm256_setzero_si256());
+    ma[0] = LoadAligned32(ma565);
+    LoadAligned64(b565 + 0, b[0]);
+    const __m256i p0 = CalculateFilteredOutputPass1(sr_lo, ma, b);
+    ma[1] = LoadAligned32(ma565 + 16);
+    LoadAligned64(b565 + 16, b[0]);
+    Sum565W(bs + 1, b[1]);
+    const __m256i p1 = CalculateFilteredOutputPass1(sr_hi, ma + 1, b);
+    const __m256i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+    const __m256i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+    StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+    sq[0] = sq[2];
+    mas[0] = mas[2];
+    bs[0] = bs[2];
+    ma565 += 32;
+    b565 += 32;
+    x += 32;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+    const uint8_t* const src, const uint8_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
+    uint32_t* const b444[2], uint8_t* const dst) {
+  const __m128i s0 =
+      LoadUnaligned16Msan(src0, kOverreadInBytesPass2_128 - width);
+  __m128i ma0, b0, sq_128[2];
+  __m256i mas[3], sq[3], bs[3];
+  sq_128[0] = SquareLo8(s0);
+  BoxFilterPreProcess3Lo(s0, scale, sum3, square_sum3, sq_128, &ma0, &b0);
+  sq[0] = SetrM128i(sq_128[0], sq_128[1]);
+  mas[0] = SetrM128i(ma0, ma0);
+  bs[0] = SetrM128i(b0, b0);
+
+  int x = 0;
+  do {
+    __m256i ma[4], b[4][2], ma3[3];
+    BoxFilterPreProcess3(src0 + x + 8,
+                         x + 8 + kOverreadInBytesPass2_256 - width, x + 8,
+                         sum_width, scale, sum3, square_sum3, sq, mas, bs);
+    Prepare3_8(mas, ma3);
+    Store343_444Lo(ma3, bs + 0, x + 0, &ma[2], b[2], ma343[2], ma444[1],
+                   b343[2], b444[1]);
+    Store343_444Hi(ma3, bs + 1, x + 16, &ma[3], b[3], ma343[2], ma444[1],
+                   b343[2], b444[1]);
+    const __m256i sr = LoadUnaligned32(src + x);
+    const __m256i sr_lo = _mm256_unpacklo_epi8(sr, _mm256_setzero_si256());
+    const __m256i sr_hi = _mm256_unpackhi_epi8(sr, _mm256_setzero_si256());
+    ma[0] = LoadAligned32(ma343[0] + x);
+    ma[1] = LoadAligned32(ma444[0] + x);
+    LoadAligned64(b343[0] + x, b[0]);
+    LoadAligned64(b444[0] + x, b[1]);
+    const __m256i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
+    ma[1] = LoadAligned32(ma343[0] + x + 16);
+    ma[2] = LoadAligned32(ma444[0] + x + 16);
+    LoadAligned64(b343[0] + x + 16, b[1]);
+    LoadAligned64(b444[0] + x + 16, b[2]);
+    const __m256i p1 = CalculateFilteredOutputPass2(sr_hi, ma + 1, b + 1);
+    const __m256i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+    const __m256i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+    StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+    sq[0] = sq[2];
+    mas[0] = mas[2];
+    bs[0] = bs[2];
+    x += 32;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+    const uint8_t* const src, const uint8_t* const src0,
+    const uint8_t* const src1, const ptrdiff_t stride, const int width,
+    const uint16_t scales[2], const int16_t w0, const int16_t w2,
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4],
+    uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
+    uint32_t* const b444[3], uint32_t* const b565[2], uint8_t* const dst) {
+  __m128i s[2], ma3_128[2], ma5_0, sq_128[2][2], b3_128[2], b5_0;
+  __m256i ma3[2][3], ma5[3], sq[2][3], b3[2][5], b5[5];
+  s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+  s[1] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1_128 - width);
+  sq_128[0][0] = SquareLo8(s[0]);
+  sq_128[1][0] = SquareLo8(s[1]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq_128,
+                        ma3_128, b3_128, &ma5_0, &b5_0);
+  sq[0][0] = SetrM128i(sq_128[0][0], sq_128[0][1]);
+  sq[1][0] = SetrM128i(sq_128[1][0], sq_128[1][1]);
+  ma3[0][0] = SetrM128i(ma3_128[0], ma3_128[0]);
+  ma3[1][0] = SetrM128i(ma3_128[1], ma3_128[1]);
+  ma5[0] = SetrM128i(ma5_0, ma5_0);
+  b3[0][0] = SetrM128i(b3_128[0], b3_128[0]);
+  b3[1][0] = SetrM128i(b3_128[1], b3_128[1]);
+  b5[0] = SetrM128i(b5_0, b5_0);
+
+  int x = 0;
+  do {
+    __m256i ma[3][3], mat[3][3], b[3][3][2], p[2][2], ma3x[2][3], ma5x[3];
+    BoxFilterPreProcess(src0 + x + 8, src1 + x + 8,
+                        x + 8 + kOverreadInBytesPass1_256 - width, x + 8,
+                        scales, sum3, sum5, square_sum3, square_sum5, sum_width,
+                        sq, ma3, b3, ma5, b5);
+    Prepare3_8(ma3[0], ma3x[0]);
+    Prepare3_8(ma3[1], ma3x[1]);
+    Prepare3_8(ma5, ma5x);
+    Store343_444Lo(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], b[1][2], b[2][1],
+                   ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444Lo(ma3x[1], b3[1], x, &ma[2][2], b[2][2], ma343[3], ma444[2],
+                   b343[3], b444[2]);
+    ma[0][1] = Sum565Lo(ma5x);
+    ma[0][2] = Sum565Hi(ma5x);
+    mat[0][1] = ma[0][2];
+    StoreAligned64(ma565[1] + x, ma[0] + 1);
+    Sum565W(b5, b[0][1]);
+    StoreAligned64(b565[1] + x, b[0][1]);
+    const __m256i sr0 = LoadUnaligned32(src + x);
+    const __m256i sr1 = LoadUnaligned32(src + stride + x);
+    const __m256i sr0_lo = _mm256_unpacklo_epi8(sr0, _mm256_setzero_si256());
+    const __m256i sr1_lo = _mm256_unpacklo_epi8(sr1, _mm256_setzero_si256());
+    ma[0][0] = LoadAligned32(ma565[0] + x);
+    LoadAligned64(b565[0] + x, b[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
+    ma[1][0] = LoadAligned32(ma343[0] + x);
+    ma[1][1] = LoadAligned32(ma444[0] + x);
+    LoadAligned64(b343[0] + x, b[1][0]);
+    LoadAligned64(b444[0] + x, b[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
+    const __m256i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
+    ma[2][0] = LoadAligned32(ma343[1] + x);
+    LoadAligned64(b343[1] + x, b[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
+    const __m256i d10 = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
+
+    Sum565W(b5 + 1, b[0][1]);
+    StoreAligned64(b565[1] + x + 16, b[0][1]);
+    Store343_444Hi(ma3x[0], b3[0] + 1, x + 16, &mat[1][2], &mat[2][1], b[1][2],
+                   b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444Hi(ma3x[1], b3[1] + 1, x + 16, &mat[2][2], b[2][2], ma343[3],
+                   ma444[2], b343[3], b444[2]);
+    const __m256i sr0_hi = _mm256_unpackhi_epi8(sr0, _mm256_setzero_si256());
+    const __m256i sr1_hi = _mm256_unpackhi_epi8(sr1, _mm256_setzero_si256());
+    mat[0][0] = LoadAligned32(ma565[0] + x + 16);
+    LoadAligned64(b565[0] + x + 16, b[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_hi, mat[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_hi, mat[0][1], b[0][1]);
+    mat[1][0] = LoadAligned32(ma343[0] + x + 16);
+    mat[1][1] = LoadAligned32(ma444[0] + x + 16);
+    LoadAligned64(b343[0] + x + 16, b[1][0]);
+    LoadAligned64(b444[0] + x + 16, b[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_hi, mat[1], b[1]);
+    const __m256i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
+    mat[2][0] = LoadAligned32(ma343[1] + x + 16);
+    LoadAligned64(b343[1] + x + 16, b[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_hi, mat[2], b[2]);
+    const __m256i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
+    StoreUnaligned32(dst + x, _mm256_packus_epi16(d00, d01));
+    StoreUnaligned32(dst + stride + x, _mm256_packus_epi16(d10, d11));
+    sq[0][0] = sq[0][2];
+    sq[1][0] = sq[1][2];
+    ma3[0][0] = ma3[0][2];
+    ma3[1][0] = ma3[1][2];
+    ma5[0] = ma5[2];
+    b3[0][0] = b3[0][2];
+    b3[1][0] = b3[1][2];
+    b5[0] = b5[2];
+    x += 32;
+  } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+    const uint8_t* const src, const uint8_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
+    const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+    uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+    uint8_t* const dst) {
+  const __m128i s0 =
+      LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+  __m128i ma3_0, ma5_0, b3_0, b5_0, sq_128[2];
+  __m256i ma3[3], ma5[3], sq[3], b3[3], b5[3];
+  sq_128[0] = SquareLo8(s0);
+  BoxFilterPreProcessLastRowLo(s0, scales, sum3, sum5, square_sum3, square_sum5,
+                               sq_128, &ma3_0, &ma5_0, &b3_0, &b5_0);
+  sq[0] = SetrM128i(sq_128[0], sq_128[1]);
+  ma3[0] = SetrM128i(ma3_0, ma3_0);
+  ma5[0] = SetrM128i(ma5_0, ma5_0);
+  b3[0] = SetrM128i(b3_0, b3_0);
+  b5[0] = SetrM128i(b5_0, b5_0);
+
+  int x = 0;
+  do {
+    __m256i ma[3], mat[3], b[3][2], p[2], ma3x[3], ma5x[3];
+    BoxFilterPreProcessLastRow(src0 + x + 8,
+                               x + 8 + kOverreadInBytesPass1_256 - width,
+                               sum_width, x + 8, scales, sum3, sum5,
+                               square_sum3, square_sum5, sq, ma3, ma5, b3, b5);
+    Prepare3_8(ma3, ma3x);
+    Prepare3_8(ma5, ma5x);
+    ma[1] = Sum565Lo(ma5x);
+    Sum565W(b5, b[1]);
+    ma[2] = Sum343Lo(ma3x);
+    Sum343W(b3, b[2]);
+    const __m256i sr = LoadUnaligned32(src + x);
+    const __m256i sr_lo = _mm256_unpacklo_epi8(sr, _mm256_setzero_si256());
+    ma[0] = LoadAligned32(ma565 + x);
+    LoadAligned64(b565 + x, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
+    ma[0] = LoadAligned32(ma343 + x);
+    ma[1] = LoadAligned32(ma444 + x);
+    LoadAligned64(b343 + x, b[0]);
+    LoadAligned64(b444 + x, b[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
+    const __m256i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
+
+    mat[1] = Sum565Hi(ma5x);
+    Sum565W(b5 + 1, b[1]);
+    mat[2] = Sum343Hi(ma3x);
+    Sum343W(b3 + 1, b[2]);
+    const __m256i sr_hi = _mm256_unpackhi_epi8(sr, _mm256_setzero_si256());
+    mat[0] = LoadAligned32(ma565 + x + 16);
+    LoadAligned64(b565 + x + 16, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_hi, mat, b);
+    mat[0] = LoadAligned32(ma343 + x + 16);
+    mat[1] = LoadAligned32(ma444 + x + 16);
+    LoadAligned64(b343 + x + 16, b[0]);
+    LoadAligned64(b444 + x + 16, b[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_hi, mat, b);
+    const __m256i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
+    StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+    sq[0] = sq[2];
+    ma3[0] = ma3[2];
+    ma5[0] = ma5[2];
+    b3[0] = b3[2];
+    b5[0] = b5[2];
+    x += 32;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+    const RestorationUnitInfo& restoration_info, const uint8_t* src,
+    const ptrdiff_t stride, const uint8_t* const top_border,
+    const ptrdiff_t top_border_stride, const uint8_t* bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    SgrBuffer* const sgr_buffer, uint8_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 32);
+  const auto sum_width = temp_stride + 8;
+  const auto sum_stride = temp_stride + 32;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+  uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+  uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+  sum3[0] = sgr_buffer->sum3 + kSumOffset;
+  square_sum3[0] = sgr_buffer->square_sum3 + kSumOffset;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 3; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  sum5[0] = sgr_buffer->sum5 + kSumOffset;
+  square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  b444[0] = sgr_buffer->b444;
+  for (int i = 1; i <= 2; ++i) {
+    ma444[i] = ma444[i - 1] + temp_stride;
+    b444[i] = b444[i - 1] + temp_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scales[0] != 0);
+  assert(scales[1] != 0);
+  BoxSum(top_border, top_border_stride, width, sum_stride, temp_stride, sum3[0],
+         sum5[1], square_sum3[0], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+                         square_sum5, sum_width, ma343, ma444[0], ma565[0],
+                         b343, b444[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5 + kSumOffset;
+  square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate4PointersBy2<uint16_t>(sum3);
+    Circulate4PointersBy2<uint32_t>(square_sum3);
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+              scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
+              ma343, ma444, ma565, b343, b444, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    Circulate4PointersBy2<uint16_t>(ma343);
+    Circulate4PointersBy2<uint32_t>(b343);
+    std::swap(ma444[0], ma444[2]);
+    std::swap(b444[0], b444[2]);
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate4PointersBy2<uint16_t>(sum3);
+  Circulate4PointersBy2<uint32_t>(square_sum3);
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint8_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+              square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
+              b444, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      Circulate4PointersBy2<uint16_t>(sum3);
+      Circulate4PointersBy2<uint32_t>(square_sum3);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+      Circulate4PointersBy2<uint16_t>(ma343);
+      Circulate4PointersBy2<uint32_t>(b343);
+      std::swap(ma444[0], ma444[2]);
+      std::swap(b444[0], b444[2]);
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+    }
+    BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+                     sum_width, scales, w0, w2, sum3, sum5, square_sum3,
+                     square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
+                     b444[0], b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+                                  const uint8_t* src, const ptrdiff_t stride,
+                                  const uint8_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint8_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint8_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 32);
+  const auto sum_width = temp_stride + 8;
+  const auto sum_stride = temp_stride + 32;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  uint16_t *sum5[5], *ma565[2];
+  uint32_t *square_sum5[5], *b565[2];
+  sum5[0] = sgr_buffer->sum5 + kSumOffset;
+  square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<5>(top_border, top_border_stride, width, sum_stride, temp_stride,
+            sum5[1], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
+                          ma565[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5 + kSumOffset;
+  square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+                   square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint8_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+                   sum_width, scale, w0, ma565, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    src += 3;
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+    }
+    BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
+                          sum_width, scale, w0, sum5, square_sum5, ma565[0],
+                          b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+                                  const uint8_t* src, const ptrdiff_t stride,
+                                  const uint8_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint8_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint8_t* dst) {
+  assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+  const auto temp_stride = Align<ptrdiff_t>(width, 32);
+  const auto sum_width = temp_stride + 8;
+  const auto sum_stride = temp_stride + 32;
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1];  // < 2^12.
+  uint16_t *sum3[3], *ma343[3], *ma444[2];
+  uint32_t *square_sum3[3], *b343[3], *b444[2];
+  sum3[0] = sgr_buffer->sum3 + kSumOffset;
+  square_sum3[0] = sgr_buffer->square_sum3 + kSumOffset;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 2; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  ma444[1] = ma444[0] + temp_stride;
+  b444[0] = sgr_buffer->b444;
+  b444[1] = b444[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<3>(top_border, top_border_stride, width, sum_stride, temp_stride,
+            sum3[0], square_sum3[0]);
+  BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
+                                 sum_width, ma343[0], nullptr, b343[0],
+                                 nullptr);
+  Circulate3PointersBy1<uint16_t>(sum3);
+  Circulate3PointersBy1<uint32_t>(square_sum3);
+  const uint8_t* s;
+  if (height > 1) {
+    s = src + stride;
+  } else {
+    s = bottom_border;
+    bottom_border += bottom_border_stride;
+  }
+  BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
+                                ma343[1], ma444[0], b343[1], b444[0]);
+
+  for (int y = height - 2; y > 0; --y) {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  }
+
+  int y = std::min(height, 2);
+  src += 2;
+  do {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    bottom_border += bottom_border_stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  } while (--y != 0);
+}
+
+// If |width| is non-multiple of 32, up to 31 more pixels are written to |dest|
+// in the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_AVX2(
+    const RestorationUnitInfo& restoration_info, const void* const source,
+    const ptrdiff_t stride, const void* const top_border,
+    const ptrdiff_t top_border_stride, const void* const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* const restoration_buffer, void* const dest) {
+  const int index = restoration_info.sgr_proj_info.index;
+  const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
+  const int radius_pass_1 = kSgrProjParams[index][2];  // 1 or 0
+  const auto* const src = static_cast<const uint8_t*>(source);
+  const auto* top = static_cast<const uint8_t*>(top_border);
+  const auto* bottom = static_cast<const uint8_t*>(bottom_border);
+  auto* const dst = static_cast<uint8_t*>(dest);
+  SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+  if (radius_pass_1 == 0) {
+    // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+    // following assertion.
+    assert(radius_pass_0 != 0);
+    BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+                          top_border_stride, bottom - 3, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
+  } else if (radius_pass_0 == 0) {
+    BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+                          top_border_stride, bottom - 2, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
+  } else {
+    BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+                     top_border_stride, bottom - 3, bottom_border_stride, width,
+                     height, sgr_buffer, dst);
+  }
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_AVX2(WienerFilter)
+  dsp->loop_restorations[0] = WienerFilter_AVX2;
+#endif
+#if DSP_ENABLED_8BPP_AVX2(SelfGuidedFilter)
+  dsp->loop_restorations[1] = SelfGuidedFilter_AVX2;
+#endif
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void LoopRestorationInit_AVX2() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#else   // !LIBGAV1_TARGETING_AVX2
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit_AVX2() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_TARGETING_AVX2

diff --git a/libgav1/src/dsp/x86/loop_restoration_avx2.h b/libgav1/src/dsp/x86/loop_restoration_avx2.h
new file mode 100644
index 0000000..2c3534a
--- /dev/null
+++ b/libgav1/src/dsp/x86/loop_restoration_avx2.h

@@ -0,0 +1,56 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_AVX2_H_
+#define LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_AVX2_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::loop_restorations, see the defines below for specifics.
+// These functions are not thread-safe.
+void LoopRestorationInit_AVX2();
+void LoopRestorationInit10bpp_AVX2();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+// If avx2 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the avx2 implementation should be used.
+#if LIBGAV1_TARGETING_AVX2
+
+#ifndef LIBGAV1_Dsp8bpp_WienerFilter
+#define LIBGAV1_Dsp8bpp_WienerFilter LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WienerFilter
+#define LIBGAV1_Dsp10bpp_WienerFilter LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_SelfGuidedFilter
+#define LIBGAV1_Dsp8bpp_SelfGuidedFilter LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_SelfGuidedFilter
+#define LIBGAV1_Dsp10bpp_SelfGuidedFilter LIBGAV1_CPU_AVX2
+#endif
+
+#endif  // LIBGAV1_TARGETING_AVX2
+
+#endif  // LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_AVX2_H_

diff --git a/libgav1/src/dsp/x86/loop_restoration_sse4.cc b/libgav1/src/dsp/x86/loop_restoration_sse4.cc
index 34f4ae8..273bcc8 100644
--- a/libgav1/src/dsp/x86/loop_restoration_sse4.cc
+++ b/libgav1/src/dsp/x86/loop_restoration_sse4.cc

@@ -15,9 +15,10 @@
 #include "src/dsp/loop_restoration.h"
 #include "src/utils/cpu.h"
 
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
 #include <smmintrin.h>
 
+#include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
@@ -35,194 +36,170 @@
 namespace low_bitdepth {
 namespace {
 
-inline void WienerHorizontalTap7Kernel(const __m128i s[2],
-                                       const __m128i filter[4],
-                                       int16_t* const wiener_buffer) {
-  const int limit =
-      (1 << (8 + 1 + kWienerFilterBits - kInterRoundBitsHorizontal)) - 1;
-  const int offset =
+inline void WienerHorizontalClip(const __m128i s[2], const __m128i s_3x128,
+                                 int16_t* const wiener_buffer) {
+  constexpr int offset =
       1 << (8 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+  constexpr int limit =
+      (1 << (8 + 1 + kWienerFilterBits - kInterRoundBitsHorizontal)) - 1;
   const __m128i offsets = _mm_set1_epi16(-offset);
   const __m128i limits = _mm_set1_epi16(limit - offset);
-  const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsHorizontal - 1));
-  const auto s01 = _mm_alignr_epi8(s[1], s[0], 1);
-  const auto s23 = _mm_alignr_epi8(s[1], s[0], 5);
-  const auto s45 = _mm_alignr_epi8(s[1], s[0], 9);
-  const auto s67 = _mm_alignr_epi8(s[1], s[0], 13);
-  const __m128i madd01 = _mm_maddubs_epi16(s01, filter[0]);
-  const __m128i madd23 = _mm_maddubs_epi16(s23, filter[1]);
-  const __m128i madd45 = _mm_maddubs_epi16(s45, filter[2]);
-  const __m128i madd67 = _mm_maddubs_epi16(s67, filter[3]);
-  const __m128i madd0123 = _mm_add_epi16(madd01, madd23);
-  const __m128i madd4567 = _mm_add_epi16(madd45, madd67);
-  // The sum range here is [-128 * 255, 90 * 255].
-  const __m128i madd = _mm_add_epi16(madd0123, madd4567);
-  const __m128i sum = _mm_add_epi16(madd, round);
+  // The sum range here is [-128 * 255 + 4, 90 * 255 + 4].
+  const __m128i sum = _mm_add_epi16(s[0], s[1]);
   const __m128i rounded_sum0 = _mm_srai_epi16(sum, kInterRoundBitsHorizontal);
-  // Calculate scaled down offset correction, and add to sum here to prevent
-  // signed 16 bit outranging.
-  const __m128i s_3x128 =
-      _mm_slli_epi16(_mm_srli_epi16(s23, 8), 7 - kInterRoundBitsHorizontal);
+  // Add back scaled down offset correction.
   const __m128i rounded_sum1 = _mm_add_epi16(rounded_sum0, s_3x128);
   const __m128i d0 = _mm_max_epi16(rounded_sum1, offsets);
   const __m128i d1 = _mm_min_epi16(d0, limits);
   StoreAligned16(wiener_buffer, d1);
 }
 
-inline void WienerHorizontalTap5Kernel(const __m128i s[2],
+inline void WienerHorizontalTap7Kernel(const __m128i s[4],
+                                       const __m128i filter[4],
+                                       int16_t* const wiener_buffer) {
+  __m128i madds[4];
+  madds[0] = _mm_maddubs_epi16(s[0], filter[0]);
+  madds[1] = _mm_maddubs_epi16(s[1], filter[1]);
+  madds[2] = _mm_maddubs_epi16(s[2], filter[2]);
+  madds[3] = _mm_maddubs_epi16(s[3], filter[3]);
+  madds[0] = _mm_add_epi16(madds[0], madds[2]);
+  madds[1] = _mm_add_epi16(madds[1], madds[3]);
+  const __m128i s_3x128 =
+      _mm_slli_epi16(_mm_srli_epi16(s[1], 8), 7 - kInterRoundBitsHorizontal);
+  WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap5Kernel(const __m128i s[5],
                                        const __m128i filter[3],
                                        int16_t* const wiener_buffer) {
-  const int limit =
-      (1 << (8 + 1 + kWienerFilterBits - kInterRoundBitsHorizontal)) - 1;
-  const int offset =
-      1 << (8 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
-  const __m128i offsets = _mm_set1_epi16(-offset);
-  const __m128i limits = _mm_set1_epi16(limit - offset);
-  const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsHorizontal - 1));
-  const auto s01 = _mm_alignr_epi8(s[1], s[0], 1);
-  const auto s23 = _mm_alignr_epi8(s[1], s[0], 5);
-  const auto s45 = _mm_alignr_epi8(s[1], s[0], 9);
-  const __m128i madd01 = _mm_maddubs_epi16(s01, filter[0]);
-  const __m128i madd23 = _mm_maddubs_epi16(s23, filter[1]);
-  const __m128i madd45 = _mm_maddubs_epi16(s45, filter[2]);
-  const __m128i madd0123 = _mm_add_epi16(madd01, madd23);
-  // The sum range here is [-128 * 255, 90 * 255].
-  const __m128i madd = _mm_add_epi16(madd0123, madd45);
-  const __m128i sum = _mm_add_epi16(madd, round);
-  const __m128i rounded_sum0 = _mm_srai_epi16(sum, kInterRoundBitsHorizontal);
-  // Calculate scaled down offset correction, and add to sum here to prevent
-  // signed 16 bit outranging.
+  __m128i madds[3];
+  madds[0] = _mm_maddubs_epi16(s[0], filter[0]);
+  madds[1] = _mm_maddubs_epi16(s[1], filter[1]);
+  madds[2] = _mm_maddubs_epi16(s[2], filter[2]);
+  madds[0] = _mm_add_epi16(madds[0], madds[2]);
   const __m128i s_3x128 =
-      _mm_srli_epi16(_mm_slli_epi16(s23, 8), kInterRoundBitsHorizontal + 1);
-  const __m128i rounded_sum1 = _mm_add_epi16(rounded_sum0, s_3x128);
-  const __m128i d0 = _mm_max_epi16(rounded_sum1, offsets);
-  const __m128i d1 = _mm_min_epi16(d0, limits);
-  StoreAligned16(wiener_buffer, d1);
+      _mm_srli_epi16(_mm_slli_epi16(s[1], 8), kInterRoundBitsHorizontal + 1);
+  WienerHorizontalClip(madds, s_3x128, wiener_buffer);
 }
 
 inline void WienerHorizontalTap3Kernel(const __m128i s[2],
                                        const __m128i filter[2],
                                        int16_t* const wiener_buffer) {
-  const int limit =
-      (1 << (8 + 1 + kWienerFilterBits - kInterRoundBitsHorizontal)) - 1;
-  const int offset =
-      1 << (8 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
-  const __m128i offsets = _mm_set1_epi16(-offset);
-  const __m128i limits = _mm_set1_epi16(limit - offset);
-  const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsHorizontal - 1));
-  const auto s01 = _mm_alignr_epi8(s[1], s[0], 1);
-  const auto s23 = _mm_alignr_epi8(s[1], s[0], 5);
-  const __m128i madd01 = _mm_maddubs_epi16(s01, filter[0]);
-  const __m128i madd23 = _mm_maddubs_epi16(s23, filter[1]);
-  // The sum range here is [-128 * 255, 90 * 255].
-  const __m128i madd = _mm_add_epi16(madd01, madd23);
-  const __m128i sum = _mm_add_epi16(madd, round);
-  const __m128i rounded_sum0 = _mm_srai_epi16(sum, kInterRoundBitsHorizontal);
-  // Calculate scaled down offset correction, and add to sum here to prevent
-  // signed 16 bit outranging.
+  __m128i madds[2];
+  madds[0] = _mm_maddubs_epi16(s[0], filter[0]);
+  madds[1] = _mm_maddubs_epi16(s[1], filter[1]);
   const __m128i s_3x128 =
-      _mm_slli_epi16(_mm_srli_epi16(s01, 8), 7 - kInterRoundBitsHorizontal);
-  const __m128i rounded_sum1 = _mm_add_epi16(rounded_sum0, s_3x128);
-  const __m128i d0 = _mm_max_epi16(rounded_sum1, offsets);
-  const __m128i d1 = _mm_min_epi16(d0, limits);
-  StoreAligned16(wiener_buffer, d1);
+      _mm_slli_epi16(_mm_srli_epi16(s[0], 8), 7 - kInterRoundBitsHorizontal);
+  WienerHorizontalClip(madds, s_3x128, wiener_buffer);
 }
 
+// loading all and unpacking is about 7% faster than using _mm_alignr_epi8().
 inline void WienerHorizontalTap7(const uint8_t* src, const ptrdiff_t src_stride,
                                  const ptrdiff_t width, const int height,
+                                 const int coefficient0,
                                  const __m128i coefficients,
                                  int16_t** const wiener_buffer) {
+  const __m128i round = _mm_set1_epi8(1 << (kInterRoundBitsHorizontal - 1));
   __m128i filter[4];
   filter[0] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0200));
   filter[1] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0604));
   filter[2] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0204));
-  filter[3] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x8000));
-  int y = height;
-  do {
-    const __m128i s0 = LoadUnaligned16(src);
-    __m128i ss[4];
-    ss[0] = _mm_unpacklo_epi8(s0, s0);
-    ss[1] = _mm_unpackhi_epi8(s0, s0);
+  filter[3] = _mm_set1_epi16((1 << 8) | static_cast<uint8_t>(coefficient0));
+  for (int y = height; y != 0; --y) {
     ptrdiff_t x = 0;
     do {
-      const __m128i s1 = LoadUnaligned16(src + x + 16);
-      ss[2] = _mm_unpacklo_epi8(s1, s1);
-      ss[3] = _mm_unpackhi_epi8(s1, s1);
-      WienerHorizontalTap7Kernel(ss + 0, filter, *wiener_buffer + x + 0);
-      WienerHorizontalTap7Kernel(ss + 1, filter, *wiener_buffer + x + 8);
-      ss[0] = ss[2];
-      ss[1] = ss[3];
+      __m128i s[7], ss[4];
+      s[0] = LoadUnaligned16(src + x + 0);
+      s[1] = LoadUnaligned16(src + x + 1);
+      s[2] = LoadUnaligned16(src + x + 2);
+      s[3] = LoadUnaligned16(src + x + 3);
+      s[4] = LoadUnaligned16(src + x + 4);
+      s[5] = LoadUnaligned16(src + x + 5);
+      s[6] = LoadUnaligned16(src + x + 6);
+      ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+      ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+      ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
+      ss[3] = _mm_unpacklo_epi8(s[6], round);
+      WienerHorizontalTap7Kernel(ss, filter, *wiener_buffer + x + 0);
+      ss[0] = _mm_unpackhi_epi8(s[0], s[1]);
+      ss[1] = _mm_unpackhi_epi8(s[2], s[3]);
+      ss[2] = _mm_unpackhi_epi8(s[4], s[5]);
+      ss[3] = _mm_unpackhi_epi8(s[6], round);
+      WienerHorizontalTap7Kernel(ss, filter, *wiener_buffer + x + 8);
       x += 16;
     } while (x < width);
     src += src_stride;
     *wiener_buffer += width;
-  } while (--y != 0);
+  }
 }
 
 inline void WienerHorizontalTap5(const uint8_t* src, const ptrdiff_t src_stride,
                                  const ptrdiff_t width, const int height,
+                                 const int coefficient1,
                                  const __m128i coefficients,
                                  int16_t** const wiener_buffer) {
+  const __m128i round = _mm_set1_epi8(1 << (kInterRoundBitsHorizontal - 1));
   __m128i filter[3];
   filter[0] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0402));
   filter[1] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0406));
-  filter[2] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x8002));
-  int y = height;
-  do {
-    const __m128i s0 = LoadUnaligned16(src);
-    __m128i ss[4];
-    ss[0] = _mm_unpacklo_epi8(s0, s0);
-    ss[1] = _mm_unpackhi_epi8(s0, s0);
+  filter[2] = _mm_set1_epi16((1 << 8) | static_cast<uint8_t>(coefficient1));
+  for (int y = height; y != 0; --y) {
     ptrdiff_t x = 0;
     do {
-      const __m128i s1 = LoadUnaligned16(src + x + 16);
-      ss[2] = _mm_unpacklo_epi8(s1, s1);
-      ss[3] = _mm_unpackhi_epi8(s1, s1);
-      WienerHorizontalTap5Kernel(ss + 0, filter, *wiener_buffer + x + 0);
-      WienerHorizontalTap5Kernel(ss + 1, filter, *wiener_buffer + x + 8);
-      ss[0] = ss[2];
-      ss[1] = ss[3];
+      __m128i s[5], ss[3];
+      s[0] = LoadUnaligned16(src + x + 0);
+      s[1] = LoadUnaligned16(src + x + 1);
+      s[2] = LoadUnaligned16(src + x + 2);
+      s[3] = LoadUnaligned16(src + x + 3);
+      s[4] = LoadUnaligned16(src + x + 4);
+      ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+      ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+      ss[2] = _mm_unpacklo_epi8(s[4], round);
+      WienerHorizontalTap5Kernel(ss, filter, *wiener_buffer + x + 0);
+      ss[0] = _mm_unpackhi_epi8(s[0], s[1]);
+      ss[1] = _mm_unpackhi_epi8(s[2], s[3]);
+      ss[2] = _mm_unpackhi_epi8(s[4], round);
+      WienerHorizontalTap5Kernel(ss, filter, *wiener_buffer + x + 8);
       x += 16;
     } while (x < width);
     src += src_stride;
     *wiener_buffer += width;
-  } while (--y != 0);
+  }
 }
 
 inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride,
                                  const ptrdiff_t width, const int height,
+                                 const int coefficient2,
                                  const __m128i coefficients,
                                  int16_t** const wiener_buffer) {
+  const __m128i round = _mm_set1_epi8(1 << (kInterRoundBitsHorizontal - 1));
   __m128i filter[2];
   filter[0] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0604));
-  filter[1] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x8004));
-  int y = height;
-  do {
-    const __m128i s0 = LoadUnaligned16(src);
-    __m128i ss[4];
-    ss[0] = _mm_unpacklo_epi8(s0, s0);
-    ss[1] = _mm_unpackhi_epi8(s0, s0);
+  filter[1] = _mm_set1_epi16((1 << 8) | static_cast<uint8_t>(coefficient2));
+  for (int y = height; y != 0; --y) {
     ptrdiff_t x = 0;
     do {
-      const __m128i s1 = LoadUnaligned16(src + x + 16);
-      ss[2] = _mm_unpacklo_epi8(s1, s1);
-      ss[3] = _mm_unpackhi_epi8(s1, s1);
-      WienerHorizontalTap3Kernel(ss + 0, filter, *wiener_buffer + x + 0);
-      WienerHorizontalTap3Kernel(ss + 1, filter, *wiener_buffer + x + 8);
-      ss[0] = ss[2];
-      ss[1] = ss[3];
+      __m128i s[3], ss[2];
+      s[0] = LoadUnaligned16(src + x + 0);
+      s[1] = LoadUnaligned16(src + x + 1);
+      s[2] = LoadUnaligned16(src + x + 2);
+      ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+      ss[1] = _mm_unpacklo_epi8(s[2], round);
+      WienerHorizontalTap3Kernel(ss, filter, *wiener_buffer + x + 0);
+      ss[0] = _mm_unpackhi_epi8(s[0], s[1]);
+      ss[1] = _mm_unpackhi_epi8(s[2], round);
+      WienerHorizontalTap3Kernel(ss, filter, *wiener_buffer + x + 8);
       x += 16;
     } while (x < width);
     src += src_stride;
     *wiener_buffer += width;
-  } while (--y != 0);
+  }
 }
 
 inline void WienerHorizontalTap1(const uint8_t* src, const ptrdiff_t src_stride,
                                  const ptrdiff_t width, const int height,
                                  int16_t** const wiener_buffer) {
-  int y = height;
-  do {
+  for (int y = height; y != 0; --y) {
     ptrdiff_t x = 0;
     do {
       const __m128i s = LoadUnaligned16(src + x);
@@ -236,7 +213,7 @@
     } while (x < width);
     src += src_stride;
     *wiener_buffer += width;
-  } while (--y != 0);
+  }
 }
 
 inline __m128i WienerVertical7(const __m128i a[2], const __m128i filter[2]) {
@@ -504,19 +481,19 @@
   }
 }
 
-void WienerFilter_SSE4_1(const void* const source, void* const dest,
-                         const RestorationUnitInfo& restoration_info,
-                         const ptrdiff_t source_stride,
-                         const ptrdiff_t dest_stride, const int width,
-                         const int height, RestorationBuffer* const buffer) {
-  constexpr int kCenterTap = kWienerFilterTaps / 2;
+void WienerFilter_SSE4_1(
+    const RestorationUnitInfo& restoration_info, const void* const source,
+    const ptrdiff_t stride, const void* const top_border,
+    const ptrdiff_t top_border_stride, const void* const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* const restoration_buffer, void* const dest) {
   const int16_t* const number_leading_zero_coefficients =
       restoration_info.wiener_info.number_leading_zero_coefficients;
   const int number_rows_to_skip = std::max(
       static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
       1);
   const ptrdiff_t wiener_stride = Align(width, 16);
-  int16_t* const wiener_buffer_vertical = buffer->wiener_buffer;
+  int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
   // The values are saturated to 13 bits before storing.
   int16_t* wiener_buffer_horizontal =
       wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
@@ -525,31 +502,61 @@
   // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
   const int height_horizontal =
       height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
-  const auto* const src = static_cast<const uint8_t*>(source) -
-                          (kCenterTap - number_rows_to_skip) * source_stride;
-  const __m128i c =
-      LoadLo8(restoration_info.wiener_info.filter[WienerInfo::kHorizontal]);
+  const int height_extra = (height_horizontal - height) >> 1;
+  assert(height_extra <= 2);
+  const auto* const src = static_cast<const uint8_t*>(source);
+  const auto* const top = static_cast<const uint8_t*>(top_border);
+  const auto* const bottom = static_cast<const uint8_t*>(bottom_border);
+  const int16_t* const filter_horizontal =
+      restoration_info.wiener_info.filter[WienerInfo::kHorizontal];
+  const __m128i c = LoadLo8(filter_horizontal);
   // In order to keep the horizontal pass intermediate values within 16 bits we
   // offset |filter[3]| by 128. The 128 offset will be added back in the loop.
   const __m128i coefficients_horizontal =
       _mm_sub_epi16(c, _mm_setr_epi16(0, 0, 0, 128, 0, 0, 0, 0));
   if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
-    WienerHorizontalTap7(src - 3, source_stride, wiener_stride,
-                         height_horizontal, coefficients_horizontal,
+    WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+                         top_border_stride, wiener_stride, height_extra,
+                         filter_horizontal[0], coefficients_horizontal,
                          &wiener_buffer_horizontal);
+    WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+                         filter_horizontal[0], coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+                         height_extra, filter_horizontal[0],
+                         coefficients_horizontal, &wiener_buffer_horizontal);
   } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
-    WienerHorizontalTap5(src - 2, source_stride, wiener_stride,
-                         height_horizontal, coefficients_horizontal,
+    WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+                         top_border_stride, wiener_stride, height_extra,
+                         filter_horizontal[1], coefficients_horizontal,
                          &wiener_buffer_horizontal);
+    WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+                         filter_horizontal[1], coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+                         height_extra, filter_horizontal[1],
+                         coefficients_horizontal, &wiener_buffer_horizontal);
   } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
     // The maximum over-reads happen here.
-    WienerHorizontalTap3(src - 1, source_stride, wiener_stride,
-                         height_horizontal, coefficients_horizontal,
+    WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+                         top_border_stride, wiener_stride, height_extra,
+                         filter_horizontal[2], coefficients_horizontal,
                          &wiener_buffer_horizontal);
+    WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+                         filter_horizontal[2], coefficients_horizontal,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+                         height_extra, filter_horizontal[2],
+                         coefficients_horizontal, &wiener_buffer_horizontal);
   } else {
     assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
-    WienerHorizontalTap1(src, source_stride, wiener_stride, height_horizontal,
+    WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+                         top_border_stride, wiener_stride, height_extra,
                          &wiener_buffer_horizontal);
+    WienerHorizontalTap1(src, stride, wiener_stride, height,
+                         &wiener_buffer_horizontal);
+    WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+                         height_extra, &wiener_buffer_horizontal);
   }
 
   // vertical filtering.
@@ -563,27 +570,114 @@
     // the top and bottom row of |wiener_buffer| accordingly.
     memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
            sizeof(*wiener_buffer_horizontal) * wiener_stride);
-    memcpy(buffer->wiener_buffer, buffer->wiener_buffer + wiener_stride,
-           sizeof(*buffer->wiener_buffer) * wiener_stride);
+    memcpy(restoration_buffer->wiener_buffer,
+           restoration_buffer->wiener_buffer + wiener_stride,
+           sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
     WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
-                       filter_vertical, dst, dest_stride);
+                       filter_vertical, dst, stride);
   } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
     WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
-                       height, filter_vertical + 1, dst, dest_stride);
+                       height, filter_vertical + 1, dst, stride);
   } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
     WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
-                       wiener_stride, height, filter_vertical + 2, dst,
-                       dest_stride);
+                       wiener_stride, height, filter_vertical + 2, dst, stride);
   } else {
     assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
     WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
-                       wiener_stride, height, dst, dest_stride);
+                       wiener_stride, height, dst, stride);
   }
 }
 
 //------------------------------------------------------------------------------
 // SGR
 
+// SIMD overreads 16 - (width % 16) - 2 * padding pixels, where padding is 3 for
+// Pass 1 and 2 for Pass 2.
+constexpr int kOverreadInBytesPass1 = 10;
+constexpr int kOverreadInBytesPass2 = 12;
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+                               __m128i dst[2]) {
+  dst[0] = LoadAligned16(src[0] + x);
+  dst[1] = LoadAligned16(src[1] + x);
+}
+
+inline void LoadAligned16x2U16Msan(const uint16_t* const src[2],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m128i dst[2]) {
+  dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
+  dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+                               __m128i dst[3]) {
+  dst[0] = LoadAligned16(src[0] + x);
+  dst[1] = LoadAligned16(src[1] + x);
+  dst[2] = LoadAligned16(src[2] + x);
+}
+
+inline void LoadAligned16x3U16Msan(const uint16_t* const src[3],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m128i dst[3]) {
+  dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
+  dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
+  dst[2] = LoadAligned16Msan(src[2] + x, sizeof(**src) * (x + 8 - border));
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) {
+  dst[0] = LoadAligned16(src + 0);
+  dst[1] = LoadAligned16(src + 4);
+}
+
+inline void LoadAligned32U32Msan(const uint32_t* const src, const ptrdiff_t x,
+                                 const ptrdiff_t border, __m128i dst[2]) {
+  dst[0] = LoadAligned16Msan(src + x + 0, sizeof(*src) * (x + 4 - border));
+  dst[1] = LoadAligned16Msan(src + x + 4, sizeof(*src) * (x + 8 - border));
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+                               __m128i dst[2][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned32x2U32Msan(const uint32_t* const src[2],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m128i dst[2][2]) {
+  LoadAligned32U32Msan(src[0], x, border, dst[0]);
+  LoadAligned32U32Msan(src[1], x, border, dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+                               __m128i dst[3][2]) {
+  LoadAligned32U32(src[0] + x, dst[0]);
+  LoadAligned32U32(src[1] + x, dst[1]);
+  LoadAligned32U32(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned32x3U32Msan(const uint32_t* const src[3],
+                                   const ptrdiff_t x, const ptrdiff_t border,
+                                   __m128i dst[3][2]) {
+  LoadAligned32U32Msan(src[0], x, border, dst[0]);
+  LoadAligned32U32Msan(src[1], x, border, dst[1]);
+  LoadAligned32U32Msan(src[2], x, border, dst[2]);
+}
+
+inline void StoreAligned32U16(uint16_t* const dst, const __m128i src[2]) {
+  StoreAligned16(dst + 0, src[0]);
+  StoreAligned16(dst + 8, src[1]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) {
+  StoreAligned16(dst + 0, src[0]);
+  StoreAligned16(dst + 4, src[1]);
+}
+
+inline void StoreAligned64U32(uint32_t* const dst, const __m128i src[4]) {
+  StoreAligned32U32(dst + 0, src + 0);
+  StoreAligned32U32(dst + 8, src + 2);
+}
+
 // Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following
 // functions. Some compilers may generate super inefficient code and the whole
 // decoder could be 15% slower.
@@ -632,24 +726,6 @@
   return _mm_add_epi32(src0, s1);
 }
 
-// Using VgetLane16() can save a sign extension instruction.
-template <int n>
-inline int VgetLane16(const __m128i src) {
-  return _mm_extract_epi16(src, n);
-}
-
-inline __m128i VmullLo8(const __m128i src0, const __m128i src1) {
-  const __m128i s0 = _mm_unpacklo_epi8(src0, _mm_setzero_si128());
-  const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
-  return _mm_mullo_epi16(s0, s1);
-}
-
-inline __m128i VmullHi8(const __m128i src0, const __m128i src1) {
-  const __m128i s0 = _mm_unpackhi_epi8(src0, _mm_setzero_si128());
-  const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
-  return _mm_mullo_epi16(s0, s1);
-}
-
 inline __m128i VmullNLo8(const __m128i src0, const int src1) {
   const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
   return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
@@ -682,131 +758,97 @@
   return _mm_srli_epi32(sum, src1);
 }
 
-template <int n>
-inline __m128i CalcAxN(const __m128i a) {
-  static_assert(n == 9 || n == 25, "");
-  // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
-  // Some compilers could do this for us but we make this explicit.
-  // return _mm_mullo_epi32(a, _mm_set1_epi32(n));
-  const __m128i ax9 = _mm_add_epi32(a, _mm_slli_epi32(a, 3));
-  if (n == 9) return ax9;
-  if (n == 25) return _mm_add_epi32(ax9, _mm_slli_epi32(a, 4));
+inline __m128i SquareLo8(const __m128i src) {
+  const __m128i s = _mm_unpacklo_epi8(src, _mm_setzero_si128());
+  return _mm_mullo_epi16(s, s);
 }
 
-template <int n>
-inline __m128i CalculateMa(const __m128i sum_sq, const __m128i sum,
-                           const uint32_t s) {
-  // a = |sum_sq|
-  // d = |sum|
-  // p = (a * n < d * d) ? 0 : a * n - d * d;
-  const __m128i dxd = _mm_madd_epi16(sum, sum);
-  const __m128i axn = CalcAxN<n>(sum_sq);
-  const __m128i sub = _mm_sub_epi32(axn, dxd);
-  const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
-
-  // z = RightShiftWithRounding(p * s, kSgrProjScaleBits);
-  const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(s));
-  return VrshrU32(pxs, kSgrProjScaleBits);
+inline __m128i SquareHi8(const __m128i src) {
+  const __m128i s = _mm_unpackhi_epi8(src, _mm_setzero_si128());
+  return _mm_mullo_epi16(s, s);
 }
 
-// b = ma * b * one_over_n
-// |ma| = [0, 255]
-// |sum| is a box sum with radius 1 or 2.
-// For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
-// For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
-// |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
-// When radius is 2 |n| is 25. |one_over_n| is 164.
-// When radius is 1 |n| is 9. |one_over_n| is 455.
-// |kSgrProjReciprocalBits| is 12.
-// Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
-// Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
-inline __m128i CalculateIntermediate4(const __m128i ma, const __m128i sum,
-                                      const uint32_t one_over_n) {
-  const __m128i maq = _mm_unpacklo_epi8(ma, _mm_setzero_si128());
-  const __m128i s = _mm_unpackhi_epi16(maq, _mm_setzero_si128());
-  const __m128i m = _mm_madd_epi16(s, sum);
-  const __m128i b = _mm_mullo_epi32(m, _mm_set1_epi32(one_over_n));
-  const __m128i truncate_u32 = VrshrU32(b, kSgrProjReciprocalBits);
-  return _mm_packus_epi32(truncate_u32, truncate_u32);
+inline void Prepare3Lo8(const __m128i src, __m128i dst[3]) {
+  dst[0] = src;
+  dst[1] = _mm_srli_si128(src, 1);
+  dst[2] = _mm_srli_si128(src, 2);
 }
 
-inline __m128i CalculateIntermediate8(const __m128i ma, const __m128i sum,
-                                      const uint32_t one_over_n) {
-  const __m128i maq = _mm_unpackhi_epi8(ma, _mm_setzero_si128());
-  const __m128i m0 = VmullLo16(maq, sum);
-  const __m128i m1 = VmullHi16(maq, sum);
-  const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
-  const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
-  const __m128i b_lo = VrshrU32(m2, kSgrProjReciprocalBits);
-  const __m128i b_hi = VrshrU32(m3, kSgrProjReciprocalBits);
-  return _mm_packus_epi32(b_lo, b_hi);
+template <int offset>
+inline void Prepare3_8(const __m128i src[2], __m128i dst[3]) {
+  dst[0] = _mm_alignr_epi8(src[1], src[0], offset + 0);
+  dst[1] = _mm_alignr_epi8(src[1], src[0], offset + 1);
+  dst[2] = _mm_alignr_epi8(src[1], src[0], offset + 2);
 }
 
-inline __m128i Sum3_16(const __m128i left, const __m128i middle,
-                       const __m128i right) {
-  const __m128i sum = _mm_add_epi16(left, middle);
-  return _mm_add_epi16(sum, right);
+inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) {
+  dst[0] = src[0];
+  dst[1] = _mm_alignr_epi8(src[1], src[0], 2);
+  dst[2] = _mm_alignr_epi8(src[1], src[0], 4);
 }
 
-inline __m128i Sum3_32(const __m128i left, const __m128i middle,
-                       const __m128i right) {
-  const __m128i sum = _mm_add_epi32(left, middle);
-  return _mm_add_epi32(sum, right);
+inline void Prepare5Lo8(const __m128i src, __m128i dst[5]) {
+  dst[0] = src;
+  dst[1] = _mm_srli_si128(src, 1);
+  dst[2] = _mm_srli_si128(src, 2);
+  dst[3] = _mm_srli_si128(src, 3);
+  dst[4] = _mm_srli_si128(src, 4);
 }
 
-inline __m128i Sum3W_16(const __m128i left, const __m128i middle,
-                        const __m128i right) {
-  const __m128i sum = VaddlLo8(left, middle);
-  return VaddwLo8(sum, right);
+template <int offset>
+inline void Prepare5_8(const __m128i src[2], __m128i dst[5]) {
+  dst[0] = _mm_alignr_epi8(src[1], src[0], offset + 0);
+  dst[1] = _mm_alignr_epi8(src[1], src[0], offset + 1);
+  dst[2] = _mm_alignr_epi8(src[1], src[0], offset + 2);
+  dst[3] = _mm_alignr_epi8(src[1], src[0], offset + 3);
+  dst[4] = _mm_alignr_epi8(src[1], src[0], offset + 4);
 }
 
-inline __m128i Sum3WLo_16(const __m128i src[3]) {
-  return Sum3W_16(src[0], src[1], src[2]);
+inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) {
+  Prepare3_16(src, dst);
+  dst[3] = _mm_alignr_epi8(src[1], src[0], 6);
+  dst[4] = _mm_alignr_epi8(src[1], src[0], 8);
 }
 
-inline __m128i Sum3WHi_16(const __m128i src[3]) {
+inline __m128i Sum3_16(const __m128i src0, const __m128i src1,
+                       const __m128i src2) {
+  const __m128i sum = _mm_add_epi16(src0, src1);
+  return _mm_add_epi16(sum, src2);
+}
+
+inline __m128i Sum3_16(const __m128i src[3]) {
+  return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m128i Sum3_32(const __m128i src0, const __m128i src1,
+                       const __m128i src2) {
+  const __m128i sum = _mm_add_epi32(src0, src1);
+  return _mm_add_epi32(sum, src2);
+}
+
+inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) {
+  dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+  dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline __m128i Sum3WLo16(const __m128i src[3]) {
+  const __m128i sum = VaddlLo8(src[0], src[1]);
+  return VaddwLo8(sum, src[2]);
+}
+
+inline __m128i Sum3WHi16(const __m128i src[3]) {
   const __m128i sum = VaddlHi8(src[0], src[1]);
   return VaddwHi8(sum, src[2]);
 }
 
-inline __m128i Sum3WLo_32(const __m128i left, const __m128i middle,
-                          const __m128i right) {
-  const __m128i sum = VaddlLo16(left, middle);
-  return VaddwLo16(sum, right);
+inline __m128i Sum3WLo32(const __m128i src[3]) {
+  const __m128i sum = VaddlLo16(src[0], src[1]);
+  return VaddwLo16(sum, src[2]);
 }
 
-inline __m128i Sum3WHi_32(const __m128i left, const __m128i middle,
-                          const __m128i right) {
-  const __m128i sum = VaddlHi16(left, middle);
-  return VaddwHi16(sum, right);
-}
-
-inline __m128i* Sum3W_16x2(const __m128i src[3], __m128i sum[2]) {
-  sum[0] = Sum3WLo_16(src);
-  sum[1] = Sum3WHi_16(src);
-  return sum;
-}
-
-inline __m128i* Sum3W(const __m128i src[3], __m128i sum[2]) {
-  sum[0] = Sum3WLo_32(src[0], src[1], src[2]);
-  sum[1] = Sum3WHi_32(src[0], src[1], src[2]);
-  return sum;
-}
-
-template <int index>
-inline __m128i Sum3WLo(const __m128i src[3][2]) {
-  return Sum3WLo_32(src[0][index], src[1][index], src[2][index]);
-}
-
-inline __m128i Sum3WHi(const __m128i src[3][2]) {
-  return Sum3WHi_32(src[0][0], src[1][0], src[2][0]);
-}
-
-inline __m128i* Sum3W(const __m128i src[3][2], __m128i sum[3]) {
-  sum[0] = Sum3WLo<0>(src);
-  sum[1] = Sum3WHi(src);
-  sum[2] = Sum3WLo<1>(src);
-  return sum;
+inline __m128i Sum3WHi32(const __m128i src[3]) {
+  const __m128i sum = VaddlHi16(src[0], src[1]);
+  return VaddwHi16(sum, src[2]);
 }
 
 inline __m128i Sum5_16(const __m128i src[5]) {
@@ -816,323 +858,993 @@
   return _mm_add_epi16(sum, src[4]);
 }
 
-inline __m128i Sum5_32(const __m128i src[5]) {
-  const __m128i sum01 = _mm_add_epi32(src[0], src[1]);
-  const __m128i sum23 = _mm_add_epi32(src[2], src[3]);
+inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1,
+                       const __m128i* const src2, const __m128i* const src3,
+                       const __m128i* const src4) {
+  const __m128i sum01 = _mm_add_epi32(*src0, *src1);
+  const __m128i sum23 = _mm_add_epi32(*src2, *src3);
   const __m128i sum = _mm_add_epi32(sum01, sum23);
-  return _mm_add_epi32(sum, src[4]);
+  return _mm_add_epi32(sum, *src4);
 }
 
-inline __m128i Sum5WLo_16(const __m128i src[5]) {
+inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) {
+  dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+  dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline __m128i Sum5WLo16(const __m128i src[5]) {
   const __m128i sum01 = VaddlLo8(src[0], src[1]);
   const __m128i sum23 = VaddlLo8(src[2], src[3]);
   const __m128i sum = _mm_add_epi16(sum01, sum23);
   return VaddwLo8(sum, src[4]);
 }
 
-inline __m128i Sum5WHi_16(const __m128i src[5]) {
+inline __m128i Sum5WHi16(const __m128i src[5]) {
   const __m128i sum01 = VaddlHi8(src[0], src[1]);
   const __m128i sum23 = VaddlHi8(src[2], src[3]);
   const __m128i sum = _mm_add_epi16(sum01, sum23);
   return VaddwHi8(sum, src[4]);
 }
 
-inline __m128i Sum5WLo_32(const __m128i src[5]) {
-  const __m128i sum01 = VaddlLo16(src[0], src[1]);
-  const __m128i sum23 = VaddlLo16(src[2], src[3]);
-  const __m128i sum0123 = _mm_add_epi32(sum01, sum23);
-  return VaddwLo16(sum0123, src[4]);
-}
-
-inline __m128i Sum5WHi_32(const __m128i src[5]) {
-  const __m128i sum01 = VaddlHi16(src[0], src[1]);
-  const __m128i sum23 = VaddlHi16(src[2], src[3]);
-  const __m128i sum0123 = _mm_add_epi32(sum01, sum23);
-  return VaddwHi16(sum0123, src[4]);
-}
-
-inline __m128i* Sum5W_16D(const __m128i src[5], __m128i sum[2]) {
-  sum[0] = Sum5WLo_16(src);
-  sum[1] = Sum5WHi_16(src);
-  return sum;
-}
-
-inline __m128i* Sum5W_32x2(const __m128i src[5], __m128i sum[2]) {
-  sum[0] = Sum5WLo_32(src);
-  sum[1] = Sum5WHi_32(src);
-  return sum;
-}
-
-template <int index>
-inline __m128i Sum5WLo(const __m128i src[5][2]) {
-  __m128i s[5];
-  s[0] = src[0][index];
-  s[1] = src[1][index];
-  s[2] = src[2][index];
-  s[3] = src[3][index];
-  s[4] = src[4][index];
-  return Sum5WLo_32(s);
-}
-
-inline __m128i Sum5WHi(const __m128i src[5][2]) {
-  __m128i s[5];
-  s[0] = src[0][0];
-  s[1] = src[1][0];
-  s[2] = src[2][0];
-  s[3] = src[3][0];
-  s[4] = src[4][0];
-  return Sum5WHi_32(s);
-}
-
-inline __m128i* Sum5W_32x3(const __m128i src[5][2], __m128i sum[3]) {
-  sum[0] = Sum5WLo<0>(src);
-  sum[1] = Sum5WHi(src);
-  sum[2] = Sum5WLo<1>(src);
-  return sum;
-}
-
 inline __m128i Sum3Horizontal(const __m128i src) {
-  const auto left = src;
-  const auto middle = _mm_srli_si128(src, 2);
-  const auto right = _mm_srli_si128(src, 4);
-  return Sum3_16(left, middle, right);
+  __m128i s[3];
+  Prepare3Lo8(src, s);
+  return Sum3WLo16(s);
 }
 
-inline __m128i Sum3Horizontal_32(const __m128i src[2]) {
-  const auto left = src[0];
-  const auto middle = _mm_alignr_epi8(src[1], src[0], 4);
-  const auto right = _mm_alignr_epi8(src[1], src[0], 8);
-  return Sum3_32(left, middle, right);
+template <int offset>
+inline void Sum3Horizontal(const __m128i src[2], __m128i dst[2]) {
+  __m128i s[3];
+  Prepare3_8<offset>(src, s);
+  dst[0] = Sum3WLo16(s);
+  dst[1] = Sum3WHi16(s);
 }
 
-inline __m128i Sum3HorizontalOffset1(const __m128i src) {
-  const auto left = _mm_srli_si128(src, 2);
-  const auto middle = _mm_srli_si128(src, 4);
-  const auto right = _mm_srli_si128(src, 6);
-  return Sum3_16(left, middle, right);
-}
-
-inline __m128i Sum3HorizontalOffset1_16(const __m128i src[2]) {
-  const auto left = _mm_alignr_epi8(src[1], src[0], 2);
-  const auto middle = _mm_alignr_epi8(src[1], src[0], 4);
-  const auto right = _mm_alignr_epi8(src[1], src[0], 6);
-  return Sum3_16(left, middle, right);
-}
-
-inline __m128i Sum3HorizontalOffset1_32(const __m128i src[2]) {
-  const auto left = _mm_alignr_epi8(src[1], src[0], 4);
-  const auto middle = _mm_alignr_epi8(src[1], src[0], 8);
-  const auto right = _mm_alignr_epi8(src[1], src[0], 12);
-  return Sum3_32(left, middle, right);
-}
-
-inline void Sum3HorizontalOffset1_32x2(const __m128i src[3], __m128i sum[2]) {
-  sum[0] = Sum3HorizontalOffset1_32(src + 0);
-  sum[1] = Sum3HorizontalOffset1_32(src + 1);
+inline void Sum3WHorizontal(const __m128i src[2], __m128i dst[2]) {
+  __m128i s[3];
+  Prepare3_16(src, s);
+  dst[0] = Sum3WLo32(s);
+  dst[1] = Sum3WHi32(s);
 }
 
 inline __m128i Sum5Horizontal(const __m128i src) {
   __m128i s[5];
-  s[0] = src;
-  s[1] = _mm_srli_si128(src, 2);
-  s[2] = _mm_srli_si128(src, 4);
-  s[3] = _mm_srli_si128(src, 6);
-  s[4] = _mm_srli_si128(src, 8);
-  return Sum5_16(s);
+  Prepare5Lo8(src, s);
+  return Sum5WLo16(s);
 }
 
-inline __m128i Sum5Horizontal_16(const __m128i src[2]) {
+template <int offset>
+inline void Sum5Horizontal(const __m128i src[2], __m128i* const dst0,
+                           __m128i* const dst1) {
   __m128i s[5];
-  s[0] = src[0];
-  s[1] = _mm_alignr_epi8(src[1], src[0], 2);
-  s[2] = _mm_alignr_epi8(src[1], src[0], 4);
-  s[3] = _mm_alignr_epi8(src[1], src[0], 6);
-  s[4] = _mm_alignr_epi8(src[1], src[0], 8);
-  return Sum5_16(s);
+  Prepare5_8<offset>(src, s);
+  *dst0 = Sum5WLo16(s);
+  *dst1 = Sum5WHi16(s);
 }
 
-inline __m128i Sum5Horizontal_32(const __m128i src[2]) {
+inline void Sum5WHorizontal(const __m128i src[2], __m128i dst[2]) {
   __m128i s[5];
-  s[0] = src[0];
-  s[1] = _mm_alignr_epi8(src[1], src[0], 4);
-  s[2] = _mm_alignr_epi8(src[1], src[0], 8);
-  s[3] = _mm_alignr_epi8(src[1], src[0], 12);
-  s[4] = src[1];
-  return Sum5_32(s);
+  Prepare5_16(src, s);
+  const __m128i sum01_lo = VaddlLo16(s[0], s[1]);
+  const __m128i sum23_lo = VaddlLo16(s[2], s[3]);
+  const __m128i sum0123_lo = _mm_add_epi32(sum01_lo, sum23_lo);
+  dst[0] = VaddwLo16(sum0123_lo, s[4]);
+  const __m128i sum01_hi = VaddlHi16(s[0], s[1]);
+  const __m128i sum23_hi = VaddlHi16(s[2], s[3]);
+  const __m128i sum0123_hi = _mm_add_epi32(sum01_hi, sum23_hi);
+  dst[1] = VaddwHi16(sum0123_hi, s[4]);
 }
 
-inline __m128i* Sum5Horizontal_32x2(const __m128i src[3], __m128i sum[2]) {
+void SumHorizontalLo(const __m128i src[5], __m128i* const row_sq3,
+                     __m128i* const row_sq5) {
+  const __m128i sum04 = VaddlLo16(src[0], src[4]);
+  *row_sq3 = Sum3WLo32(src + 1);
+  *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalHi(const __m128i src[5], __m128i* const row_sq3,
+                     __m128i* const row_sq5) {
+  const __m128i sum04 = VaddlHi16(src[0], src[4]);
+  *row_sq3 = Sum3WHi32(src + 1);
+  *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalLo(const __m128i src, __m128i* const row3,
+                     __m128i* const row5) {
   __m128i s[5];
-  s[0] = src[0];
-  s[1] = _mm_alignr_epi8(src[1], src[0], 4);
-  s[2] = _mm_alignr_epi8(src[1], src[0], 8);
-  s[3] = _mm_alignr_epi8(src[1], src[0], 12);
-  s[4] = src[1];
-  sum[0] = Sum5_32(s);
-  s[0] = src[1];
-  s[1] = _mm_alignr_epi8(src[2], src[1], 4);
-  s[2] = _mm_alignr_epi8(src[2], src[1], 8);
-  s[3] = _mm_alignr_epi8(src[2], src[1], 12);
-  s[4] = src[2];
-  sum[1] = Sum5_32(s);
-  return sum;
+  Prepare5Lo8(src, s);
+  const __m128i sum04 = VaddlLo8(s[0], s[4]);
+  *row3 = Sum3WLo16(s + 1);
+  *row5 = _mm_add_epi16(sum04, *row3);
 }
 
-template <int size, int offset>
-inline void BoxFilterPreProcess4(const __m128i* const row,
-                                 const __m128i* const row_sq, const uint32_t s,
-                                 uint16_t* const dst) {
-  static_assert(size == 3 || size == 5, "");
-  static_assert(offset == 0 || offset == 1, "");
-  // Number of elements in the box being summed.
-  constexpr uint32_t n = size * size;
-  constexpr uint32_t one_over_n =
-      ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
-  __m128i sum, sum_sq;
-  if (size == 3) {
-    __m128i temp32[2];
-    if (offset == 0) {
-      sum = Sum3Horizontal(Sum3WLo_16(row));
-      sum_sq = Sum3Horizontal_32(Sum3W(row_sq, temp32));
-    } else {
-      sum = Sum3HorizontalOffset1(Sum3WLo_16(row));
-      sum_sq = Sum3HorizontalOffset1_32(Sum3W(row_sq, temp32));
-    }
-  }
-  if (size == 5) {
-    __m128i temp[2];
-    sum = Sum5Horizontal(Sum5WLo_16(row));
-    sum_sq = Sum5Horizontal_32(Sum5W_32x2(row_sq, temp));
-  }
-  const __m128i sum_32 = _mm_unpacklo_epi16(sum, _mm_setzero_si128());
-  const __m128i z0 = CalculateMa<n>(sum_sq, sum_32, s);
-  const __m128i z1 = _mm_packus_epi32(z0, z0);
-  const __m128i z = _mm_min_epu16(z1, _mm_set1_epi16(255));
-  __m128i ma = _mm_setzero_si128();
-  ma = _mm_insert_epi8(ma, kSgrMaLookup[VgetLane16<0>(z)], 4);
-  ma = _mm_insert_epi8(ma, kSgrMaLookup[VgetLane16<1>(z)], 5);
-  ma = _mm_insert_epi8(ma, kSgrMaLookup[VgetLane16<2>(z)], 6);
-  ma = _mm_insert_epi8(ma, kSgrMaLookup[VgetLane16<3>(z)], 7);
-  const __m128i b = CalculateIntermediate4(ma, sum_32, one_over_n);
-  const __m128i ma_b = _mm_unpacklo_epi64(ma, b);
-  StoreAligned16(dst, ma_b);
+template <int offset>
+void SumHorizontal(const __m128i src[2], __m128i* const row3_0,
+                   __m128i* const row3_1, __m128i* const row5_0,
+                   __m128i* const row5_1) {
+  __m128i s[5];
+  Prepare5_8<offset>(src, s);
+  const __m128i sum04_lo = VaddlLo8(s[0], s[4]);
+  const __m128i sum04_hi = VaddlHi8(s[0], s[4]);
+  *row3_0 = Sum3WLo16(s + 1);
+  *row3_1 = Sum3WHi16(s + 1);
+  *row5_0 = _mm_add_epi16(sum04_lo, *row3_0);
+  *row5_1 = _mm_add_epi16(sum04_hi, *row3_1);
+}
+
+inline void SumHorizontal(const __m128i src[2], __m128i* const row_sq3_0,
+                          __m128i* const row_sq3_1, __m128i* const row_sq5_0,
+                          __m128i* const row_sq5_1) {
+  __m128i s[5];
+  Prepare5_16(src, s);
+  SumHorizontalLo(s, row_sq3_0, row_sq5_0);
+  SumHorizontalHi(s, row_sq3_1, row_sq5_1);
+}
+
+inline __m128i Sum343Lo(const __m128i ma3[3]) {
+  const __m128i sum = Sum3WLo16(ma3);
+  const __m128i sum3 = Sum3_16(sum, sum, sum);
+  return VaddwLo8(sum3, ma3[1]);
+}
+
+inline __m128i Sum343Hi(const __m128i ma3[3]) {
+  const __m128i sum = Sum3WHi16(ma3);
+  const __m128i sum3 = Sum3_16(sum, sum, sum);
+  return VaddwHi8(sum3, ma3[1]);
+}
+
+inline __m128i Sum343WLo(const __m128i src[3]) {
+  const __m128i sum = Sum3WLo32(src);
+  const __m128i sum3 = Sum3_32(sum, sum, sum);
+  return VaddwLo16(sum3, src[1]);
+}
+
+inline __m128i Sum343WHi(const __m128i src[3]) {
+  const __m128i sum = Sum3WHi32(src);
+  const __m128i sum3 = Sum3_32(sum, sum, sum);
+  return VaddwHi16(sum3, src[1]);
+}
+
+inline void Sum343W(const __m128i src[2], __m128i dst[2]) {
+  __m128i s[3];
+  Prepare3_16(src, s);
+  dst[0] = Sum343WLo(s);
+  dst[1] = Sum343WHi(s);
+}
+
+inline __m128i Sum565Lo(const __m128i src[3]) {
+  const __m128i sum = Sum3WLo16(src);
+  const __m128i sum4 = _mm_slli_epi16(sum, 2);
+  const __m128i sum5 = _mm_add_epi16(sum4, sum);
+  return VaddwLo8(sum5, src[1]);
+}
+
+inline __m128i Sum565Hi(const __m128i src[3]) {
+  const __m128i sum = Sum3WHi16(src);
+  const __m128i sum4 = _mm_slli_epi16(sum, 2);
+  const __m128i sum5 = _mm_add_epi16(sum4, sum);
+  return VaddwHi8(sum5, src[1]);
+}
+
+inline __m128i Sum565WLo(const __m128i src[3]) {
+  const __m128i sum = Sum3WLo32(src);
+  const __m128i sum4 = _mm_slli_epi32(sum, 2);
+  const __m128i sum5 = _mm_add_epi32(sum4, sum);
+  return VaddwLo16(sum5, src[1]);
+}
+
+inline __m128i Sum565WHi(const __m128i src[3]) {
+  const __m128i sum = Sum3WHi32(src);
+  const __m128i sum4 = _mm_slli_epi32(sum, 2);
+  const __m128i sum5 = _mm_add_epi32(sum4, sum);
+  return VaddwHi16(sum5, src[1]);
+}
+
+inline void Sum565W(const __m128i src[2], __m128i dst[2]) {
+  __m128i s[3];
+  Prepare3_16(src, s);
+  dst[0] = Sum565WLo(s);
+  dst[1] = Sum565WHi(s);
+}
+
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
+                   uint32_t* square_sum3, uint32_t* square_sum5) {
+  int y = 2;
+  do {
+    __m128i s[2], sq[3];
+    s[0] = LoadUnaligned16Msan(src, kOverreadInBytesPass1 - width);
+    sq[0] = SquareLo8(s[0]);
+    ptrdiff_t x = sum_width;
+    do {
+      __m128i row3[2], row5[2], row_sq3[2], row_sq5[2];
+      x -= 16;
+      src += 16;
+      s[1] = LoadUnaligned16Msan(src,
+                                 sum_width - x + kOverreadInBytesPass1 - width);
+      sq[1] = SquareHi8(s[0]);
+      sq[2] = SquareLo8(s[1]);
+      SumHorizontal<0>(s, &row3[0], &row3[1], &row5[0], &row5[1]);
+      StoreAligned32U16(sum3, row3);
+      StoreAligned32U16(sum5, row5);
+      SumHorizontal(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
+      StoreAligned32U32(square_sum3 + 0, row_sq3);
+      StoreAligned32U32(square_sum5 + 0, row_sq5);
+      SumHorizontal(sq + 1, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
+      StoreAligned32U32(square_sum3 + 8, row_sq3);
+      StoreAligned32U32(square_sum5 + 8, row_sq5);
+      s[0] = s[1];
+      sq[0] = sq[2];
+      sum3 += 16;
+      sum5 += 16;
+      square_sum3 += 16;
+      square_sum5 += 16;
+    } while (x != 0);
+    src += src_stride - sum_width;
+    sum3 += sum_stride - sum_width;
+    sum5 += sum_stride - sum_width;
+    square_sum3 += sum_stride - sum_width;
+    square_sum5 += sum_stride - sum_width;
+  } while (--y != 0);
 }
 
 template <int size>
-inline void BoxFilterPreProcess8(const __m128i* const row,
-                                 const __m128i row_sq[][2], const uint32_t s,
-                                 __m128i* const ma, __m128i* const b,
-                                 uint16_t* const dst) {
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+                   const ptrdiff_t width, const ptrdiff_t sum_stride,
+                   const ptrdiff_t sum_width, uint16_t* sums,
+                   uint32_t* square_sums) {
   static_assert(size == 3 || size == 5, "");
-  // Number of elements in the box being summed.
-  constexpr uint32_t n = size * size;
-  constexpr uint32_t one_over_n =
-      ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
-  __m128i sum, sum_sq[2];
-  if (size == 3) {
-    __m128i temp16[2], temp32[3];
-    sum = Sum3HorizontalOffset1_16(Sum3W_16x2(row, temp16));
-    Sum3HorizontalOffset1_32x2(Sum3W(row_sq, temp32), sum_sq);
-  }
-  if (size == 5) {
-    __m128i temp16[2], temp32[3];
-    sum = Sum5Horizontal_16(Sum5W_16D(row, temp16));
-    Sum5Horizontal_32x2(Sum5W_32x3(row_sq, temp32), sum_sq);
-  }
+  constexpr int kOverreadInBytes =
+      (size == 5) ? kOverreadInBytesPass1 : kOverreadInBytesPass2;
+  int y = 2;
+  do {
+    __m128i s[2], sq[3];
+    s[0] = LoadUnaligned16Msan(src, kOverreadInBytes - width);
+    sq[0] = SquareLo8(s[0]);
+    ptrdiff_t x = sum_width;
+    do {
+      __m128i row[2], row_sq[4];
+      x -= 16;
+      src += 16;
+      s[1] = LoadUnaligned16Msan(src, sum_width - x + kOverreadInBytes - width);
+      sq[1] = SquareHi8(s[0]);
+      sq[2] = SquareLo8(s[1]);
+      if (size == 3) {
+        Sum3Horizontal<0>(s, row);
+        Sum3WHorizontal(sq + 0, row_sq + 0);
+        Sum3WHorizontal(sq + 1, row_sq + 2);
+      } else {
+        Sum5Horizontal<0>(s, &row[0], &row[1]);
+        Sum5WHorizontal(sq + 0, row_sq + 0);
+        Sum5WHorizontal(sq + 1, row_sq + 2);
+      }
+      StoreAligned32U16(sums, row);
+      StoreAligned64U32(square_sums, row_sq);
+      s[0] = s[1];
+      sq[0] = sq[2];
+      sums += 16;
+      square_sums += 16;
+    } while (x != 0);
+    src += src_stride - sum_width;
+    sums += sum_stride - sum_width;
+    square_sums += sum_stride - sum_width;
+  } while (--y != 0);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq,
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
+  // a = |sum_sq|
+  // d = |sum|
+  // p = (a * n < d * d) ? 0 : a * n - d * d;
+  const __m128i dxd = _mm_madd_epi16(sum, sum);
+  // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
+  // Some compilers could do this for us but we make this explicit.
+  // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n));
+  __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3));
+  if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4));
+  const __m128i sub = _mm_sub_epi32(axn, dxd);
+  const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
+  const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale));
+  return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
+                           const uint32_t scale) {
+  static_assert(n == 9 || n == 25, "");
   const __m128i sum_lo = _mm_unpacklo_epi16(sum, _mm_setzero_si128());
-  const __m128i z0 = CalculateMa<n>(sum_sq[0], sum_lo, s);
   const __m128i sum_hi = _mm_unpackhi_epi16(sum, _mm_setzero_si128());
-  const __m128i z1 = CalculateMa<n>(sum_sq[1], sum_hi, s);
-  const __m128i z01 = _mm_packus_epi32(z0, z1);
-  const __m128i z = _mm_min_epu16(z01, _mm_set1_epi16(255));
-  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[VgetLane16<0>(z)], 8);
-  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[VgetLane16<1>(z)], 9);
-  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[VgetLane16<2>(z)], 10);
-  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[VgetLane16<3>(z)], 11);
-  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[VgetLane16<4>(z)], 12);
-  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[VgetLane16<5>(z)], 13);
-  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[VgetLane16<6>(z)], 14);
-  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[VgetLane16<7>(z)], 15);
-  *b = CalculateIntermediate8(*ma, sum, one_over_n);
-  const __m128i ma_b = _mm_unpackhi_epi64(*ma, *b);
-  StoreAligned16(dst, ma_b);
+  const __m128i z0 = CalculateMa<n>(sum_lo, sum_sq[0], scale);
+  const __m128i z1 = CalculateMa<n>(sum_hi, sum_sq[1], scale);
+  return _mm_packus_epi32(z0, z1);
 }
 
-inline void Prepare3_8(const __m128i src, __m128i* const left,
-                       __m128i* const middle, __m128i* const right) {
-  *left = _mm_srli_si128(src, 5);
-  *middle = _mm_srli_si128(src, 6);
-  *right = _mm_srli_si128(src, 7);
+inline __m128i CalculateB5(const __m128i sum, const __m128i ma) {
+  // one_over_n == 164.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+  // one_over_n_quarter == 41.
+  constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+  static_assert(one_over_n == one_over_n_quarter << 2, "");
+  // |ma| is in range [0, 255].
+  const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter));
+  const __m128i m0 = VmullLo16(m, sum);
+  const __m128i m1 = VmullHi16(m, sum);
+  const __m128i b_lo = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+  const __m128i b_hi = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+  return _mm_packus_epi32(b_lo, b_hi);
 }
 
-inline void Prepare3_16(const __m128i src[2], __m128i* const left,
-                        __m128i* const middle, __m128i* const right) {
-  *left = _mm_alignr_epi8(src[1], src[0], 10);
-  *middle = _mm_alignr_epi8(src[1], src[0], 12);
-  *right = _mm_alignr_epi8(src[1], src[0], 14);
+inline __m128i CalculateB3(const __m128i sum, const __m128i ma) {
+  // one_over_n == 455.
+  constexpr uint32_t one_over_n =
+      ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+  const __m128i m0 = VmullLo16(ma, sum);
+  const __m128i m1 = VmullHi16(ma, sum);
+  const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
+  const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
+  const __m128i b_lo = VrshrU32(m2, kSgrProjReciprocalBits);
+  const __m128i b_hi = VrshrU32(m3, kSgrProjReciprocalBits);
+  return _mm_packus_epi32(b_lo, b_hi);
 }
 
-inline __m128i Sum343(const __m128i src) {
-  __m128i left, middle, right;
-  Prepare3_8(src, &left, &middle, &right);
-  const auto sum = Sum3W_16(left, middle, right);
-  const auto sum3 = Sum3_16(sum, sum, sum);
-  return VaddwLo8(sum3, middle);
+inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2],
+                                  const uint32_t scale, __m128i* const sum,
+                                  __m128i* const index) {
+  __m128i sum_sq[2];
+  *sum = Sum5_16(s5);
+  Sum5_32(sq5, sum_sq);
+  *index = CalculateMa<25>(*sum, sum_sq, scale);
 }
 
-inline void Sum343_444(const __m128i src, __m128i* const sum343,
-                       __m128i* const sum444) {
-  __m128i left, middle, right;
-  Prepare3_8(src, &left, &middle, &right);
-  const auto sum111 = Sum3W_16(left, middle, right);
-  *sum444 = _mm_slli_epi16(sum111, 2);
-  const __m128i sum333 = _mm_sub_epi16(*sum444, sum111);
-  *sum343 = VaddwLo8(sum333, middle);
+inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2],
+                                  const uint32_t scale, __m128i* const sum,
+                                  __m128i* const index) {
+  __m128i sum_sq[2];
+  *sum = Sum3_16(s3);
+  Sum3_32(sq3, sum_sq);
+  *index = CalculateMa<9>(*sum, sum_sq, scale);
 }
 
-inline __m128i* Sum343W(const __m128i src[2], __m128i d[2]) {
-  __m128i left, middle, right;
-  Prepare3_16(src, &left, &middle, &right);
-  d[0] = Sum3WLo_32(left, middle, right);
-  d[1] = Sum3WHi_32(left, middle, right);
-  d[0] = Sum3_32(d[0], d[0], d[0]);
-  d[1] = Sum3_32(d[1], d[1], d[1]);
-  d[0] = VaddwLo16(d[0], middle);
-  d[1] = VaddwHi16(d[1], middle);
-  return d;
+template <int n, int offset>
+inline void LookupIntermediate(const __m128i sum, const __m128i index,
+                               __m128i* const ma, __m128i* const b) {
+  static_assert(n == 9 || n == 25, "");
+  static_assert(offset == 0 || offset == 8, "");
+  const __m128i idx = _mm_packus_epi16(index, index);
+  // Actually it's not stored and loaded. The compiler will use a 64-bit
+  // general-purpose register to process. Faster than using _mm_extract_epi8().
+  uint8_t temp[8];
+  StoreLo8(temp, idx);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[0]], offset + 0);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], offset + 1);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], offset + 2);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], offset + 3);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], offset + 4);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], offset + 5);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], offset + 6);
+  *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], offset + 7);
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  __m128i maq;
+  if (offset == 0) {
+    maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+  } else {
+    maq = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
+  }
+  *b = (n == 9) ? CalculateB3(sum, maq) : CalculateB5(sum, maq);
 }
 
-inline void Sum343_444W(const __m128i src[2], __m128i sum343[2],
-                        __m128i sum444[2]) {
-  __m128i left, middle, right, sum111[2];
-  Prepare3_16(src, &left, &middle, &right);
-  sum111[0] = Sum3WLo_32(left, middle, right);
-  sum111[1] = Sum3WHi_32(left, middle, right);
-  sum444[0] = _mm_slli_epi32(sum111[0], 2);
-  sum444[1] = _mm_slli_epi32(sum111[1], 2);
-  sum343[0] = _mm_sub_epi32(sum444[0], sum111[0]);
-  sum343[1] = _mm_sub_epi32(sum444[1], sum111[1]);
-  sum343[0] = VaddwLo16(sum343[0], middle);
-  sum343[1] = VaddwHi16(sum343[1], middle);
+// Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
+// to get value 0 as the shuffle result. The most significiant bit 1 comes
+// either from the comparison instruction, or from the sign bit of the index.
+inline __m128i ShuffleIndex(const __m128i table, const __m128i index) {
+  __m128i mask;
+  mask = _mm_cmpgt_epi8(index, _mm_set1_epi8(15));
+  mask = _mm_or_si128(mask, index);
+  return _mm_shuffle_epi8(table, mask);
 }
 
-inline __m128i Sum565(const __m128i src) {
-  __m128i left, middle, right;
-  Prepare3_8(src, &left, &middle, &right);
-  const auto sum = Sum3W_16(left, middle, right);
-  const auto sum4 = _mm_slli_epi16(sum, 2);
-  const auto sum5 = _mm_add_epi16(sum4, sum);
-  return VaddwLo8(sum5, middle);
+inline __m128i AdjustValue(const __m128i value, const __m128i index,
+                           const int threshold) {
+  const __m128i thresholds = _mm_set1_epi8(threshold - 128);
+  const __m128i offset = _mm_cmpgt_epi8(index, thresholds);
+  return _mm_add_epi8(value, offset);
 }
 
-inline __m128i Sum565W(const __m128i src) {
-  const auto left = _mm_srli_si128(src, 2);
-  const auto middle = _mm_srli_si128(src, 4);
-  const auto right = _mm_srli_si128(src, 6);
-  const auto sum = Sum3WLo_32(left, middle, right);
-  const auto sum4 = _mm_slli_epi32(sum, 2);
-  const auto sum5 = _mm_add_epi32(sum4, sum);
-  return VaddwLo16(sum5, middle);
+inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
+                                  __m128i* const ma, __m128i* const b0,
+                                  __m128i* const b1) {
+  // Use table lookup to read elements whose indices are less than 48.
+  const __m128i c0 = LoadAligned16(kSgrMaLookup + 0 * 16);
+  const __m128i c1 = LoadAligned16(kSgrMaLookup + 1 * 16);
+  const __m128i c2 = LoadAligned16(kSgrMaLookup + 2 * 16);
+  const __m128i indices = _mm_packus_epi16(index[0], index[1]);
+  __m128i idx;
+  // Clip idx to 127 to apply signed comparison instructions.
+  idx = _mm_min_epu8(indices, _mm_set1_epi8(127));
+  // All elements whose indices are less than 48 are set to 0.
+  // Get shuffle results for indices in range [0, 15].
+  *ma = ShuffleIndex(c0, idx);
+  // Get shuffle results for indices in range [16, 31].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+  const __m128i res1 = ShuffleIndex(c1, idx);
+  // Use OR instruction to combine shuffle results together.
+  *ma = _mm_or_si128(*ma, res1);
+  // Get shuffle results for indices in range [32, 47].
+  // Subtract 16 to utilize the sign bit of the index.
+  idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+  const __m128i res2 = ShuffleIndex(c2, idx);
+  *ma = _mm_or_si128(*ma, res2);
+
+  // For elements whose indices are larger than 47, since they seldom change
+  // values with the increase of the index, we use comparison and arithmetic
+  // operations to calculate their values.
+  // Add -128 to apply signed comparison instructions.
+  idx = _mm_add_epi8(indices, _mm_set1_epi8(-128));
+  // Elements whose indices are larger than 47 (with value 0) are set to 5.
+  *ma = _mm_max_epu8(*ma, _mm_set1_epi8(5));
+  *ma = AdjustValue(*ma, idx, 55);   // 55 is the last index which value is 5.
+  *ma = AdjustValue(*ma, idx, 72);   // 72 is the last index which value is 4.
+  *ma = AdjustValue(*ma, idx, 101);  // 101 is the last index which value is 3.
+  *ma = AdjustValue(*ma, idx, 169);  // 169 is the last index which value is 2.
+  *ma = AdjustValue(*ma, idx, 254);  // 254 is the last index which value is 1.
+
+  // b = ma * b * one_over_n
+  // |ma| = [0, 255]
+  // |sum| is a box sum with radius 1 or 2.
+  // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+  // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+  // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+  // When radius is 2 |n| is 25. |one_over_n| is 164.
+  // When radius is 1 |n| is 9. |one_over_n| is 455.
+  // |kSgrProjReciprocalBits| is 12.
+  // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+  // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+  const __m128i maq0 = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+  *b0 = CalculateB3(sum[0], maq0);
+  const __m128i maq1 = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
+  *b1 = CalculateB3(sum[1], maq1);
+}
+
+inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
+                                  __m128i ma[2], __m128i b[2]) {
+  __m128i mas;
+  CalculateIntermediate(sum, index, &mas, &b[0], &b[1]);
+  ma[0] = _mm_unpacklo_epi64(ma[0], mas);
+  ma[1] = _mm_srli_si128(mas, 8);
+}
+
+// Note: It has been tried to call CalculateIntermediate() to replace the slow
+// LookupIntermediate() when calculating 16 intermediate data points. However,
+// the compiler generates even slower code.
+template <int offset>
+inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2],
+                                   const uint32_t scale, __m128i* const ma,
+                                   __m128i* const b) {
+  static_assert(offset == 0 || offset == 8, "");
+  __m128i sum, index;
+  CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
+  LookupIntermediate<25, offset>(sum, index, ma, b);
+}
+
+inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2],
+                                   const uint32_t scale, __m128i* const ma,
+                                   __m128i* const b) {
+  __m128i sum, index;
+  CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
+  LookupIntermediate<9, 0>(sum, index, ma, b);
+}
+
+inline void Store343_444(const __m128i b3[2], const ptrdiff_t x,
+                         __m128i sum_b343[2], __m128i sum_b444[2],
+                         uint32_t* const b343, uint32_t* const b444) {
+  __m128i b[3], sum_b111[2];
+  Prepare3_16(b3, b);
+  sum_b111[0] = Sum3WLo32(b);
+  sum_b111[1] = Sum3WHi32(b);
+  sum_b444[0] = _mm_slli_epi32(sum_b111[0], 2);
+  sum_b444[1] = _mm_slli_epi32(sum_b111[1], 2);
+  StoreAligned32U32(b444 + x, sum_b444);
+  sum_b343[0] = _mm_sub_epi32(sum_b444[0], sum_b111[0]);
+  sum_b343[1] = _mm_sub_epi32(sum_b444[1], sum_b111[1]);
+  sum_b343[0] = VaddwLo16(sum_b343[0], b[1]);
+  sum_b343[1] = VaddwHi16(sum_b343[1], b[1]);
+  StoreAligned32U32(b343 + x, sum_b343);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, __m128i* const sum_ma343,
+                           __m128i* const sum_ma444, __m128i sum_b343[2],
+                           __m128i sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const __m128i sum_ma111 = Sum3WLo16(ma3);
+  *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
+  StoreAligned16(ma444 + x, *sum_ma444);
+  const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwLo8(sum333, ma3[1]);
+  StoreAligned16(ma343 + x, *sum_ma343);
+  Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, __m128i* const sum_ma343,
+                           __m128i* const sum_ma444, __m128i sum_b343[2],
+                           __m128i sum_b444[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  const __m128i sum_ma111 = Sum3WHi16(ma3);
+  *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
+  StoreAligned16(ma444 + x, *sum_ma444);
+  const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
+  *sum_ma343 = VaddwHi8(sum333, ma3[1]);
+  StoreAligned16(ma343 + x, *sum_ma343);
+  Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, __m128i* const sum_ma343,
+                           __m128i sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m128i sum_ma444, sum_b444[2];
+  Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, __m128i* const sum_ma343,
+                           __m128i sum_b343[2], uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m128i sum_ma444, sum_b444[2];
+  Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+                 ma444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m128i sum_ma343, sum_b343[2];
+  Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+                           const ptrdiff_t x, uint16_t* const ma343,
+                           uint16_t* const ma444, uint32_t* const b343,
+                           uint32_t* const b444) {
+  __m128i sum_ma343, sum_b343[2];
+  Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+    const __m128i s[2][2], const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i* const ma,
+    __m128i* const b) {
+  __m128i s5[2][5], sq5[5][2];
+  sq[0][1] = SquareHi8(s[0][0]);
+  sq[1][1] = SquareHi8(s[1][0]);
+  s5[0][3] = Sum5Horizontal(s[0][0]);
+  StoreAligned16(sum5[3], s5[0][3]);
+  s5[0][4] = Sum5Horizontal(s[1][0]);
+  StoreAligned16(sum5[4], s5[0][4]);
+  Sum5WHorizontal(sq[0], sq5[3]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  Sum5WHorizontal(sq[1], sq5[4]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x3U16(sum5, 0, s5[0]);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5<0>(s5[0], sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+    const __m128i s[2][2], const ptrdiff_t sum_width, const ptrdiff_t x,
+    const uint32_t scale, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i ma[2],
+    __m128i b[3]) {
+  __m128i s5[2][5], sq5[5][2];
+  sq[0][2] = SquareLo8(s[0][1]);
+  sq[1][2] = SquareLo8(s[1][1]);
+  Sum5Horizontal<8>(s[0], &s5[0][3], &s5[1][3]);
+  StoreAligned16(sum5[3] + x + 0, s5[0][3]);
+  StoreAligned16(sum5[3] + x + 8, s5[1][3]);
+  Sum5Horizontal<8>(s[1], &s5[0][4], &s5[1][4]);
+  StoreAligned16(sum5[4] + x + 0, s5[0][4]);
+  StoreAligned16(sum5[4] + x + 8, s5[1][4]);
+  Sum5WHorizontal(sq[0] + 1, sq5[3]);
+  StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+  Sum5WHorizontal(sq[1] + 1, sq5[4]);
+  StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[1]);
+
+  sq[0][3] = SquareHi8(s[0][1]);
+  sq[1][3] = SquareHi8(s[1][1]);
+  Sum5WHorizontal(sq[0] + 2, sq5[3]);
+  StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+  Sum5WHorizontal(sq[1] + 2, sq5[4]);
+  StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[2]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+    const __m128i s, const uint32_t scale, const uint16_t* const sum5[5],
+    const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma,
+    __m128i* const b) {
+  __m128i s5[5], sq5[5][2];
+  sq[1] = SquareHi8(s);
+  s5[3] = s5[4] = Sum5Horizontal(s);
+  Sum5WHorizontal(sq, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+    const __m128i s[2], const ptrdiff_t sum_width, const ptrdiff_t x,
+    const uint32_t scale, const uint16_t* const sum5[5],
+    const uint32_t* const square_sum5[5], __m128i sq[4], __m128i ma[2],
+    __m128i b[3]) {
+  __m128i s5[2][5], sq5[5][2];
+  sq[2] = SquareLo8(s[1]);
+  Sum5Horizontal<8>(s, &s5[0][3], &s5[1][3]);
+  s5[0][4] = s5[0][3];
+  s5[1][4] = s5[1][3];
+  Sum5WHorizontal(sq + 1, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[1]);
+
+  sq[3] = SquareHi8(s[1]);
+  Sum5WHorizontal(sq + 2, sq5[3]);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[2]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+    const __m128i s, const uint32_t scale, uint16_t* const sum3[3],
+    uint32_t* const square_sum3[3], __m128i sq[2], __m128i* const ma,
+    __m128i* const b) {
+  __m128i s3[3], sq3[3][2];
+  sq[1] = SquareHi8(s);
+  s3[2] = Sum3Horizontal(s);
+  StoreAligned16(sum3[2], s3[2]);
+  Sum3WHorizontal(sq, sq3[2]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+    const __m128i s[2], const ptrdiff_t x, const ptrdiff_t sum_width,
+    const uint32_t scale, uint16_t* const sum3[3],
+    uint32_t* const square_sum3[3], __m128i sq[4], __m128i ma[2],
+    __m128i b[3]) {
+  __m128i s3[4], sq3[3][2], sum[2], index[2];
+  sq[2] = SquareLo8(s[1]);
+  Sum3Horizontal<8>(s, s3 + 2);
+  StoreAligned32U16(sum3[2] + x, s3 + 2);
+  Sum3WHorizontal(sq + 1, sq3[2]);
+  StoreAligned32U32(square_sum3[2] + x + 0, sq3[2]);
+  LoadAligned16x2U16(sum3, x, s3);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
+
+  sq[3] = SquareHi8(s[1]);
+  Sum3WHorizontal(sq + 2, sq3[2]);
+  StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+  LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3 + 1);
+  LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+  CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
+  CalculateIntermediate(sum, index, ma, b + 1);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+    const __m128i s[2][2], const uint16_t scales[2], uint16_t* const sum3[4],
+    uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+    uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i ma3[2][2],
+    __m128i b3[2][3], __m128i* const ma5, __m128i* const b5) {
+  __m128i s3[4], s5[5], sq3[4][2], sq5[5][2], sum[2], index[2];
+  sq[0][1] = SquareHi8(s[0][0]);
+  sq[1][1] = SquareHi8(s[1][0]);
+  SumHorizontalLo(s[0][0], &s3[2], &s5[3]);
+  SumHorizontalLo(s[1][0], &s3[3], &s5[4]);
+  StoreAligned16(sum3[2], s3[2]);
+  StoreAligned16(sum3[3], s3[3]);
+  StoreAligned16(sum5[3], s5[3]);
+  StoreAligned16(sum5[4], s5[4]);
+  SumHorizontal(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2], sq3[2]);
+  StoreAligned32U32(square_sum5[3], sq5[3]);
+  SumHorizontal(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3], sq3[3]);
+  StoreAligned32U32(square_sum5[4], sq5[4]);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  LoadAligned16x3U16(sum5, 0, s5);
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  CalculateSumAndIndex3(s3 + 0, sq3 + 0, scales[1], &sum[0], &index[0]);
+  CalculateSumAndIndex3(s3 + 1, sq3 + 1, scales[1], &sum[1], &index[1]);
+  CalculateIntermediate(sum, index, &ma3[0][0], &b3[0][0], &b3[1][0]);
+  ma3[1][0] = _mm_srli_si128(ma3[0][0], 8);
+  CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+    const __m128i s[2][2], const ptrdiff_t x, const uint16_t scales[2],
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, __m128i sq[2][4], __m128i ma3[2][2],
+    __m128i b3[2][3], __m128i ma5[2], __m128i b5[3]) {
+  __m128i s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum[2][2], index[2][2];
+  SumHorizontal<8>(s[0], &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+  StoreAligned16(sum3[2] + x + 0, s3[0][2]);
+  StoreAligned16(sum3[2] + x + 8, s3[1][2]);
+  StoreAligned16(sum5[3] + x + 0, s5[0][3]);
+  StoreAligned16(sum5[3] + x + 8, s5[1][3]);
+  SumHorizontal<8>(s[1], &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]);
+  StoreAligned16(sum3[3] + x + 0, s3[0][3]);
+  StoreAligned16(sum3[3] + x + 8, s3[1][3]);
+  StoreAligned16(sum5[4] + x + 0, s5[0][4]);
+  StoreAligned16(sum5[4] + x + 8, s5[1][4]);
+  sq[0][2] = SquareLo8(s[0][1]);
+  sq[1][2] = SquareLo8(s[1][1]);
+  SumHorizontal(sq[0] + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2] + x, sq3[2]);
+  StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+  SumHorizontal(sq[1] + 1, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3] + x, sq3[3]);
+  StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+  LoadAligned16x2U16(sum3, x, s3[0]);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0][0], &index[0][0]);
+  CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum[1][0],
+                        &index[1][0]);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], &b5[1]);
+
+  sq[0][3] = SquareHi8(s[0][1]);
+  sq[1][3] = SquareHi8(s[1][1]);
+  SumHorizontal(sq[0] + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+  StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+  SumHorizontal(sq[1] + 2, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+  StoreAligned32U32(square_sum3[3] + x + 8, sq3[3]);
+  StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+  LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
+  LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[0][1], &index[0][1]);
+  CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum[1][1],
+                        &index[1][1]);
+  CalculateIntermediate(sum[0], index[0], ma3[0], b3[0] + 1);
+  CalculateIntermediate(sum[1], index[1], ma3[1], b3[1] + 1);
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], &b5[2]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+    const __m128i s, const uint16_t scales[2], const uint16_t* const sum3[4],
+    const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+    const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma3,
+    __m128i* const ma5, __m128i* const b3, __m128i* const b5) {
+  __m128i s3[3], s5[5], sq3[3][2], sq5[5][2];
+  sq[1] = SquareHi8(s);
+  SumHorizontalLo(s, &s3[2], &s5[3]);
+  SumHorizontal(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16(sum5, 0, s5);
+  s5[4] = s5[3];
+  LoadAligned32x3U32(square_sum5, 0, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+  LoadAligned16x2U16(sum3, 0, s3);
+  LoadAligned32x2U32(square_sum3, 0, sq3);
+  CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+    const __m128i s[2], const ptrdiff_t sum_width, const ptrdiff_t x,
+    const uint16_t scales[2], const uint16_t* const sum3[4],
+    const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+    const uint32_t* const square_sum5[5], __m128i sq[4], __m128i ma3[2],
+    __m128i ma5[2], __m128i b3[3], __m128i b5[3]) {
+  __m128i s3[2][3], s5[2][5], sq3[3][2], sq5[5][2], sum[2], index[2];
+  sq[2] = SquareLo8(s[1]);
+  SumHorizontal<8>(s, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+  SumHorizontal(sq + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16(sum5, x, s5[0]);
+  s5[0][4] = s5[0][3];
+  LoadAligned32x3U32(square_sum5, x, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5<8>(s5[0], sq5, scales[0], ma5, b5 + 1);
+  LoadAligned16x2U16(sum3, x, s3[0]);
+  LoadAligned32x2U32(square_sum3, x, sq3);
+  CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0], &index[0]);
+
+  sq[3] = SquareHi8(s[1]);
+  SumHorizontal(sq + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+  LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+  s5[1][4] = s5[1][3];
+  LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+  sq5[4][0] = sq5[3][0];
+  sq5[4][1] = sq5[3][1];
+  CalculateIntermediate5<0>(s5[1], sq5, scales[0], ma5 + 1, b5 + 2);
+  LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
+  LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+  CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[1], &index[1]);
+  CalculateIntermediate(sum, index, ma3, b3 + 1);
+}
+
+inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
+                                    const uint8_t* const src1, const int width,
+                                    const uint32_t scale,
+                                    uint16_t* const sum5[5],
+                                    uint32_t* const square_sum5[5],
+                                    const ptrdiff_t sum_width, uint16_t* ma565,
+                                    uint32_t* b565) {
+  __m128i s[2][2], mas[2], sq[2][4], bs[3];
+  s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+  s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
+  sq[0][0] = SquareLo8(s[0][0]);
+  sq[1][0] = SquareLo8(s[1][0]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], &bs[0]);
+
+  int x = 0;
+  do {
+    __m128i ma5[3], ma[2], b[4];
+    s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
+                                  x + 16 + kOverreadInBytesPass1 - width);
+    s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
+                                  x + 16 + kOverreadInBytesPass1 - width);
+    BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+                         bs);
+    Prepare3_8<0>(mas, ma5);
+    ma[0] = Sum565Lo(ma5);
+    ma[1] = Sum565Hi(ma5);
+    StoreAligned32U16(ma565, ma);
+    Sum565W(bs + 0, b + 0);
+    Sum565W(bs + 1, b + 2);
+    StoreAligned64U32(b565, b);
+    s[0][0] = s[0][1];
+    s[1][0] = s[1][1];
+    sq[0][1] = sq[0][3];
+    sq[1][1] = sq[1][3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+    const uint8_t* const src, const int width, const uint32_t scale,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
+    uint32_t* b444) {
+  __m128i s[2], mas[2], sq[4], bs[3];
+  s[0] = LoadUnaligned16Msan(src, kOverreadInBytesPass2 - width);
+  sq[0] = SquareLo8(s[0]);
+  BoxFilterPreProcess3Lo(s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]);
+
+  int x = 0;
+  do {
+    s[1] = LoadUnaligned16Msan(src + x + 16,
+                               x + 16 + kOverreadInBytesPass2 - width);
+    BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+                         bs);
+    __m128i ma3[3];
+    Prepare3_8<0>(mas, ma3);
+    if (calculate444) {  // NOLINT(readability-simplify-boolean-expr)
+      Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
+      Store343_444Hi(ma3, bs + 1, 8, ma343, ma444, b343, b444);
+      ma444 += 16;
+      b444 += 16;
+    } else {
+      __m128i ma[2], b[4];
+      ma[0] = Sum343Lo(ma3);
+      ma[1] = Sum343Hi(ma3);
+      StoreAligned32U16(ma343, ma);
+      Sum343W(bs + 0, b + 0);
+      Sum343W(bs + 1, b + 2);
+      StoreAligned64U32(b343, b);
+    }
+    s[0] = s[1];
+    sq[1] = sq[3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    ma343 += 16;
+    b343 += 16;
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+    const uint8_t* const src0, const uint8_t* const src1, const int width,
+    const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
+    uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
+    uint32_t* b565) {
+  __m128i s[2][2], ma3[2][2], ma5[2], sq[2][4], b3[2][3], b5[3];
+  s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+  s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
+  sq[0][0] = SquareLo8(s[0][0]);
+  sq[1][0] = SquareLo8(s[1][0]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+                        ma3, b3, &ma5[0], &b5[0]);
+
+  int x = 0;
+  do {
+    __m128i ma[2], b[4], ma3x[3], ma5x[3];
+    s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
+                                  x + 16 + kOverreadInBytesPass1 - width);
+    s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
+                                  x + 16 + kOverreadInBytesPass1 - width);
+    BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+                        sum_width, sq, ma3, b3, ma5, b5);
+
+    Prepare3_8<0>(ma3[0], ma3x);
+    ma[0] = Sum343Lo(ma3x);
+    ma[1] = Sum343Hi(ma3x);
+    StoreAligned32U16(ma343[0] + x, ma);
+    Sum343W(b3[0] + 0, b + 0);
+    Sum343W(b3[0] + 1, b + 2);
+    StoreAligned64U32(b343[0] + x, b);
+    Sum565W(b5 + 0, b + 0);
+    Sum565W(b5 + 1, b + 2);
+    StoreAligned64U32(b565, b);
+    Prepare3_8<0>(ma3[1], ma3x);
+    Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+    Store343_444Hi(ma3x, b3[1] + 1, x + 8, ma343[1], ma444, b343[1], b444);
+    Prepare3_8<0>(ma5, ma5x);
+    ma[0] = Sum565Lo(ma5x);
+    ma[1] = Sum565Hi(ma5x);
+    StoreAligned32U16(ma565, ma);
+    s[0][0] = s[0][1];
+    s[1][0] = s[1][1];
+    sq[0][1] = sq[0][3];
+    sq[1][1] = sq[1][3];
+    ma3[0][0] = ma3[0][1];
+    ma3[1][0] = ma3[1][1];
+    ma5[0] = ma5[1];
+    b3[0][0] = b3[0][2];
+    b3[1][0] = b3[1][2];
+    b5[0] = b5[2];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
+  } while (x < width);
 }
 
 template <int shift>
@@ -1149,635 +1861,699 @@
 }
 
 template <int shift>
-inline __m128i CalculateFilteredOutput(const __m128i src, const __m128i a,
+inline __m128i CalculateFilteredOutput(const __m128i src, const __m128i ma,
                                        const __m128i b[2]) {
-  const __m128i src_u16 = _mm_unpacklo_epi8(src, _mm_setzero_si128());
-  const __m128i ma_x_src_lo = VmullLo16(a, src_u16);
-  const __m128i ma_x_src_hi = VmullHi16(a, src_u16);
+  const __m128i ma_x_src_lo = VmullLo16(ma, src);
+  const __m128i ma_x_src_hi = VmullHi16(ma, src);
   const __m128i dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
   const __m128i dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
   return _mm_packs_epi32(dst_lo, dst_hi);  // 13 bits
 }
 
-inline __m128i BoxFilterPass1(const __m128i src_u8, const __m128i ma,
-                              const __m128i b[2], __m128i ma565[2],
-                              __m128i b565[2][2]) {
+inline __m128i CalculateFilteredOutputPass1(const __m128i src,
+                                            const __m128i ma[2],
+                                            const __m128i b[2][2]) {
+  const __m128i ma_sum = _mm_add_epi16(ma[0], ma[1]);
   __m128i b_sum[2];
-  ma565[1] = Sum565(ma);
-  b565[1][0] = Sum565W(_mm_alignr_epi8(b[1], b[0], 8));
-  b565[1][1] = Sum565W(b[1]);
-  __m128i ma_sum = _mm_add_epi16(ma565[0], ma565[1]);
-  b_sum[0] = _mm_add_epi32(b565[0][0], b565[1][0]);
-  b_sum[1] = _mm_add_epi32(b565[0][1], b565[1][1]);
-  return CalculateFilteredOutput<5>(src_u8, ma_sum, b_sum);  // 13 bits
+  b_sum[0] = _mm_add_epi32(b[0][0], b[1][0]);
+  b_sum[1] = _mm_add_epi32(b[0][1], b[1][1]);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
 }
 
-inline __m128i BoxFilterPass2(const __m128i src_u8, const __m128i ma,
-                              const __m128i b[2], __m128i ma343[4],
-                              __m128i ma444[3], __m128i b343[4][2],
-                              __m128i b444[3][2]) {
+inline __m128i CalculateFilteredOutputPass2(const __m128i src,
+                                            const __m128i ma[3],
+                                            const __m128i b[3][2]) {
+  const __m128i ma_sum = Sum3_16(ma);
   __m128i b_sum[2];
-  Sum343_444(ma, &ma343[2], &ma444[1]);
-  __m128i ma_sum = Sum3_16(ma343[0], ma444[0], ma343[2]);
-  Sum343_444W(b, b343[2], b444[1]);
-  b_sum[0] = Sum3_32(b343[0][0], b444[0][0], b343[2][0]);
-  b_sum[1] = Sum3_32(b343[0][1], b444[0][1], b343[2][1]);
-  return CalculateFilteredOutput<5>(src_u8, ma_sum, b_sum);  // 13 bits
+  Sum3_32(b, b_sum);
+  return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
 }
 
-inline void SelfGuidedFinal(const __m128i src, const __m128i v[2],
-                            uint8_t* const dst) {
+inline __m128i SelfGuidedFinal(const __m128i src, const __m128i v[2]) {
   const __m128i v_lo =
       VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
   const __m128i v_hi =
       VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
   const __m128i vv = _mm_packs_epi32(v_lo, v_hi);
-  const __m128i s = _mm_unpacklo_epi8(src, _mm_setzero_si128());
-  const __m128i d = _mm_add_epi16(s, vv);
-  StoreLo8(dst, _mm_packus_epi16(d, d));
+  return _mm_add_epi16(src, vv);
 }
 
-inline void SelfGuidedDoubleMultiplier(const __m128i src,
-                                       const __m128i filter[2], const int w0,
-                                       const int w2, uint8_t* const dst) {
+inline __m128i SelfGuidedDoubleMultiplier(const __m128i src,
+                                          const __m128i filter[2], const int w0,
+                                          const int w2) {
   __m128i v[2];
   const __m128i w0_w2 = _mm_set1_epi32((w2 << 16) | static_cast<uint16_t>(w0));
   const __m128i f_lo = _mm_unpacklo_epi16(filter[0], filter[1]);
   const __m128i f_hi = _mm_unpackhi_epi16(filter[0], filter[1]);
   v[0] = _mm_madd_epi16(w0_w2, f_lo);
   v[1] = _mm_madd_epi16(w0_w2, f_hi);
-  SelfGuidedFinal(src, v, dst);
+  return SelfGuidedFinal(src, v);
 }
 
-inline void SelfGuidedSingleMultiplier(const __m128i src, const __m128i filter,
-                                       const int w0, uint8_t* const dst) {
+inline __m128i SelfGuidedSingleMultiplier(const __m128i src,
+                                          const __m128i filter, const int w0) {
   // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
   __m128i v[2];
   v[0] = VmullNLo8(filter, w0);
   v[1] = VmullNHi8(filter, w0);
-  SelfGuidedFinal(src, v, dst);
+  return SelfGuidedFinal(src, v);
 }
 
-inline void BoxFilterProcess(const uint8_t* const src,
-                             const ptrdiff_t src_stride,
-                             const RestorationUnitInfo& restoration_info,
-                             const int width, const int height,
-                             const uint16_t scale[2], uint16_t* const temp,
-                             uint8_t* const dst, const ptrdiff_t dst_stride) {
-  // We have combined PreProcess and Process for the first pass by storing
-  // intermediate values in the |ma| region. The values stored are one
-  // vertical column of interleaved |ma| and |b| values and consume 8 *
-  // |height| values. This is |height| and not |height| * 2 because PreProcess
-  // only generates output for every other row. When processing the next column
-  // we write the new scratch values right after reading the previously saved
-  // ones.
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+    const uint8_t* const src, const uint8_t* const src0,
+    const uint8_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+    uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
+    const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
+    uint32_t* const b565[2], uint8_t* const dst) {
+  __m128i s[2][2], mas[2], sq[2][4], bs[3];
+  s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+  s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
+  sq[0][0] = SquareLo8(s[0][0]);
+  sq[1][0] = SquareLo8(s[1][0]);
+  BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], &bs[0]);
 
-  // The PreProcess phase calculates a 5x5 box sum for every other row
-  //
-  // PreProcess and Process have been combined into the same step. We need 12
-  // input values to generate 8 output values for PreProcess:
-  // 0 1 2 3 4 5 6 7 8 9 10 11
-  // 2 = 0 + 1 + 2 +  3 +  4
-  // 3 = 1 + 2 + 3 +  4 +  5
-  // 4 = 2 + 3 + 4 +  5 +  6
-  // 5 = 3 + 4 + 5 +  6 +  7
-  // 6 = 4 + 5 + 6 +  7 +  8
-  // 7 = 5 + 6 + 7 +  8 +  9
-  // 8 = 6 + 7 + 8 +  9 + 10
-  // 9 = 7 + 8 + 9 + 10 + 11
-  //
-  // and then we need 10 input values to generate 8 output values for Process:
-  // 0 1 2 3 4 5 6 7 8 9
-  // 1 = 0 + 1 + 2
-  // 2 = 1 + 2 + 3
-  // 3 = 2 + 3 + 4
-  // 4 = 3 + 4 + 5
-  // 5 = 4 + 5 + 6
-  // 6 = 5 + 6 + 7
-  // 7 = 6 + 7 + 8
-  // 8 = 7 + 8 + 9
-  //
-  // To avoid re-calculating PreProcess values over and over again we will do a
-  // single column of 8 output values and store the second half of them
-  // interleaved in |temp|. The first half is not stored, since it is used
-  // immediately and becomes useless for the next column. Next we will start the
-  // second column. When 2 rows have been calculated we can calculate Process
-  // and output the results.
-
-  // Calculate and store a single column. Scope so we can re-use the variable
-  // names for the next step.
-  uint16_t* ab_ptr = temp;
-  const uint8_t* const src_pre_process = src - 2 * src_stride;
-  // Calculate intermediate results, including two-pixel border, for example, if
-  // unit size is 64x64, we calculate 68x68 pixels.
-  {
-    const uint8_t* column = src_pre_process - 4;
-    __m128i row[5], row_sq[5];
-    row[0] = row[1] = LoadLo8(column);
-    column += src_stride;
-    row[2] = LoadLo8(column);
-    row_sq[0] = row_sq[1] = VmullLo8(row[1], row[1]);
-    row_sq[2] = VmullLo8(row[2], row[2]);
-
-    int y = (height + 2) >> 1;
-    do {
-      column += src_stride;
-      row[3] = LoadLo8(column);
-      column += src_stride;
-      row[4] = LoadLo8(column);
-      row_sq[3] = VmullLo8(row[3], row[3]);
-      row_sq[4] = VmullLo8(row[4], row[4]);
-      BoxFilterPreProcess4<5, 1>(row + 0, row_sq + 0, scale[0], ab_ptr + 0);
-      BoxFilterPreProcess4<3, 1>(row + 1, row_sq + 1, scale[1], ab_ptr + 8);
-      BoxFilterPreProcess4<3, 1>(row + 2, row_sq + 2, scale[1], ab_ptr + 16);
-      row[0] = row[2];
-      row[1] = row[3];
-      row[2] = row[4];
-      row_sq[0] = row_sq[2];
-      row_sq[1] = row_sq[3];
-      row_sq[2] = row_sq[4];
-      ab_ptr += 24;
-    } while (--y != 0);
-
-    if ((height & 1) != 0) {
-      column += src_stride;
-      row[3] = row[4] = LoadLo8(column);
-      row_sq[3] = row_sq[4] = VmullLo8(row[3], row[3]);
-      BoxFilterPreProcess4<5, 1>(row + 0, row_sq + 0, scale[0], ab_ptr + 0);
-      BoxFilterPreProcess4<3, 1>(row + 1, row_sq + 1, scale[1], ab_ptr + 8);
-    }
-  }
-
-  const int w0 = restoration_info.sgr_proj_info.multiplier[0];
-  const int w1 = restoration_info.sgr_proj_info.multiplier[1];
-  const int w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
   int x = 0;
   do {
-    // |src_pre_process| is X but we already processed the first column of 4
-    // values so we want to start at Y and increment from there.
-    // X s s s Y s s
-    // s s s s s s s
-    // s s i i i i i
-    // s s i o o o o
-    // s s i o o o o
+    __m128i ma[2], ma5[3], b[2][2], sr[2], p[2];
+    s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
+                                  x + 16 + kOverreadInBytesPass1 - width);
+    s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
+                                  x + 16 + kOverreadInBytesPass1 - width);
+    BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+                         bs);
+    Prepare3_8<0>(mas, ma5);
+    ma[1] = Sum565Lo(ma5);
+    StoreAligned16(ma565[1] + x, ma[1]);
+    Sum565W(bs, b[1]);
+    StoreAligned32U32(b565[1] + x, b[1]);
+    sr[0] = LoadAligned16(src + x);
+    sr[1] = LoadAligned16(src + stride + x);
+    const __m128i sr0_lo = _mm_unpacklo_epi8(sr[0], _mm_setzero_si128());
+    const __m128i sr1_lo = _mm_unpacklo_epi8(sr[1], _mm_setzero_si128());
+    ma[0] = LoadAligned16(ma565[0] + x);
+    LoadAligned32U32(b565[0] + x, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr0_lo, ma, b);
+    p[1] = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[1]);
+    const __m128i d00 = SelfGuidedSingleMultiplier(sr0_lo, p[0], w0);
+    const __m128i d10 = SelfGuidedSingleMultiplier(sr1_lo, p[1], w0);
 
-    // Seed the loop with one line of output. Then, inside the loop, for each
-    // iteration we can output one even row and one odd row and carry the new
-    // line to the next iteration. In the diagram below 'i' values are
-    // intermediary values from the first step and '-' values are empty.
-    // iiii
-    // ---- > even row
-    // iiii - odd row
-    // ---- > even row
-    // iiii
-    __m128i ma[2], b[2][2], ma565[2], ma343[4], ma444[3];
-    __m128i b565[2][2], b343[4][2], b444[3][2];
-    ab_ptr = temp;
-    ma[0] = b[0][0] = LoadAligned16(ab_ptr);
-    ma[1] = b[1][0] = LoadAligned16(ab_ptr + 8);
-    const uint8_t* column = src_pre_process + x;
-    __m128i row[5], row_sq[5][2];
-    // Need |width| + 3 pixels, but we read max(|x|) + 16 pixels.
-    // Mask max(|x|) + 13 - |width| extra pixels.
-    row[0] = row[1] = LoadUnaligned16Msan(column, x + 13 - width);
-    column += src_stride;
-    row[2] = LoadUnaligned16Msan(column, x + 13 - width);
-    column += src_stride;
-    row[3] = LoadUnaligned16Msan(column, x + 13 - width);
-    column += src_stride;
-    row[4] = LoadUnaligned16Msan(column, x + 13 - width);
-    row_sq[0][0] = row_sq[1][0] = VmullLo8(row[1], row[1]);
-    row_sq[0][1] = row_sq[1][1] = VmullHi8(row[1], row[1]);
-    row_sq[2][0] = VmullLo8(row[2], row[2]);
-    row_sq[2][1] = VmullHi8(row[2], row[2]);
-    row_sq[3][0] = VmullLo8(row[3], row[3]);
-    row_sq[3][1] = VmullHi8(row[3], row[3]);
-    row_sq[4][0] = VmullLo8(row[4], row[4]);
-    row_sq[4][1] = VmullHi8(row[4], row[4]);
-    BoxFilterPreProcess8<5>(row, row_sq, scale[0], &ma[0], &b[0][1], ab_ptr);
-    BoxFilterPreProcess8<3>(row + 1, row_sq + 1, scale[1], &ma[1], &b[1][1],
-                            ab_ptr + 8);
-
-    // Pass 1 Process. These are the only values we need to propagate between
-    // rows.
-    ma565[0] = Sum565(ma[0]);
-    b565[0][0] = Sum565W(_mm_alignr_epi8(b[0][1], b[0][0], 8));
-    b565[0][1] = Sum565W(b[0][1]);
-    ma343[0] = Sum343(ma[1]);
-    Sum343W(b[1], b343[0]);
-    ma[1] = b[1][0] = LoadAligned16(ab_ptr + 16);
-    BoxFilterPreProcess8<3>(row + 2, row_sq + 2, scale[1], &ma[1], &b[1][1],
-                            ab_ptr + 16);
-    Sum343_444(ma[1], &ma343[1], &ma444[0]);
-    Sum343_444W(b[1], b343[1], b444[0]);
-
-    uint8_t* dst_ptr = dst + x;
-    // Calculate one output line. Add in the line from the previous pass and
-    // output one even row. Sum the new line and output the odd row. Carry the
-    // new row into the next pass.
-    for (int y = height >> 1; y != 0; --y) {
-      ab_ptr += 24;
-      ma[0] = b[0][0] = LoadAligned16(ab_ptr);
-      ma[1] = b[1][0] = LoadAligned16(ab_ptr + 8);
-      row[0] = row[2];
-      row[1] = row[3];
-      row[2] = row[4];
-      row_sq[0][0] = row_sq[2][0], row_sq[0][1] = row_sq[2][1];
-      row_sq[1][0] = row_sq[3][0], row_sq[1][1] = row_sq[3][1];
-      row_sq[2][0] = row_sq[4][0], row_sq[2][1] = row_sq[4][1];
-      column += src_stride;
-      row[3] = LoadUnaligned16Msan(column, x + 13 - width);
-      column += src_stride;
-      row[4] = LoadUnaligned16Msan(column, x + 13 - width);
-      row_sq[3][0] = VmullLo8(row[3], row[3]);
-      row_sq[3][1] = VmullHi8(row[3], row[3]);
-      row_sq[4][0] = VmullLo8(row[4], row[4]);
-      row_sq[4][1] = VmullHi8(row[4], row[4]);
-      BoxFilterPreProcess8<5>(row, row_sq, scale[0], &ma[0], &b[0][1], ab_ptr);
-      BoxFilterPreProcess8<3>(row + 1, row_sq + 1, scale[1], &ma[1], &b[1][1],
-                              ab_ptr + 8);
-      __m128i p[2];
-      p[0] = BoxFilterPass1(row[1], ma[0], b[0], ma565, b565);
-      p[1] = BoxFilterPass2(row[1], ma[1], b[1], ma343, ma444, b343, b444);
-      SelfGuidedDoubleMultiplier(row[1], p, w0, w2, dst_ptr);
-      dst_ptr += dst_stride;
-      p[0] = CalculateFilteredOutput<4>(row[2], ma565[1], b565[1]);
-      ma[1] = b[1][0] = LoadAligned16(ab_ptr + 16);
-      BoxFilterPreProcess8<3>(row + 2, row_sq + 2, scale[1], &ma[1], &b[1][1],
-                              ab_ptr + 16);
-      p[1] = BoxFilterPass2(row[2], ma[1], b[1], ma343 + 1, ma444 + 1, b343 + 1,
-                            b444 + 1);
-      SelfGuidedDoubleMultiplier(row[2], p, w0, w2, dst_ptr);
-      dst_ptr += dst_stride;
-      ma565[0] = ma565[1];
-      b565[0][0] = b565[1][0], b565[0][1] = b565[1][1];
-      ma343[0] = ma343[2];
-      ma343[1] = ma343[3];
-      ma444[0] = ma444[2];
-      b343[0][0] = b343[2][0], b343[0][1] = b343[2][1];
-      b343[1][0] = b343[3][0], b343[1][1] = b343[3][1];
-      b444[0][0] = b444[2][0], b444[0][1] = b444[2][1];
-    }
-
-    if ((height & 1) != 0) {
-      ab_ptr += 24;
-      ma[0] = b[0][0] = LoadAligned16(ab_ptr);
-      ma[1] = b[1][0] = LoadAligned16(ab_ptr + 8);
-      row[0] = row[2];
-      row[1] = row[3];
-      row[2] = row[4];
-      row_sq[0][0] = row_sq[2][0], row_sq[0][1] = row_sq[2][1];
-      row_sq[1][0] = row_sq[3][0], row_sq[1][1] = row_sq[3][1];
-      row_sq[2][0] = row_sq[4][0], row_sq[2][1] = row_sq[4][1];
-      column += src_stride;
-      row[3] = row[4] = LoadUnaligned16Msan(column, x + 13 - width);
-      row_sq[3][0] = row_sq[4][0] = VmullLo8(row[3], row[3]);
-      row_sq[3][1] = row_sq[4][1] = VmullHi8(row[3], row[3]);
-      BoxFilterPreProcess8<5>(row, row_sq, scale[0], &ma[0], &b[0][1], ab_ptr);
-      BoxFilterPreProcess8<3>(row + 1, row_sq + 1, scale[1], &ma[1], &b[1][1],
-                              ab_ptr + 8);
-      __m128i p[2];
-      p[0] = BoxFilterPass1(row[1], ma[0], b[0], ma565, b565);
-      p[1] = BoxFilterPass2(row[1], ma[1], b[1], ma343, ma444, b343, b444);
-      SelfGuidedDoubleMultiplier(row[1], p, w0, w2, dst_ptr);
-    }
-    x += 8;
+    ma[1] = Sum565Hi(ma5);
+    StoreAligned16(ma565[1] + x + 8, ma[1]);
+    Sum565W(bs + 1, b[1]);
+    StoreAligned32U32(b565[1] + x + 8, b[1]);
+    const __m128i sr0_hi = _mm_unpackhi_epi8(sr[0], _mm_setzero_si128());
+    const __m128i sr1_hi = _mm_unpackhi_epi8(sr[1], _mm_setzero_si128());
+    ma[0] = LoadAligned16(ma565[0] + x + 8);
+    LoadAligned32U32(b565[0] + x + 8, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr0_hi, ma, b);
+    p[1] = CalculateFilteredOutput<4>(sr1_hi, ma[1], b[1]);
+    const __m128i d01 = SelfGuidedSingleMultiplier(sr0_hi, p[0], w0);
+    StoreAligned16(dst + x, _mm_packus_epi16(d00, d01));
+    const __m128i d11 = SelfGuidedSingleMultiplier(sr1_hi, p[1], w0);
+    StoreAligned16(dst + stride + x, _mm_packus_epi16(d10, d11));
+    s[0][0] = s[0][1];
+    s[1][0] = s[1][1];
+    sq[0][1] = sq[0][3];
+    sq[1][1] = sq[1][3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    x += 16;
   } while (x < width);
 }
 
-inline void BoxFilterProcessPass1(const uint8_t* const src,
-                                  const ptrdiff_t src_stride,
-                                  const RestorationUnitInfo& restoration_info,
-                                  const int width, const int height,
-                                  const uint32_t scale, uint16_t* const temp,
-                                  uint8_t* const dst,
-                                  const ptrdiff_t dst_stride) {
-  // We have combined PreProcess and Process for the first pass by storing
-  // intermediate values in the |ma| region. The values stored are one
-  // vertical column of interleaved |ma| and |b| values and consume 8 *
-  // |height| values. This is |height| and not |height| * 2 because PreProcess
-  // only generates output for every other row. When processing the next column
-  // we write the new scratch values right after reading the previously saved
-  // ones.
+inline void BoxFilterPass1LastRow(
+    const uint8_t* const src, const uint8_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
+    uint32_t* b565, uint8_t* const dst) {
+  __m128i s[2], mas[2], sq[4], bs[3];
+  s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+  sq[0] = SquareLo8(s[0]);
+  BoxFilterPreProcess5LastRowLo(s[0], scale, sum5, square_sum5, sq, &mas[0],
+                                &bs[0]);
 
-  // The PreProcess phase calculates a 5x5 box sum for every other row
-  //
-  // PreProcess and Process have been combined into the same step. We need 12
-  // input values to generate 8 output values for PreProcess:
-  // 0 1 2 3 4 5 6 7 8 9 10 11
-  // 2 = 0 + 1 + 2 +  3 +  4
-  // 3 = 1 + 2 + 3 +  4 +  5
-  // 4 = 2 + 3 + 4 +  5 +  6
-  // 5 = 3 + 4 + 5 +  6 +  7
-  // 6 = 4 + 5 + 6 +  7 +  8
-  // 7 = 5 + 6 + 7 +  8 +  9
-  // 8 = 6 + 7 + 8 +  9 + 10
-  // 9 = 7 + 8 + 9 + 10 + 11
-  //
-  // and then we need 10 input values to generate 8 output values for Process:
-  // 0 1 2 3 4 5 6 7 8 9
-  // 1 = 0 + 1 + 2
-  // 2 = 1 + 2 + 3
-  // 3 = 2 + 3 + 4
-  // 4 = 3 + 4 + 5
-  // 5 = 4 + 5 + 6
-  // 6 = 5 + 6 + 7
-  // 7 = 6 + 7 + 8
-  // 8 = 7 + 8 + 9
-  //
-  // To avoid re-calculating PreProcess values over and over again we will do a
-  // single column of 8 output values and store the second half of them
-  // interleaved in |temp|. The first half is not stored, since it is used
-  // immediately and becomes useless for the next column. Next we will start the
-  // second column. When 2 rows have been calculated we can calculate Process
-  // and output the results.
-
-  // Calculate and store a single column. Scope so we can re-use the variable
-  // names for the next step.
-  uint16_t* ab_ptr = temp;
-  const uint8_t* const src_pre_process = src - 2 * src_stride;
-  // Calculate intermediate results, including two-pixel border, for example, if
-  // unit size is 64x64, we calculate 68x68 pixels.
-  {
-    const uint8_t* column = src_pre_process - 4;
-    __m128i row[5], row_sq[5];
-    row[0] = row[1] = LoadLo8(column);
-    column += src_stride;
-    row[2] = LoadLo8(column);
-    row_sq[0] = row_sq[1] = VmullLo8(row[1], row[1]);
-    row_sq[2] = VmullLo8(row[2], row[2]);
-
-    int y = (height + 2) >> 1;
-    do {
-      column += src_stride;
-      row[3] = LoadLo8(column);
-      column += src_stride;
-      row[4] = LoadLo8(column);
-      row_sq[3] = VmullLo8(row[3], row[3]);
-      row_sq[4] = VmullLo8(row[4], row[4]);
-      BoxFilterPreProcess4<5, 1>(row, row_sq, scale, ab_ptr);
-      row[0] = row[2];
-      row[1] = row[3];
-      row[2] = row[4];
-      row_sq[0] = row_sq[2];
-      row_sq[1] = row_sq[3];
-      row_sq[2] = row_sq[4];
-      ab_ptr += 8;
-    } while (--y != 0);
-
-    if ((height & 1) != 0) {
-      column += src_stride;
-      row[3] = row[4] = LoadLo8(column);
-      row_sq[3] = row_sq[4] = VmullLo8(row[3], row[3]);
-      BoxFilterPreProcess4<5, 1>(row, row_sq, scale, ab_ptr);
-    }
-  }
-
-  const int w0 = restoration_info.sgr_proj_info.multiplier[0];
   int x = 0;
   do {
-    // |src_pre_process| is X but we already processed the first column of 4
-    // values so we want to start at Y and increment from there.
-    // X s s s Y s s
-    // s s s s s s s
-    // s s i i i i i
-    // s s i o o o o
-    // s s i o o o o
+    __m128i ma[2], ma5[3], b[2][2];
+    s[1] = LoadUnaligned16Msan(src0 + x + 16,
+                               x + 16 + kOverreadInBytesPass1 - width);
+    BoxFilterPreProcess5LastRow(s, sum_width, x + 8, scale, sum5, square_sum5,
+                                sq, mas, bs);
+    Prepare3_8<0>(mas, ma5);
+    ma[1] = Sum565Lo(ma5);
+    Sum565W(bs, b[1]);
+    ma[0] = LoadAligned16(ma565);
+    LoadAligned32U32(b565, b[0]);
+    const __m128i sr = LoadAligned16(src + x);
+    const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128());
+    __m128i p = CalculateFilteredOutputPass1(sr_lo, ma, b);
+    const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p, w0);
 
-    // Seed the loop with one line of output. Then, inside the loop, for each
-    // iteration we can output one even row and one odd row and carry the new
-    // line to the next iteration. In the diagram below 'i' values are
-    // intermediary values from the first step and '-' values are empty.
-    // iiii
-    // ---- > even row
-    // iiii - odd row
-    // ---- > even row
-    // iiii
-    __m128i ma[2], b[2], ma565[2], b565[2][2];
-    ab_ptr = temp;
-    ma[0] = b[0] = LoadAligned16(ab_ptr);
-    const uint8_t* column = src_pre_process + x;
-    __m128i row[5], row_sq[5][2];
-    // Need |width| + 3 pixels, but we read max(|x|) + 16 pixels.
-    // Mask max(|x|) + 13 - |width| extra pixels.
-    row[0] = row[1] = LoadUnaligned16Msan(column, x + 13 - width);
-    column += src_stride;
-    row[2] = LoadUnaligned16Msan(column, x + 13 - width);
-    column += src_stride;
-    row[3] = LoadUnaligned16Msan(column, x + 13 - width);
-    column += src_stride;
-    row[4] = LoadUnaligned16Msan(column, x + 13 - width);
-    row_sq[0][0] = row_sq[1][0] = VmullLo8(row[1], row[1]);
-    row_sq[0][1] = row_sq[1][1] = VmullHi8(row[1], row[1]);
-    row_sq[2][0] = VmullLo8(row[2], row[2]);
-    row_sq[2][1] = VmullHi8(row[2], row[2]);
-    row_sq[3][0] = VmullLo8(row[3], row[3]);
-    row_sq[3][1] = VmullHi8(row[3], row[3]);
-    row_sq[4][0] = VmullLo8(row[4], row[4]);
-    row_sq[4][1] = VmullHi8(row[4], row[4]);
-    BoxFilterPreProcess8<5>(row, row_sq, scale, &ma[0], &b[1], ab_ptr);
-
-    // Pass 1 Process. These are the only values we need to propagate between
-    // rows.
-    ma565[0] = Sum565(ma[0]);
-    b565[0][0] = Sum565W(_mm_alignr_epi8(b[1], b[0], 8));
-    b565[0][1] = Sum565W(b[1]);
-    uint8_t* dst_ptr = dst + x;
-    // Calculate one output line. Add in the line from the previous pass and
-    // output one even row. Sum the new line and output the odd row. Carry the
-    // new row into the next pass.
-    for (int y = height >> 1; y != 0; --y) {
-      ab_ptr += 8;
-      ma[0] = b[0] = LoadAligned16(ab_ptr);
-      row[0] = row[2];
-      row[1] = row[3];
-      row[2] = row[4];
-      row_sq[0][0] = row_sq[2][0], row_sq[0][1] = row_sq[2][1];
-      row_sq[1][0] = row_sq[3][0], row_sq[1][1] = row_sq[3][1];
-      row_sq[2][0] = row_sq[4][0], row_sq[2][1] = row_sq[4][1];
-      column += src_stride;
-      row[3] = LoadUnaligned16Msan(column, x + 13 - width);
-      column += src_stride;
-      row[4] = LoadUnaligned16Msan(column, x + 13 - width);
-      row_sq[3][0] = VmullLo8(row[3], row[3]);
-      row_sq[3][1] = VmullHi8(row[3], row[3]);
-      row_sq[4][0] = VmullLo8(row[4], row[4]);
-      row_sq[4][1] = VmullHi8(row[4], row[4]);
-      BoxFilterPreProcess8<5>(row, row_sq, scale, &ma[0], &b[1], ab_ptr);
-      const __m128i p0 = BoxFilterPass1(row[1], ma[0], b, ma565, b565);
-      SelfGuidedSingleMultiplier(row[1], p0, w0, dst_ptr);
-      dst_ptr += dst_stride;
-      const __m128i p1 = CalculateFilteredOutput<4>(row[2], ma565[1], b565[1]);
-      SelfGuidedSingleMultiplier(row[2], p1, w0, dst_ptr);
-      dst_ptr += dst_stride;
-      ma565[0] = ma565[1];
-      b565[0][0] = b565[1][0], b565[0][1] = b565[1][1];
-    }
-
-    if ((height & 1) != 0) {
-      ab_ptr += 8;
-      ma[0] = b[0] = LoadAligned16(ab_ptr);
-      row[0] = row[2];
-      row[1] = row[3];
-      row[2] = row[4];
-      row_sq[0][0] = row_sq[2][0], row_sq[0][1] = row_sq[2][1];
-      row_sq[1][0] = row_sq[3][0], row_sq[1][1] = row_sq[3][1];
-      row_sq[2][0] = row_sq[4][0], row_sq[2][1] = row_sq[4][1];
-      column += src_stride;
-      row[3] = row[4] = LoadUnaligned16Msan(column, x + 13 - width);
-      row_sq[3][0] = row_sq[4][0] = VmullLo8(row[3], row[3]);
-      row_sq[3][1] = row_sq[4][1] = VmullHi8(row[3], row[3]);
-      BoxFilterPreProcess8<5>(row, row_sq, scale, &ma[0], &b[1], ab_ptr);
-      const __m128i p0 = BoxFilterPass1(row[1], ma[0], b, ma565, b565);
-      SelfGuidedSingleMultiplier(row[1], p0, w0, dst_ptr);
-    }
-    x += 8;
+    ma[1] = Sum565Hi(ma5);
+    Sum565W(bs + 1, b[1]);
+    ma[0] = LoadAligned16(ma565 + 8);
+    LoadAligned32U32(b565 + 8, b[0]);
+    const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128());
+    p = CalculateFilteredOutputPass1(sr_hi, ma, b);
+    const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p, w0);
+    StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+    s[0] = s[1];
+    sq[1] = sq[3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    ma565 += 16;
+    b565 += 16;
+    x += 16;
   } while (x < width);
 }
 
-inline void BoxFilterProcessPass2(const uint8_t* src,
-                                  const ptrdiff_t src_stride,
-                                  const RestorationUnitInfo& restoration_info,
-                                  const int width, const int height,
-                                  const uint32_t scale, uint16_t* const temp,
-                                  uint8_t* const dst,
-                                  const ptrdiff_t dst_stride) {
-  // Calculate intermediate results, including one-pixel border, for example, if
-  // unit size is 64x64, we calculate 66x66 pixels.
-  // Because of the vectors this calculates start in blocks of 4 so we actually
-  // get 68 values.
-  uint16_t* ab_ptr = temp;
-  const uint8_t* const src_pre_process = src - 2 * src_stride;
-  {
-    const uint8_t* column = src_pre_process - 3;
-    __m128i row[3], row_sq[3];
-    row[0] = LoadLo8(column);
-    column += src_stride;
-    row[1] = LoadLo8(column);
-    row_sq[0] = VmullLo8(row[0], row[0]);
-    row_sq[1] = VmullLo8(row[1], row[1]);
-    int y = height + 2;
-    do {
-      column += src_stride;
-      row[2] = LoadLo8(column);
-      row_sq[2] = VmullLo8(row[2], row[2]);
-      BoxFilterPreProcess4<3, 0>(row, row_sq, scale, ab_ptr);
-      row[0] = row[1];
-      row[1] = row[2];
-      row_sq[0] = row_sq[1];
-      row_sq[1] = row_sq[2];
-      ab_ptr += 8;
-    } while (--y != 0);
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+    const uint8_t* const src, const uint8_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+    uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+    uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
+    uint32_t* const b444[2], uint8_t* const dst) {
+  __m128i s[2], mas[2], sq[4], bs[3];
+  s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass2 - width);
+  sq[0] = SquareLo8(s[0]);
+  BoxFilterPreProcess3Lo(s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]);
+
+  int x = 0;
+  do {
+    s[1] = LoadUnaligned16Msan(src0 + x + 16,
+                               x + 16 + kOverreadInBytesPass2 - width);
+    BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+                         bs);
+    __m128i ma[3], b[3][2], ma3[3];
+    Prepare3_8<0>(mas, ma3);
+    Store343_444Lo(ma3, bs + 0, x, &ma[2], b[2], ma343[2], ma444[1], b343[2],
+                   b444[1]);
+    const __m128i sr = LoadAligned16(src + x);
+    const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128());
+    ma[0] = LoadAligned16(ma343[0] + x);
+    ma[1] = LoadAligned16(ma444[0] + x);
+    LoadAligned32U32(b343[0] + x, b[0]);
+    LoadAligned32U32(b444[0] + x, b[1]);
+    const __m128i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
+
+    Store343_444Hi(ma3, bs + 1, x + 8, &ma[2], b[2], ma343[2], ma444[1],
+                   b343[2], b444[1]);
+    const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128());
+    ma[0] = LoadAligned16(ma343[0] + x + 8);
+    ma[1] = LoadAligned16(ma444[0] + x + 8);
+    LoadAligned32U32(b343[0] + x + 8, b[0]);
+    LoadAligned32U32(b444[0] + x + 8, b[1]);
+    const __m128i p1 = CalculateFilteredOutputPass2(sr_hi, ma, b);
+    const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+    const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+    StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+    s[0] = s[1];
+    sq[1] = sq[3];
+    mas[0] = mas[1];
+    bs[0] = bs[2];
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+    const uint8_t* const src, const uint8_t* const src0,
+    const uint8_t* const src1, const ptrdiff_t stride, const int width,
+    const uint16_t scales[2], const int16_t w0, const int16_t w2,
+    uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    const ptrdiff_t sum_width, uint16_t* const ma343[4],
+    uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
+    uint32_t* const b444[3], uint32_t* const b565[2], uint8_t* const dst) {
+  __m128i s[2][2], ma3[2][2], ma5[2], sq[2][4], b3[2][3], b5[3];
+  s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+  s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
+  sq[0][0] = SquareLo8(s[0][0]);
+  sq[1][0] = SquareLo8(s[1][0]);
+  BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+                        ma3, b3, &ma5[0], &b5[0]);
+
+  int x = 0;
+  do {
+    __m128i ma[3][3], b[3][3][2], p[2][2], ma3x[2][3], ma5x[3];
+    s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
+                                  x + 16 + kOverreadInBytesPass1 - width);
+    s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
+                                  x + 16 + kOverreadInBytesPass1 - width);
+    BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+                        sum_width, sq, ma3, b3, ma5, b5);
+    Prepare3_8<0>(ma3[0], ma3x[0]);
+    Prepare3_8<0>(ma3[1], ma3x[1]);
+    Prepare3_8<0>(ma5, ma5x);
+    Store343_444Lo(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], b[1][2], b[2][1],
+                   ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444Lo(ma3x[1], b3[1], x, &ma[2][2], b[2][2], ma343[3], ma444[2],
+                   b343[3], b444[2]);
+    ma[0][1] = Sum565Lo(ma5x);
+    StoreAligned16(ma565[1] + x, ma[0][1]);
+    Sum565W(b5, b[0][1]);
+    StoreAligned32U32(b565[1] + x, b[0][1]);
+    const __m128i sr0 = LoadAligned16(src + x);
+    const __m128i sr1 = LoadAligned16(src + stride + x);
+    const __m128i sr0_lo = _mm_unpacklo_epi8(sr0, _mm_setzero_si128());
+    const __m128i sr1_lo = _mm_unpacklo_epi8(sr1, _mm_setzero_si128());
+    ma[0][0] = LoadAligned16(ma565[0] + x);
+    LoadAligned32U32(b565[0] + x, b[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
+    ma[1][0] = LoadAligned16(ma343[0] + x);
+    ma[1][1] = LoadAligned16(ma444[0] + x);
+    LoadAligned32U32(b343[0] + x, b[1][0]);
+    LoadAligned32U32(b444[0] + x, b[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
+    const __m128i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
+    ma[2][0] = LoadAligned16(ma343[1] + x);
+    LoadAligned32U32(b343[1] + x, b[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
+    const __m128i d10 = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
+
+    Store343_444Hi(ma3x[0], b3[0] + 1, x + 8, &ma[1][2], &ma[2][1], b[1][2],
+                   b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
+    Store343_444Hi(ma3x[1], b3[1] + 1, x + 8, &ma[2][2], b[2][2], ma343[3],
+                   ma444[2], b343[3], b444[2]);
+    ma[0][1] = Sum565Hi(ma5x);
+    StoreAligned16(ma565[1] + x + 8, ma[0][1]);
+    Sum565W(b5 + 1, b[0][1]);
+    StoreAligned32U32(b565[1] + x + 8, b[0][1]);
+    const __m128i sr0_hi = _mm_unpackhi_epi8(sr0, _mm_setzero_si128());
+    const __m128i sr1_hi = _mm_unpackhi_epi8(sr1, _mm_setzero_si128());
+    ma[0][0] = LoadAligned16(ma565[0] + x + 8);
+    LoadAligned32U32(b565[0] + x + 8, b[0][0]);
+    p[0][0] = CalculateFilteredOutputPass1(sr0_hi, ma[0], b[0]);
+    p[1][0] = CalculateFilteredOutput<4>(sr1_hi, ma[0][1], b[0][1]);
+    ma[1][0] = LoadAligned16(ma343[0] + x + 8);
+    ma[1][1] = LoadAligned16(ma444[0] + x + 8);
+    LoadAligned32U32(b343[0] + x + 8, b[1][0]);
+    LoadAligned32U32(b444[0] + x + 8, b[1][1]);
+    p[0][1] = CalculateFilteredOutputPass2(sr0_hi, ma[1], b[1]);
+    const __m128i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
+    StoreAligned16(dst + x, _mm_packus_epi16(d00, d01));
+    ma[2][0] = LoadAligned16(ma343[1] + x + 8);
+    LoadAligned32U32(b343[1] + x + 8, b[2][0]);
+    p[1][1] = CalculateFilteredOutputPass2(sr1_hi, ma[2], b[2]);
+    const __m128i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
+    StoreAligned16(dst + stride + x, _mm_packus_epi16(d10, d11));
+    s[0][0] = s[0][1];
+    s[1][0] = s[1][1];
+    sq[0][1] = sq[0][3];
+    sq[1][1] = sq[1][3];
+    ma3[0][0] = ma3[0][1];
+    ma3[1][0] = ma3[1][1];
+    ma5[0] = ma5[1];
+    b3[0][0] = b3[0][2];
+    b3[1][0] = b3[1][2];
+    b5[0] = b5[2];
+    x += 16;
+  } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+    const uint8_t* const src, const uint8_t* const src0, const int width,
+    const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
+    const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
+    uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+    uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+    uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+    uint8_t* const dst) {
+  __m128i s[2], ma3[2], ma5[2], sq[4], b3[3], b5[3], ma[3], b[3][2];
+  s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+  sq[0] = SquareLo8(s[0]);
+  BoxFilterPreProcessLastRowLo(s[0], scales, sum3, sum5, square_sum3,
+                               square_sum5, sq, &ma3[0], &ma5[0], &b3[0],
+                               &b5[0]);
+
+  int x = 0;
+  do {
+    __m128i ma3x[3], ma5x[3], p[2];
+    s[1] = LoadUnaligned16Msan(src0 + x + 16,
+                               x + 16 + kOverreadInBytesPass1 - width);
+    BoxFilterPreProcessLastRow(s, sum_width, x + 8, scales, sum3, sum5,
+                               square_sum3, square_sum5, sq, ma3, ma5, b3, b5);
+    Prepare3_8<0>(ma3, ma3x);
+    Prepare3_8<0>(ma5, ma5x);
+    ma[1] = Sum565Lo(ma5x);
+    Sum565W(b5, b[1]);
+    ma[2] = Sum343Lo(ma3x);
+    Sum343W(b3, b[2]);
+    const __m128i sr = LoadAligned16(src + x);
+    const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128());
+    ma[0] = LoadAligned16(ma565 + x);
+    LoadAligned32U32(b565 + x, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
+    ma[0] = LoadAligned16(ma343 + x);
+    ma[1] = LoadAligned16(ma444 + x);
+    LoadAligned32U32(b343 + x, b[0]);
+    LoadAligned32U32(b444 + x, b[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
+    const __m128i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
+
+    ma[1] = Sum565Hi(ma5x);
+    Sum565W(b5 + 1, b[1]);
+    ma[2] = Sum343Hi(ma3x);
+    Sum343W(b3 + 1, b[2]);
+    const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128());
+    ma[0] = LoadAligned16(ma565 + x + 8);
+    LoadAligned32U32(b565 + x + 8, b[0]);
+    p[0] = CalculateFilteredOutputPass1(sr_hi, ma, b);
+    ma[0] = LoadAligned16(ma343 + x + 8);
+    ma[1] = LoadAligned16(ma444 + x + 8);
+    LoadAligned32U32(b343 + x + 8, b[0]);
+    LoadAligned32U32(b444 + x + 8, b[1]);
+    p[1] = CalculateFilteredOutputPass2(sr_hi, ma, b);
+    const __m128i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
+    StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+    s[0] = s[1];
+    sq[1] = sq[3];
+    ma3[0] = ma3[1];
+    ma5[0] = ma5[1];
+    b3[0] = b3[2];
+    b5[0] = b5[2];
+    x += 16;
+  } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+    const RestorationUnitInfo& restoration_info, const uint8_t* src,
+    const ptrdiff_t stride, const uint8_t* const top_border,
+    const ptrdiff_t top_border_stride, const uint8_t* bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    SgrBuffer* const sgr_buffer, uint8_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const auto sum_stride = temp_stride + 16;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+  uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+  uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+  sum3[0] = sgr_buffer->sum3;
+  square_sum3[0] = sgr_buffer->square_sum3;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 3; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  b444[0] = sgr_buffer->b444;
+  for (int i = 1; i <= 2; ++i) {
+    ma444[i] = ma444[i - 1] + temp_stride;
+    b444[i] = b444[i - 1] + temp_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scales[0] != 0);
+  assert(scales[1] != 0);
+  BoxSum(top_border, top_border_stride, width, sum_stride, sum_width, sum3[0],
+         sum5[1], square_sum3[0], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+                         square_sum5, sum_width, ma343, ma444[0], ma565[0],
+                         b343, b444[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate4PointersBy2<uint16_t>(sum3);
+    Circulate4PointersBy2<uint32_t>(square_sum3);
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+              scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
+              ma343, ma444, ma565, b343, b444, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    Circulate4PointersBy2<uint16_t>(ma343);
+    Circulate4PointersBy2<uint32_t>(b343);
+    std::swap(ma444[0], ma444[2]);
+    std::swap(b444[0], b444[2]);
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
   }
 
+  Circulate4PointersBy2<uint16_t>(sum3);
+  Circulate4PointersBy2<uint32_t>(square_sum3);
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint8_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+              square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
+              b444, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      Circulate4PointersBy2<uint16_t>(sum3);
+      Circulate4PointersBy2<uint32_t>(square_sum3);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+      Circulate4PointersBy2<uint16_t>(ma343);
+      Circulate4PointersBy2<uint32_t>(b343);
+      std::swap(ma444[0], ma444[2]);
+      std::swap(b444[0], b444[2]);
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+    }
+    BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+                     sum_width, scales, w0, w2, sum3, sum5, square_sum3,
+                     square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
+                     b444[0], b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+                                  const uint8_t* src, const ptrdiff_t stride,
+                                  const uint8_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint8_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint8_t* dst) {
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const auto sum_stride = temp_stride + 16;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0];  // < 2^12.
+  const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+  uint16_t *sum5[5], *ma565[2];
+  uint32_t *square_sum5[5], *b565[2];
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+  for (int i = 1; i <= 4; ++i) {
+    sum5[i] = sum5[i - 1] + sum_stride;
+    square_sum5[i] = square_sum5[i - 1] + sum_stride;
+  }
+  ma565[0] = sgr_buffer->ma565;
+  ma565[1] = ma565[0] + temp_stride;
+  b565[0] = sgr_buffer->b565;
+  b565[1] = b565[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<5>(top_border, top_border_stride, width, sum_stride, sum_width,
+            sum5[1], square_sum5[1]);
+  sum5[0] = sum5[1];
+  square_sum5[0] = square_sum5[1];
+  const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+  BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
+                          ma565[0], b565[0]);
+  sum5[0] = sgr_buffer->sum5;
+  square_sum5[0] = sgr_buffer->square_sum5;
+
+  for (int y = (height >> 1) - 1; y > 0; --y) {
+    Circulate5PointersBy2<uint16_t>(sum5);
+    Circulate5PointersBy2<uint32_t>(square_sum5);
+    BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+                   square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
+    src += 2 * stride;
+    dst += 2 * stride;
+    std::swap(ma565[0], ma565[1]);
+    std::swap(b565[0], b565[1]);
+  }
+
+  Circulate5PointersBy2<uint16_t>(sum5);
+  Circulate5PointersBy2<uint32_t>(square_sum5);
+  if ((height & 1) == 0 || height > 1) {
+    const uint8_t* sr[2];
+    if ((height & 1) == 0) {
+      sr[0] = bottom_border;
+      sr[1] = bottom_border + bottom_border_stride;
+    } else {
+      sr[0] = src + 2 * stride;
+      sr[1] = bottom_border;
+    }
+    BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+                   sum_width, scale, w0, ma565, b565, dst);
+  }
+  if ((height & 1) != 0) {
+    src += 3;
+    if (height > 1) {
+      src += 2 * stride;
+      dst += 2 * stride;
+      std::swap(ma565[0], ma565[1]);
+      std::swap(b565[0], b565[1]);
+      Circulate5PointersBy2<uint16_t>(sum5);
+      Circulate5PointersBy2<uint32_t>(square_sum5);
+    }
+    BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
+                          sum_width, scale, w0, sum5, square_sum5, ma565[0],
+                          b565[0], dst);
+  }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+                                  const uint8_t* src, const ptrdiff_t stride,
+                                  const uint8_t* const top_border,
+                                  const ptrdiff_t top_border_stride,
+                                  const uint8_t* bottom_border,
+                                  const ptrdiff_t bottom_border_stride,
+                                  const int width, const int height,
+                                  SgrBuffer* const sgr_buffer, uint8_t* dst) {
   assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
-  const int w1 = restoration_info.sgr_proj_info.multiplier[1];
-  const int w0 = (1 << kSgrProjPrecisionBits) - w1;
-  int x = 0;
-  do {
-    ab_ptr = temp;
-    __m128i ma, b[2], ma343[3], ma444[2], b343[3][2], b444[2][2];
-    ma = b[0] = LoadAligned16(ab_ptr);
-    const uint8_t* column = src_pre_process + x;
-    __m128i row[3], row_sq[3][2];
-    // Need |width| + 2 pixels, but we read max(|x|) + 16 pixels.
-    // Mask max(|x|) + 14 - |width| extra pixels.
-    row[0] = LoadUnaligned16Msan(column, x + 14 - width);
-    column += src_stride;
-    row[1] = LoadUnaligned16Msan(column, x + 14 - width);
-    column += src_stride;
-    row[2] = LoadUnaligned16Msan(column, x + 14 - width);
-    row_sq[0][0] = VmullLo8(row[0], row[0]);
-    row_sq[0][1] = VmullHi8(row[0], row[0]);
-    row_sq[1][0] = VmullLo8(row[1], row[1]);
-    row_sq[1][1] = VmullHi8(row[1], row[1]);
-    row_sq[2][0] = VmullLo8(row[2], row[2]);
-    row_sq[2][1] = VmullHi8(row[2], row[2]);
-    BoxFilterPreProcess8<3>(row, row_sq, scale, &ma, &b[1], ab_ptr);
-    ma343[0] = Sum343(ma);
-    Sum343W(b, b343[0]);
-    ab_ptr += 8;
-    ma = b[0] = LoadAligned16(ab_ptr);
-    row[0] = row[1];
-    row[1] = row[2];
-    row_sq[0][0] = row_sq[1][0], row_sq[0][1] = row_sq[1][1];
-    row_sq[1][0] = row_sq[2][0], row_sq[1][1] = row_sq[2][1];
-    column += src_stride;
-    row[2] = LoadUnaligned16Msan(column, x + 14 - width);
-    row_sq[2][0] = VmullLo8(row[2], row[2]);
-    row_sq[2][1] = VmullHi8(row[2], row[2]);
-    BoxFilterPreProcess8<3>(row, row_sq, scale, &ma, &b[1], ab_ptr);
-    Sum343_444(ma, &ma343[1], &ma444[0]);
-    Sum343_444W(b, b343[1], b444[0]);
+  const auto temp_stride = Align<ptrdiff_t>(width, 16);
+  const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+  const auto sum_stride = temp_stride + 16;
+  const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+  const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+  const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+  const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1];  // < 2^12.
+  uint16_t *sum3[3], *ma343[3], *ma444[2];
+  uint32_t *square_sum3[3], *b343[3], *b444[2];
+  sum3[0] = sgr_buffer->sum3;
+  square_sum3[0] = sgr_buffer->square_sum3;
+  ma343[0] = sgr_buffer->ma343;
+  b343[0] = sgr_buffer->b343;
+  for (int i = 1; i <= 2; ++i) {
+    sum3[i] = sum3[i - 1] + sum_stride;
+    square_sum3[i] = square_sum3[i - 1] + sum_stride;
+    ma343[i] = ma343[i - 1] + temp_stride;
+    b343[i] = b343[i - 1] + temp_stride;
+  }
+  ma444[0] = sgr_buffer->ma444;
+  ma444[1] = ma444[0] + temp_stride;
+  b444[0] = sgr_buffer->b444;
+  b444[1] = b444[0] + temp_stride;
+  assert(scale != 0);
+  BoxSum<3>(top_border, top_border_stride, width, sum_stride, sum_width,
+            sum3[0], square_sum3[0]);
+  BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
+                                 sum_width, ma343[0], nullptr, b343[0],
+                                 nullptr);
+  Circulate3PointersBy1<uint16_t>(sum3);
+  Circulate3PointersBy1<uint32_t>(square_sum3);
+  const uint8_t* s;
+  if (height > 1) {
+    s = src + stride;
+  } else {
+    s = bottom_border;
+    bottom_border += bottom_border_stride;
+  }
+  BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
+                                ma343[1], ma444[0], b343[1], b444[0]);
 
-    uint8_t* dst_ptr = dst + x;
-    int y = height;
-    do {
-      ab_ptr += 8;
-      ma = b[0] = LoadAligned16(ab_ptr);
-      row[0] = row[1];
-      row[1] = row[2];
-      row_sq[0][0] = row_sq[1][0], row_sq[0][1] = row_sq[1][1];
-      row_sq[1][0] = row_sq[2][0], row_sq[1][1] = row_sq[2][1];
-      column += src_stride;
-      row[2] = LoadUnaligned16Msan(column, x + 14 - width);
-      row_sq[2][0] = VmullLo8(row[2], row[2]);
-      row_sq[2][1] = VmullHi8(row[2], row[2]);
-      BoxFilterPreProcess8<3>(row, row_sq, scale, &ma, &b[1], ab_ptr);
-      const __m128i p = BoxFilterPass2(row[0], ma, b, ma343, ma444, b343, b444);
-      SelfGuidedSingleMultiplier(row[0], p, w0, dst_ptr);
-      ma343[0] = ma343[1];
-      ma343[1] = ma343[2];
-      ma444[0] = ma444[1];
-      b343[0][0] = b343[1][0], b343[0][1] = b343[1][1];
-      b343[1][0] = b343[2][0], b343[1][1] = b343[2][1];
-      b444[0][0] = b444[1][0], b444[0][1] = b444[1][1];
-      dst_ptr += dst_stride;
-    } while (--y != 0);
-    x += 8;
-  } while (x < width);
+  for (int y = height - 2; y > 0; --y) {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  }
+
+  int y = std::min(height, 2);
+  src += 2;
+  do {
+    Circulate3PointersBy1<uint16_t>(sum3);
+    Circulate3PointersBy1<uint32_t>(square_sum3);
+    BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
+                   square_sum3, ma343, ma444, b343, b444, dst);
+    src += stride;
+    dst += stride;
+    bottom_border += bottom_border_stride;
+    Circulate3PointersBy1<uint16_t>(ma343);
+    Circulate3PointersBy1<uint32_t>(b343);
+    std::swap(ma444[0], ma444[1]);
+    std::swap(b444[0], b444[1]);
+  } while (--y != 0);
 }
 
-// If |width| is non-multiple of 8, up to 7 more pixels are written to |dest| in
-// the end of each row. It is safe to overwrite the output as it will not be
+// If |width| is non-multiple of 16, up to 15 more pixels are written to |dest|
+// in the end of each row. It is safe to overwrite the output as it will not be
 // part of the visible frame.
-void SelfGuidedFilter_SSE4_1(const void* const source, void* const dest,
-                             const RestorationUnitInfo& restoration_info,
-                             const ptrdiff_t source_stride,
-                             const ptrdiff_t dest_stride, const int width,
-                             const int height,
-                             RestorationBuffer* const buffer) {
+void SelfGuidedFilter_SSE4_1(
+    const RestorationUnitInfo& restoration_info, const void* const source,
+    const ptrdiff_t stride, const void* const top_border,
+    const ptrdiff_t top_border_stride, const void* const bottom_border,
+    const ptrdiff_t bottom_border_stride, const int width, const int height,
+    RestorationBuffer* const restoration_buffer, void* const dest) {
   const int index = restoration_info.sgr_proj_info.index;
   const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
   const int radius_pass_1 = kSgrProjParams[index][2];  // 1 or 0
-  const auto* src = static_cast<const uint8_t*>(source);
-  auto* dst = static_cast<uint8_t*>(dest);
+  const auto* const src = static_cast<const uint8_t*>(source);
+  const auto* top = static_cast<const uint8_t*>(top_border);
+  const auto* bottom = static_cast<const uint8_t*>(bottom_border);
+  auto* const dst = static_cast<uint8_t*>(dest);
+  SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
   if (radius_pass_1 == 0) {
     // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
     // following assertion.
     assert(radius_pass_0 != 0);
-    BoxFilterProcessPass1(src, source_stride, restoration_info, width, height,
-                          kSgrScaleParameter[index][0],
-                          buffer->sgr_buffer.temp_buffer, dst, dest_stride);
+    BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+                          top_border_stride, bottom - 3, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
   } else if (radius_pass_0 == 0) {
-    BoxFilterProcessPass2(src, source_stride, restoration_info, width, height,
-                          kSgrScaleParameter[index][1],
-                          buffer->sgr_buffer.temp_buffer, dst, dest_stride);
+    BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+                          top_border_stride, bottom - 2, bottom_border_stride,
+                          width, height, sgr_buffer, dst);
   } else {
-    BoxFilterProcess(src, source_stride, restoration_info, width, height,
-                     kSgrScaleParameter[index], buffer->sgr_buffer.temp_buffer,
-                     dst, dest_stride);
+    BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+                     top_border_stride, bottom - 3, bottom_border_stride, width,
+                     height, sgr_buffer, dst);
   }
 }
 
 void Init8bpp() {
   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
   assert(dsp != nullptr);
+  static_cast<void>(dsp);
 #if DSP_ENABLED_8BPP_SSE4_1(WienerFilter)
   dsp->loop_restorations[0] = WienerFilter_SSE4_1;
+#else
+  static_cast<void>(WienerFilter_SSE4_1);
 #endif
 #if DSP_ENABLED_8BPP_SSE4_1(SelfGuidedFilter)
   dsp->loop_restorations[1] = SelfGuidedFilter_SSE4_1;
+#else
+  static_cast<void>(SelfGuidedFilter_SSE4_1);
 #endif
 }
 
@@ -1789,7 +2565,7 @@
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_SSE4_1
+#else   // !LIBGAV1_TARGETING_SSE4_1
 namespace libgav1 {
 namespace dsp {
 
@@ -1797,4 +2573,4 @@
 
 }  // namespace dsp
 }  // namespace libgav1
-#endif  // LIBGAV1_ENABLE_SSE4_1
+#endif  // LIBGAV1_TARGETING_SSE4_1

diff --git a/libgav1/src/dsp/x86/loop_restoration_sse4.h b/libgav1/src/dsp/x86/loop_restoration_sse4.h
index e11f35a..00df3af 100644
--- a/libgav1/src/dsp/x86/loop_restoration_sse4.h
+++ b/libgav1/src/dsp/x86/loop_restoration_sse4.h

@@ -24,15 +24,16 @@
 namespace dsp {
 
 // Initializes Dsp::loop_restorations, see the defines below for specifics.
-// This function is not thread-safe.
+// These functions are not thread-safe.
 void LoopRestorationInit_SSE4_1();
+void LoopRestorationInit10bpp_SSE4_1();
 
 }  // namespace dsp
 }  // namespace libgav1
 
 // If sse4 is enabled and the baseline isn't set due to a higher level of
 // optimization being enabled, signal the sse4 implementation should be used.
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
 
 #ifndef LIBGAV1_Dsp8bpp_WienerFilter
 #define LIBGAV1_Dsp8bpp_WienerFilter LIBGAV1_CPU_SSE4_1
@@ -42,6 +43,14 @@
 #define LIBGAV1_Dsp8bpp_SelfGuidedFilter LIBGAV1_CPU_SSE4_1
 #endif
 
-#endif  // LIBGAV1_ENABLE_SSE4_1
+#ifndef LIBGAV1_Dsp10bpp_WienerFilter
+#define LIBGAV1_Dsp10bpp_WienerFilter LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_SelfGuidedFilter
+#define LIBGAV1_Dsp10bpp_SelfGuidedFilter LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
 
 #endif  // LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_SSE4_H_

diff --git a/libgav1/src/dsp/x86/mask_blend_sse4.cc b/libgav1/src/dsp/x86/mask_blend_sse4.cc
index 76d3811..2e836af 100644
--- a/libgav1/src/dsp/x86/mask_blend_sse4.cc
+++ b/libgav1/src/dsp/x86/mask_blend_sse4.cc

@@ -15,7 +15,7 @@
 #include "src/dsp/mask_blend.h"
 #include "src/utils/cpu.h"
 
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
 
 #include <smmintrin.h>
 
@@ -121,10 +121,8 @@
                                   const __m128i pred_mask_0,
                                   const __m128i pred_mask_1, uint8_t* dst,
                                   const ptrdiff_t dst_stride) {
-  const __m128i pred_val_0_lo = LoadLo8(pred_0);
-  const __m128i pred_val_0 = LoadHi8(pred_val_0_lo, pred_0 + 4);
-  const __m128i pred_val_1_lo = LoadLo8(pred_1);
-  const __m128i pred_val_1 = LoadHi8(pred_val_1_lo, pred_1 + 4);
+  const __m128i pred_val_0 = LoadAligned16(pred_0);
+  const __m128i pred_val_1 = LoadAligned16(pred_1);
   const __m128i mask_lo = _mm_unpacklo_epi16(pred_mask_0, pred_mask_1);
   const __m128i mask_hi = _mm_unpackhi_epi16(pred_mask_0, pred_mask_1);
   const __m128i pred_lo = _mm_unpacklo_epi16(pred_val_0, pred_val_1);
@@ -286,8 +284,7 @@
                                                 const __m128i pred_mask_1) {
   const __m128i pred_mask = _mm_unpacklo_epi8(pred_mask_0, pred_mask_1);
 
-  __m128i pred_val_0 = Load4(pred_0);
-  pred_val_0 = _mm_or_si128(_mm_slli_si128(Load4(pred_0 + 4), 4), pred_val_0);
+  const __m128i pred_val_0 = LoadLo8(pred_0);
   // TODO(b/150326556): One load.
   __m128i pred_val_1 = Load4(pred_1);
   pred_val_1 = _mm_or_si128(_mm_slli_si128(Load4(pred_1 + pred_stride_1), 4),
@@ -433,12 +430,515 @@
 }  // namespace
 }  // namespace low_bitdepth
 
-void MaskBlendInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+constexpr int kMax10bppSample = (1 << 10) - 1;
+constexpr int kMaskInverse = 64;
+constexpr int kRoundBitsMaskBlend = 4;
+
+inline __m128i RightShiftWithRoundingZero_U16(const __m128i v_val_d, int bits,
+                                              const __m128i zero) {
+  // Shift out all but the last bit.
+  const __m128i v_tmp_d = _mm_srli_epi16(v_val_d, bits - 1);
+  // Avg with zero will shift by 1 and round.
+  return _mm_avg_epu16(v_tmp_d, zero);
+}
+
+inline __m128i RightShiftWithRoundingConst_S32(const __m128i v_val_d, int bits,
+                                               const __m128i shift) {
+  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, shift);
+  return _mm_srai_epi32(v_tmp_d, bits);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride,
+                          const __m128i zero) {
+  if (subsampling_x == 1) {
+    if (subsampling_y == 0) {
+      const __m128i mask_val_0 = _mm_cvtepu8_epi16(LoadLo8(mask));
+      const __m128i mask_val_1 =
+          _mm_cvtepu8_epi16(LoadLo8(mask + (mask_stride << subsampling_y)));
+      __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
+      return RightShiftWithRoundingZero_U16(subsampled_mask, 1, zero);
+    }
+    const __m128i one = _mm_set1_epi8(1);
+    const __m128i mask_val_0 =
+        LoadHi8(LoadLo8(mask), mask + (mask_stride << 1));
+    const __m128i mask_val_1 = LoadHi8(LoadLo8(mask + mask_stride),
+                                       mask + (mask_stride << 1) + mask_stride);
+    const __m128i add = _mm_adds_epu8(mask_val_0, mask_val_1);
+    const __m128i subsampled_mask = _mm_maddubs_epi16(add, one);
+    return RightShiftWithRoundingZero_U16(subsampled_mask, 2, zero);
+  }
+  assert(subsampling_y == 0 && subsampling_x == 0);
+  const __m128i mask_val_0 = Load4(mask);
+  const __m128i mask_val_1 = Load4(mask + mask_stride);
+  return _mm_cvtepu8_epi16(
+      _mm_or_si128(mask_val_0, _mm_slli_si128(mask_val_1, 4)));
+}
+
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetMask8(const uint8_t* mask, const ptrdiff_t stride,
+                        const __m128i zero) {
+  if (subsampling_x == 1) {
+    if (subsampling_y == 0) {
+      const __m128i row_vals = LoadUnaligned16(mask);
+      const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals);
+      const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8));
+      __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
+      return RightShiftWithRoundingZero_U16(subsampled_mask, 1, zero);
+    }
+    const __m128i one = _mm_set1_epi8(1);
+    const __m128i mask_val_0 = LoadUnaligned16(mask);
+    const __m128i mask_val_1 = LoadUnaligned16(mask + stride);
+    const __m128i add_0 = _mm_adds_epu8(mask_val_0, mask_val_1);
+    const __m128i mask_0 = _mm_maddubs_epi16(add_0, one);
+    return RightShiftWithRoundingZero_U16(mask_0, 2, zero);
+  }
+  assert(subsampling_y == 0 && subsampling_x == 0);
+  const __m128i mask_val = LoadLo8(mask);
+  return _mm_cvtepu8_epi16(mask_val);
+}
+
+inline void WriteMaskBlendLine10bpp4x2_SSE4_1(
+    const uint16_t* pred_0, const uint16_t* pred_1,
+    const ptrdiff_t pred_stride_1, const __m128i& pred_mask_0,
+    const __m128i& pred_mask_1, const __m128i& offset, const __m128i& max,
+    const __m128i& shift4, uint16_t* dst, const ptrdiff_t dst_stride) {
+  const __m128i pred_val_0 = LoadUnaligned16(pred_0);
+  const __m128i pred_val_1 = LoadHi8(LoadLo8(pred_1), pred_1 + pred_stride_1);
+
+  // int res = (mask_value * pred_0[x] + (64 - mask_value) * pred_1[x]) >> 6;
+  const __m128i compound_pred_lo_0 = _mm_mullo_epi16(pred_val_0, pred_mask_0);
+  const __m128i compound_pred_hi_0 = _mm_mulhi_epu16(pred_val_0, pred_mask_0);
+  const __m128i compound_pred_lo_1 = _mm_mullo_epi16(pred_val_1, pred_mask_1);
+  const __m128i compound_pred_hi_1 = _mm_mulhi_epu16(pred_val_1, pred_mask_1);
+  const __m128i pack0_lo =
+      _mm_unpacklo_epi16(compound_pred_lo_0, compound_pred_hi_0);
+  const __m128i pack0_hi =
+      _mm_unpackhi_epi16(compound_pred_lo_0, compound_pred_hi_0);
+  const __m128i pack1_lo =
+      _mm_unpacklo_epi16(compound_pred_lo_1, compound_pred_hi_1);
+  const __m128i pack1_hi =
+      _mm_unpackhi_epi16(compound_pred_lo_1, compound_pred_hi_1);
+  const __m128i compound_pred_lo = _mm_add_epi32(pack0_lo, pack1_lo);
+  const __m128i compound_pred_hi = _mm_add_epi32(pack0_hi, pack1_hi);
+  // res -= (bitdepth == 8) ? 0 : kCompoundOffset;
+  const __m128i sub_0 =
+      _mm_sub_epi32(_mm_srli_epi32(compound_pred_lo, 6), offset);
+  const __m128i sub_1 =
+      _mm_sub_epi32(_mm_srli_epi32(compound_pred_hi, 6), offset);
+
+  // dst[x] = static_cast<Pixel>(
+  //     Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+  //           (1 << kBitdepth8) - 1));
+  const __m128i shift_0 =
+      RightShiftWithRoundingConst_S32(sub_0, kRoundBitsMaskBlend, shift4);
+  const __m128i shift_1 =
+      RightShiftWithRoundingConst_S32(sub_1, kRoundBitsMaskBlend, shift4);
+  const __m128i result = _mm_min_epi16(_mm_packus_epi32(shift_0, shift_1), max);
+  StoreLo8(dst, result);
+  StoreHi8(dst + dst_stride, result);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlend10bpp4x4_SSE4_1(const uint16_t* pred_0,
+                                     const uint16_t* pred_1,
+                                     const ptrdiff_t pred_stride_1,
+                                     const uint8_t* mask,
+                                     const ptrdiff_t mask_stride, uint16_t* dst,
+                                     const ptrdiff_t dst_stride) {
+  const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1);
+  const __m128i offset = _mm_set1_epi32(kCompoundOffset);
+  const __m128i max = _mm_set1_epi16(kMax10bppSample);
+  __m128i pred_mask_0 =
+      GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+  __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+  WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0,
+                                    pred_mask_1, offset, max, shift4, dst,
+                                    dst_stride);
+  pred_0 += 4 << 1;
+  pred_1 += pred_stride_1 << 1;
+  mask += mask_stride << (1 + subsampling_y);
+  dst += dst_stride << 1;
+
+  pred_mask_0 =
+      GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+  pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+  WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0,
+                                    pred_mask_1, offset, max, shift4, dst,
+                                    dst_stride);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlend10bpp4xH_SSE4_1(const uint16_t* pred_0,
+                                     const uint16_t* pred_1,
+                                     const ptrdiff_t pred_stride_1,
+                                     const uint8_t* const mask_ptr,
+                                     const ptrdiff_t mask_stride,
+                                     const int height, uint16_t* dst,
+                                     const ptrdiff_t dst_stride) {
+  const uint8_t* mask = mask_ptr;
+  if (height == 4) {
+    MaskBlend10bpp4x4_SSE4_1<subsampling_x, subsampling_y>(
+        pred_0, pred_1, pred_stride_1, mask, mask_stride, dst, dst_stride);
+    return;
+  }
+  const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+  const __m128i zero = _mm_setzero_si128();
+  const uint8_t pred0_stride2 = 4 << 1;
+  const ptrdiff_t pred1_stride2 = pred_stride_1 << 1;
+  const ptrdiff_t mask_stride2 = mask_stride << (1 + subsampling_y);
+  const ptrdiff_t dst_stride2 = dst_stride << 1;
+  const __m128i offset = _mm_set1_epi32(kCompoundOffset);
+  const __m128i max = _mm_set1_epi16(kMax10bppSample);
+  const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1);
+  int y = height;
+  do {
+    __m128i pred_mask_0 =
+        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+    __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+
+    WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                      pred_mask_0, pred_mask_1, offset, max,
+                                      shift4, dst, dst_stride);
+    pred_0 += pred0_stride2;
+    pred_1 += pred1_stride2;
+    mask += mask_stride2;
+    dst += dst_stride2;
+
+    pred_mask_0 =
+        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+    pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+    WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                      pred_mask_0, pred_mask_1, offset, max,
+                                      shift4, dst, dst_stride);
+    pred_0 += pred0_stride2;
+    pred_1 += pred1_stride2;
+    mask += mask_stride2;
+    dst += dst_stride2;
+
+    pred_mask_0 =
+        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+    pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+    WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                      pred_mask_0, pred_mask_1, offset, max,
+                                      shift4, dst, dst_stride);
+    pred_0 += pred0_stride2;
+    pred_1 += pred1_stride2;
+    mask += mask_stride2;
+    dst += dst_stride2;
+
+    pred_mask_0 =
+        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+    pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+    WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                      pred_mask_0, pred_mask_1, offset, max,
+                                      shift4, dst, dst_stride);
+    pred_0 += pred0_stride2;
+    pred_1 += pred1_stride2;
+    mask += mask_stride2;
+    dst += dst_stride2;
+    y -= 8;
+  } while (y != 0);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlend10bpp_SSE4_1(const void* prediction_0,
+                                  const void* prediction_1,
+                                  const ptrdiff_t prediction_stride_1,
+                                  const uint8_t* const mask_ptr,
+                                  const ptrdiff_t mask_stride, const int width,
+                                  const int height, void* dest,
+                                  const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t dst_stride = dest_stride / sizeof(dst[0]);
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  const ptrdiff_t pred_stride_0 = width;
+  const ptrdiff_t pred_stride_1 = prediction_stride_1;
+  if (width == 4) {
+    MaskBlend10bpp4xH_SSE4_1<subsampling_x, subsampling_y>(
+        pred_0, pred_1, pred_stride_1, mask_ptr, mask_stride, height, dst,
+        dst_stride);
+    return;
+  }
+  const uint8_t* mask = mask_ptr;
+  const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+  const __m128i zero = _mm_setzero_si128();
+  const ptrdiff_t mask_stride_ss = mask_stride << subsampling_y;
+  const __m128i offset = _mm_set1_epi32(kCompoundOffset);
+  const __m128i max = _mm_set1_epi16(kMax10bppSample);
+  const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1);
+  int y = height;
+  do {
+    int x = 0;
+    do {
+      const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>(
+          mask + (x << subsampling_x), mask_stride, zero);
+      const __m128i pred_val_0 = LoadUnaligned16(pred_0 + x);
+      const __m128i pred_val_1 = LoadUnaligned16(pred_1 + x);
+      // 64 - mask
+      const __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+
+      const __m128i compound_pred_lo_0 =
+          _mm_mullo_epi16(pred_val_0, pred_mask_0);
+      const __m128i compound_pred_hi_0 =
+          _mm_mulhi_epu16(pred_val_0, pred_mask_0);
+      const __m128i compound_pred_lo_1 =
+          _mm_mullo_epi16(pred_val_1, pred_mask_1);
+      const __m128i compound_pred_hi_1 =
+          _mm_mulhi_epu16(pred_val_1, pred_mask_1);
+      const __m128i pack0_lo =
+          _mm_unpacklo_epi16(compound_pred_lo_0, compound_pred_hi_0);
+      const __m128i pack0_hi =
+          _mm_unpackhi_epi16(compound_pred_lo_0, compound_pred_hi_0);
+      const __m128i pack1_lo =
+          _mm_unpacklo_epi16(compound_pred_lo_1, compound_pred_hi_1);
+      const __m128i pack1_hi =
+          _mm_unpackhi_epi16(compound_pred_lo_1, compound_pred_hi_1);
+      const __m128i compound_pred_lo = _mm_add_epi32(pack0_lo, pack1_lo);
+      const __m128i compound_pred_hi = _mm_add_epi32(pack0_hi, pack1_hi);
+
+      const __m128i sub_0 =
+          _mm_sub_epi32(_mm_srli_epi32(compound_pred_lo, 6), offset);
+      const __m128i sub_1 =
+          _mm_sub_epi32(_mm_srli_epi32(compound_pred_hi, 6), offset);
+      const __m128i shift_0 =
+          RightShiftWithRoundingConst_S32(sub_0, kRoundBitsMaskBlend, shift4);
+      const __m128i shift_1 =
+          RightShiftWithRoundingConst_S32(sub_1, kRoundBitsMaskBlend, shift4);
+      const __m128i result =
+          _mm_min_epi16(_mm_packus_epi32(shift_0, shift_1), max);
+      StoreUnaligned16(dst + x, result);
+      x += 8;
+    } while (x < width);
+    dst += dst_stride;
+    pred_0 += pred_stride_0;
+    pred_1 += pred_stride_1;
+    mask += mask_stride_ss;
+  } while (--y != 0);
+}
+
+inline void InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(
+    const uint16_t* prediction_0, const uint16_t* prediction_1,
+    const ptrdiff_t pred_stride_1, const __m128i& pred_mask_0,
+    const __m128i& pred_mask_1, const __m128i& shift6, uint16_t* dst,
+    const ptrdiff_t dst_stride) {
+  const __m128i pred_val_0 = LoadUnaligned16(prediction_0);
+  const __m128i pred_val_1 =
+      LoadHi8(LoadLo8(prediction_1), prediction_1 + pred_stride_1);
+
+  const __m128i mask_0 = _mm_unpacklo_epi16(pred_mask_1, pred_mask_0);
+  const __m128i mask_1 = _mm_unpackhi_epi16(pred_mask_1, pred_mask_0);
+  const __m128i pred_0 = _mm_unpacklo_epi16(pred_val_0, pred_val_1);
+  const __m128i pred_1 = _mm_unpackhi_epi16(pred_val_0, pred_val_1);
+
+  const __m128i compound_pred_0 = _mm_madd_epi16(pred_0, mask_0);
+  const __m128i compound_pred_1 = _mm_madd_epi16(pred_1, mask_1);
+  const __m128i shift_0 =
+      RightShiftWithRoundingConst_S32(compound_pred_0, 6, shift6);
+  const __m128i shift_1 =
+      RightShiftWithRoundingConst_S32(compound_pred_1, 6, shift6);
+  const __m128i res = _mm_packus_epi32(shift_0, shift_1);
+  StoreLo8(dst, res);
+  StoreHi8(dst + dst_stride, res);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlend10bpp4x4_SSE4_1(
+    const uint16_t* pred_0, const uint16_t* pred_1,
+    const ptrdiff_t pred_stride_1, const uint8_t* mask,
+    const ptrdiff_t mask_stride, uint16_t* dst, const ptrdiff_t dst_stride) {
+  const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+  const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i pred_mask_0 =
+      GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+  __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+  InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                              pred_mask_0, pred_mask_1, shift6,
+                                              dst, dst_stride);
+  pred_0 += 4 << 1;
+  pred_1 += pred_stride_1 << 1;
+  mask += mask_stride << (1 + subsampling_y);
+  dst += dst_stride << 1;
+
+  pred_mask_0 =
+      GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+  pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+  InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                              pred_mask_0, pred_mask_1, shift6,
+                                              dst, dst_stride);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlend10bpp4xH_SSE4_1(const uint16_t* pred_0,
+                                               const uint16_t* pred_1,
+                                               const ptrdiff_t pred_stride_1,
+                                               const uint8_t* const mask_ptr,
+                                               const ptrdiff_t mask_stride,
+                                               const int height, uint16_t* dst,
+                                               const ptrdiff_t dst_stride) {
+  const uint8_t* mask = mask_ptr;
+  if (height == 4) {
+    InterIntraMaskBlend10bpp4x4_SSE4_1<subsampling_x, subsampling_y>(
+        pred_0, pred_1, pred_stride_1, mask, mask_stride, dst, dst_stride);
+    return;
+  }
+  const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1);
+  const uint8_t pred0_stride2 = 4 << 1;
+  const ptrdiff_t pred1_stride2 = pred_stride_1 << 1;
+  const ptrdiff_t mask_stride2 = mask_stride << (1 + subsampling_y);
+  const ptrdiff_t dst_stride2 = dst_stride << 1;
+  int y = height;
+  do {
+    __m128i pred_mask_0 =
+        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+    __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+    InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                                pred_mask_0, pred_mask_1,
+                                                shift6, dst, dst_stride);
+    pred_0 += pred0_stride2;
+    pred_1 += pred1_stride2;
+    mask += mask_stride2;
+    dst += dst_stride2;
+
+    pred_mask_0 =
+        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+    pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+    InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                                pred_mask_0, pred_mask_1,
+                                                shift6, dst, dst_stride);
+    pred_0 += pred0_stride2;
+    pred_1 += pred1_stride2;
+    mask += mask_stride2;
+    dst += dst_stride2;
+
+    pred_mask_0 =
+        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+    pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+    InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                                pred_mask_0, pred_mask_1,
+                                                shift6, dst, dst_stride);
+    pred_0 += pred0_stride2;
+    pred_1 += pred1_stride2;
+    mask += mask_stride2;
+    dst += dst_stride2;
+
+    pred_mask_0 =
+        GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+    pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+    InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+                                                pred_mask_0, pred_mask_1,
+                                                shift6, dst, dst_stride);
+    pred_0 += pred0_stride2;
+    pred_1 += pred1_stride2;
+    mask += mask_stride2;
+    dst += dst_stride2;
+    y -= 8;
+  } while (y != 0);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlend10bpp_SSE4_1(
+    const void* prediction_0, const void* prediction_1,
+    const ptrdiff_t prediction_stride_1, const uint8_t* const mask_ptr,
+    const ptrdiff_t mask_stride, const int width, const int height, void* dest,
+    const ptrdiff_t dest_stride) {
+  auto* dst = static_cast<uint16_t*>(dest);
+  const ptrdiff_t dst_stride = dest_stride / sizeof(dst[0]);
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  const ptrdiff_t pred_stride_0 = width;
+  const ptrdiff_t pred_stride_1 = prediction_stride_1;
+  if (width == 4) {
+    InterIntraMaskBlend10bpp4xH_SSE4_1<subsampling_x, subsampling_y>(
+        pred_0, pred_1, pred_stride_1, mask_ptr, mask_stride, height, dst,
+        dst_stride);
+    return;
+  }
+  const uint8_t* mask = mask_ptr;
+  const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+  const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1);
+  const __m128i zero = _mm_setzero_si128();
+  const ptrdiff_t mask_stride_ss = mask_stride << subsampling_y;
+  int y = height;
+  do {
+    int x = 0;
+    do {
+      const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>(
+          mask + (x << subsampling_x), mask_stride, zero);
+      const __m128i pred_val_0 = LoadUnaligned16(pred_0 + x);
+      const __m128i pred_val_1 = LoadUnaligned16(pred_1 + x);
+      // 64 - mask
+      const __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+      const __m128i mask_0 = _mm_unpacklo_epi16(pred_mask_1, pred_mask_0);
+      const __m128i mask_1 = _mm_unpackhi_epi16(pred_mask_1, pred_mask_0);
+      const __m128i pred_0 = _mm_unpacklo_epi16(pred_val_0, pred_val_1);
+      const __m128i pred_1 = _mm_unpackhi_epi16(pred_val_0, pred_val_1);
+
+      const __m128i compound_pred_0 = _mm_madd_epi16(pred_0, mask_0);
+      const __m128i compound_pred_1 = _mm_madd_epi16(pred_1, mask_1);
+      const __m128i shift_0 =
+          RightShiftWithRoundingConst_S32(compound_pred_0, 6, shift6);
+      const __m128i shift_1 =
+          RightShiftWithRoundingConst_S32(compound_pred_1, 6, shift6);
+      StoreUnaligned16(dst + x, _mm_packus_epi32(shift_0, shift_1));
+      x += 8;
+    } while (x < width);
+    dst += dst_stride;
+    pred_0 += pred_stride_0;
+    pred_1 += pred_stride_1;
+    mask += mask_stride_ss;
+  } while (--y != 0);
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlend444)
+  dsp->mask_blend[0][0] = MaskBlend10bpp_SSE4_1<0, 0>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlend422)
+  dsp->mask_blend[1][0] = MaskBlend10bpp_SSE4_1<1, 0>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlend420)
+  dsp->mask_blend[2][0] = MaskBlend10bpp_SSE4_1<1, 1>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlendInterIntra444)
+  dsp->mask_blend[0][1] = InterIntraMaskBlend10bpp_SSE4_1<0, 0>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlendInterIntra422)
+  dsp->mask_blend[1][1] = InterIntraMaskBlend10bpp_SSE4_1<1, 0>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlendInterIntra420)
+  dsp->mask_blend[2][1] = InterIntraMaskBlend10bpp_SSE4_1<1, 1>;
+#endif
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void MaskBlendInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}
 
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_SSE4_1
+#else   // !LIBGAV1_TARGETING_SSE4_1
 
 namespace libgav1 {
 namespace dsp {
@@ -447,4 +947,4 @@
 
 }  // namespace dsp
 }  // namespace libgav1
-#endif  // LIBGAV1_ENABLE_SSE4_1
+#endif  // LIBGAV1_TARGETING_SSE4_1

diff --git a/libgav1/src/dsp/x86/mask_blend_sse4.h b/libgav1/src/dsp/x86/mask_blend_sse4.h
index cfd5e9a..4a95f0c 100644
--- a/libgav1/src/dsp/x86/mask_blend_sse4.h
+++ b/libgav1/src/dsp/x86/mask_blend_sse4.h

@@ -29,13 +29,56 @@
 }  // namespace dsp
 }  // namespace libgav1
 
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend444
 #define LIBGAV1_Dsp8bpp_MaskBlend444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend422
 #define LIBGAV1_Dsp8bpp_MaskBlend422 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend420
 #define LIBGAV1_Dsp8bpp_MaskBlend420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp444
 #define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp422
 #define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp422 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420
 #define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420 LIBGAV1_CPU_SSE4_1
-#endif  // LIBGAV1_ENABLE_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlend444
+#define LIBGAV1_Dsp10bpp_MaskBlend444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlend422
+#define LIBGAV1_Dsp10bpp_MaskBlend422 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlend420
+#define LIBGAV1_Dsp10bpp_MaskBlend420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra444
+#define LIBGAV1_Dsp10bpp_MaskBlendInterIntra444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra422
+#define LIBGAV1_Dsp10bpp_MaskBlendInterIntra422 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra420
+#define LIBGAV1_Dsp10bpp_MaskBlendInterIntra420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
 
 #endif  // LIBGAV1_SRC_DSP_X86_MASK_BLEND_SSE4_H_

diff --git a/libgav1/src/dsp/x86/motion_field_projection_sse4.cc b/libgav1/src/dsp/x86/motion_field_projection_sse4.cc
index 1875198..e3f2cce 100644
--- a/libgav1/src/dsp/x86/motion_field_projection_sse4.cc
+++ b/libgav1/src/dsp/x86/motion_field_projection_sse4.cc

@@ -15,7 +15,7 @@
 #include "src/dsp/motion_field_projection.h"
 #include "src/utils/cpu.h"
 
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
 
 #include <smmintrin.h>
 
@@ -139,9 +139,9 @@
   const ptrdiff_t offset =
       static_cast<int16_t>(_mm_extract_epi16(position, idx));
   if ((idx & 3) == 0) {
-    dst_mv[offset].mv32 = _mm_cvtsi128_si32(mv);
+    dst_mv[offset].mv32 = static_cast<uint32_t>(_mm_cvtsi128_si32(mv));
   } else {
-    dst_mv[offset].mv32 = _mm_extract_epi32(mv, idx & 3);
+    dst_mv[offset].mv32 = static_cast<uint32_t>(_mm_extract_epi32(mv, idx & 3));
   }
   dst_reference_offset[offset] = _mm_extract_epi8(reference_offset, idx);
 }
@@ -386,7 +386,7 @@
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_SSE4_1
+#else   // !LIBGAV1_TARGETING_SSE4_1
 namespace libgav1 {
 namespace dsp {
 
@@ -394,4 +394,4 @@
 
 }  // namespace dsp
 }  // namespace libgav1
-#endif  // LIBGAV1_ENABLE_SSE4_1
+#endif  // LIBGAV1_TARGETING_SSE4_1

diff --git a/libgav1/src/dsp/x86/motion_field_projection_sse4.h b/libgav1/src/dsp/x86/motion_field_projection_sse4.h
index 7828de5..c05422c 100644
--- a/libgav1/src/dsp/x86/motion_field_projection_sse4.h
+++ b/libgav1/src/dsp/x86/motion_field_projection_sse4.h

@@ -30,8 +30,12 @@
 }  // namespace dsp
 }  // namespace libgav1
 
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel
 #define LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel LIBGAV1_CPU_SSE4_1
-#endif  // LIBGAV1_ENABLE_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
 
 #endif  // LIBGAV1_SRC_DSP_X86_MOTION_FIELD_PROJECTION_SSE4_H_

diff --git a/libgav1/src/dsp/x86/motion_vector_search_sse4.cc b/libgav1/src/dsp/x86/motion_vector_search_sse4.cc
index e49be12..7f5f035 100644
--- a/libgav1/src/dsp/x86/motion_vector_search_sse4.cc
+++ b/libgav1/src/dsp/x86/motion_vector_search_sse4.cc

@@ -15,7 +15,7 @@
 #include "src/dsp/motion_vector_search.h"
 #include "src/utils/cpu.h"
 
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
 
 #include <smmintrin.h>
 
@@ -251,7 +251,7 @@
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_SSE4_1
+#else   // !LIBGAV1_TARGETING_SSE4_1
 namespace libgav1 {
 namespace dsp {
 
@@ -259,4 +259,4 @@
 
 }  // namespace dsp
 }  // namespace libgav1
-#endif  // LIBGAV1_ENABLE_SSE4_1
+#endif  // LIBGAV1_TARGETING_SSE4_1

diff --git a/libgav1/src/dsp/x86/motion_vector_search_sse4.h b/libgav1/src/dsp/x86/motion_vector_search_sse4.h
index b8b0412..d65b392 100644
--- a/libgav1/src/dsp/x86/motion_vector_search_sse4.h
+++ b/libgav1/src/dsp/x86/motion_vector_search_sse4.h

@@ -30,8 +30,12 @@
 }  // namespace dsp
 }  // namespace libgav1
 
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_MotionVectorSearch
 #define LIBGAV1_Dsp8bpp_MotionVectorSearch LIBGAV1_CPU_SSE4_1
-#endif  // LIBGAV1_ENABLE_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
 
 #endif  // LIBGAV1_SRC_DSP_X86_MOTION_VECTOR_SEARCH_SSE4_H_

diff --git a/libgav1/src/dsp/x86/obmc_sse4.cc b/libgav1/src/dsp/x86/obmc_sse4.cc
index a1be5ef..c34a7f7 100644
--- a/libgav1/src/dsp/x86/obmc_sse4.cc
+++ b/libgav1/src/dsp/x86/obmc_sse4.cc

@@ -15,7 +15,7 @@
 #include "src/dsp/obmc.h"
 #include "src/utils/cpu.h"
 
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
 
 #include <xmmintrin.h>
 
@@ -31,6 +31,7 @@
 
 namespace libgav1 {
 namespace dsp {
+namespace low_bitdepth {
 namespace {
 
 #include "src/dsp/obmc.inc"
@@ -311,13 +312,295 @@
 }
 
 }  // namespace
+}  // namespace low_bitdepth
 
-void ObmcInit_SSE4_1() { Init8bpp(); }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+#include "src/dsp/obmc.inc"
+
+constexpr int kRoundBitsObmcBlend = 6;
+
+inline void OverlapBlendFromLeft2xH_SSE4_1(
+    uint16_t* const prediction, const ptrdiff_t pred_stride, const int height,
+    const uint16_t* const obmc_prediction, const ptrdiff_t obmc_pred_stride) {
+  uint16_t* pred = prediction;
+  const uint16_t* obmc_pred = obmc_prediction;
+  const ptrdiff_t pred_stride2 = pred_stride << 1;
+  const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1;
+  const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
+  const __m128i mask_val = _mm_shufflelo_epi16(Load2(kObmcMask), 0x00);
+  // 64 - mask.
+  const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+  const __m128i masks =
+      _mm_cvtepi8_epi16(_mm_unpacklo_epi8(mask_val, obmc_mask_val));
+  int y = height;
+  do {
+    const __m128i pred_val = Load4x2(pred, pred + pred_stride);
+    const __m128i obmc_pred_val =
+        Load4x2(obmc_pred, obmc_pred + obmc_pred_stride);
+    const __m128i terms = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
+    const __m128i result = RightShiftWithRounding_U32(
+        _mm_madd_epi16(terms, masks), kRoundBitsObmcBlend);
+    const __m128i packed_result = _mm_packus_epi32(result, result);
+    Store4(pred, packed_result);
+    Store4(pred + pred_stride, _mm_srli_si128(packed_result, 4));
+    pred += pred_stride2;
+    obmc_pred += obmc_pred_stride2;
+    y -= 2;
+  } while (y != 0);
+}
+
+inline void OverlapBlendFromLeft4xH_SSE4_1(
+    uint16_t* const prediction, const ptrdiff_t pred_stride, const int height,
+    const uint16_t* const obmc_prediction, const ptrdiff_t obmc_pred_stride) {
+  uint16_t* pred = prediction;
+  const uint16_t* obmc_pred = obmc_prediction;
+  const ptrdiff_t pred_stride2 = pred_stride << 1;
+  const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1;
+  const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
+  const __m128i mask_val = Load4(kObmcMask + 2);
+  // 64 - mask.
+  const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+  const __m128i masks =
+      _mm_cvtepi8_epi16(_mm_unpacklo_epi8(mask_val, obmc_mask_val));
+  int y = height;
+  do {
+    const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride);
+    const __m128i obmc_pred_val =
+        LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride);
+    const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
+    const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val);
+    const __m128i result_lo = RightShiftWithRounding_U32(
+        _mm_madd_epi16(terms_lo, masks), kRoundBitsObmcBlend);
+    const __m128i result_hi = RightShiftWithRounding_U32(
+        _mm_madd_epi16(terms_hi, masks), kRoundBitsObmcBlend);
+    const __m128i packed_result = _mm_packus_epi32(result_lo, result_hi);
+    StoreLo8(pred, packed_result);
+    StoreHi8(pred + pred_stride, packed_result);
+    pred += pred_stride2;
+    obmc_pred += obmc_pred_stride2;
+    y -= 2;
+  } while (y != 0);
+}
+
+void OverlapBlendFromLeft10bpp_SSE4_1(void* const prediction,
+                                      const ptrdiff_t prediction_stride,
+                                      const int width, const int height,
+                                      const void* const obmc_prediction,
+                                      const ptrdiff_t obmc_prediction_stride) {
+  auto* pred = static_cast<uint16_t*>(prediction);
+  const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
+  const ptrdiff_t pred_stride = prediction_stride / sizeof(pred[0]);
+  const ptrdiff_t obmc_pred_stride =
+      obmc_prediction_stride / sizeof(obmc_pred[0]);
+
+  if (width == 2) {
+    OverlapBlendFromLeft2xH_SSE4_1(pred, pred_stride, height, obmc_pred,
+                                   obmc_pred_stride);
+    return;
+  }
+  if (width == 4) {
+    OverlapBlendFromLeft4xH_SSE4_1(pred, pred_stride, height, obmc_pred,
+                                   obmc_pred_stride);
+    return;
+  }
+  const __m128i mask_inverter = _mm_set1_epi8(64);
+  const uint8_t* mask = kObmcMask + width - 2;
+  int x = 0;
+  do {
+    pred = static_cast<uint16_t*>(prediction) + x;
+    obmc_pred = static_cast<const uint16_t*>(obmc_prediction) + x;
+    const __m128i mask_val = LoadLo8(mask + x);
+    // 64 - mask
+    const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+    const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+    const __m128i masks_lo = _mm_cvtepi8_epi16(masks);
+    const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8));
+    int y = height;
+    do {
+      const __m128i pred_val = LoadUnaligned16(pred);
+      const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
+      const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
+      const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val);
+      const __m128i result_lo = RightShiftWithRounding_U32(
+          _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend);
+      const __m128i result_hi = RightShiftWithRounding_U32(
+          _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend);
+      StoreUnaligned16(pred, _mm_packus_epi32(result_lo, result_hi));
+
+      pred += pred_stride;
+      obmc_pred += obmc_pred_stride;
+    } while (--y != 0);
+    x += 8;
+  } while (x < width);
+}
+
+inline void OverlapBlendFromTop2xH_SSE4_1(uint16_t* const prediction,
+                                          const ptrdiff_t pred_stride,
+                                          const int height,
+                                          const uint16_t* const obmc_prediction,
+                                          const ptrdiff_t obmc_pred_stride) {
+  uint16_t* pred = prediction;
+  const uint16_t* obmc_pred = obmc_prediction;
+  const __m128i mask_inverter = _mm_set1_epi16(64);
+  const __m128i mask_shuffler = _mm_set_epi32(0x01010101, 0x01010101, 0, 0);
+  const __m128i mask_preinverter = _mm_set1_epi16(-256 | 1);
+  const uint8_t* mask = kObmcMask + height - 2;
+  const int compute_height =
+      height - (height >> 2);  // compute_height based on 8-bit opt
+  const ptrdiff_t pred_stride2 = pred_stride << 1;
+  const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1;
+  int y = 0;
+  do {
+    // First mask in the first half, second mask in the second half.
+    const __m128i mask_val = _mm_shuffle_epi8(Load4(mask + y), mask_shuffler);
+    const __m128i masks =
+        _mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter));
+    const __m128i masks_lo = _mm_cvtepi8_epi16(masks);
+    const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8));
+
+    const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride);
+    const __m128i obmc_pred_val =
+        LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride);
+    const __m128i terms_lo = _mm_unpacklo_epi16(obmc_pred_val, pred_val);
+    const __m128i terms_hi = _mm_unpackhi_epi16(obmc_pred_val, pred_val);
+    const __m128i result_lo = RightShiftWithRounding_U32(
+        _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend);
+    const __m128i result_hi = RightShiftWithRounding_U32(
+        _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend);
+    const __m128i packed_result = _mm_packus_epi32(result_lo, result_hi);
+
+    Store4(pred, packed_result);
+    Store4(pred + pred_stride, _mm_srli_si128(packed_result, 8));
+    pred += pred_stride2;
+    obmc_pred += obmc_pred_stride2;
+    y += 2;
+  } while (y < compute_height);
+}
+
+inline void OverlapBlendFromTop4xH_SSE4_1(uint16_t* const prediction,
+                                          const ptrdiff_t pred_stride,
+                                          const int height,
+                                          const uint16_t* const obmc_prediction,
+                                          const ptrdiff_t obmc_pred_stride) {
+  uint16_t* pred = prediction;
+  const uint16_t* obmc_pred = obmc_prediction;
+  const __m128i mask_inverter = _mm_set1_epi16(64);
+  const __m128i mask_shuffler = _mm_set_epi32(0x01010101, 0x01010101, 0, 0);
+  const __m128i mask_preinverter = _mm_set1_epi16(-256 | 1);
+  const uint8_t* mask = kObmcMask + height - 2;
+  const int compute_height = height - (height >> 2);
+  const ptrdiff_t pred_stride2 = pred_stride << 1;
+  const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1;
+  int y = 0;
+  do {
+    // First mask in the first half, second mask in the second half.
+    const __m128i mask_val = _mm_shuffle_epi8(Load4(mask + y), mask_shuffler);
+    const __m128i masks =
+        _mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter));
+    const __m128i masks_lo = _mm_cvtepi8_epi16(masks);
+    const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8));
+
+    const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride);
+    const __m128i obmc_pred_val =
+        LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride);
+    const __m128i terms_lo = _mm_unpacklo_epi16(obmc_pred_val, pred_val);
+    const __m128i terms_hi = _mm_unpackhi_epi16(obmc_pred_val, pred_val);
+    const __m128i result_lo = RightShiftWithRounding_U32(
+        _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend);
+    const __m128i result_hi = RightShiftWithRounding_U32(
+        _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend);
+    const __m128i packed_result = _mm_packus_epi32(result_lo, result_hi);
+
+    StoreLo8(pred, packed_result);
+    StoreHi8(pred + pred_stride, packed_result);
+    pred += pred_stride2;
+    obmc_pred += obmc_pred_stride2;
+    y += 2;
+  } while (y < compute_height);
+}
+
+void OverlapBlendFromTop10bpp_SSE4_1(void* const prediction,
+                                     const ptrdiff_t prediction_stride,
+                                     const int width, const int height,
+                                     const void* const obmc_prediction,
+                                     const ptrdiff_t obmc_prediction_stride) {
+  auto* pred = static_cast<uint16_t*>(prediction);
+  const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
+  const ptrdiff_t pred_stride = prediction_stride / sizeof(pred[0]);
+  const ptrdiff_t obmc_pred_stride =
+      obmc_prediction_stride / sizeof(obmc_pred[0]);
+
+  if (width == 2) {
+    OverlapBlendFromTop2xH_SSE4_1(pred, pred_stride, height, obmc_pred,
+                                  obmc_pred_stride);
+    return;
+  }
+  if (width == 4) {
+    OverlapBlendFromTop4xH_SSE4_1(pred, pred_stride, height, obmc_pred,
+                                  obmc_pred_stride);
+    return;
+  }
+
+  const __m128i mask_inverter = _mm_set1_epi8(64);
+  const int compute_height = height - (height >> 2);
+  const uint8_t* mask = kObmcMask + height - 2;
+  pred = static_cast<uint16_t*>(prediction);
+  obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
+  int y = 0;
+  do {
+    const __m128i mask_val = _mm_set1_epi8(mask[y]);
+    // 64 - mask
+    const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+    const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+    const __m128i masks_lo = _mm_cvtepi8_epi16(masks);
+    const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8));
+    int x = 0;
+    do {
+      const __m128i pred_val = LoadUnaligned16(pred + x);
+      const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred + x);
+      const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
+      const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val);
+      const __m128i result_lo = RightShiftWithRounding_U32(
+          _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend);
+      const __m128i result_hi = RightShiftWithRounding_U32(
+          _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend);
+      StoreUnaligned16(pred + x, _mm_packus_epi32(result_lo, result_hi));
+      x += 8;
+    } while (x < width);
+    pred += pred_stride;
+    obmc_pred += obmc_pred_stride;
+  } while (++y < compute_height);
+}
+
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+#if DSP_ENABLED_10BPP_SSE4_1(ObmcVertical)
+  dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop10bpp_SSE4_1;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(ObmcHorizontal)
+  dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft10bpp_SSE4_1;
+#endif
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void ObmcInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+}
 
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_SSE4_1
+#else   // !LIBGAV1_TARGETING_SSE4_1
 
 namespace libgav1 {
 namespace dsp {
@@ -326,4 +609,4 @@
 
 }  // namespace dsp
 }  // namespace libgav1
-#endif  // LIBGAV1_ENABLE_SSE4_1
+#endif  // LIBGAV1_TARGETING_SSE4_1

diff --git a/libgav1/src/dsp/x86/obmc_sse4.h b/libgav1/src/dsp/x86/obmc_sse4.h
index 03669ad..448d2cf 100644
--- a/libgav1/src/dsp/x86/obmc_sse4.h
+++ b/libgav1/src/dsp/x86/obmc_sse4.h

@@ -31,13 +31,19 @@
 
 // If sse4 is enabled and the baseline isn't set due to a higher level of
 // optimization being enabled, signal the sse4 implementation should be used.
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
 #ifndef LIBGAV1_Dsp8bpp_ObmcVertical
 #define LIBGAV1_Dsp8bpp_ObmcVertical LIBGAV1_CPU_SSE4_1
 #endif
 #ifndef LIBGAV1_Dsp8bpp_ObmcHorizontal
 #define LIBGAV1_Dsp8bpp_ObmcHorizontal LIBGAV1_CPU_SSE4_1
 #endif
-#endif  // LIBGAV1_ENABLE_SSE4_1
+#ifndef LIBGAV1_Dsp10bpp_ObmcVertical
+#define LIBGAV1_Dsp10bpp_ObmcVertical LIBGAV1_CPU_SSE4_1
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ObmcHorizontal
+#define LIBGAV1_Dsp10bpp_ObmcHorizontal LIBGAV1_CPU_SSE4_1
+#endif
+#endif  // LIBGAV1_TARGETING_SSE4_1
 
 #endif  // LIBGAV1_SRC_DSP_X86_OBMC_SSE4_H_

diff --git a/libgav1/src/dsp/x86/super_res_sse4.cc b/libgav1/src/dsp/x86/super_res_sse4.cc
index 050bcc4..85d05bc 100644
--- a/libgav1/src/dsp/x86/super_res_sse4.cc
+++ b/libgav1/src/dsp/x86/super_res_sse4.cc

@@ -15,13 +15,15 @@
 #include "src/dsp/super_res.h"
 #include "src/utils/cpu.h"
 
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
 
 #include <smmintrin.h>
 
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
 #include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
 #include "src/utils/constants.h"
 
 namespace libgav1 {
@@ -30,12 +32,153 @@
 namespace {
 
 // Upscale_Filter as defined in AV1 Section 7.16
+// Negative to make them fit in 8-bit.
+alignas(16) const int8_t
+    kNegativeUpscaleFilter[kSuperResFilterShifts][kSuperResFilterTaps] = {
+        {0, 0, 0, -128, 0, 0, 0, 0},       {0, 0, 1, -128, -2, 1, 0, 0},
+        {0, -1, 3, -127, -4, 2, -1, 0},    {0, -1, 4, -127, -6, 3, -1, 0},
+        {0, -2, 6, -126, -8, 3, -1, 0},    {0, -2, 7, -125, -11, 4, -1, 0},
+        {1, -2, 8, -125, -13, 5, -2, 0},   {1, -3, 9, -124, -15, 6, -2, 0},
+        {1, -3, 10, -123, -18, 6, -2, 1},  {1, -3, 11, -122, -20, 7, -3, 1},
+        {1, -4, 12, -121, -22, 8, -3, 1},  {1, -4, 13, -120, -25, 9, -3, 1},
+        {1, -4, 14, -118, -28, 9, -3, 1},  {1, -4, 15, -117, -30, 10, -4, 1},
+        {1, -5, 16, -116, -32, 11, -4, 1}, {1, -5, 16, -114, -35, 12, -4, 1},
+        {1, -5, 17, -112, -38, 12, -4, 1}, {1, -5, 18, -111, -40, 13, -5, 1},
+        {1, -5, 18, -109, -43, 14, -5, 1}, {1, -6, 19, -107, -45, 14, -5, 1},
+        {1, -6, 19, -105, -48, 15, -5, 1}, {1, -6, 19, -103, -51, 16, -5, 1},
+        {1, -6, 20, -101, -53, 16, -6, 1}, {1, -6, 20, -99, -56, 17, -6, 1},
+        {1, -6, 20, -97, -58, 17, -6, 1},  {1, -6, 20, -95, -61, 18, -6, 1},
+        {2, -7, 20, -93, -64, 18, -6, 2},  {2, -7, 20, -91, -66, 19, -6, 1},
+        {2, -7, 20, -88, -69, 19, -6, 1},  {2, -7, 20, -86, -71, 19, -6, 1},
+        {2, -7, 20, -84, -74, 20, -7, 2},  {2, -7, 20, -81, -76, 20, -7, 1},
+        {2, -7, 20, -79, -79, 20, -7, 2},  {1, -7, 20, -76, -81, 20, -7, 2},
+        {2, -7, 20, -74, -84, 20, -7, 2},  {1, -6, 19, -71, -86, 20, -7, 2},
+        {1, -6, 19, -69, -88, 20, -7, 2},  {1, -6, 19, -66, -91, 20, -7, 2},
+        {2, -6, 18, -64, -93, 20, -7, 2},  {1, -6, 18, -61, -95, 20, -6, 1},
+        {1, -6, 17, -58, -97, 20, -6, 1},  {1, -6, 17, -56, -99, 20, -6, 1},
+        {1, -6, 16, -53, -101, 20, -6, 1}, {1, -5, 16, -51, -103, 19, -6, 1},
+        {1, -5, 15, -48, -105, 19, -6, 1}, {1, -5, 14, -45, -107, 19, -6, 1},
+        {1, -5, 14, -43, -109, 18, -5, 1}, {1, -5, 13, -40, -111, 18, -5, 1},
+        {1, -4, 12, -38, -112, 17, -5, 1}, {1, -4, 12, -35, -114, 16, -5, 1},
+        {1, -4, 11, -32, -116, 16, -5, 1}, {1, -4, 10, -30, -117, 15, -4, 1},
+        {1, -3, 9, -28, -118, 14, -4, 1},  {1, -3, 9, -25, -120, 13, -4, 1},
+        {1, -3, 8, -22, -121, 12, -4, 1},  {1, -3, 7, -20, -122, 11, -3, 1},
+        {1, -2, 6, -18, -123, 10, -3, 1},  {0, -2, 6, -15, -124, 9, -3, 1},
+        {0, -2, 5, -13, -125, 8, -2, 1},   {0, -1, 4, -11, -125, 7, -2, 0},
+        {0, -1, 3, -8, -126, 6, -2, 0},    {0, -1, 3, -6, -127, 4, -1, 0},
+        {0, -1, 2, -4, -127, 3, -1, 0},    {0, 0, 1, -2, -128, 1, 0, 0},
+};
+
+void SuperResCoefficients_SSE4_1(const int upscaled_width,
+                                 const int initial_subpixel_x, const int step,
+                                 void* const coefficients) {
+  auto* dst = static_cast<uint8_t*>(coefficients);
+  int subpixel_x = initial_subpixel_x;
+  int x = RightShiftWithCeiling(upscaled_width, 4);
+  do {
+    for (int i = 0; i < 8; ++i, dst += 16) {
+      int remainder = subpixel_x & kSuperResScaleMask;
+      __m128i filter =
+          LoadLo8(kNegativeUpscaleFilter[remainder >> kSuperResExtraBits]);
+      subpixel_x += step;
+      remainder = subpixel_x & kSuperResScaleMask;
+      filter = LoadHi8(filter,
+                       kNegativeUpscaleFilter[remainder >> kSuperResExtraBits]);
+      subpixel_x += step;
+      StoreAligned16(dst, filter);
+    }
+  } while (--x != 0);
+}
+
+void SuperRes_SSE4_1(const void* const coefficients, void* const source,
+                     const ptrdiff_t source_stride, const int height,
+                     const int downscaled_width, const int upscaled_width,
+                     const int initial_subpixel_x, const int step,
+                     void* const dest, const ptrdiff_t dest_stride) {
+  auto* src = static_cast<uint8_t*>(source) - DivideBy2(kSuperResFilterTaps);
+  auto* dst = static_cast<uint8_t*>(dest);
+  int y = height;
+  do {
+    const auto* filter = static_cast<const uint8_t*>(coefficients);
+    uint8_t* dst_ptr = dst;
+    ExtendLine<uint8_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+                        kSuperResHorizontalBorder, kSuperResHorizontalBorder);
+    int subpixel_x = initial_subpixel_x;
+    // The below code calculates up to 15 extra upscaled pixels which will
+    // over-read up to 15 downscaled pixels in the end of each row.
+    // kSuperResHorizontalPadding protects this behavior from segmentation
+    // faults and threading issues.
+    int x = RightShiftWithCeiling(upscaled_width, 4);
+    do {
+      __m128i weighted_src[8];
+      for (int i = 0; i < 8; ++i, filter += 16) {
+        // TODO(b/178652672): Remove Msan loads when hadd bug is resolved.
+        // It's fine to write uninitialized bytes outside the frame, but the
+        // inside-frame pixels are incorrectly labeled uninitialized if
+        // uninitialized values go through the hadd intrinsics.
+        // |src| is offset 4 pixels to the left, and there are 4 extended border
+        // pixels, so a difference of 0 from |downscaled_width| indicates 8 good
+        // bytes. A difference of 1 indicates 7 good bytes.
+        const int msan_bytes_lo =
+            (subpixel_x >> kSuperResScaleBits) - downscaled_width;
+        __m128i s =
+            LoadLo8Msan(&src[subpixel_x >> kSuperResScaleBits], msan_bytes_lo);
+        subpixel_x += step;
+        const int msan_bytes_hi =
+            (subpixel_x >> kSuperResScaleBits) - downscaled_width;
+        s = LoadHi8Msan(s, &src[subpixel_x >> kSuperResScaleBits],
+                        msan_bytes_hi);
+        subpixel_x += step;
+        const __m128i f = LoadAligned16(filter);
+        weighted_src[i] = _mm_maddubs_epi16(s, f);
+      }
+
+      __m128i a[4];
+      a[0] = _mm_hadd_epi16(weighted_src[0], weighted_src[1]);
+      a[1] = _mm_hadd_epi16(weighted_src[2], weighted_src[3]);
+      a[2] = _mm_hadd_epi16(weighted_src[4], weighted_src[5]);
+      a[3] = _mm_hadd_epi16(weighted_src[6], weighted_src[7]);
+      Transpose2x16_U16(a, a);
+      a[0] = _mm_adds_epi16(a[0], a[1]);
+      a[1] = _mm_adds_epi16(a[2], a[3]);
+      const __m128i rounding = _mm_set1_epi16(1 << (kFilterBits - 1));
+      a[0] = _mm_subs_epi16(rounding, a[0]);
+      a[1] = _mm_subs_epi16(rounding, a[1]);
+      a[0] = _mm_srai_epi16(a[0], kFilterBits);
+      a[1] = _mm_srai_epi16(a[1], kFilterBits);
+      StoreAligned16(dst_ptr, _mm_packus_epi16(a[0], a[1]));
+      dst_ptr += 16;
+    } while (--x != 0);
+    src += source_stride;
+    dst += dest_stride;
+  } while (--y != 0);
+}
+
+void Init8bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+#if DSP_ENABLED_8BPP_SSE4_1(SuperResCoefficients)
+  dsp->super_res_coefficients = SuperResCoefficients_SSE4_1;
+#endif  // DSP_ENABLED_8BPP_SSE4_1(SuperResCoefficients)
+#if DSP_ENABLED_8BPP_SSE4_1(SuperRes)
+  dsp->super_res = SuperRes_SSE4_1;
+#endif  // DSP_ENABLED_8BPP_SSE4_1(SuperRes)
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+// Upscale_Filter as defined in AV1 Section 7.16
 alignas(16) const int16_t
     kUpscaleFilter[kSuperResFilterShifts][kSuperResFilterTaps] = {
-        {-0, 0, -0, 128, 0, -0, 0, -0},    {-0, 0, -1, 128, 2, -1, 0, -0},
-        {-0, 1, -3, 127, 4, -2, 1, -0},    {-0, 1, -4, 127, 6, -3, 1, -0},
-        {-0, 2, -6, 126, 8, -3, 1, -0},    {-0, 2, -7, 125, 11, -4, 1, -0},
-        {-1, 2, -8, 125, 13, -5, 2, -0},   {-1, 3, -9, 124, 15, -6, 2, -0},
+        {0, 0, 0, 128, 0, 0, 0, 0},        {0, 0, -1, 128, 2, -1, 0, 0},
+        {0, 1, -3, 127, 4, -2, 1, 0},      {0, 1, -4, 127, 6, -3, 1, 0},
+        {0, 2, -6, 126, 8, -3, 1, 0},      {0, 2, -7, 125, 11, -4, 1, 0},
+        {-1, 2, -8, 125, 13, -5, 2, 0},    {-1, 3, -9, 124, 15, -6, 2, 0},
         {-1, 3, -10, 123, 18, -6, 2, -1},  {-1, 3, -11, 122, 20, -7, 3, -1},
         {-1, 4, -12, 121, 22, -8, 3, -1},  {-1, 4, -13, 120, 25, -9, 3, -1},
         {-1, 4, -14, 118, 28, -9, 3, -1},  {-1, 4, -15, 117, 30, -10, 4, -1},
@@ -60,91 +203,111 @@
         {-1, 4, -11, 32, 116, -16, 5, -1}, {-1, 4, -10, 30, 117, -15, 4, -1},
         {-1, 3, -9, 28, 118, -14, 4, -1},  {-1, 3, -9, 25, 120, -13, 4, -1},
         {-1, 3, -8, 22, 121, -12, 4, -1},  {-1, 3, -7, 20, 122, -11, 3, -1},
-        {-1, 2, -6, 18, 123, -10, 3, -1},  {-0, 2, -6, 15, 124, -9, 3, -1},
-        {-0, 2, -5, 13, 125, -8, 2, -1},   {-0, 1, -4, 11, 125, -7, 2, -0},
-        {-0, 1, -3, 8, 126, -6, 2, -0},    {-0, 1, -3, 6, 127, -4, 1, -0},
-        {-0, 1, -2, 4, 127, -3, 1, -0},    {-0, 0, -1, 2, 128, -1, 0, -0},
+        {-1, 2, -6, 18, 123, -10, 3, -1},  {0, 2, -6, 15, 124, -9, 3, -1},
+        {0, 2, -5, 13, 125, -8, 2, -1},    {0, 1, -4, 11, 125, -7, 2, 0},
+        {0, 1, -3, 8, 126, -6, 2, 0},      {0, 1, -3, 6, 127, -4, 1, 0},
+        {0, 1, -2, 4, 127, -3, 1, 0},      {0, 0, -1, 2, 128, -1, 0, 0},
 };
 
-inline void ComputeSuperRes4(const uint8_t* src, uint8_t* dst_x, int step,
-                             int* p) {
-  __m128i weighted_src[4];
-  for (int i = 0; i < 4; ++i, *p += step) {
-    const __m128i src_x = LoadLo8(&src[*p >> kSuperResScaleBits]);
-    const int remainder = *p & kSuperResScaleMask;
-    const __m128i filter =
-        LoadUnaligned16(kUpscaleFilter[remainder >> kSuperResExtraBits]);
-    weighted_src[i] = _mm_madd_epi16(_mm_cvtepu8_epi16(src_x), filter);
-  }
-
-  // Pairwise add is chosen in favor of transpose and add because of the
-  // ability to take advantage of madd.
-  const __m128i res0 = _mm_hadd_epi32(weighted_src[0], weighted_src[1]);
-  const __m128i res1 = _mm_hadd_epi32(weighted_src[2], weighted_src[3]);
-  const __m128i result0 = _mm_hadd_epi32(res0, res1);
-  const __m128i result = _mm_packus_epi32(
-      RightShiftWithRounding_S32(result0, kFilterBits), result0);
-  Store4(dst_x, _mm_packus_epi16(result, result));
+void SuperResCoefficients_SSE4_1(const int upscaled_width,
+                                 const int initial_subpixel_x, const int step,
+                                 void* const coefficients) {
+  auto* dst = static_cast<uint16_t*>(coefficients);
+  int subpixel_x = initial_subpixel_x;
+  int x = RightShiftWithCeiling(upscaled_width, 3);
+  do {
+    for (int i = 0; i < 8; ++i, dst += 8) {
+      int remainder = subpixel_x & kSuperResScaleMask;
+      __m128i filter =
+          LoadAligned16(kUpscaleFilter[remainder >> kSuperResExtraBits]);
+      subpixel_x += step;
+      StoreAligned16(dst, filter);
+    }
+  } while (--x != 0);
 }
 
-inline void ComputeSuperRes8(const uint8_t* src, uint8_t* dst_x, int step,
-                             int* p) {
-  __m128i weighted_src[8];
-  for (int i = 0; i < 8; ++i, *p += step) {
-    const __m128i src_x = LoadLo8(&src[*p >> kSuperResScaleBits]);
-    const int remainder = *p & kSuperResScaleMask;
-    const __m128i filter =
-        LoadUnaligned16(kUpscaleFilter[remainder >> kSuperResExtraBits]);
-    weighted_src[i] = _mm_madd_epi16(_mm_cvtepu8_epi16(src_x), filter);
-  }
+template <int bitdepth>
+void SuperRes_SSE4_1(const void* const coefficients, void* const source,
+                     const ptrdiff_t source_stride, const int height,
+                     const int downscaled_width, const int upscaled_width,
+                     const int initial_subpixel_x, const int step,
+                     void* const dest, const ptrdiff_t dest_stride) {
+  auto* src = static_cast<uint16_t*>(source) - DivideBy2(kSuperResFilterTaps);
+  auto* dst = static_cast<uint16_t*>(dest);
+  int y = height;
+  do {
+    const auto* filter = static_cast<const uint16_t*>(coefficients);
+    uint16_t* dst_ptr = dst;
+    ExtendLine<uint16_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+                         kSuperResHorizontalBorder, kSuperResHorizontalPadding);
+    int subpixel_x = initial_subpixel_x;
+    // The below code calculates up to 7 extra upscaled
+    // pixels which will over-read up to 7 downscaled pixels in the end of each
+    // row. kSuperResHorizontalPadding accounts for this.
+    int x = RightShiftWithCeiling(upscaled_width, 3);
+    do {
+      __m128i weighted_src[8];
+      for (int i = 0; i < 8; ++i, filter += 8) {
+        const __m128i s =
+            LoadUnaligned16(&src[subpixel_x >> kSuperResScaleBits]);
+        subpixel_x += step;
+        const __m128i f = LoadAligned16(filter);
+        weighted_src[i] = _mm_madd_epi16(s, f);
+      }
 
-  // Pairwise add is chosen in favor of transpose and add because of the
-  // ability to take advantage of madd.
-  const __m128i res0 = _mm_hadd_epi32(weighted_src[0], weighted_src[1]);
-  const __m128i res1 = _mm_hadd_epi32(weighted_src[2], weighted_src[3]);
-  const __m128i res2 = _mm_hadd_epi32(weighted_src[4], weighted_src[5]);
-  const __m128i res3 = _mm_hadd_epi32(weighted_src[6], weighted_src[7]);
-  const __m128i result0 = _mm_hadd_epi32(res0, res1);
-  const __m128i result1 = _mm_hadd_epi32(res2, res3);
-  const __m128i result =
-      _mm_packus_epi32(RightShiftWithRounding_S32(result0, kFilterBits),
-                       RightShiftWithRounding_S32(result1, kFilterBits));
-  StoreLo8(dst_x, _mm_packus_epi16(result, result));
+      __m128i a[4];
+      a[0] = _mm_hadd_epi32(weighted_src[0], weighted_src[1]);
+      a[1] = _mm_hadd_epi32(weighted_src[2], weighted_src[3]);
+      a[2] = _mm_hadd_epi32(weighted_src[4], weighted_src[5]);
+      a[3] = _mm_hadd_epi32(weighted_src[6], weighted_src[7]);
+
+      a[0] = _mm_hadd_epi32(a[0], a[1]);
+      a[1] = _mm_hadd_epi32(a[2], a[3]);
+      a[0] = RightShiftWithRounding_S32(a[0], kFilterBits);
+      a[1] = RightShiftWithRounding_S32(a[1], kFilterBits);
+
+      // Clip the values at (1 << bd) - 1
+      const __m128i clipped_16 = _mm_min_epi16(
+          _mm_packus_epi32(a[0], a[1]), _mm_set1_epi16((1 << bitdepth) - 1));
+      StoreAligned16(dst_ptr, clipped_16);
+      dst_ptr += 8;
+    } while (--x != 0);
+    src += source_stride;
+    dst += dest_stride;
+  } while (--y != 0);
 }
 
-void ComputeSuperRes_SSE4_1(const void* source, const int upscaled_width,
-                            const int initial_subpixel_x, const int step,
-                            void* const dest) {
-  const auto* src = static_cast<const uint8_t*>(source);
-  auto* dst = static_cast<uint8_t*>(dest);
-  src -= kSuperResFilterTaps >> 1;
-
-  int p = initial_subpixel_x;
-  int x = 0;
-  for (; x < (upscaled_width & ~7); x += 8) {
-    ComputeSuperRes8(src, &dst[x], step, &p);
-  }
-  // The below code can overwrite at most 3 bytes and overread at most 7.
-  // kSuperResHorizontalBorder accounts for this.
-  for (; x < upscaled_width; x += 4) {
-    ComputeSuperRes4(src, &dst[x], step, &p);
-  }
-}
-
-void Init8bpp() {
-  Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
-  dsp->super_res_row = ComputeSuperRes_SSE4_1;
+void Init10bpp() {
+  Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  static_cast<void>(dsp);
+#if DSP_ENABLED_10BPP_SSE4_1(SuperResCoefficients)
+  dsp->super_res_coefficients = SuperResCoefficients_SSE4_1;
+#else
+  static_cast<void>(SuperResCoefficients_SSE4_1);
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(SuperRes)
+  dsp->super_res = SuperRes_SSE4_1<10>;
+#else
+  static_cast<void>(SuperRes_SSE4_1);
+#endif
 }
 
 }  // namespace
-}  // namespace low_bitdepth
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
 
-void SuperResInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+void SuperResInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
 
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_SSE4_1
+#else   // !LIBGAV1_TARGETING_SSE4_1
 
 namespace libgav1 {
 namespace dsp {
@@ -153,4 +316,4 @@
 
 }  // namespace dsp
 }  // namespace libgav1
-#endif  // LIBGAV1_ENABLE_SSE4_1
+#endif  // LIBGAV1_TARGETING_SSE4_1

diff --git a/libgav1/src/dsp/x86/super_res_sse4.h b/libgav1/src/dsp/x86/super_res_sse4.h
index 5673ca5..07a7ef4 100644
--- a/libgav1/src/dsp/x86/super_res_sse4.h
+++ b/libgav1/src/dsp/x86/super_res_sse4.h

@@ -29,8 +29,22 @@
 }  // namespace dsp
 }  // namespace libgav1
 
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_SuperResCoefficients
+#define LIBGAV1_Dsp8bpp_SuperResCoefficients LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_SuperRes
 #define LIBGAV1_Dsp8bpp_SuperRes LIBGAV1_CPU_SSE4_1
-#endif  // LIBGAV1_ENABLE_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_SuperResCoefficients
+#define LIBGAV1_Dsp10bpp_SuperResCoefficients LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_SuperRes
+#define LIBGAV1_Dsp10bpp_SuperRes LIBGAV1_CPU_SSE4_1
+#endif
+#endif  // LIBGAV1_TARGETING_SSE4_1
 
 #endif  // LIBGAV1_SRC_DSP_X86_SUPER_RES_SSE4_H_

diff --git a/libgav1/src/dsp/x86/transpose_sse4.h b/libgav1/src/dsp/x86/transpose_sse4.h
index cd61c92..9726495 100644
--- a/libgav1/src/dsp/x86/transpose_sse4.h
+++ b/libgav1/src/dsp/x86/transpose_sse4.h

@@ -20,12 +20,46 @@
 #include "src/utils/compiler_attributes.h"
 #include "src/utils/cpu.h"
 
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
 #include <emmintrin.h>
 
 namespace libgav1 {
 namespace dsp {
 
+LIBGAV1_ALWAYS_INLINE void Transpose2x16_U16(const __m128i* const in,
+                                             __m128i* const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]:  00 01 10 11  20 21 30 31
+  // in[1]:  40 41 50 51  60 61 70 71
+  // in[2]:  80 81 90 91  a0 a1 b0 b1
+  // in[3]:  c0 c1 d0 d1  e0 e1 f0 f1
+  // to:
+  // a0:     00 40 01 41  10 50 11 51
+  // a1:     20 60 21 61  30 70 31 71
+  // a2:     80 c0 81 c1  90 d0 91 d1
+  // a3:     a0 e0 a1 e1  b0 f0 b1 f1
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i a2 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a3 = _mm_unpackhi_epi16(in[2], in[3]);
+  // b0:     00 20 40 60  01 21 41 61
+  // b1:     10 30 50 70  11 31 51 71
+  // b2:     80 a0 c0 e0  81 a1 c1 e1
+  // b3:     90 b0 d0 f0  91 b1 d1 f1
+  const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
+  const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
+  const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
+  const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
+  // out[0]: 00 10 20 30  40 50 60 70
+  // out[1]: 01 11 21 31  41 51 61 71
+  // out[2]: 80 90 a0 b0  c0 d0 e0 f0
+  // out[3]: 81 91 a1 b1  c1 d1 e1 f1
+  out[0] = _mm_unpacklo_epi16(b0, b1);
+  out[1] = _mm_unpackhi_epi16(b0, b1);
+  out[2] = _mm_unpacklo_epi16(b2, b3);
+  out[3] = _mm_unpackhi_epi16(b2, b3);
+}
+
 LIBGAV1_ALWAYS_INLINE __m128i Transpose4x4_U8(const __m128i* const in) {
   // Unpack 8 bit elements. Goes from:
   // in[0]: 00 01 02 03
@@ -269,5 +303,5 @@
 }  // namespace dsp
 }  // namespace libgav1
 
-#endif  // LIBGAV1_ENABLE_SSE4_1
+#endif  // LIBGAV1_TARGETING_SSE4_1
 #endif  // LIBGAV1_SRC_DSP_X86_TRANSPOSE_SSE4_H_

diff --git a/libgav1/src/dsp/x86/warp_sse4.cc b/libgav1/src/dsp/x86/warp_sse4.cc
index 4c9e716..9ddfeac 100644
--- a/libgav1/src/dsp/x86/warp_sse4.cc
+++ b/libgav1/src/dsp/x86/warp_sse4.cc

@@ -15,7 +15,7 @@
 #include "src/dsp/warp.h"
 #include "src/utils/cpu.h"
 
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
 
 #include <smmintrin.h>
 
@@ -513,7 +513,7 @@
 
 }  // namespace dsp
 }  // namespace libgav1
-#else  // !LIBGAV1_ENABLE_SSE4_1
+#else   // !LIBGAV1_TARGETING_SSE4_1
 
 namespace libgav1 {
 namespace dsp {
@@ -522,4 +522,4 @@
 
 }  // namespace dsp
 }  // namespace libgav1
-#endif  // LIBGAV1_ENABLE_SSE4_1
+#endif  // LIBGAV1_TARGETING_SSE4_1

diff --git a/libgav1/src/dsp/x86/warp_sse4.h b/libgav1/src/dsp/x86/warp_sse4.h
index 51fbf43..a2dc5ca 100644
--- a/libgav1/src/dsp/x86/warp_sse4.h
+++ b/libgav1/src/dsp/x86/warp_sse4.h

@@ -29,9 +29,16 @@
 }  // namespace dsp
 }  // namespace libgav1
 
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_Warp
 #define LIBGAV1_Dsp8bpp_Warp LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WarpCompound
 #define LIBGAV1_Dsp8bpp_WarpCompound LIBGAV1_CPU_SSE4_1
-#endif  // LIBGAV1_ENABLE_SSE4_1
+#endif
+
+#endif  // LIBGAV1_TARGETING_SSE4_1
 
 #endif  // LIBGAV1_SRC_DSP_X86_WARP_SSE4_H_

diff --git a/libgav1/src/dsp/x86/weight_mask_sse4.cc b/libgav1/src/dsp/x86/weight_mask_sse4.cc
index 9d9d9c4..08a1739 100644
--- a/libgav1/src/dsp/x86/weight_mask_sse4.cc
+++ b/libgav1/src/dsp/x86/weight_mask_sse4.cc

@@ -16,7 +16,7 @@
 
 #include "src/utils/cpu.h"
 
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
 
 #include <smmintrin.h>
 
@@ -36,47 +36,65 @@
 
 constexpr int kRoundingBits8bpp = 4;
 
-template <bool mask_is_inverse>
-inline void WeightMask8_SSE4(const int16_t* prediction_0,
-                             const int16_t* prediction_1, uint8_t* mask) {
-  const __m128i pred_0 = LoadAligned16(prediction_0);
-  const __m128i pred_1 = LoadAligned16(prediction_1);
-  const __m128i difference = RightShiftWithRounding_U16(
-      _mm_abs_epi16(_mm_sub_epi16(pred_0, pred_1)), kRoundingBits8bpp);
-  const __m128i scaled_difference = _mm_srli_epi16(difference, 4);
+template <bool mask_is_inverse, bool is_store_16>
+inline void WeightMask16_SSE4(const int16_t* prediction_0,
+                              const int16_t* prediction_1, uint8_t* mask,
+                              ptrdiff_t mask_stride) {
+  const __m128i pred_00 = LoadAligned16(prediction_0);
+  const __m128i pred_10 = LoadAligned16(prediction_1);
+  const __m128i difference_0 = RightShiftWithRounding_U16(
+      _mm_abs_epi16(_mm_sub_epi16(pred_00, pred_10)), kRoundingBits8bpp);
+  const __m128i scaled_difference_0 = _mm_srli_epi16(difference_0, 4);
+
+  const __m128i pred_01 = LoadAligned16(prediction_0 + 8);
+  const __m128i pred_11 = LoadAligned16(prediction_1 + 8);
+  const __m128i difference_1 = RightShiftWithRounding_U16(
+      _mm_abs_epi16(_mm_sub_epi16(pred_01, pred_11)), kRoundingBits8bpp);
+  const __m128i scaled_difference_1 = _mm_srli_epi16(difference_1, 4);
+
   const __m128i difference_offset = _mm_set1_epi8(38);
   const __m128i adjusted_difference =
-      _mm_adds_epu8(_mm_packus_epi16(scaled_difference, scaled_difference),
+      _mm_adds_epu8(_mm_packus_epi16(scaled_difference_0, scaled_difference_1),
                     difference_offset);
   const __m128i mask_ceiling = _mm_set1_epi8(64);
   const __m128i mask_value = _mm_min_epi8(adjusted_difference, mask_ceiling);
   if (mask_is_inverse) {
     const __m128i inverted_mask_value = _mm_sub_epi8(mask_ceiling, mask_value);
-    StoreLo8(mask, inverted_mask_value);
+    if (is_store_16) {
+      StoreAligned16(mask, inverted_mask_value);
+    } else {
+      StoreLo8(mask, inverted_mask_value);
+      StoreHi8(mask + mask_stride, inverted_mask_value);
+    }
   } else {
-    StoreLo8(mask, mask_value);
+    if (is_store_16) {
+      StoreAligned16(mask, mask_value);
+    } else {
+      StoreLo8(mask, mask_value);
+      StoreHi8(mask + mask_stride, mask_value);
+    }
   }
 }
 
-#define WEIGHT8_WITHOUT_STRIDE \
-  WeightMask8_SSE4<mask_is_inverse>(pred_0, pred_1, mask)
+#define WEIGHT8_PAIR_WITHOUT_STRIDE \
+  WeightMask16_SSE4<mask_is_inverse, false>(pred_0, pred_1, mask, mask_stride)
 
-#define WEIGHT8_AND_STRIDE \
-  WEIGHT8_WITHOUT_STRIDE;  \
-  pred_0 += 8;             \
-  pred_1 += 8;             \
-  mask += mask_stride
+#define WEIGHT8_PAIR_AND_STRIDE \
+  WEIGHT8_PAIR_WITHOUT_STRIDE;  \
+  pred_0 += 8 << 1;             \
+  pred_1 += 8 << 1;             \
+  mask += mask_stride << 1
 
 template <bool mask_is_inverse>
 void WeightMask8x8_SSE4(const void* prediction_0, const void* prediction_1,
                         uint8_t* mask, ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
-  int y = 0;
-  do {
-    WEIGHT8_AND_STRIDE;
-  } while (++y < 7);
-  WEIGHT8_WITHOUT_STRIDE;
+
+  WEIGHT8_PAIR_AND_STRIDE;
+  WEIGHT8_PAIR_AND_STRIDE;
+  WEIGHT8_PAIR_AND_STRIDE;
+  WEIGHT8_PAIR_WITHOUT_STRIDE;
 }
 
 template <bool mask_is_inverse>
@@ -84,13 +102,13 @@
                          uint8_t* mask, ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
-  int y3 = 0;
+  int y3 = 3;
   do {
-    WEIGHT8_AND_STRIDE;
-    WEIGHT8_AND_STRIDE;
-    WEIGHT8_AND_STRIDE;
-  } while (++y3 < 5);
-  WEIGHT8_WITHOUT_STRIDE;
+    WEIGHT8_PAIR_AND_STRIDE;
+    WEIGHT8_PAIR_AND_STRIDE;
+  } while (--y3 != 0);
+  WEIGHT8_PAIR_AND_STRIDE;
+  WEIGHT8_PAIR_WITHOUT_STRIDE;
 }
 
 template <bool mask_is_inverse>
@@ -98,21 +116,17 @@
                          uint8_t* mask, ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
-  int y5 = 0;
+  int y5 = 5;
   do {
-    WEIGHT8_AND_STRIDE;
-    WEIGHT8_AND_STRIDE;
-    WEIGHT8_AND_STRIDE;
-    WEIGHT8_AND_STRIDE;
-    WEIGHT8_AND_STRIDE;
-  } while (++y5 < 6);
-  WEIGHT8_AND_STRIDE;
-  WEIGHT8_WITHOUT_STRIDE;
+    WEIGHT8_PAIR_AND_STRIDE;
+    WEIGHT8_PAIR_AND_STRIDE;
+    WEIGHT8_PAIR_AND_STRIDE;
+  } while (--y5 != 0);
+  WEIGHT8_PAIR_WITHOUT_STRIDE;
 }
 
-#define WEIGHT16_WITHOUT_STRIDE                            \
-  WeightMask8_SSE4<mask_is_inverse>(pred_0, pred_1, mask); \
-  WeightMask8_SSE4<mask_is_inverse>(pred_0 + 8, pred_1 + 8, mask + 8)
+#define WEIGHT16_WITHOUT_STRIDE \
+  WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride)
 
 #define WEIGHT16_AND_STRIDE \
   WEIGHT16_WITHOUT_STRIDE;  \
@@ -125,10 +139,10 @@
                          uint8_t* mask, ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
-  int y = 0;
+  int y = 7;
   do {
     WEIGHT16_AND_STRIDE;
-  } while (++y < 7);
+  } while (--y != 0);
   WEIGHT16_WITHOUT_STRIDE;
 }
 
@@ -137,12 +151,12 @@
                           uint8_t* mask, ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
-  int y3 = 0;
+  int y3 = 5;
   do {
     WEIGHT16_AND_STRIDE;
     WEIGHT16_AND_STRIDE;
     WEIGHT16_AND_STRIDE;
-  } while (++y3 < 5);
+  } while (--y3 != 0);
   WEIGHT16_WITHOUT_STRIDE;
 }
 
@@ -151,14 +165,14 @@
                           uint8_t* mask, ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
-  int y5 = 0;
+  int y5 = 6;
   do {
     WEIGHT16_AND_STRIDE;
     WEIGHT16_AND_STRIDE;
     WEIGHT16_AND_STRIDE;
     WEIGHT16_AND_STRIDE;
     WEIGHT16_AND_STRIDE;
-  } while (++y5 < 6);
+  } while (--y5 != 0);
   WEIGHT16_AND_STRIDE;
   WEIGHT16_WITHOUT_STRIDE;
 }
@@ -168,20 +182,19 @@
                           uint8_t* mask, ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
-  int y3 = 0;
+  int y3 = 21;
   do {
     WEIGHT16_AND_STRIDE;
     WEIGHT16_AND_STRIDE;
     WEIGHT16_AND_STRIDE;
-  } while (++y3 < 21);
+  } while (--y3 != 0);
   WEIGHT16_WITHOUT_STRIDE;
 }
 
-#define WEIGHT32_WITHOUT_STRIDE                                           \
-  WeightMask8_SSE4<mask_is_inverse>(pred_0, pred_1, mask);                \
-  WeightMask8_SSE4<mask_is_inverse>(pred_0 + 8, pred_1 + 8, mask + 8);    \
-  WeightMask8_SSE4<mask_is_inverse>(pred_0 + 16, pred_1 + 16, mask + 16); \
-  WeightMask8_SSE4<mask_is_inverse>(pred_0 + 24, pred_1 + 24, mask + 24)
+#define WEIGHT32_WITHOUT_STRIDE                                                \
+  WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride); \
+  WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16,           \
+                                           mask + 16, mask_stride)
 
 #define WEIGHT32_AND_STRIDE \
   WEIGHT32_WITHOUT_STRIDE;  \
@@ -209,12 +222,12 @@
                           uint8_t* mask, ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
-  int y3 = 0;
+  int y3 = 5;
   do {
     WEIGHT32_AND_STRIDE;
     WEIGHT32_AND_STRIDE;
     WEIGHT32_AND_STRIDE;
-  } while (++y3 < 5);
+  } while (--y3 != 0);
   WEIGHT32_WITHOUT_STRIDE;
 }
 
@@ -223,14 +236,14 @@
                           uint8_t* mask, ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
-  int y5 = 0;
+  int y5 = 6;
   do {
     WEIGHT32_AND_STRIDE;
     WEIGHT32_AND_STRIDE;
     WEIGHT32_AND_STRIDE;
     WEIGHT32_AND_STRIDE;
     WEIGHT32_AND_STRIDE;
-  } while (++y5 < 6);
+  } while (--y5 != 0);
   WEIGHT32_AND_STRIDE;
   WEIGHT32_WITHOUT_STRIDE;
 }
@@ -240,24 +253,23 @@
                           uint8_t* mask, ptrdiff_t mask_stride) {
   const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
   const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
-  int y3 = 0;
+  int y3 = 21;
   do {
     WEIGHT32_AND_STRIDE;
     WEIGHT32_AND_STRIDE;
     WEIGHT32_AND_STRIDE;
-  } while (++y3 < 21);
+  } while (--y3 != 0);
   WEIGHT32_WITHOUT_STRIDE;
 }
 
-#define WEIGHT64_WITHOUT_STRIDE                                           \
-  WeightMask8_SSE4<mask_is_inverse>(pred_0, pred_1, mask);                \
-  WeightMask8_SSE4<mask_is_inverse>(pred_0 + 8, pred_1 + 8, mask + 8);    \
-  WeightMask8_SSE4<mask_is_inverse>(pred_0 + 16, pred_1 + 16, mask + 16); \
-  WeightMask8_SSE4<mask_is_inverse>(pred_0 + 24, pred_1 + 24, mask + 24); \
-  WeightMask8_SSE4<mask_is_inverse>(pred_0 + 32, pred_1 + 32, mask + 32); \
-  WeightMask8_SSE4<mask_is_inverse>(pred_0 + 40, pred_1 + 40, mask + 40); \
-  WeightMask8_SSE4<mask_is_inverse>(pred_0 + 48, pred_1 + 48, mask + 48); \
-  WeightMask8_SSE4<mask_is_inverse>(pred_0 + 56, pred_1 + 56, mask + 56)
+#define WEIGHT64_WITHOUT_STRIDE                                                \
+  WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride); \
+  WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16,           \
+                                           mask + 16, mask_stride);            \
+  WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32,           \
+                                           mask + 32, mask_stride);            \
+  WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48,           \
+                                           mask + 48, mask_stride)
 
 #define WEIGHT64_AND_STRIDE \
   WEIGHT64_WITHOUT_STRIDE;  \
@@ -447,12 +459,491 @@
 }  // namespace
 }  // namespace low_bitdepth
 
-void WeightMaskInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+constexpr int kRoundingBits10bpp = 6;
+constexpr int kScaledDiffShift = 4;
+
+template <bool mask_is_inverse, bool is_store_16>
+inline void WeightMask16_10bpp_SSE4(const uint16_t* prediction_0,
+                                    const uint16_t* prediction_1, uint8_t* mask,
+                                    ptrdiff_t mask_stride) {
+  const __m128i diff_offset = _mm_set1_epi8(38);
+  const __m128i mask_ceiling = _mm_set1_epi8(64);
+  const __m128i zero = _mm_setzero_si128();
+
+  // Range of prediction: [3988, 61532].
+  const __m128i pred_00 = LoadAligned16(prediction_0);
+  const __m128i pred_10 = LoadAligned16(prediction_1);
+  const __m128i pred_lo_00 = _mm_cvtepu16_epi32(pred_00);
+  const __m128i pred_lo_10 = _mm_cvtepu16_epi32(pred_10);
+  const __m128i diff_lo_0 = RightShiftWithRounding_U32(
+      _mm_abs_epi32(_mm_sub_epi32(pred_lo_00, pred_lo_10)), kRoundingBits10bpp);
+
+  const __m128i pred_hi_00 = _mm_unpackhi_epi16(pred_00, zero);
+  const __m128i pred_hi_10 = _mm_unpackhi_epi16(pred_10, zero);
+  const __m128i diff_hi_0 = RightShiftWithRounding_U32(
+      _mm_abs_epi32(_mm_sub_epi32(pred_hi_00, pred_hi_10)), kRoundingBits10bpp);
+
+  const __m128i diff_0 = _mm_packus_epi32(diff_lo_0, diff_hi_0);
+  const __m128i scaled_diff_0 = _mm_srli_epi16(diff_0, kScaledDiffShift);
+
+  const __m128i pred_01 = LoadAligned16(prediction_0 + 8);
+  const __m128i pred_11 = LoadAligned16(prediction_1 + 8);
+  const __m128i pred_lo_01 = _mm_cvtepu16_epi32(pred_01);
+  const __m128i pred_lo_11 = _mm_cvtepu16_epi32(pred_11);
+  const __m128i diff_lo_1 = RightShiftWithRounding_U32(
+      _mm_abs_epi32(_mm_sub_epi32(pred_lo_01, pred_lo_11)), kRoundingBits10bpp);
+
+  const __m128i pred_hi_01 = _mm_unpackhi_epi16(pred_01, zero);
+  const __m128i pred_hi_11 = _mm_unpackhi_epi16(pred_11, zero);
+  const __m128i diff_hi_1 = RightShiftWithRounding_U32(
+      _mm_abs_epi32(_mm_sub_epi32(pred_hi_01, pred_hi_11)), kRoundingBits10bpp);
+
+  const __m128i diff_1 = _mm_packus_epi32(diff_lo_1, diff_hi_1);
+  const __m128i scaled_diff_1 = _mm_srli_epi16(diff_1, kScaledDiffShift);
+
+  const __m128i adjusted_diff = _mm_adds_epu8(
+      _mm_packus_epi16(scaled_diff_0, scaled_diff_1), diff_offset);
+  const __m128i mask_value = _mm_min_epi8(adjusted_diff, mask_ceiling);
+
+  if (mask_is_inverse) {
+    const __m128i inverted_mask_value = _mm_sub_epi8(mask_ceiling, mask_value);
+    if (is_store_16) {
+      StoreAligned16(mask, inverted_mask_value);
+    } else {
+      StoreLo8(mask, inverted_mask_value);
+      StoreHi8(mask + mask_stride, inverted_mask_value);
+    }
+  } else {
+    if (is_store_16) {
+      StoreAligned16(mask, mask_value);
+    } else {
+      StoreLo8(mask, mask_value);
+      StoreHi8(mask + mask_stride, mask_value);
+    }
+  }
+}
+
+#define WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP                               \
+  WeightMask16_10bpp_SSE4<mask_is_inverse, false>(pred_0, pred_1, mask, \
+                                                  mask_stride)
+
+#define WEIGHT8_PAIR_AND_STRIDE_10BPP \
+  WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP;  \
+  pred_0 += 8 << 1;                   \
+  pred_1 += 8 << 1;                   \
+  mask += mask_stride << 1
+
+template <bool mask_is_inverse>
+void WeightMask8x8_10bpp_SSE4(const void* prediction_0,
+                              const void* prediction_1, uint8_t* mask,
+                              ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+
+  WEIGHT8_PAIR_AND_STRIDE_10BPP;
+  WEIGHT8_PAIR_AND_STRIDE_10BPP;
+  WEIGHT8_PAIR_AND_STRIDE_10BPP;
+  WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask8x16_10bpp_SSE4(const void* prediction_0,
+                               const void* prediction_1, uint8_t* mask,
+                               ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 3;
+  do {
+    WEIGHT8_PAIR_AND_STRIDE_10BPP;
+    WEIGHT8_PAIR_AND_STRIDE_10BPP;
+  } while (--y3 != 0);
+  WEIGHT8_PAIR_AND_STRIDE_10BPP;
+  WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask8x32_10bpp_SSE4(const void* prediction_0,
+                               const void* prediction_1, uint8_t* mask,
+                               ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y5 = 5;
+  do {
+    WEIGHT8_PAIR_AND_STRIDE_10BPP;
+    WEIGHT8_PAIR_AND_STRIDE_10BPP;
+    WEIGHT8_PAIR_AND_STRIDE_10BPP;
+  } while (--y5 != 0);
+  WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP;
+}
+
+#define WEIGHT16_WITHOUT_STRIDE_10BPP                                  \
+  WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, \
+                                                 mask_stride)
+
+#define WEIGHT16_AND_STRIDE_10BPP \
+  WEIGHT16_WITHOUT_STRIDE_10BPP;  \
+  pred_0 += 16;                   \
+  pred_1 += 16;                   \
+  mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask16x8_10bpp_SSE4(const void* prediction_0,
+                               const void* prediction_1, uint8_t* mask,
+                               ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y = 7;
+  do {
+    WEIGHT16_AND_STRIDE_10BPP;
+  } while (--y != 0);
+  WEIGHT16_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x16_10bpp_SSE4(const void* prediction_0,
+                                const void* prediction_1, uint8_t* mask,
+                                ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 5;
+  do {
+    WEIGHT16_AND_STRIDE_10BPP;
+    WEIGHT16_AND_STRIDE_10BPP;
+    WEIGHT16_AND_STRIDE_10BPP;
+  } while (--y3 != 0);
+  WEIGHT16_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x32_10bpp_SSE4(const void* prediction_0,
+                                const void* prediction_1, uint8_t* mask,
+                                ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y5 = 6;
+  do {
+    WEIGHT16_AND_STRIDE_10BPP;
+    WEIGHT16_AND_STRIDE_10BPP;
+    WEIGHT16_AND_STRIDE_10BPP;
+    WEIGHT16_AND_STRIDE_10BPP;
+    WEIGHT16_AND_STRIDE_10BPP;
+  } while (--y5 != 0);
+  WEIGHT16_AND_STRIDE_10BPP;
+  WEIGHT16_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x64_10bpp_SSE4(const void* prediction_0,
+                                const void* prediction_1, uint8_t* mask,
+                                ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 21;
+  do {
+    WEIGHT16_AND_STRIDE_10BPP;
+    WEIGHT16_AND_STRIDE_10BPP;
+    WEIGHT16_AND_STRIDE_10BPP;
+  } while (--y3 != 0);
+  WEIGHT16_WITHOUT_STRIDE_10BPP;
+}
+
+#define WEIGHT32_WITHOUT_STRIDE_10BPP                                      \
+  WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask,     \
+                                                 mask_stride);             \
+  WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+                                                 mask + 16, mask_stride)
+
+#define WEIGHT32_AND_STRIDE_10BPP \
+  WEIGHT32_WITHOUT_STRIDE_10BPP;  \
+  pred_0 += 32;                   \
+  pred_1 += 32;                   \
+  mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask32x8_10bpp_SSE4(const void* prediction_0,
+                               const void* prediction_1, uint8_t* mask,
+                               ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  WEIGHT32_AND_STRIDE_10BPP;
+  WEIGHT32_AND_STRIDE_10BPP;
+  WEIGHT32_AND_STRIDE_10BPP;
+  WEIGHT32_AND_STRIDE_10BPP;
+  WEIGHT32_AND_STRIDE_10BPP;
+  WEIGHT32_AND_STRIDE_10BPP;
+  WEIGHT32_AND_STRIDE_10BPP;
+  WEIGHT32_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x16_10bpp_SSE4(const void* prediction_0,
+                                const void* prediction_1, uint8_t* mask,
+                                ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 5;
+  do {
+    WEIGHT32_AND_STRIDE_10BPP;
+    WEIGHT32_AND_STRIDE_10BPP;
+    WEIGHT32_AND_STRIDE_10BPP;
+  } while (--y3 != 0);
+  WEIGHT32_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x32_10bpp_SSE4(const void* prediction_0,
+                                const void* prediction_1, uint8_t* mask,
+                                ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y5 = 6;
+  do {
+    WEIGHT32_AND_STRIDE_10BPP;
+    WEIGHT32_AND_STRIDE_10BPP;
+    WEIGHT32_AND_STRIDE_10BPP;
+    WEIGHT32_AND_STRIDE_10BPP;
+    WEIGHT32_AND_STRIDE_10BPP;
+  } while (--y5 != 0);
+  WEIGHT32_AND_STRIDE_10BPP;
+  WEIGHT32_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x64_10bpp_SSE4(const void* prediction_0,
+                                const void* prediction_1, uint8_t* mask,
+                                ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 21;
+  do {
+    WEIGHT32_AND_STRIDE_10BPP;
+    WEIGHT32_AND_STRIDE_10BPP;
+    WEIGHT32_AND_STRIDE_10BPP;
+  } while (--y3 != 0);
+  WEIGHT32_WITHOUT_STRIDE_10BPP;
+}
+
+#define WEIGHT64_WITHOUT_STRIDE_10BPP                                      \
+  WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask,     \
+                                                 mask_stride);             \
+  WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+                                                 mask + 16, mask_stride);  \
+  WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32, \
+                                                 mask + 32, mask_stride);  \
+  WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48, \
+                                                 mask + 48, mask_stride)
+
+#define WEIGHT64_AND_STRIDE_10BPP \
+  WEIGHT64_WITHOUT_STRIDE_10BPP;  \
+  pred_0 += 64;                   \
+  pred_1 += 64;                   \
+  mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask64x16_10bpp_SSE4(const void* prediction_0,
+                                const void* prediction_1, uint8_t* mask,
+                                ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 5;
+  do {
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+  } while (--y3 != 0);
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x32_10bpp_SSE4(const void* prediction_0,
+                                const void* prediction_1, uint8_t* mask,
+                                ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y5 = 6;
+  do {
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+  } while (--y5 != 0);
+  WEIGHT64_AND_STRIDE_10BPP;
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x64_10bpp_SSE4(const void* prediction_0,
+                                const void* prediction_1, uint8_t* mask,
+                                ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 21;
+  do {
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+  } while (--y3 != 0);
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x128_10bpp_SSE4(const void* prediction_0,
+                                 const void* prediction_1, uint8_t* mask,
+                                 ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 42;
+  do {
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+    WEIGHT64_AND_STRIDE_10BPP;
+  } while (--y3 != 0);
+  WEIGHT64_AND_STRIDE_10BPP;
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask128x64_10bpp_SSE4(const void* prediction_0,
+                                 const void* prediction_1, uint8_t* mask,
+                                 ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 21;
+  const ptrdiff_t adjusted_mask_stride = mask_stride - 64;
+  do {
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+  } while (--y3 != 0);
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += 64;
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask128x128_10bpp_SSE4(const void* prediction_0,
+                                  const void* prediction_1, uint8_t* mask,
+                                  ptrdiff_t mask_stride) {
+  const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+  int y3 = 42;
+  const ptrdiff_t adjusted_mask_stride = mask_stride - 64;
+  do {
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += 64;
+    WEIGHT64_WITHOUT_STRIDE_10BPP;
+    pred_0 += 64;
+    pred_1 += 64;
+    mask += adjusted_mask_stride;
+  } while (--y3 != 0);
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += 64;
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += adjusted_mask_stride;
+
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+  pred_0 += 64;
+  pred_1 += 64;
+  mask += 64;
+  WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+#define INIT_WEIGHT_MASK_10BPP(width, height, w_index, h_index) \
+  dsp->weight_mask[w_index][h_index][0] =                       \
+      WeightMask##width##x##height##_10bpp_SSE4<0>;             \
+  dsp->weight_mask[w_index][h_index][1] =                       \
+      WeightMask##width##x##height##_10bpp_SSE4<1>
+void Init10bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+  assert(dsp != nullptr);
+  INIT_WEIGHT_MASK_10BPP(8, 8, 0, 0);
+  INIT_WEIGHT_MASK_10BPP(8, 16, 0, 1);
+  INIT_WEIGHT_MASK_10BPP(8, 32, 0, 2);
+  INIT_WEIGHT_MASK_10BPP(16, 8, 1, 0);
+  INIT_WEIGHT_MASK_10BPP(16, 16, 1, 1);
+  INIT_WEIGHT_MASK_10BPP(16, 32, 1, 2);
+  INIT_WEIGHT_MASK_10BPP(16, 64, 1, 3);
+  INIT_WEIGHT_MASK_10BPP(32, 8, 2, 0);
+  INIT_WEIGHT_MASK_10BPP(32, 16, 2, 1);
+  INIT_WEIGHT_MASK_10BPP(32, 32, 2, 2);
+  INIT_WEIGHT_MASK_10BPP(32, 64, 2, 3);
+  INIT_WEIGHT_MASK_10BPP(64, 16, 3, 1);
+  INIT_WEIGHT_MASK_10BPP(64, 32, 3, 2);
+  INIT_WEIGHT_MASK_10BPP(64, 64, 3, 3);
+  INIT_WEIGHT_MASK_10BPP(64, 128, 3, 4);
+  INIT_WEIGHT_MASK_10BPP(128, 64, 4, 3);
+  INIT_WEIGHT_MASK_10BPP(128, 128, 4, 4);
+}
+
+}  // namespace
+}  // namespace high_bitdepth
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+void WeightMaskInit_SSE4_1() {
+  low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  high_bitdepth::Init10bpp();
+#endif
+}
 
 }  // namespace dsp
 }  // namespace libgav1
 
-#else  // !LIBGAV1_ENABLE_SSE4_1
+#else   // !LIBGAV1_TARGETING_SSE4_1
 
 namespace libgav1 {
 namespace dsp {
@@ -461,4 +952,4 @@
 
 }  // namespace dsp
 }  // namespace libgav1
-#endif  // LIBGAV1_ENABLE_SSE4_1
+#endif  // LIBGAV1_TARGETING_SSE4_1

diff --git a/libgav1/src/dsp/x86/weight_mask_sse4.h b/libgav1/src/dsp/x86/weight_mask_sse4.h
index 841dd5a..e5d9d70 100644
--- a/libgav1/src/dsp/x86/weight_mask_sse4.h
+++ b/libgav1/src/dsp/x86/weight_mask_sse4.h

@@ -29,24 +29,143 @@
 }  // namespace dsp
 }  // namespace libgav1
 
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x8
 #define LIBGAV1_Dsp8bpp_WeightMask_8x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x16
 #define LIBGAV1_Dsp8bpp_WeightMask_8x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x32
 #define LIBGAV1_Dsp8bpp_WeightMask_8x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x8
 #define LIBGAV1_Dsp8bpp_WeightMask_16x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x16
 #define LIBGAV1_Dsp8bpp_WeightMask_16x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x32
 #define LIBGAV1_Dsp8bpp_WeightMask_16x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x64
 #define LIBGAV1_Dsp8bpp_WeightMask_16x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x8
 #define LIBGAV1_Dsp8bpp_WeightMask_32x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x16
 #define LIBGAV1_Dsp8bpp_WeightMask_32x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x32
 #define LIBGAV1_Dsp8bpp_WeightMask_32x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x64
 #define LIBGAV1_Dsp8bpp_WeightMask_32x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x16
 #define LIBGAV1_Dsp8bpp_WeightMask_64x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x32
 #define LIBGAV1_Dsp8bpp_WeightMask_64x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x64
 #define LIBGAV1_Dsp8bpp_WeightMask_64x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x128
 #define LIBGAV1_Dsp8bpp_WeightMask_64x128 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_128x64
 #define LIBGAV1_Dsp8bpp_WeightMask_128x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_128x128
 #define LIBGAV1_Dsp8bpp_WeightMask_128x128 LIBGAV1_CPU_SSE4_1
-#endif  // LIBGAV1_ENABLE_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x8
+#define LIBGAV1_Dsp10bpp_WeightMask_8x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x16
+#define LIBGAV1_Dsp10bpp_WeightMask_8x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x32
+#define LIBGAV1_Dsp10bpp_WeightMask_8x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x8
+#define LIBGAV1_Dsp10bpp_WeightMask_16x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x16
+#define LIBGAV1_Dsp10bpp_WeightMask_16x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x32
+#define LIBGAV1_Dsp10bpp_WeightMask_16x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x64
+#define LIBGAV1_Dsp10bpp_WeightMask_16x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x8
+#define LIBGAV1_Dsp10bpp_WeightMask_32x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x16
+#define LIBGAV1_Dsp10bpp_WeightMask_32x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x32
+#define LIBGAV1_Dsp10bpp_WeightMask_32x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x64
+#define LIBGAV1_Dsp10bpp_WeightMask_32x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x16
+#define LIBGAV1_Dsp10bpp_WeightMask_64x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x32
+#define LIBGAV1_Dsp10bpp_WeightMask_64x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x64
+#define LIBGAV1_Dsp10bpp_WeightMask_64x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x128
+#define LIBGAV1_Dsp10bpp_WeightMask_64x128 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_128x64
+#define LIBGAV1_Dsp10bpp_WeightMask_128x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_128x128
+#define LIBGAV1_Dsp10bpp_WeightMask_128x128 LIBGAV1_CPU_SSE4_1
+#endif
+#endif  // LIBGAV1_TARGETING_SSE4_1
 
 #endif  // LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_SSE4_H_

diff --git a/libgav1/src/film_grain.cc b/libgav1/src/film_grain.cc
index 15ae956..dac37b5 100644
--- a/libgav1/src/film_grain.cc
+++ b/libgav1/src/film_grain.cc

@@ -433,7 +433,7 @@
   if (!is_monochrome_) {
     noise_buffer_size += 2 * max_luma_num *
                          (kNoiseStripeHeight >> subsampling_y_) *
-                         RightShiftWithRounding(width_, subsampling_x_);
+                         SubsampledValue(width_, subsampling_x_);
   }
   noise_buffer_.reset(new (std::nothrow) GrainType[noise_buffer_size]);
   if (noise_buffer_ == nullptr) return false;
@@ -444,18 +444,16 @@
     noise_buffer += max_luma_num * kNoiseStripeHeight * width_;
   }
   if (!is_monochrome_) {
-    noise_stripes_[kPlaneU].Reset(
-        max_luma_num,
-        (kNoiseStripeHeight >> subsampling_y_) *
-            RightShiftWithRounding(width_, subsampling_x_),
-        noise_buffer);
+    noise_stripes_[kPlaneU].Reset(max_luma_num,
+                                  (kNoiseStripeHeight >> subsampling_y_) *
+                                      SubsampledValue(width_, subsampling_x_),
+                                  noise_buffer);
     noise_buffer += max_luma_num * (kNoiseStripeHeight >> subsampling_y_) *
-                    RightShiftWithRounding(width_, subsampling_x_);
-    noise_stripes_[kPlaneV].Reset(
-        max_luma_num,
-        (kNoiseStripeHeight >> subsampling_y_) *
-            RightShiftWithRounding(width_, subsampling_x_),
-        noise_buffer);
+                    SubsampledValue(width_, subsampling_x_);
+    noise_stripes_[kPlaneV].Reset(max_luma_num,
+                                  (kNoiseStripeHeight >> subsampling_y_) *
+                                      SubsampledValue(width_, subsampling_x_),
+                                  noise_buffer);
   }
   return true;
 }
@@ -715,8 +713,8 @@
       planes_to_blend[num_planes++] = kPlaneU;
       planes_to_blend[num_planes++] = kPlaneV;
     } else {
-      const int height_uv = RightShiftWithRounding(height_, subsampling_y_);
-      const int width_uv = RightShiftWithRounding(width_, subsampling_x_);
+      const int height_uv = SubsampledValue(height_, subsampling_y_);
+      const int width_uv = SubsampledValue(width_, subsampling_x_);
 
       // Noise is applied according to a lookup table defined by pieceiwse
       // linear "points." If the lookup table is empty, that corresponds to

diff --git a/libgav1/src/film_grain.h b/libgav1/src/film_grain.h
index 6757214..b588f6d 100644
--- a/libgav1/src/film_grain.h
+++ b/libgav1/src/film_grain.h

@@ -178,8 +178,8 @@
   //
   // noise_stripes_[kPlaneU][luma_num] or noise_stripes_[kPlaneV][luma_num]
   // is an array that has (34 >> subsampling_y_) rows and
-  // RightShiftWithRounding(width_, subsampling_x_) columns and contains noise
-  // for the chroma components.
+  // SubsampledValue(width_, subsampling_x_) columns and contains noise for the
+  // chroma components.
   Array2DView<GrainType> noise_stripes_[kMaxPlanes];
   // Owns the memory that the elements of noise_stripes_ point to.
   std::unique_ptr<GrainType[]> noise_buffer_;

diff --git a/libgav1/src/frame_scratch_buffer.h b/libgav1/src/frame_scratch_buffer.h
index 1d6a1f4..90c3bb8 100644
--- a/libgav1/src/frame_scratch_buffer.h
+++ b/libgav1/src/frame_scratch_buffer.h

@@ -54,20 +54,18 @@
   TemporalMotionField motion_field;
   SymbolDecoderContext symbol_decoder_context;
   std::unique_ptr<ResidualBufferPool> residual_buffer_pool;
-  // threaded_window_buffer will be subdivided by PostFilter into windows of
-  // width 512 pixels. Each row in the window is filtered by a worker thread.
-  // To avoid false sharing, each 512-pixel row processed by one thread should
-  // not share a cache line with a row processed by another thread. So we align
-  // threaded_window_buffer to the cache line size. In addition, it is faster to
-  // memcpy from an aligned buffer.
-  AlignedDynamicBuffer<uint8_t, kCacheLineSize> threaded_window_buffer;
+  // Buffer used to store the cdef borders. This buffer will store 4 rows for
+  // every 64x64 block (4 rows for every 32x32 for chroma with subsampling). The
+  // indices of the rows that are stored are specified in |kCdefBorderRows|.
+  YuvBuffer cdef_border;
+  AlignedDynamicBuffer<uint8_t, 16> superres_coefficients[kNumPlaneTypes];
   // Buffer used to temporarily store the input row for applying SuperRes.
-  AlignedDynamicBuffer<uint8_t, 16> superres_line_buffer;
-  // Buffer used to store the deblocked pixels that are necessary for loop
-  // restoration. This buffer will store 4 rows for every 64x64 block (4 rows
-  // for every 32x32 for chroma with subsampling). The indices of the rows that
-  // are stored are specified in |kDeblockedRowsForLoopRestoration|.
-  YuvBuffer deblock_buffer;
+  YuvBuffer superres_line_buffer;
+  // Buffer used to store the loop restoration borders. This buffer will store 4
+  // rows for every 64x64 block (4 rows for every 32x32 for chroma with
+  // subsampling). The indices of the rows that are stored are specified in
+  // |kLoopRestorationBorderRows|.
+  YuvBuffer loop_restoration_border;
   // The size of this dynamic buffer is |tile_rows|.
   DynamicBuffer<IntraPredictionBuffer> intra_prediction_buffers;
   TileScratchBufferPool tile_scratch_buffer_pool;

diff --git a/libgav1/src/gav1/decoder_settings.h b/libgav1/src/gav1/decoder_settings.h
index ab22a4d..7ee487f 100644
--- a/libgav1/src/gav1/decoder_settings.h
+++ b/libgav1/src/gav1/decoder_settings.h

@@ -62,7 +62,8 @@
   Libgav1GetFrameBufferCallback get_frame_buffer;
   // Release frame buffer callback.
   Libgav1ReleaseFrameBufferCallback release_frame_buffer;
-  // Release input frame buffer callback.
+  // Release input frame buffer callback. This callback must be set when
+  // |frame_parallel| is true.
   Libgav1ReleaseInputBufferCallback release_input_buffer;
   // Passed as the private_data argument to the callbacks.
   void* callback_private_data;
@@ -117,7 +118,8 @@
   GetFrameBufferCallback get_frame_buffer = nullptr;
   // Release frame buffer callback.
   ReleaseFrameBufferCallback release_frame_buffer = nullptr;
-  // Release input frame buffer callback.
+  // Release input frame buffer callback. This callback must be set when
+  // |frame_parallel| is true.
   ReleaseInputBufferCallback release_input_buffer = nullptr;
   // Passed as the private_data argument to the callbacks.
   void* callback_private_data = nullptr;

diff --git a/libgav1/src/gav1/symbol_visibility.h b/libgav1/src/gav1/symbol_visibility.h
index ad7498c..116a514 100644
--- a/libgav1/src/gav1/symbol_visibility.h
+++ b/libgav1/src/gav1/symbol_visibility.h

@@ -58,6 +58,11 @@
 //
 // Much of the above information and more can be found at
 // https://gcc.gnu.org/wiki/Visibility
+//
+// NOTE: A third-party build system for libgav1 can add -DLIBGAV1_PUBLIC= to the
+// compiler command line to override the definition of LIBGAV1_PUBLIC in this
+// header. This can be used to create a libgav1 static library that will not
+// export any symbols when it is linked into a shared library.
 
 #if !defined(LIBGAV1_PUBLIC)
 #if defined(_WIN32)
@@ -76,7 +81,7 @@
 #else
 #define LIBGAV1_PUBLIC
 #endif  // defined(LIBGAV1_BUILDING_DLL) && LIBGAV1_BUILDING_DLL
-#else
+#else   // !defined(_WIN32)
 #if defined(__GNUC__) && __GNUC__ >= 4
 #define LIBGAV1_PUBLIC __attribute__((visibility("default")))
 #else

diff --git a/libgav1/src/gav1/version.h b/libgav1/src/gav1/version.h
index e78e9a7..c018928 100644
--- a/libgav1/src/gav1/version.h
+++ b/libgav1/src/gav1/version.h

@@ -24,7 +24,7 @@
 
 #define LIBGAV1_MAJOR_VERSION 0
 #define LIBGAV1_MINOR_VERSION 16
-#define LIBGAV1_PATCH_VERSION 0
+#define LIBGAV1_PATCH_VERSION 3
 
 #define LIBGAV1_VERSION                                           \
   ((LIBGAV1_MAJOR_VERSION << 16) | (LIBGAV1_MINOR_VERSION << 8) | \

diff --git a/libgav1/src/loop_restoration_info.cc b/libgav1/src/loop_restoration_info.cc
index 3830836..2dba57d 100644
--- a/libgav1/src/loop_restoration_info.cc
+++ b/libgav1/src/loop_restoration_info.cc

@@ -70,18 +70,16 @@
       continue;
     }
     plane_needs_filtering_[plane] = true;
-    const int plane_width = (plane == kPlaneY)
-                                ? width
-                                : RightShiftWithRounding(width, subsampling_x_);
+    const int plane_width =
+        (plane == kPlaneY) ? width : SubsampledValue(width, subsampling_x_);
     const int plane_height =
-        (plane == kPlaneY) ? height
-                           : RightShiftWithRounding(height, subsampling_y_);
-    num_horizontal_units_[plane] = std::max(
-        1, (plane_width + DivideBy2(loop_restoration_->unit_size[plane])) /
-               loop_restoration_->unit_size[plane]);
+        (plane == kPlaneY) ? height : SubsampledValue(height, subsampling_y_);
+    num_horizontal_units_[plane] =
+        std::max(1, RightShiftWithRounding(
+                        plane_width, loop_restoration_->unit_size_log2[plane]));
     num_vertical_units_[plane] = std::max(
-        1, (plane_height + DivideBy2(loop_restoration_->unit_size[plane])) /
-               loop_restoration_->unit_size[plane]);
+        1, RightShiftWithRounding(plane_height,
+                                  loop_restoration_->unit_size_log2[plane]));
     num_units_[plane] =
         num_horizontal_units_[plane] * num_vertical_units_[plane];
     total_num_units += num_units_[plane];
@@ -109,29 +107,25 @@
     LoopRestorationUnitInfo* const unit_info) const {
   assert(unit_info != nullptr);
   if (!plane_needs_filtering_[plane]) return false;
-  const int denominator_column =
-      is_superres_scaled
-          ? loop_restoration_->unit_size[plane] * kSuperResScaleNumerator
-          : loop_restoration_->unit_size[plane];
   const int numerator_column =
       is_superres_scaled ? superres_scale_denominator : 1;
   const int pixel_column_start =
       RowOrColumn4x4ToPixel(column4x4, plane, subsampling_x_);
   const int pixel_column_end = RowOrColumn4x4ToPixel(
       column4x4 + kNum4x4BlocksWide[block_size], plane, subsampling_x_);
-  const int unit_row = loop_restoration_->unit_size[plane];
+  const int unit_row_log2 = loop_restoration_->unit_size_log2[plane];
+  const int denominator_column_log2 =
+      unit_row_log2 + (is_superres_scaled ? 3 : 0);
   const int pixel_row_start =
       RowOrColumn4x4ToPixel(row4x4, plane, subsampling_y_);
   const int pixel_row_end = RowOrColumn4x4ToPixel(
       row4x4 + kNum4x4BlocksHigh[block_size], plane, subsampling_y_);
-  unit_info->column_start =
-      (pixel_column_start * numerator_column + denominator_column - 1) /
-      denominator_column;
-  unit_info->column_end =
-      (pixel_column_end * numerator_column + denominator_column - 1) /
-      denominator_column;
-  unit_info->row_start = (pixel_row_start + unit_row - 1) / unit_row;
-  unit_info->row_end = (pixel_row_end + unit_row - 1) / unit_row;
+  unit_info->column_start = RightShiftWithCeiling(
+      pixel_column_start * numerator_column, denominator_column_log2);
+  unit_info->column_end = RightShiftWithCeiling(
+      pixel_column_end * numerator_column, denominator_column_log2);
+  unit_info->row_start = RightShiftWithCeiling(pixel_row_start, unit_row_log2);
+  unit_info->row_end = RightShiftWithCeiling(pixel_row_end, unit_row_log2);
   unit_info->column_end =
       std::min(unit_info->column_end, num_horizontal_units_[plane]);
   unit_info->row_end = std::min(unit_info->row_end, num_vertical_units_[plane]);

diff --git a/libgav1/src/motion_vector.cc b/libgav1/src/motion_vector.cc
index 8223f3d..fdb1875 100644
--- a/libgav1/src/motion_vector.cc
+++ b/libgav1/src/motion_vector.cc

@@ -63,16 +63,12 @@
   const ObuFrameHeader& frame_header = block.tile.frame_header();
   ReferenceFrameType reference_type = bp.reference_frame[index];
   const auto& gm = frame_header.global_motion[reference_type];
-  GlobalMotionTransformationType global_motion_type =
-      (reference_type != kReferenceFrameIntra)
-          ? gm.type
-          : kNumGlobalMotionTransformationTypes;
   if (reference_type == kReferenceFrameIntra ||
-      global_motion_type == kGlobalMotionTransformationTypeIdentity) {
+      gm.type == kGlobalMotionTransformationTypeIdentity) {
     mv->mv32 = 0;
     return;
   }
-  if (global_motion_type == kGlobalMotionTransformationTypeTranslation) {
+  if (gm.type == kGlobalMotionTransformationTypeTranslation) {
     for (int i = 0; i < 2; ++i) {
       mv->mv[i] = gm.params[i] >> (kWarpedModelPrecisionBits - 3);
     }
@@ -127,18 +123,19 @@
   *found_new_mv |= kPredictionModeNewMvMask.Contains(mv_bp.y_mode);
   *found_match = true;
   MotionVector* const ref_mv_stack = prediction_parameters.ref_mv_stack;
-  const auto result = std::find_if(ref_mv_stack, ref_mv_stack + *num_mv_found,
+  const int num_found = *num_mv_found;
+  const auto result = std::find_if(ref_mv_stack, ref_mv_stack + num_found,
                                    [&candidate_mv](const MotionVector& ref_mv) {
                                      return ref_mv == candidate_mv;
                                    });
-  if (result != ref_mv_stack + *num_mv_found) {
+  if (result != ref_mv_stack + num_found) {
     prediction_parameters.IncreaseWeight(std::distance(ref_mv_stack, result),
                                          weight);
     return;
   }
-  if (*num_mv_found >= kMaxRefMvStackSize) return;
-  ref_mv_stack[*num_mv_found] = candidate_mv;
-  prediction_parameters.SetWeightIndexStackEntry(*num_mv_found, weight);
+  if (num_found >= kMaxRefMvStackSize) return;
+  ref_mv_stack[num_found] = candidate_mv;
+  prediction_parameters.SetWeightIndexStackEntry(num_found, weight);
   ++*num_mv_found;
 }
 
@@ -163,19 +160,20 @@
   *found_match = true;
   CompoundMotionVector* const compound_ref_mv_stack =
       prediction_parameters.compound_ref_mv_stack;
+  const int num_found = *num_mv_found;
   const auto result =
-      std::find_if(compound_ref_mv_stack, compound_ref_mv_stack + *num_mv_found,
+      std::find_if(compound_ref_mv_stack, compound_ref_mv_stack + num_found,
                    [&candidate_mv](const CompoundMotionVector& ref_mv) {
                      return ref_mv == candidate_mv;
                    });
-  if (result != compound_ref_mv_stack + *num_mv_found) {
+  if (result != compound_ref_mv_stack + num_found) {
     prediction_parameters.IncreaseWeight(
         std::distance(compound_ref_mv_stack, result), weight);
     return;
   }
-  if (*num_mv_found >= kMaxRefMvStackSize) return;
-  compound_ref_mv_stack[*num_mv_found] = candidate_mv;
-  prediction_parameters.SetWeightIndexStackEntry(*num_mv_found, weight);
+  if (num_found >= kMaxRefMvStackSize) return;
+  compound_ref_mv_stack[num_found] = candidate_mv;
+  prediction_parameters.SetWeightIndexStackEntry(num_found, weight);
   ++*num_mv_found;
 }
 
@@ -305,24 +303,26 @@
     }
     CompoundMotionVector* const compound_ref_mv_stack =
         prediction_parameters->compound_ref_mv_stack;
+    int num_found = *num_mv_found;
     int index = 0;
     do {
       const CompoundMotionVector& candidate_mv = candidate_mvs[index];
-      const auto result = std::find_if(
-          compound_ref_mv_stack, compound_ref_mv_stack + *num_mv_found,
-          [&candidate_mv](const CompoundMotionVector& ref_mv) {
-            return ref_mv == candidate_mv;
-          });
-      if (result != compound_ref_mv_stack + *num_mv_found) {
+      const auto result =
+          std::find_if(compound_ref_mv_stack, compound_ref_mv_stack + num_found,
+                       [&candidate_mv](const CompoundMotionVector& ref_mv) {
+                         return ref_mv == candidate_mv;
+                       });
+      if (result != compound_ref_mv_stack + num_found) {
         prediction_parameters->IncreaseWeight(
             std::distance(compound_ref_mv_stack, result), 2);
         continue;
       }
-      if (*num_mv_found >= kMaxRefMvStackSize) continue;
-      compound_ref_mv_stack[*num_mv_found] = candidate_mv;
-      prediction_parameters->SetWeightIndexStackEntry(*num_mv_found, 2);
-      ++*num_mv_found;
+      if (num_found >= kMaxRefMvStackSize) continue;
+      compound_ref_mv_stack[num_found] = candidate_mv;
+      prediction_parameters->SetWeightIndexStackEntry(num_found, 2);
+      ++num_found;
     } while (++index < count);
+    *num_mv_found = num_found;
     return;
   }
   MotionVector* const ref_mv_stack = prediction_parameters->ref_mv_stack;
@@ -333,19 +333,20 @@
       *zero_mv_context = static_cast<int>(max_difference >= 16);
     }
     const MotionVector candidate_mv = {};
+    const int num_found = *num_mv_found;
     const auto result =
-        std::find_if(ref_mv_stack, ref_mv_stack + *num_mv_found,
+        std::find_if(ref_mv_stack, ref_mv_stack + num_found,
                      [&candidate_mv](const MotionVector& ref_mv) {
                        return ref_mv == candidate_mv;
                      });
-    if (result != ref_mv_stack + *num_mv_found) {
+    if (result != ref_mv_stack + num_found) {
       prediction_parameters->IncreaseWeight(std::distance(ref_mv_stack, result),
                                             2 * count);
       return;
     }
-    if (*num_mv_found >= kMaxRefMvStackSize) return;
-    ref_mv_stack[*num_mv_found] = candidate_mv;
-    prediction_parameters->SetWeightIndexStackEntry(*num_mv_found, 2 * count);
+    if (num_found >= kMaxRefMvStackSize) return;
+    ref_mv_stack[num_found] = candidate_mv;
+    prediction_parameters->SetWeightIndexStackEntry(num_found, 2 * count);
     ++*num_mv_found;
     return;
   }
@@ -361,24 +362,26 @@
                  std::abs(candidate_mvs[0].mv[1] - global_mv[0].mv[1]));
     *zero_mv_context = static_cast<int>(max_difference >= 16);
   }
+  int num_found = *num_mv_found;
   int index = 0;
   do {
     const MotionVector& candidate_mv = candidate_mvs[index];
     const auto result =
-        std::find_if(ref_mv_stack, ref_mv_stack + *num_mv_found,
+        std::find_if(ref_mv_stack, ref_mv_stack + num_found,
                      [&candidate_mv](const MotionVector& ref_mv) {
                        return ref_mv == candidate_mv;
                      });
-    if (result != ref_mv_stack + *num_mv_found) {
+    if (result != ref_mv_stack + num_found) {
       prediction_parameters->IncreaseWeight(std::distance(ref_mv_stack, result),
                                             2);
       continue;
     }
-    if (*num_mv_found >= kMaxRefMvStackSize) continue;
-    ref_mv_stack[*num_mv_found] = candidate_mv;
-    prediction_parameters->SetWeightIndexStackEntry(*num_mv_found, 2);
-    ++*num_mv_found;
+    if (num_found >= kMaxRefMvStackSize) continue;
+    ref_mv_stack[num_found] = candidate_mv;
+    prediction_parameters->SetWeightIndexStackEntry(num_found, 2);
+    ++num_found;
   } while (++index < count);
+  *num_mv_found = num_found;
 }
 
 // Part of 7.10.2.5.
@@ -397,9 +400,6 @@
                                        kBlock32x8, kBlock32x16, kBlock32x32);
 
 // 7.10.2.5.
-//
-// The |zero_mv_context| output parameter may be null. If |zero_mv_context| is
-// not null, the function may set |*zero_mv_context|.
 void TemporalScan(const Tile::Block& block, bool is_compound,
                   int* const zero_mv_context, int* const num_mv_found) {
   const int step_w = (block.width4x4 >= 16) ? 4 : 2;
@@ -552,6 +552,7 @@
   PredictionParameters& prediction_parameters =
       *block.bp->prediction_parameters;
   MotionVector* const ref_mv_stack = prediction_parameters.ref_mv_stack;
+  int num_found = *num_mv_found;
   for (int i = 0; i < 2; ++i) {
     const ReferenceFrameType candidate_reference_frame = bp.reference_frame[i];
     if (candidate_reference_frame <= kReferenceFrameIntra) continue;
@@ -561,15 +562,16 @@
       candidate_mv.mv[0] *= -1;
       candidate_mv.mv[1] *= -1;
     }
-    assert(*num_mv_found <= 2);
-    if ((*num_mv_found != 0 && ref_mv_stack[0] == candidate_mv) ||
-        (*num_mv_found == 2 && ref_mv_stack[1] == candidate_mv)) {
+    assert(num_found <= 2);
+    if ((num_found != 0 && ref_mv_stack[0] == candidate_mv) ||
+        (num_found == 2 && ref_mv_stack[1] == candidate_mv)) {
       continue;
     }
-    ref_mv_stack[*num_mv_found] = candidate_mv;
-    prediction_parameters.SetWeightIndexStackEntry(*num_mv_found, 0);
-    ++*num_mv_found;
+    ref_mv_stack[num_found] = candidate_mv;
+    prediction_parameters.SetWeightIndexStackEntry(num_found, 0);
+    ++num_found;
   }
+  *num_mv_found = num_found;
 }
 
 // 7.10.2.12.

diff --git a/libgav1/src/obu_parser.cc b/libgav1/src/obu_parser.cc
index 41df909..69480d7 100644
--- a/libgav1/src/obu_parser.cc
+++ b/libgav1/src/obu_parser.cc

@@ -188,6 +188,16 @@
       color_config->color_range = kColorRangeFull;
       color_config->subsampling_x = 0;
       color_config->subsampling_y = 0;
+      // YUV 4:4:4 is only allowed in profile 1, or profile 2 with bit depth 12.
+      // See the table at the beginning of Section 6.4.1.
+      if (sequence_header->profile != kProfile1 &&
+          (sequence_header->profile != kProfile2 ||
+           color_config->bitdepth != 12)) {
+        LIBGAV1_DLOG(ERROR,
+                     "YUV 4:4:4 is not allowed in profile %d for bitdepth %d.",
+                     sequence_header->profile, color_config->bitdepth);
+        return false;
+      }
     } else {
       OBU_READ_BIT_OR_FAIL;
       color_config->color_range = static_cast<ColorRange>(scratch);
@@ -469,9 +479,13 @@
       LIBGAV1_DLOG(ERROR, "Sequence header changed in the middle of a frame.");
       return false;
     }
+    sequence_header_changed_ = true;
     decoder_state_.ClearReferenceFrames();
   }
   sequence_header_ = sequence_header;
+  if (!has_sequence_header_) {
+    sequence_header_changed_ = true;
+  }
   has_sequence_header_ = true;
   // Section 6.4.1: It is a requirement of bitstream conformance that if
   // OperatingPointIdc is equal to 0, then obu_extension_flag is equal to 0 for
@@ -499,12 +513,12 @@
     if (lower_bound_is_smaller) {
       if (reference_frame_id > decoder_state_.current_frame_id ||
           reference_frame_id < lower_bound) {
-        decoder_state_.reference_valid[i] = false;
+        decoder_state_.reference_frame[i] = nullptr;
       }
     } else {
       if (reference_frame_id > decoder_state_.current_frame_id &&
           reference_frame_id < lower_bound) {
-        decoder_state_.reference_valid[i] = false;
+        decoder_state_.reference_frame[i] = nullptr;
       }
     }
   }
@@ -611,7 +625,7 @@
     frame_header_.reference_order_hint[i] = scratch;
     if (frame_header_.reference_order_hint[i] !=
         decoder_state_.reference_order_hint[i]) {
-      decoder_state_.reference_valid[i] = false;
+      decoder_state_.reference_frame[i] = nullptr;
     }
   }
   return true;
@@ -1149,8 +1163,7 @@
         unit_shift += unit_extra_shift;
       }
     }
-    loop_restoration->unit_size[kPlaneY] =
-        kLoopRestorationTileSizeMax >> (2 - unit_shift);
+    loop_restoration->unit_size_log2[kPlaneY] = 6 + unit_shift;
     uint8_t uv_shift = 0;
     if (sequence_header_.color_config.subsampling_x != 0 &&
         sequence_header_.color_config.subsampling_y != 0 &&
@@ -1158,9 +1171,9 @@
       OBU_READ_BIT_OR_FAIL;
       uv_shift = scratch;
     }
-    loop_restoration->unit_size[kPlaneU] =
-        loop_restoration->unit_size[kPlaneV] =
-            loop_restoration->unit_size[0] >> uv_shift;
+    loop_restoration->unit_size_log2[kPlaneU] =
+        loop_restoration->unit_size_log2[kPlaneV] =
+            loop_restoration->unit_size_log2[0] - uv_shift;
   }
   return true;
 }
@@ -1778,10 +1791,11 @@
         // whenever display_frame_id is read, the value matches
         // RefFrameId[ frame_to_show_map_idx ] ..., and that
         // RefValid[ frame_to_show_map_idx ] is equal to 1.
+        //
+        // The current_frame_ == nullptr check below is equivalent to checking
+        // if RefValid[ frame_to_show_map_idx ] is equal to 1.
         if (frame_header_.display_frame_id !=
-                decoder_state_
-                    .reference_frame_id[frame_header_.frame_to_show] ||
-            !decoder_state_.reference_valid[frame_header_.frame_to_show]) {
+            decoder_state_.reference_frame_id[frame_header_.frame_to_show]) {
           LIBGAV1_DLOG(ERROR,
                        "Reference buffer %d has a frame id number mismatch.",
                        frame_header_.frame_to_show);
@@ -1859,8 +1873,8 @@
     }
   }
   if (frame_header_.frame_type == kFrameKey && frame_header_.show_frame) {
-    decoder_state_.reference_valid.fill(false);
     decoder_state_.reference_order_hint.fill(0);
+    decoder_state_.reference_frame.fill(nullptr);
   }
   OBU_READ_BIT_OR_FAIL;
   frame_header_.enable_cdf_update = !static_cast<bool>(scratch);
@@ -1890,27 +1904,28 @@
     frame_header_.current_frame_id = static_cast<uint16_t>(scratch);
     const int previous_frame_id = decoder_state_.current_frame_id;
     decoder_state_.current_frame_id = frame_header_.current_frame_id;
-    if ((frame_header_.frame_type != kFrameKey || !frame_header_.show_frame) &&
-        previous_frame_id >= 0) {
-      // Section 6.8.2: ..., it is a requirement of bitstream conformance
-      // that all of the following conditions are true:
-      //   * current_frame_id is not equal to PrevFrameID,
-      //   * DiffFrameID is less than 1 << ( idLen - 1 )
-      int diff_frame_id = decoder_state_.current_frame_id - previous_frame_id;
-      const int id_length_max_value = 1
-                                      << sequence_header_.frame_id_length_bits;
-      if (diff_frame_id <= 0) {
-        diff_frame_id += id_length_max_value;
+    if (frame_header_.frame_type != kFrameKey || !frame_header_.show_frame) {
+      if (previous_frame_id >= 0) {
+        // Section 6.8.2: ..., it is a requirement of bitstream conformance
+        // that all of the following conditions are true:
+        //   * current_frame_id is not equal to PrevFrameID,
+        //   * DiffFrameID is less than 1 << ( idLen - 1 )
+        int diff_frame_id = decoder_state_.current_frame_id - previous_frame_id;
+        const int id_length_max_value =
+            1 << sequence_header_.frame_id_length_bits;
+        if (diff_frame_id <= 0) {
+          diff_frame_id += id_length_max_value;
+        }
+        if (diff_frame_id >= DivideBy2(id_length_max_value)) {
+          LIBGAV1_DLOG(ERROR,
+                       "current_frame_id (%d) equals or differs too much from "
+                       "previous_frame_id (%d).",
+                       decoder_state_.current_frame_id, previous_frame_id);
+          return false;
+        }
       }
-      if (diff_frame_id >= DivideBy2(id_length_max_value)) {
-        LIBGAV1_DLOG(ERROR,
-                     "current_frame_id (%d) equals or differs too much from "
-                     "previous_frame_id (%d).",
-                     decoder_state_.current_frame_id, previous_frame_id);
-        return false;
-      }
+      MarkInvalidReferenceFrames();
     }
-    MarkInvalidReferenceFrames();
   } else {
     frame_header_.current_frame_id = 0;
     decoder_state_.current_frame_id = frame_header_.current_frame_id;
@@ -2008,15 +2023,8 @@
       // Note if support for Annex C: Error resilience behavior is added this
       // check should be omitted per C.5 Decoder consequences of processable
       // frames.
-      if (!decoder_state_.reference_valid[reference_frame_index]) {
-        LIBGAV1_DLOG(ERROR, "ref_frame_idx[%d] (%d) is not valid.", i,
-                     reference_frame_index);
-        return false;
-      }
-      // Check if the inter frame requests a nonexistent reference, whether or
-      // not frame_refs_short_signaling is used.
       if (decoder_state_.reference_frame[reference_frame_index] == nullptr) {
-        LIBGAV1_DLOG(ERROR, "ref_frame_idx[%d] (%d) is not a decoded frame.", i,
+        LIBGAV1_DLOG(ERROR, "ref_frame_idx[%d] (%d) is not valid.", i,
                      reference_frame_index);
         return false;
       }
@@ -2032,12 +2040,8 @@
         // Section 6.8.2: It is a requirement of bitstream conformance that
         // whenever expectedFrameId[ i ] is calculated, the value matches
         // RefFrameId[ ref_frame_idx[ i ] ] ...
-        //
-        // Section 6.8.2: It is a requirement of bitstream conformance that
-        // RefValid[ ref_frame_idx[ i ] ] is equal to 1, ...
         if (frame_header_.expected_frame_id[i] !=
-                decoder_state_.reference_frame_id[reference_frame_index] ||
-            !decoder_state_.reference_valid[reference_frame_index]) {
+            decoder_state_.reference_frame_id[reference_frame_index]) {
           LIBGAV1_DLOG(ERROR,
                        "Reference buffer %d has a frame id number mismatch.",
                        reference_frame_index);
@@ -2045,20 +2049,6 @@
         }
       }
     }
-    // Validate frame_header_.primary_reference_frame.
-    if (frame_header_.primary_reference_frame != kPrimaryReferenceNone) {
-      const int index =
-          frame_header_
-              .reference_frame_index[frame_header_.primary_reference_frame];
-      if (decoder_state_.reference_frame[index] == nullptr) {
-        LIBGAV1_DLOG(ERROR,
-                     "primary_ref_frame is %d but ref_frame_idx[%d] (%d) is "
-                     "not a decoded frame.",
-                     frame_header_.primary_reference_frame,
-                     frame_header_.primary_reference_frame, index);
-        return false;
-      }
-    }
     if (frame_header_.frame_size_override_flag &&
         !frame_header_.error_resilient_mode) {
       // Section 5.9.7.
@@ -2668,6 +2658,7 @@
   metadata_ = {};
   tile_buffers_.clear();
   next_tile_group_start_ = 0;
+  sequence_header_changed_ = false;
 
   bool parsed_one_full_frame = false;
   bool seen_frame_header = false;

diff --git a/libgav1/src/obu_parser.h b/libgav1/src/obu_parser.h
index 22a2396..c4619ed 100644
--- a/libgav1/src/obu_parser.h
+++ b/libgav1/src/obu_parser.h

@@ -276,6 +276,9 @@
   const ObuFrameHeader& frame_header() const { return frame_header_; }
   const Vector<TileBuffer>& tile_buffers() const { return tile_buffers_; }
   const ObuMetadata& metadata() const { return metadata_; }
+  // Returns true if the last call to ParseOneFrame() encountered a sequence
+  // header change.
+  bool sequence_header_changed() const { return sequence_header_changed_; }
 
   // Setters.
   void set_sequence_header(const ObuSequenceHeader& sequence_header) {
@@ -284,7 +287,7 @@
   }
 
   // Moves |tile_buffers_| into |tile_buffers|.
-  void MoveTileBuffer(Vector<TileBuffer>* tile_buffers) {
+  void MoveTileBuffers(Vector<TileBuffer>* tile_buffers) {
     *tile_buffers = std::move(tile_buffers_);
   }
 
@@ -362,7 +365,8 @@
   // ParseMetadata() can find the trailing bit of the OBU and either extract
   // or skip over the payload data as an opaque chunk of data.
   bool ParseMetadata(const uint8_t* data, size_t size);  // 5.8.
-  // Adds and populates the TileBuffer for each tile in the tile group.
+  // Adds and populates the TileBuffer for each tile in the tile group and
+  // updates |next_tile_group_start_|
   bool AddTileBuffers(int start, int end, size_t total_size,
                       size_t tg_header_size, size_t bytes_consumed_so_far);
   bool ParseTileGroup(size_t size, size_t bytes_consumed_so_far);  // 5.11.1.
@@ -383,6 +387,9 @@
   int next_tile_group_start_ = 0;
   // If true, the sequence_header_ field is valid.
   bool has_sequence_header_ = false;
+  // If true, it means that the last call to ParseOneFrame() encountered a
+  // sequence header change.
+  bool sequence_header_changed_ = false;
   // If true, the obu_extension_flag syntax element in the OBU header must be
   // 0. Set to true when parsing a sequence header if OperatingPointIdc is 0.
   bool extension_disallowed_ = false;

diff --git a/libgav1/src/post_filter.h b/libgav1/src/post_filter.h
index d300049..dfcd08e 100644
--- a/libgav1/src/post_filter.h
+++ b/libgav1/src/post_filter.h

@@ -58,7 +58,7 @@
   // The overall flow of data in this class (for both single and multi-threaded
   // cases) is as follows:
   //   -> Input: |frame_buffer_|.
-  //   -> Initialize |source_buffer_|, |cdef_buffer_| and
+  //   -> Initialize |source_buffer_|, |cdef_buffer_|, |superres_buffer_| and
   //      |loop_restoration_buffer_|.
   //   -> Deblocking:
   //      * Input: |source_buffer_|
@@ -68,9 +68,9 @@
   //      * Output: |cdef_buffer_|
   //   -> SuperRes:
   //      * Input: |cdef_buffer_|
-  //      * Output: |cdef_buffer_|
+  //      * Output: |superres_buffer_|
   //   -> Loop Restoration:
-  //      * Input: |cdef_buffer_|
+  //      * Input: |superres_buffer_|
   //      * Output: |loop_restoration_buffer_|.
   //   -> Now |frame_buffer_| contains the filtered frame.
   PostFilter(const ObuFrameHeader& frame_header,
@@ -102,18 +102,20 @@
   // Filter behavior (multi-threaded):
   // * Deblock: In-place filtering. The output is written to |source_buffer_|.
   //            If cdef and loop restoration are both on, then 4 rows (as
-  //            specified by |kDeblockedRowsForLoopRestoration|) in every 64x64
-  //            block is copied into |deblock_buffer_|.
-  // * Cdef: Filtering output is written into |threaded_window_buffer_| and then
-  //         copied into the |cdef_buffer_| (which is just |source_buffer_| with
-  //         a shift to the top-left).
-  // * SuperRes: Near in-place filtering (with an additional line buffer for
-  //             each row). The output is written to |cdef_buffer_|.
-  // * Restoration: Uses the |cdef_buffer_| and |deblock_buffer_| as the input
-  //                and the output is written into the
-  //                |threaded_window_buffer_|. It is then copied to the
-  //                |loop_restoration_buffer_| (which is just |cdef_buffer_|
-  //                with a shift to the top-left).
+  //            specified by |kLoopRestorationBorderRows|) in every 64x64 block
+  //            is copied into |loop_restoration_border_|.
+  // * Cdef: In-place filtering. Uses the |source_buffer_| and |cdef_border_| as
+  //         the input and the output is written into |cdef_buffer_| (which is
+  //         the same as |source_buffer_|).
+  // * SuperRes: Near in-place filtering. Uses the |cdef_buffer_| and
+  //             |superres_line_buffer_| as the input and the output is written
+  //             into |superres_buffer_| (which is just |cdef_buffer_| with a
+  //             shift to the top).
+  // * Restoration: Near in-place filtering.
+  //                Uses the |superres_buffer_| and |loop_restoration_border_|
+  //                as the input and the output is written into
+  //                |loop_restoration_buffer_| (which is just |superres_buffer_|
+  //                with a shift to the left).
   void ApplyFilteringThreaded();
 
   // Does the overall post processing filter for one superblock row starting at
@@ -123,17 +125,18 @@
   // Filter behavior (single-threaded):
   // * Deblock: In-place filtering. The output is written to |source_buffer_|.
   //            If cdef and loop restoration are both on, then 4 rows (as
-  //            specified by |kDeblockedRowsForLoopRestoration|) in every 64x64
-  //            block is copied into |deblock_buffer_|.
+  //            specified by |kLoopRestorationBorderRows|) in every 64x64 block
+  //            is copied into |loop_restoration_border_|.
   // * Cdef: In-place filtering. The output is written into |cdef_buffer_|
   //         (which is just |source_buffer_| with a shift to the top-left).
-  // * SuperRes: Near in-place filtering (with an additional line buffer for
-  //             each row). The output is written to |cdef_buffer_|.
-  // * Restoration: Near in-place filtering. Uses a local block of size 64x64.
-  //                Uses the |cdef_buffer_| and |deblock_buffer_| as the input
-  //                and the output is written into |loop_restoration_buffer_|
-  //                (which is just |source_buffer_| with a shift to the
-  //                top-left).
+  // * SuperRes: Near in-place filtering. Uses the |cdef_buffer_| as the input
+  //             and the output is written into |superres_buffer_| (which is
+  //             just |cdef_buffer_| with a shift to the top).
+  // * Restoration: Near in-place filtering.
+  //                Uses the |superres_buffer_| and |loop_restoration_border_|
+  //                as the input and the output is written into
+  //                |loop_restoration_buffer_| (which is just |superres_buffer_|
+  //                with a shift to the left or top-left).
   // Returns the index of the last row whose post processing is complete and can
   // be used for referencing.
   int ApplyFilteringForOneSuperBlockRow(int row4x4, int sb4x4, bool is_last_row,
@@ -170,25 +173,6 @@
     return DoDeblock(frame_header_, do_post_filter_mask_);
   }
 
-  // This function takes the cdef filtered buffer and the deblocked buffer to
-  // prepare a block as input for loop restoration.
-  // In striped loop restoration:
-  // The filtering needs to fetch the area of size (width + 6) x (height + 4),
-  // in which (width + 6) x height area is from upscaled frame
-  // (superres_buffer). Top 2 rows and bottom 2 rows are from deblocked frame
-  // (deblock_buffer). Special cases are: (1). when it is the top border, the
-  // top 2 rows are from cdef filtered frame. (2). when it is the bottom border,
-  // the bottom 2 rows are from cdef filtered frame. This function is called
-  // only when cdef is applied for this frame.
-  template <typename Pixel>
-  static void PrepareLoopRestorationBlock(const Pixel* src_buffer,
-                                          ptrdiff_t src_stride,
-                                          const Pixel* deblock_buffer,
-                                          ptrdiff_t deblock_stride, Pixel* dst,
-                                          ptrdiff_t dst_stride, int width,
-                                          int height, bool frame_top_border,
-                                          bool frame_bottom_border);
-
   uint8_t GetZeroDeltaDeblockFilterLevel(int segment_id, int level_index,
                                          ReferenceFrameType type,
                                          int mode_id) const {
@@ -235,36 +219,21 @@
   }
   LoopRestorationInfo* restoration_info() const { return restoration_info_; }
   uint8_t* GetBufferOffset(uint8_t* base_buffer, int stride, Plane plane,
-                           int row4x4, int column4x4) const {
-    return base_buffer +
-           RowOrColumn4x4ToPixel(row4x4, plane, subsampling_y_[plane]) *
-               stride +
-           RowOrColumn4x4ToPixel(column4x4, plane, subsampling_x_[plane]) *
-               pixel_size_;
+                           int row, int column) const {
+    return base_buffer + (row >> subsampling_y_[plane]) * stride +
+           ((column >> subsampling_x_[plane]) << pixel_size_log2_);
   }
   uint8_t* GetSourceBuffer(Plane plane, int row4x4, int column4x4) const {
     return GetBufferOffset(source_buffer_[plane], frame_buffer_.stride(plane),
-                           plane, row4x4, column4x4);
+                           plane, MultiplyBy4(row4x4), MultiplyBy4(column4x4));
   }
-
-  static int GetWindowBufferWidth(const ThreadPool* const thread_pool,
-                                  const ObuFrameHeader& frame_header) {
-    return (thread_pool == nullptr) ? 0
-                                    : Align(frame_header.upscaled_width, 64);
+  uint8_t* GetCdefBuffer(Plane plane, int row4x4, int column4x4) const {
+    return GetBufferOffset(cdef_buffer_[plane], frame_buffer_.stride(plane),
+                           plane, MultiplyBy4(row4x4), MultiplyBy4(column4x4));
   }
-
-  // For multi-threaded cdef and loop restoration, window height is the minimum
-  // of the following two quantities:
-  //  1) thread_count * 64
-  //  2) frame_height rounded up to the nearest power of 64
-  // Where 64 is the block size for cdef and loop restoration.
-  static int GetWindowBufferHeight(const ThreadPool* const thread_pool,
-                                   const ObuFrameHeader& frame_header) {
-    if (thread_pool == nullptr) return 0;
-    const int thread_count = 1 + thread_pool->num_threads();
-    const int window_height = MultiplyBy64(thread_count);
-    const int adjusted_frame_height = Align(frame_header.height, 64);
-    return std::min(adjusted_frame_height, window_height);
+  uint8_t* GetSuperResBuffer(Plane plane, int row4x4, int column4x4) const {
+    return GetBufferOffset(superres_buffer_[plane], frame_buffer_.stride(plane),
+                           plane, MultiplyBy4(row4x4), MultiplyBy4(column4x4));
   }
 
   template <typename Pixel>
@@ -302,8 +271,13 @@
   // updated.
   void CopyBordersForOneSuperBlockRow(int row4x4, int sb4x4,
                                       bool for_loop_restoration);
-  // Sets up the |deblock_buffer_| for loop restoration.
-  void SetupDeblockBuffer(int row4x4_start, int sb4x4);
+  // Sets up the |loop_restoration_border_| for loop restoration.
+  // This is called when there is no CDEF filter. We copy rows from
+  // |superres_buffer_| and do the line extension.
+  void SetupLoopRestorationBorder(int row4x4_start);
+  // This is called when there is CDEF filter. We copy rows from
+  // |source_buffer_|, apply superres and do the line extension.
+  void SetupLoopRestorationBorder(int row4x4_start, int sb4x4);
   // Returns true if we can perform border extension in loop (i.e.) without
   // waiting until the entire frame is decoded. If intra_block_copy is true, we
   // do in-loop border extension only if the upscaled_width is the same as 4 *
@@ -317,13 +291,21 @@
   template <typename Pixel>
   void CopyPlane(const Pixel* src, ptrdiff_t src_stride, int width, int height,
                  Pixel* dst, ptrdiff_t dst_stride) {
-    for (int y = 0; y < height; ++y) {
+    assert(height > 0);
+    do {
       memcpy(dst, src, width * sizeof(Pixel));
       src += src_stride;
       dst += dst_stride;
-    }
+    } while (--height != 0);
   }
 
+  // Worker function used for multi-threaded implementation of Deblocking, CDEF
+  // and Loop Restoration.
+  using WorkerFunction = void (PostFilter::*)(std::atomic<int>* row4x4_atomic);
+  // Schedules |worker| jobs to the |thread_pool_|, runs them in the calling
+  // thread and returns once all the jobs are completed.
+  void RunJobs(WorkerFunction worker);
+
   // Functions for the Deblocking filter.
 
   static int GetIndex(int row4x4) { return DivideBy4(row4x4); }
@@ -361,16 +343,25 @@
   // Applies deblock filtering for the superblock row starting at |row4x4| with
   // a height of 4*|sb4x4|.
   void ApplyDeblockFilterForOneSuperBlockRow(int row4x4, int sb4x4);
-  void DeblockFilterWorker(int jobs_per_plane, const Plane* planes,
-                           int num_planes, std::atomic<int>* job_counter,
-                           DeblockFilter deblock_filter);
-  void ApplyDeblockFilterThreaded();
+  // Worker function used for multi-threaded deblocking.
+  template <LoopFilterType loop_filter_type>
+  void DeblockFilterWorker(std::atomic<int>* row4x4_atomic);
+  static_assert(
+      std::is_same<
+          decltype(&PostFilter::DeblockFilterWorker<kLoopFilterTypeVertical>),
+          WorkerFunction>::value,
+      "");
+  static_assert(
+      std::is_same<
+          decltype(&PostFilter::DeblockFilterWorker<kLoopFilterTypeHorizontal>),
+          WorkerFunction>::value,
+      "");
 
   // Functions for the cdef filter.
 
-  uint8_t* GetCdefBufferAndStride(int start_x, int start_y, int plane,
-                                  int window_buffer_plane_size,
-                                  int* cdef_stride) const;
+  // Copies the deblocked pixels necessary for use by the multi-threaded cdef
+  // implementation into |cdef_border_|.
+  void SetupCdefBorder(int row4x4);
   // This function prepares the input source block for cdef filtering. The input
   // source block contains a 12x12 block, with the inner 8x8 as the desired
   // filter region. It pads the block if the 12x12 block includes out of frame
@@ -379,35 +370,43 @@
   template <typename Pixel>
   void PrepareCdefBlock(int block_width4x4, int block_height4x4, int row4x4,
                         int column4x4, uint16_t* cdef_source,
-                        ptrdiff_t cdef_stride, bool y_plane);
+                        ptrdiff_t cdef_stride, bool y_plane,
+                        const uint8_t border_columns[kMaxPlanes][256],
+                        bool use_border_columns);
+  // Applies cdef for one 64x64 block.
   template <typename Pixel>
   void ApplyCdefForOneUnit(uint16_t* cdef_block, int index, int block_width4x4,
                            int block_height4x4, int row4x4_start,
-                           int column4x4_start);
+                           int column4x4_start,
+                           uint8_t border_columns[2][kMaxPlanes][256],
+                           bool use_border_columns[2][2]);
   // Helper function used by ApplyCdefForOneSuperBlockRow to avoid some code
   // duplication.
-  void ApplyCdefForOneSuperBlockRowHelper(int row4x4, int block_height4x4);
-  // Applies cdef filtering for the superblock row starting at |row4x4| with a
+  void ApplyCdefForOneSuperBlockRowHelper(
+      uint16_t* cdef_block, uint8_t border_columns[2][kMaxPlanes][256],
+      int row4x4, int block_height4x4);
+  // Applies CDEF filtering for the superblock row starting at |row4x4| with a
   // height of 4*|sb4x4|.
   void ApplyCdefForOneSuperBlockRow(int row4x4, int sb4x4, bool is_last_row);
-  template <typename Pixel>
-  void ApplyCdefForOneRowInWindow(int row, int column);
-  template <typename Pixel>
-  void ApplyCdefThreaded();
-  void ApplyCdef();  // Sections 7.15 and 7.15.1.
+  // Worker function used for multi-threaded CDEF.
+  void ApplyCdefWorker(std::atomic<int>* row4x4_atomic);
+  static_assert(std::is_same<decltype(&PostFilter::ApplyCdefWorker),
+                             WorkerFunction>::value,
+                "");
 
   // Functions for the SuperRes filter.
 
-  // Applies super resolution for the |buffers| for |rows[plane]| rows of each
-  // plane. If |in_place| is true, the line buffer will not be used and the
-  // SuperRes output will be written to a row above the input row. If |in_place|
-  // is false, the line buffer will be used to store a copy of the input and the
-  // output will be written to the same row as the input row.
-  template <bool in_place>
-  void ApplySuperRes(const std::array<uint8_t*, kMaxPlanes>& buffers,
-                     const std::array<int, kMaxPlanes>& strides,
-                     const std::array<int, kMaxPlanes>& rows,
-                     size_t line_buffer_offset);  // Section 7.16.
+  // Applies super resolution for the |src| for |rows[plane]| rows of each
+  // plane. If |line_buffer_row| is larger than or equal to 0, one more row will
+  // be processed, the line buffer indicated by |line_buffer_row| will be used
+  // as the source. If |dst_is_loop_restoration_border| is true, then it means
+  // that the |dst| pointers come from |loop_restoration_border_| and the
+  // strides will be populated from that buffer.
+  void ApplySuperRes(
+      const std::array<uint8_t*, kMaxPlanes>& src,
+      const std::array<int, kMaxPlanes>& rows, int line_buffer_row,
+      const std::array<uint8_t*, kMaxPlanes>& dst,
+      bool dst_is_loop_restoration_border = false);  // Section 7.16.
   // Applies SuperRes for the superblock row starting at |row4x4| with a height
   // of 4*|sb4x4|.
   void ApplySuperResForOneSuperBlockRow(int row4x4, int sb4x4,
@@ -416,22 +415,7 @@
 
   // Functions for the Loop Restoration filter.
 
-  template <typename Pixel>
-  void ApplyLoopRestorationForOneRowInWindow(
-      const Pixel* src_buffer, Plane plane, int plane_height, int plane_width,
-      int y, int x, int row, int unit_row, int current_process_unit_height,
-      int plane_unit_size, int window_width,
-      Array2DView<Pixel>* loop_restored_window);
-  // Applies loop restoration for the superblock row starting at |row4x4_start|
-  // with a height of 4*|sb4x4|.
-  template <typename Pixel>
-  void ApplyLoopRestorationSingleThread(int row4x4_start, int sb4x4);
-  void ApplyLoopRestoration(int row4x4_start, int sb4x4);
-  template <typename Pixel>
-  void ApplyLoopRestorationThreaded();
-  // Note for ApplyLoopRestoration():
-  // First, we must differentiate loop restoration processing unit from loop
-  // restoration unit.
+  // Notes about Loop Restoration:
   // (1). Loop restoration processing unit size is default to 64x64.
   // Only when the remaining filtering area is smaller than 64x64, the
   // processing unit size is the actual area size.
@@ -460,7 +444,26 @@
   // then sizes of the first row of processing units are 64x56, 64x56, 12x56,
   // respectively. The second row is 64x64, 64x64, 12x64.
   // The third row is 64x20, 64x20, 12x20.
-  void ApplyLoopRestoration();
+
+  // |stride| is shared by |src_buffer| and |dst_buffer|.
+  template <typename Pixel>
+  void ApplyLoopRestorationForOneRow(const Pixel* src_buffer, ptrdiff_t stride,
+                                     Plane plane, int plane_height,
+                                     int plane_width, int y, int unit_row,
+                                     int current_process_unit_height,
+                                     int plane_unit_size, Pixel* dst_buffer);
+  // Applies loop restoration for the superblock row starting at |row4x4_start|
+  // with a height of 4*|sb4x4|.
+  template <typename Pixel>
+  void ApplyLoopRestorationForOneSuperBlockRow(int row4x4_start, int sb4x4);
+  // Helper function that calls the right variant of
+  // ApplyLoopRestorationForOneSuperBlockRow based on the bitdepth.
+  void ApplyLoopRestoration(int row4x4_start, int sb4x4);
+  // Worker function used for multithreaded Loop Restoration.
+  void ApplyLoopRestorationWorker(std::atomic<int>* row4x4_atomic);
+  static_assert(std::is_same<decltype(&PostFilter::ApplyLoopRestorationWorker),
+                             WorkerFunction>::value,
+                "");
 
   const ObuFrameHeader& frame_header_;
   const LoopRestoration& loop_restoration_;
@@ -473,7 +476,7 @@
   const int8_t subsampling_x_[kMaxPlanes];
   const int8_t subsampling_y_[kMaxPlanes];
   const int8_t planes_;
-  const int pixel_size_;
+  const int pixel_size_log2_;
   const uint8_t* const inner_thresh_;
   const uint8_t* const outer_thresh_;
   const bool needs_chroma_deblock_;
@@ -491,18 +494,13 @@
   } super_res_info_[kMaxPlanes];
   const Array2D<int16_t>& cdef_index_;
   const Array2D<TransformSize>& inter_transform_sizes_;
-  // Pointer to the data buffer used for multi-threaded cdef or loop
-  // restoration. The size of this buffer must be at least
-  // |window_buffer_width_| * |window_buffer_height_| * |pixel_size_|.
-  // Or |planes_| times that for multi-threaded cdef.
-  // If |thread_pool_| is nullptr, then this buffer is not used and can be
-  // nullptr as well.
-  uint8_t* const threaded_window_buffer_;
   LoopRestorationInfo* const restoration_info_;
-  // Pointer to the line buffer used by ApplySuperRes(). If SuperRes is on, then
-  // the buffer will be large enough to hold one downscaled row +
-  // 2 * kSuperResHorizontalBorder + kSuperResHorizontalPadding.
-  uint8_t* const superres_line_buffer_;
+  uint8_t* const superres_coefficients_[kNumPlaneTypes];
+  // Line buffer used by multi-threaded ApplySuperRes().
+  // In the multi-threaded case, this buffer will store the last downscaled row
+  // input of each thread to avoid overwrites by the first upscaled row output
+  // of the thread below it.
+  YuvBuffer& superres_line_buffer_;
   const BlockParametersHolder& block_parameters_;
   // Frame buffer to hold cdef filtered frame.
   YuvBuffer cdef_filtered_buffer_;
@@ -520,24 +518,27 @@
   // A view into |frame_buffer_| that points to the output of the Loop Restored
   // planes (to facilitate in-place Loop Restoration).
   uint8_t* loop_restoration_buffer_[kMaxPlanes];
-  // Buffer used to store the deblocked pixels that are necessary for loop
+  YuvBuffer& cdef_border_;
+  // Buffer used to store the border pixels that are necessary for loop
   // restoration. This buffer will store 4 rows for every 64x64 block (4 rows
   // for every 32x32 for chroma with subsampling). The indices of the rows that
-  // are stored are specified in |kDeblockedRowsForLoopRestoration|. First 4
-  // rows of this buffer are never populated and never used.
-  // This buffer is used only when both Cdef and Loop Restoration are on.
-  YuvBuffer& deblock_buffer_;
+  // are stored are specified in |kLoopRestorationBorderRows|. First 4 rows of
+  // this buffer are never populated and never used.
+  // This buffer is used only when both of the following conditions are true:
+  //   (1). Loop Restoration is on.
+  //   (2). Cdef is on, or multi-threading is enabled for post filter.
+  YuvBuffer& loop_restoration_border_;
   const uint8_t do_post_filter_mask_;
   ThreadPool* const thread_pool_;
-  const int window_buffer_width_;
-  const int window_buffer_height_;
 
   // Tracks the progress of the post filters.
   int progress_row_ = -1;
 
   // A block buffer to hold the input that is converted to uint16_t before
-  // cdef filtering. Only used in single threaded case.
-  uint16_t cdef_block_[kCdefUnitSizeWithBorders * kCdefUnitSizeWithBorders * 3];
+  // cdef filtering. Only used in single threaded case. Y plane is processed
+  // separately. U and V planes are processed together. So it is sufficient to
+  // have this buffer to accommodate 2 planes at a time.
+  uint16_t cdef_block_[kCdefUnitSizeWithBorders * kCdefUnitSizeWithBorders * 2];
 
   template <int bitdepth, typename Pixel>
   friend class PostFilterSuperResTest;
@@ -551,11 +552,6 @@
                                                       ptrdiff_t stride,
                                                       int left, int right,
                                                       int top, int bottom);
-extern template void PostFilter::PrepareLoopRestorationBlock<uint8_t>(
-    const uint8_t* src_buffer, ptrdiff_t src_stride,
-    const uint8_t* deblock_buffer, ptrdiff_t deblock_stride, uint8_t* dst,
-    ptrdiff_t dst_stride, const int width, const int height,
-    const bool frame_top_border, const bool frame_bottom_border);
 
 #if LIBGAV1_MAX_BITDEPTH >= 10
 extern template void PostFilter::ExtendFrame<uint16_t>(uint16_t* frame_start,
@@ -563,11 +559,6 @@
                                                        ptrdiff_t stride,
                                                        int left, int right,
                                                        int top, int bottom);
-extern template void PostFilter::PrepareLoopRestorationBlock<uint16_t>(
-    const uint16_t* src_buffer, ptrdiff_t src_stride,
-    const uint16_t* deblock_buffer, ptrdiff_t deblock_stride, uint16_t* dst,
-    ptrdiff_t dst_stride, const int width, const int height,
-    const bool frame_top_border, const bool frame_bottom_border);
 #endif
 
 }  // namespace libgav1

diff --git a/libgav1/src/post_filter/cdef.cc b/libgav1/src/post_filter/cdef.cc
index 9b6bb00..f32b0a0 100644
--- a/libgav1/src/post_filter/cdef.cc
+++ b/libgav1/src/post_filter/cdef.cc

@@ -26,16 +26,20 @@
     {{0, 1, 2, 3, 4, 5, 6, 7}, {1, 2, 2, 2, 3, 4, 6, 0}},
     {{7, 0, 2, 4, 5, 6, 6, 6}, {0, 1, 2, 3, 4, 5, 6, 7}}};
 
+constexpr int kCdefBorderRows[2][4] = {{0, 1, 62, 63}, {0, 1, 30, 31}};
+
 template <typename Pixel>
 void CopyRowForCdef(const Pixel* src, int block_width, int unit_width,
                     bool is_frame_left, bool is_frame_right,
-                    uint16_t* const dst) {
+                    uint16_t* const dst, const Pixel* left_border = nullptr) {
   if (sizeof(src[0]) == sizeof(dst[0])) {
     if (is_frame_left) {
       Memset(dst - kCdefBorder, kCdefLargeValue, kCdefBorder);
-    } else {
+    } else if (left_border == nullptr) {
       memcpy(dst - kCdefBorder, src - kCdefBorder,
              kCdefBorder * sizeof(dst[0]));
+    } else {
+      memcpy(dst - kCdefBorder, left_border, kCdefBorder * sizeof(dst[0]));
     }
     memcpy(dst, src, block_width * sizeof(dst[0]));
     if (is_frame_right) {
@@ -47,8 +51,18 @@
     }
     return;
   }
-  for (int x = -kCdefBorder; x < 0; ++x) {
-    dst[x] = is_frame_left ? static_cast<uint16_t>(kCdefLargeValue) : src[x];
+  if (is_frame_left) {
+    for (int x = -kCdefBorder; x < 0; ++x) {
+      dst[x] = static_cast<uint16_t>(kCdefLargeValue);
+    }
+  } else if (left_border == nullptr) {
+    for (int x = -kCdefBorder; x < 0; ++x) {
+      dst[x] = src[x];
+    }
+  } else {
+    for (int x = -kCdefBorder; x < 0; ++x) {
+      dst[x] = left_border[x + kCdefBorder];
+    }
   }
   for (int x = 0; x < block_width; ++x) {
     dst[x] = src[x];
@@ -72,38 +86,48 @@
 
 }  // namespace
 
-uint8_t* PostFilter::GetCdefBufferAndStride(const int start_x,
-                                            const int start_y, const int plane,
-                                            const int window_buffer_plane_size,
-                                            int* cdef_stride) const {
-  if (thread_pool_ != nullptr) {
-    // write output to threaded_window_buffer.
-    *cdef_stride = window_buffer_width_ * pixel_size_;
-    const int column_window =
-        start_x % (window_buffer_width_ >> subsampling_x_[plane]);
-    const int row_window =
-        start_y % (window_buffer_height_ >> subsampling_y_[plane]);
-    return threaded_window_buffer_ + plane * window_buffer_plane_size +
-           row_window * (*cdef_stride) + column_window * pixel_size_;
-  }
-  // write output to |cdef_buffer_|.
-  *cdef_stride = frame_buffer_.stride(plane);
-  return cdef_buffer_[plane] + start_y * (*cdef_stride) + start_x * pixel_size_;
+void PostFilter::SetupCdefBorder(int row4x4) {
+  assert(row4x4 >= 0);
+  assert(DoCdef());
+  int plane = kPlaneY;
+  do {
+    const ptrdiff_t src_stride = frame_buffer_.stride(plane);
+    const ptrdiff_t dst_stride = cdef_border_.stride(plane);
+    const int row_offset = DivideBy4(row4x4);
+    const int num_pixels = SubsampledValue(
+        MultiplyBy4(frame_header_.columns4x4), subsampling_x_[plane]);
+    const int row_width = num_pixels << pixel_size_log2_;
+    const int plane_height = SubsampledValue(MultiplyBy4(frame_header_.rows4x4),
+                                             subsampling_y_[plane]);
+    for (int i = 0; i < 4; ++i) {
+      const int row = kCdefBorderRows[subsampling_y_[plane]][i];
+      const int absolute_row =
+          (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row;
+      if (absolute_row >= plane_height) break;
+      const uint8_t* src =
+          GetSourceBuffer(static_cast<Plane>(plane), row4x4, 0) +
+          row * src_stride;
+      uint8_t* dst = cdef_border_.data(plane) + dst_stride * (row_offset + i);
+      memcpy(dst, src, row_width);
+    }
+  } while (++plane < planes_);
 }
 
 template <typename Pixel>
 void PostFilter::PrepareCdefBlock(int block_width4x4, int block_height4x4,
                                   int row4x4, int column4x4,
                                   uint16_t* cdef_source, ptrdiff_t cdef_stride,
-                                  const bool y_plane) {
+                                  const bool y_plane,
+                                  const uint8_t border_columns[kMaxPlanes][256],
+                                  bool use_border_columns) {
   assert(y_plane || planes_ == kMaxPlanes);
   const int max_planes = y_plane ? 1 : kMaxPlanes;
   const int8_t subsampling_x = y_plane ? 0 : subsampling_x_[kPlaneU];
   const int8_t subsampling_y = y_plane ? 0 : subsampling_y_[kPlaneU];
   const int start_x = MultiplyBy4(column4x4) >> subsampling_x;
   const int start_y = MultiplyBy4(row4x4) >> subsampling_y;
-  const int plane_width = RightShiftWithRounding(width_, subsampling_x);
-  const int plane_height = RightShiftWithRounding(height_, subsampling_y);
+  const int plane_width = SubsampledValue(width_, subsampling_x);
+  const int plane_height = SubsampledValue(height_, subsampling_y);
   const int block_width = MultiplyBy4(block_width4x4) >> subsampling_x;
   const int block_height = MultiplyBy4(block_height4x4) >> subsampling_y;
   // unit_width, unit_height are the same as block_width, block_height unless
@@ -117,20 +141,33 @@
   const bool is_frame_top = row4x4 == 0;
   const bool is_frame_bottom = start_y + block_height >= plane_height;
   const int y_offset = is_frame_top ? 0 : kCdefBorder;
+  const int cdef_border_row_offset = DivideBy4(row4x4) - (is_frame_top ? 0 : 2);
 
   for (int plane = y_plane ? kPlaneY : kPlaneU; plane < max_planes; ++plane) {
-    uint16_t* cdef_src = cdef_source + plane * kCdefUnitSizeWithBorders *
+    uint16_t* cdef_src = cdef_source + static_cast<int>(plane == kPlaneV) *
+                                           kCdefUnitSizeWithBorders *
                                            kCdefUnitSizeWithBorders;
     const int src_stride = frame_buffer_.stride(plane) / sizeof(Pixel);
     const Pixel* src_buffer =
         reinterpret_cast<const Pixel*>(source_buffer_[plane]) +
         (start_y - y_offset) * src_stride + start_x;
+    const int cdef_border_stride = cdef_border_.stride(plane) / sizeof(Pixel);
+    const Pixel* cdef_border =
+        (thread_pool_ == nullptr)
+            ? nullptr
+            : reinterpret_cast<const Pixel*>(cdef_border_.data(plane)) +
+                  cdef_border_row_offset * cdef_border_stride + start_x;
 
     // All the copying code will use negative indices for populating the left
     // border. So the starting point is set to kCdefBorder.
     cdef_src += kCdefBorder;
 
-    // Copy the top 2 rows.
+    // Copy the top 2 rows as follows;
+    // If is_frame_top is true, both the rows are set to kCdefLargeValue.
+    // Otherwise:
+    //   If multi-threaded filtering is off, the rows are copied from
+    //   |src_buffer|.
+    //   Otherwise, the rows are copied from |cdef_border|.
     if (is_frame_top) {
       for (int y = 0; y < kCdefBorder; ++y) {
         Memset(cdef_src - kCdefBorder, kCdefLargeValue,
@@ -138,24 +175,63 @@
         cdef_src += cdef_stride;
       }
     } else {
+      const Pixel* top_border =
+          (thread_pool_ == nullptr) ? src_buffer : cdef_border;
+      const int top_border_stride =
+          (thread_pool_ == nullptr) ? src_stride : cdef_border_stride;
       for (int y = 0; y < kCdefBorder; ++y) {
-        CopyRowForCdef(src_buffer, block_width, unit_width, is_frame_left,
+        CopyRowForCdef(top_border, block_width, unit_width, is_frame_left,
                        is_frame_right, cdef_src);
-        src_buffer += src_stride;
+        top_border += top_border_stride;
         cdef_src += cdef_stride;
+        // We need to increment |src_buffer| and |cdef_border| in this loop to
+        // set them up for the subsequent loops below.
+        src_buffer += src_stride;
+        cdef_border += cdef_border_stride;
       }
     }
 
-    // Copy the body.
+    // Copy the body as follows;
+    // If multi-threaded filtering is off or if is_frame_bottom is true, all the
+    // rows are copied from |src_buffer|.
+    // Otherwise, the first |block_height|-kCdefBorder rows are copied from
+    // |src_buffer| and the last kCdefBorder rows are coped from |cdef_border|.
     int y = block_height;
+    const int y_threshold =
+        (thread_pool_ == nullptr || is_frame_bottom) ? 0 : kCdefBorder;
+    const Pixel* left_border =
+        (thread_pool_ == nullptr || !use_border_columns)
+            ? nullptr
+            : reinterpret_cast<const Pixel*>(border_columns[plane]);
     do {
       CopyRowForCdef(src_buffer, block_width, unit_width, is_frame_left,
-                     is_frame_right, cdef_src);
+                     is_frame_right, cdef_src, left_border);
       cdef_src += cdef_stride;
       src_buffer += src_stride;
-    } while (--y != 0);
+      if (left_border != nullptr) left_border += kCdefBorder;
+    } while (--y != y_threshold);
 
-    // Copy the bottom 2 rows.
+    if (y > 0) {
+      assert(y == kCdefBorder);
+      // |cdef_border| now points to the top 2 rows of the current block. For
+      // the next loop, we need it to point to the bottom 2 rows of the
+      // current block. So increment it by 2 rows.
+      cdef_border += MultiplyBy2(cdef_border_stride);
+      for (int i = 0; i < kCdefBorder; ++i) {
+        CopyRowForCdef(cdef_border, block_width, unit_width, is_frame_left,
+                       is_frame_right, cdef_src);
+        cdef_src += cdef_stride;
+        cdef_border += cdef_border_stride;
+      }
+    }
+
+    // Copy the bottom 2 rows as follows;
+    // If is_frame_bottom is true, both the rows are set to kCdefLargeValue.
+    // Otherwise:
+    //   If multi-threaded filtering is off, the rows are copied from
+    //   |src_buffer|.
+    //   Otherwise, the rows are copied from |cdef_border|.
+    y = 0;
     if (is_frame_bottom) {
       do {
         Memset(cdef_src - kCdefBorder, kCdefLargeValue,
@@ -163,10 +239,14 @@
         cdef_src += cdef_stride;
       } while (++y < kCdefBorder + unit_height - block_height);
     } else {
+      const Pixel* bottom_border =
+          (thread_pool_ == nullptr) ? src_buffer : cdef_border;
+      const int bottom_border_stride =
+          (thread_pool_ == nullptr) ? src_stride : cdef_border_stride;
       do {
-        CopyRowForCdef(src_buffer, block_width, unit_width, is_frame_left,
+        CopyRowForCdef(bottom_border, block_width, unit_width, is_frame_left,
                        is_frame_right, cdef_src);
-        src_buffer += src_stride;
+        bottom_border += bottom_border_stride;
         cdef_src += cdef_stride;
       } while (++y < kCdefBorder + unit_height - block_height);
     }
@@ -178,54 +258,91 @@
                                      const int block_width4x4,
                                      const int block_height4x4,
                                      const int row4x4_start,
-                                     const int column4x4_start) {
+                                     const int column4x4_start,
+                                     uint8_t border_columns[2][kMaxPlanes][256],
+                                     bool use_border_columns[2][2]) {
   // Cdef operates in 8x8 blocks (4x4 for chroma with subsampling).
   static constexpr int kStep = 8;
   static constexpr int kStep4x4 = 2;
 
-  const int window_buffer_plane_size =
-      window_buffer_width_ * window_buffer_height_ * sizeof(Pixel);
   int cdef_buffer_row_base_stride[kMaxPlanes];
-  int cdef_buffer_stride[kMaxPlanes];
   uint8_t* cdef_buffer_row_base[kMaxPlanes];
   int src_buffer_row_base_stride[kMaxPlanes];
   const uint8_t* src_buffer_row_base[kMaxPlanes];
+  const uint16_t* cdef_src_row_base[kMaxPlanes];
+  int cdef_src_row_base_stride[kMaxPlanes];
   int column_step[kMaxPlanes];
-  assert(planes_ >= 1);
-  for (int plane = kPlaneY; plane < planes_; ++plane) {
-    const int start_y = MultiplyBy4(row4x4_start) >> subsampling_y_[plane];
-    const int start_x = MultiplyBy4(column4x4_start) >> subsampling_x_[plane];
-    cdef_buffer_row_base[plane] = GetCdefBufferAndStride(
-        start_x, start_y, plane, window_buffer_plane_size,
-        &cdef_buffer_stride[plane]);
+  assert(planes_ == kMaxPlanesMonochrome || planes_ == kMaxPlanes);
+  int plane = kPlaneY;
+  do {
+    cdef_buffer_row_base[plane] =
+        GetCdefBuffer(static_cast<Plane>(plane), row4x4_start, column4x4_start);
     cdef_buffer_row_base_stride[plane] =
-        cdef_buffer_stride[plane] * (kStep >> subsampling_y_[plane]);
-    src_buffer_row_base[plane] = source_buffer_[plane] +
-                                 start_y * frame_buffer_.stride(plane) +
-                                 start_x * sizeof(Pixel);
+        frame_buffer_.stride(plane) * (kStep >> subsampling_y_[plane]);
+    src_buffer_row_base[plane] = GetSourceBuffer(static_cast<Plane>(plane),
+                                                 row4x4_start, column4x4_start);
     src_buffer_row_base_stride[plane] =
         frame_buffer_.stride(plane) * (kStep >> subsampling_y_[plane]);
+    cdef_src_row_base[plane] =
+        cdef_block +
+        static_cast<int>(plane == kPlaneV) * kCdefUnitSizeWithBorders *
+            kCdefUnitSizeWithBorders +
+        kCdefBorder * kCdefUnitSizeWithBorders + kCdefBorder;
+    cdef_src_row_base_stride[plane] =
+        kCdefUnitSizeWithBorders * (kStep >> subsampling_y_[plane]);
     column_step[plane] = (kStep >> subsampling_x_[plane]) * sizeof(Pixel);
-  }
+  } while (++plane < planes_);
+
+  // |border_columns| contains two buffers. In each call to this function, we
+  // will use one of them as the "destination" for the current call. And the
+  // other one as the "source" for the current call (which would have been the
+  // "destination" of the previous call). We will use the src_index to populate
+  // the borders which were backed up in the previous call. We will use the
+  // dst_index to populate the borders to be used in the next call.
+  const int border_columns_src_index = DivideBy16(column4x4_start) & 1;
+  const int border_columns_dst_index = border_columns_src_index ^ 1;
 
   if (index == -1) {
-    for (int plane = kPlaneY; plane < planes_; ++plane) {
-      CopyPixels(src_buffer_row_base[plane], frame_buffer_.stride(plane),
-                 cdef_buffer_row_base[plane], cdef_buffer_stride[plane],
-                 MultiplyBy4(block_width4x4) >> subsampling_x_[plane],
-                 MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
-                 sizeof(Pixel));
+    if (thread_pool_ == nullptr) {
+      int plane = kPlaneY;
+      do {
+        CopyPixels(src_buffer_row_base[plane], frame_buffer_.stride(plane),
+                   cdef_buffer_row_base[plane], frame_buffer_.stride(plane),
+                   MultiplyBy4(block_width4x4) >> subsampling_x_[plane],
+                   MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
+                   sizeof(Pixel));
+      } while (++plane < planes_);
     }
+    use_border_columns[border_columns_dst_index][0] = false;
+    use_border_columns[border_columns_dst_index][1] = false;
     return;
   }
 
-  PrepareCdefBlock<Pixel>(block_width4x4, block_height4x4, row4x4_start,
-                          column4x4_start, cdef_block, kCdefUnitSizeWithBorders,
-                          true);
+  const bool is_frame_right =
+      MultiplyBy4(column4x4_start) + MultiplyBy4(block_width4x4) >= width_;
+  if (!is_frame_right && thread_pool_ != nullptr) {
+    // Backup the last 2 columns for use in the next iteration.
+    use_border_columns[border_columns_dst_index][0] = true;
+    const uint8_t* src_line =
+        GetSourceBuffer(kPlaneY, row4x4_start,
+                        column4x4_start + block_width4x4) -
+        kCdefBorder * sizeof(Pixel);
+    CopyPixels(src_line, frame_buffer_.stride(kPlaneY),
+               border_columns[border_columns_dst_index][kPlaneY],
+               kCdefBorder * sizeof(Pixel), kCdefBorder,
+               MultiplyBy4(block_height4x4), sizeof(Pixel));
+  }
+
+  PrepareCdefBlock<Pixel>(
+      block_width4x4, block_height4x4, row4x4_start, column4x4_start,
+      cdef_block, kCdefUnitSizeWithBorders, true,
+      (border_columns != nullptr) ? border_columns[border_columns_src_index]
+                                  : nullptr,
+      use_border_columns[border_columns_src_index][0]);
 
   // Stored direction used during the u/v pass.  If bit 3 is set, then block is
   // a skip.
-  int direction_y[8 * 8];
+  uint8_t direction_y[8 * 8];
   int y_index = 0;
 
   const uint8_t y_primary_strength =
@@ -248,14 +365,16 @@
   do {
     uint8_t* cdef_buffer_base = cdef_buffer_row_base[kPlaneY];
     const uint8_t* src_buffer_base = src_buffer_row_base[kPlaneY];
+    const uint16_t* cdef_src_base = cdef_src_row_base[kPlaneY];
     BlockParameters* const* bp0 = bp_row0_base;
     BlockParameters* const* bp1 = bp_row1_base;
     int column4x4 = column4x4_start;
     do {
       const int block_width = kStep;
       const int block_height = kStep;
-      const int cdef_stride = cdef_buffer_stride[kPlaneY];
+      const int cdef_stride = frame_buffer_.stride(kPlaneY);
       uint8_t* const cdef_buffer = cdef_buffer_base;
+      const uint16_t* const cdef_src = cdef_src_base;
       const int src_stride = frame_buffer_.stride(kPlaneY);
       const uint8_t* const src_buffer = src_buffer_base;
 
@@ -264,16 +383,39 @@
 
       if (skip) {  // No cdef filtering.
         direction_y[y_index] = kCdefSkip;
-        CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
-                   block_width, block_height, sizeof(Pixel));
+        if (thread_pool_ == nullptr) {
+          CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
+                     block_width, block_height, sizeof(Pixel));
+        }
       } else {
         // Zero out residual skip flag.
         direction_y[y_index] = 0;
 
         int variance = 0;
         if (compute_direction_and_variance) {
-          dsp_.cdef_direction(src_buffer, src_stride, &direction_y[y_index],
-                              &variance);
+          if (thread_pool_ == nullptr ||
+              row4x4 + kStep4x4 < row4x4_start + block_height4x4) {
+            dsp_.cdef_direction(src_buffer, src_stride, &direction_y[y_index],
+                                &variance);
+          } else if (sizeof(Pixel) == 2) {
+            dsp_.cdef_direction(cdef_src, kCdefUnitSizeWithBorders * 2,
+                                &direction_y[y_index], &variance);
+          } else {
+            // If we are in the last row4x4 for this unit, then the last two
+            // input rows have to come from |cdef_border_|. Since we already
+            // have |cdef_src| populated correctly, use that as the input
+            // for the direction process.
+            uint8_t direction_src[8][8];
+            const uint16_t* cdef_src_line = cdef_src;
+            for (auto& direction_src_line : direction_src) {
+              for (int i = 0; i < 8; ++i) {
+                direction_src_line[i] = cdef_src_line[i];
+              }
+              cdef_src_line += kCdefUnitSizeWithBorders;
+            }
+            dsp_.cdef_direction(direction_src, 8, &direction_y[y_index],
+                                &variance);
+          }
         }
         const int direction =
             (y_primary_strength == 0) ? 0 : direction_y[y_index];
@@ -283,16 +425,12 @@
             (variance != 0)
                 ? (y_primary_strength * (4 + variance_strength) + 8) >> 4
                 : 0;
-
         if ((primary_strength | y_secondary_strength) == 0) {
-          CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
-                     block_width, block_height, sizeof(Pixel));
+          if (thread_pool_ == nullptr) {
+            CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
+                       block_width, block_height, sizeof(Pixel));
+          }
         } else {
-          uint16_t* cdef_src =
-              cdef_block + kCdefBorder * kCdefUnitSizeWithBorders + kCdefBorder;
-          cdef_src +=
-              (MultiplyBy4(row4x4 - row4x4_start)) * kCdefUnitSizeWithBorders +
-              (MultiplyBy4(column4x4 - column4x4_start));
           const int strength_index =
               y_strength_index | (static_cast<int>(primary_strength == 0) << 1);
           dsp_.cdef_filters[1][strength_index](
@@ -303,6 +441,7 @@
       }
       cdef_buffer_base += column_step[kPlaneY];
       src_buffer_base += column_step[kPlaneY];
+      cdef_src_base += column_step[kPlaneY] / sizeof(Pixel);
 
       bp0 += kStep4x4;
       bp1 += kStep4x4;
@@ -312,6 +451,7 @@
 
     cdef_buffer_row_base[kPlaneY] += cdef_buffer_row_base_stride[kPlaneY];
     src_buffer_row_base[kPlaneY] += src_buffer_row_base_stride[kPlaneY];
+    cdef_src_row_base[kPlaneY] += cdef_src_row_base_stride[kPlaneY];
     bp_row0_base += bp_stride;
     bp_row1_base += bp_stride;
     row4x4 += kStep4x4;
@@ -327,19 +467,41 @@
       frame_header_.cdef.uv_secondary_strength[index];
 
   if ((uv_primary_strength | uv_secondary_strength) == 0) {
-    for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
-      CopyPixels(src_buffer_row_base[plane], frame_buffer_.stride(plane),
-                 cdef_buffer_row_base[plane], cdef_buffer_stride[plane],
-                 MultiplyBy4(block_width4x4) >> subsampling_x_[plane],
-                 MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
-                 sizeof(Pixel));
+    if (thread_pool_ == nullptr) {
+      for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
+        CopyPixels(src_buffer_row_base[plane], frame_buffer_.stride(plane),
+                   cdef_buffer_row_base[plane], frame_buffer_.stride(plane),
+                   MultiplyBy4(block_width4x4) >> subsampling_x_[plane],
+                   MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
+                   sizeof(Pixel));
+      }
     }
+    use_border_columns[border_columns_dst_index][1] = false;
     return;
   }
 
-  PrepareCdefBlock<Pixel>(block_width4x4, block_height4x4, row4x4_start,
-                          column4x4_start, cdef_block, kCdefUnitSizeWithBorders,
-                          false);
+  if (!is_frame_right && thread_pool_ != nullptr) {
+    use_border_columns[border_columns_dst_index][1] = true;
+    for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
+      // Backup the last 2 columns for use in the next iteration.
+      const uint8_t* src_line =
+          GetSourceBuffer(static_cast<Plane>(plane), row4x4_start,
+                          column4x4_start + block_width4x4) -
+          kCdefBorder * sizeof(Pixel);
+      CopyPixels(src_line, frame_buffer_.stride(plane),
+                 border_columns[border_columns_dst_index][plane],
+                 kCdefBorder * sizeof(Pixel), kCdefBorder,
+                 MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
+                 sizeof(Pixel));
+    }
+  }
+
+  PrepareCdefBlock<Pixel>(
+      block_width4x4, block_height4x4, row4x4_start, column4x4_start,
+      cdef_block, kCdefUnitSizeWithBorders, false,
+      (border_columns != nullptr) ? border_columns[border_columns_src_index]
+                                  : nullptr,
+      use_border_columns[border_columns_src_index][1]);
 
   // uv_strength_index is 0 for both primary and secondary strengths being
   // non-zero, 1 for primary only, 2 for secondary only.
@@ -357,18 +519,22 @@
     do {
       uint8_t* cdef_buffer_base = cdef_buffer_row_base[plane];
       const uint8_t* src_buffer_base = src_buffer_row_base[plane];
+      const uint16_t* cdef_src_base = cdef_src_row_base[plane];
       int column4x4 = column4x4_start;
       do {
-        const int cdef_stride = cdef_buffer_stride[plane];
+        const int cdef_stride = frame_buffer_.stride(plane);
         uint8_t* const cdef_buffer = cdef_buffer_base;
         const int src_stride = frame_buffer_.stride(plane);
         const uint8_t* const src_buffer = src_buffer_base;
+        const uint16_t* const cdef_src = cdef_src_base;
         const bool skip = (direction_y[y_index] & kCdefSkip) != 0;
         int dual_cdef = 0;
 
         if (skip) {  // No cdef filtering.
-          CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
-                     block_width, block_height, sizeof(Pixel));
+          if (thread_pool_ == nullptr) {
+            CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
+                       block_width, block_height, sizeof(Pixel));
+          }
         } else {
           // Make sure block pair is not out of bounds.
           if (column4x4 + (kStep4x4 * 2) <= column4x4_start + block_width4x4) {
@@ -396,13 +562,6 @@
             }
           }
 
-          uint16_t* cdef_src = cdef_block + plane * kCdefUnitSizeWithBorders *
-                                                kCdefUnitSizeWithBorders;
-          cdef_src += kCdefBorder * kCdefUnitSizeWithBorders + kCdefBorder;
-          cdef_src +=
-              (MultiplyBy4(row4x4 - row4x4_start) >> subsampling_y) *
-                  kCdefUnitSizeWithBorders +
-              (MultiplyBy4(column4x4 - column4x4_start) >> subsampling_x);
           // Block width is 8 if either dual_cdef is true or subsampling_x == 0.
           const int width_index = dual_cdef | (subsampling_x ^ 1);
           dsp_.cdef_filters[width_index][uv_strength_index](
@@ -415,19 +574,23 @@
         // so adjust the pointers and indexes for 2 blocks.
         cdef_buffer_base += column_step[plane] << dual_cdef;
         src_buffer_base += column_step[plane] << dual_cdef;
+        cdef_src_base += (column_step[plane] / sizeof(Pixel)) << dual_cdef;
         column4x4 += kStep4x4 << dual_cdef;
         y_index += 1 << dual_cdef;
       } while (column4x4 < column4x4_start + block_width4x4);
 
       cdef_buffer_row_base[plane] += cdef_buffer_row_base_stride[plane];
       src_buffer_row_base[plane] += src_buffer_row_base_stride[plane];
+      cdef_src_row_base[plane] += cdef_src_row_base_stride[plane];
       row4x4 += kStep4x4;
     } while (row4x4 < row4x4_start + block_height4x4);
   }
 }
 
-void PostFilter::ApplyCdefForOneSuperBlockRowHelper(int row4x4,
-                                                    int block_height4x4) {
+void PostFilter::ApplyCdefForOneSuperBlockRowHelper(
+    uint16_t* cdef_block, uint8_t border_columns[2][kMaxPlanes][256],
+    int row4x4, int block_height4x4) {
+  bool use_border_columns[2][2] = {};
   for (int column4x4 = 0; column4x4 < frame_header_.columns4x4;
        column4x4 += kStep64x64) {
     const int index = cdef_index_[DivideBy16(row4x4)][DivideBy16(column4x4)];
@@ -436,13 +599,15 @@
 
 #if LIBGAV1_MAX_BITDEPTH >= 10
     if (bitdepth_ >= 10) {
-      ApplyCdefForOneUnit<uint16_t>(cdef_block_, index, block_width4x4,
-                                    block_height4x4, row4x4, column4x4);
+      ApplyCdefForOneUnit<uint16_t>(cdef_block, index, block_width4x4,
+                                    block_height4x4, row4x4, column4x4,
+                                    border_columns, use_border_columns);
       continue;
     }
 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
-    ApplyCdefForOneUnit<uint8_t>(cdef_block_, index, block_width4x4,
-                                 block_height4x4, row4x4, column4x4);
+    ApplyCdefForOneUnit<uint8_t>(cdef_block, index, block_width4x4,
+                                 block_height4x4, row4x4, column4x4,
+                                 border_columns, use_border_columns);
   }
 }
 
@@ -461,7 +626,7 @@
     // first iteration (y == 0).
     if (row4x4 > 0 && (!is_last_row || y == 0)) {
       assert(row4x4 >= 16);
-      ApplyCdefForOneSuperBlockRowHelper(row4x4 - 2, 2);
+      ApplyCdefForOneSuperBlockRowHelper(cdef_block_, nullptr, row4x4 - 2, 2);
     }
 
     // Apply cdef for the current superblock row. If this is the last superblock
@@ -471,101 +636,25 @@
         std::min(kStep64x64, frame_header_.rows4x4 - row4x4);
     const int height4x4 = block_height4x4 - (is_last_row ? 0 : 2);
     if (height4x4 > 0) {
-      ApplyCdefForOneSuperBlockRowHelper(row4x4, height4x4);
+      ApplyCdefForOneSuperBlockRowHelper(cdef_block_, nullptr, row4x4,
+                                         height4x4);
     }
   }
 }
 
-template <typename Pixel>
-void PostFilter::ApplyCdefForOneRowInWindow(const int row4x4,
-                                            const int column4x4_start) {
-  uint16_t cdef_block[kCdefUnitSizeWithBorders * kCdefUnitSizeWithBorders * 3];
-
-  for (int column4x4_64x64 = 0;
-       column4x4_64x64 < std::min(DivideBy4(window_buffer_width_),
-                                  frame_header_.columns4x4 - column4x4_start);
-       column4x4_64x64 += kStep64x64) {
-    const int column4x4 = column4x4_start + column4x4_64x64;
-    const int index = cdef_index_[DivideBy16(row4x4)][DivideBy16(column4x4)];
-    const int block_width4x4 =
-        std::min(kStep64x64, frame_header_.columns4x4 - column4x4);
+void PostFilter::ApplyCdefWorker(std::atomic<int>* row4x4_atomic) {
+  int row4x4;
+  uint16_t cdef_block[kCdefUnitSizeWithBorders * kCdefUnitSizeWithBorders * 2];
+  // Each border_column buffer has to store 64 rows and 2 columns for each
+  // plane. For 10bit, that is 64*2*2 = 256 bytes.
+  alignas(kMaxAlignment) uint8_t border_columns[2][kMaxPlanes][256];
+  while ((row4x4 = row4x4_atomic->fetch_add(
+              kStep64x64, std::memory_order_relaxed)) < frame_header_.rows4x4) {
     const int block_height4x4 =
         std::min(kStep64x64, frame_header_.rows4x4 - row4x4);
-
-    ApplyCdefForOneUnit<Pixel>(cdef_block, index, block_width4x4,
-                               block_height4x4, row4x4, column4x4);
+    ApplyCdefForOneSuperBlockRowHelper(cdef_block, border_columns, row4x4,
+                                       block_height4x4);
   }
 }
 
-// Each thread processes one row inside the window.
-// Y, U, V planes are processed together inside one thread.
-template <typename Pixel>
-void PostFilter::ApplyCdefThreaded() {
-  assert((window_buffer_height_ & 63) == 0);
-  const int num_workers = thread_pool_->num_threads();
-  const int window_buffer_plane_size =
-      window_buffer_width_ * window_buffer_height_;
-  const int window_buffer_height4x4 = DivideBy4(window_buffer_height_);
-  for (int row4x4 = 0; row4x4 < frame_header_.rows4x4;
-       row4x4 += window_buffer_height4x4) {
-    const int actual_window_height4x4 =
-        std::min(window_buffer_height4x4, frame_header_.rows4x4 - row4x4);
-    const int vertical_units_per_window =
-        DivideBy16(actual_window_height4x4 + 15);
-    for (int column4x4 = 0; column4x4 < frame_header_.columns4x4;
-         column4x4 += DivideBy4(window_buffer_width_)) {
-      const int jobs_for_threadpool =
-          vertical_units_per_window * num_workers / (num_workers + 1);
-      BlockingCounter pending_jobs(jobs_for_threadpool);
-      int job_count = 0;
-      for (int row64x64 = 0; row64x64 < actual_window_height4x4;
-           row64x64 += kStep64x64) {
-        if (job_count < jobs_for_threadpool) {
-          thread_pool_->Schedule(
-              [this, row4x4, column4x4, row64x64, &pending_jobs]() {
-                ApplyCdefForOneRowInWindow<Pixel>(row4x4 + row64x64, column4x4);
-                pending_jobs.Decrement();
-              });
-        } else {
-          ApplyCdefForOneRowInWindow<Pixel>(row4x4 + row64x64, column4x4);
-        }
-        ++job_count;
-      }
-      pending_jobs.Wait();
-
-      // Copy |threaded_window_buffer_| to |cdef_buffer_|.
-      for (int plane = kPlaneY; plane < planes_; ++plane) {
-        const ptrdiff_t src_stride =
-            frame_buffer_.stride(plane) / sizeof(Pixel);
-        const int plane_row = MultiplyBy4(row4x4) >> subsampling_y_[plane];
-        const int plane_column =
-            MultiplyBy4(column4x4) >> subsampling_x_[plane];
-        int copy_width = std::min(frame_header_.columns4x4 - column4x4,
-                                  DivideBy4(window_buffer_width_));
-        copy_width = MultiplyBy4(copy_width) >> subsampling_x_[plane];
-        int copy_height =
-            std::min(frame_header_.rows4x4 - row4x4, window_buffer_height4x4);
-        copy_height = MultiplyBy4(copy_height) >> subsampling_y_[plane];
-        CopyPlane<Pixel>(
-            reinterpret_cast<const Pixel*>(threaded_window_buffer_) +
-                plane * window_buffer_plane_size,
-            window_buffer_width_, copy_width, copy_height,
-            reinterpret_cast<Pixel*>(cdef_buffer_[plane]) +
-                plane_row * src_stride + plane_column,
-            src_stride);
-      }
-    }
-  }
-}
-
-void PostFilter::ApplyCdef() {
-#if LIBGAV1_MAX_BITDEPTH >= 10
-  if (bitdepth_ >= 10) {
-    ApplyCdefThreaded<uint16_t>();
-    return;
-  }
-#endif
-  ApplyCdefThreaded<uint8_t>();
-}
-
 }  // namespace libgav1

diff --git a/libgav1/src/post_filter/deblock.cc b/libgav1/src/post_filter/deblock.cc
index c4e0852..9b5ed0f 100644
--- a/libgav1/src/post_filter/deblock.cc
+++ b/libgav1/src/post_filter/deblock.cc

@@ -14,7 +14,6 @@
 #include <atomic>
 
 #include "src/post_filter.h"
-#include "src/utils/blocking_counter.h"
 
 namespace libgav1 {
 namespace {
@@ -261,7 +260,7 @@
       kDeblockFilterLevelIndex[kPlaneU][kLoopFilterTypeVertical];
   const int filter_id_v =
       kDeblockFilterLevelIndex[kPlaneV][kLoopFilterTypeVertical];
-  const BlockParameters* bp_prev = *(bp_ptr - (1 << subsampling_x));
+  const BlockParameters* bp_prev = *(bp_ptr - (ptrdiff_t{1} << subsampling_x));
 
   if (bp == bp_prev) {
     // Not a border.
@@ -299,7 +298,7 @@
 void PostFilter::HorizontalDeblockFilter(int row4x4_start,
                                          int column4x4_start) {
   const int column_step = 1;
-  const size_t src_step = MultiplyBy4(pixel_size_);
+  const int src_step = 4 << pixel_size_log2_;
   const ptrdiff_t src_stride = frame_buffer_.stride(kPlaneY);
   uint8_t* src = GetSourceBuffer(kPlaneY, row4x4_start, column4x4_start);
   int row_step;
@@ -383,6 +382,7 @@
   BlockParameters* const* bp_row_base =
       block_parameters_.Address(row4x4_start, column4x4_start);
   const int bp_stride = block_parameters_.columns4x4();
+  const int column_step_shift = pixel_size_log2_;
   for (int row4x4 = 0; row4x4 < kNum4x4InLoopFilterUnit &&
                        MultiplyBy4(row4x4_start + row4x4) < height_;
        ++row4x4, src += row_stride, bp_row_base += bp_stride) {
@@ -400,7 +400,7 @@
             src_row, src_stride, outer_thresh_[level], inner_thresh_[level],
             HevThresh(level));
       }
-      src_row += column_step * pixel_size_;
+      src_row += column_step << column_step_shift;
       column_step = DivideBy4(column_step);
     }
   }
@@ -424,7 +424,7 @@
     BlockParameters* const* bp_row_base = block_parameters_.Address(
         GetDeblockPosition(row4x4_start, subsampling_y),
         GetDeblockPosition(column4x4_start, subsampling_x));
-    const int bp_stride = block_parameters_.columns4x4() * row_step;
+    const int bp_stride = block_parameters_.columns4x4() << subsampling_y;
     for (int row4x4 = 0; row4x4 < kNum4x4InLoopFilterUnit &&
                          MultiplyBy4(row4x4_start + row4x4) < height_;
          row4x4 += row_step, src_u += row_stride_u, src_v += row_stride_v,
@@ -450,8 +450,8 @@
               src_row_v, src_stride_v, outer_thresh_[level_v],
               inner_thresh_[level_v], HevThresh(level_v));
         }
-        src_row_u += column_step * pixel_size_;
-        src_row_v += column_step * pixel_size_;
+        src_row_u += column_step << column_step_shift;
+        src_row_v += column_step << column_step_shift;
         column_step = DivideBy4(column_step << subsampling_x);
       }
     }
@@ -481,67 +481,23 @@
   }
 }
 
-void PostFilter::DeblockFilterWorker(int jobs_per_plane,
-                                     const Plane* /*planes*/,
-                                     int /*num_planes*/,
-                                     std::atomic<int>* job_counter,
-                                     DeblockFilter deblock_filter) {
-  const int total_jobs = jobs_per_plane;
-  int job_index;
-  while ((job_index = job_counter->fetch_add(1, std::memory_order_relaxed)) <
-         total_jobs) {
-    const int row_unit = job_index % jobs_per_plane;
-    const int row4x4 = row_unit * kNum4x4InLoopFilterUnit;
+template <LoopFilterType loop_filter_type>
+void PostFilter::DeblockFilterWorker(std::atomic<int>* row4x4_atomic) {
+  int row4x4;
+  while ((row4x4 = row4x4_atomic->fetch_add(kNum4x4InLoopFilterUnit,
+                                            std::memory_order_relaxed)) <
+         frame_header_.rows4x4) {
     for (int column4x4 = 0; column4x4 < frame_header_.columns4x4;
          column4x4 += kNum4x4InLoopFilterUnit) {
-      (this->*deblock_filter)(row4x4, column4x4);
+      (this->*deblock_filter_func_[loop_filter_type])(row4x4, column4x4);
     }
   }
 }
 
-void PostFilter::ApplyDeblockFilterThreaded() {
-  const int jobs_per_plane = DivideBy16(frame_header_.rows4x4 + 15);
-  const int num_workers = thread_pool_->num_threads();
-  std::array<Plane, kMaxPlanes> planes;
-  planes[0] = kPlaneY;
-  int num_planes = 1;
-  for (int plane = kPlaneU; plane < planes_; ++plane) {
-    if (frame_header_.loop_filter.level[plane + 1] != 0) {
-      planes[num_planes++] = static_cast<Plane>(plane);
-    }
-  }
-  // The vertical filters are not dependent on each other. So simply schedule
-  // them for all possible rows.
-  //
-  // The horizontal filter for a row/column depends on the vertical filter being
-  // finished for the blocks to the top and to the right. To work around
-  // this synchronization, we simply wait for the vertical filter to finish for
-  // all rows. Now, the horizontal filters can also be scheduled
-  // unconditionally similar to the vertical filters.
-  //
-  // The only synchronization involved is to know when the each directional
-  // filter is complete for the entire frame.
-  for (const auto& type :
-       {kLoopFilterTypeVertical, kLoopFilterTypeHorizontal}) {
-    const DeblockFilter deblock_filter = deblock_filter_func_[type];
-    std::atomic<int> job_counter(0);
-    BlockingCounter pending_workers(num_workers);
-    for (int i = 0; i < num_workers; ++i) {
-      thread_pool_->Schedule([this, jobs_per_plane, &planes, num_planes,
-                              &job_counter, deblock_filter,
-                              &pending_workers]() {
-        DeblockFilterWorker(jobs_per_plane, planes.data(), num_planes,
-                            &job_counter, deblock_filter);
-        pending_workers.Decrement();
-      });
-    }
-    // Run the jobs on the current thread.
-    DeblockFilterWorker(jobs_per_plane, planes.data(), num_planes, &job_counter,
-                        deblock_filter);
-    // Wait for the threadpool jobs to finish.
-    pending_workers.Wait();
-  }
-}
+template void PostFilter::DeblockFilterWorker<kLoopFilterTypeVertical>(
+    std::atomic<int>* row4x4_atomic);
+template void PostFilter::DeblockFilterWorker<kLoopFilterTypeHorizontal>(
+    std::atomic<int>* row4x4_atomic);
 
 void PostFilter::ApplyDeblockFilter(LoopFilterType loop_filter_type,
                                     int row4x4_start, int column4x4_start,

diff --git a/libgav1/src/post_filter/loop_restoration.cc b/libgav1/src/post_filter/loop_restoration.cc
index 17670b9..826ef48 100644
--- a/libgav1/src/post_filter/loop_restoration.cc
+++ b/libgav1/src/post_filter/loop_restoration.cc

@@ -15,182 +15,103 @@
 #include "src/utils/blocking_counter.h"
 
 namespace libgav1 {
-namespace {
 
 template <typename Pixel>
-void CopyTwoRows(const Pixel* src, const ptrdiff_t src_stride, Pixel** dst,
-                 const ptrdiff_t dst_stride, const int width) {
-  for (int i = 0; i < kRestorationVerticalBorder; ++i) {
-    memcpy(*dst, src, sizeof(Pixel) * width);
-    src += src_stride;
-    *dst += dst_stride;
-  }
-}
-
-}  // namespace
-
-// static
-template <typename Pixel>
-void PostFilter::PrepareLoopRestorationBlock(
-    const Pixel* src_buffer, const ptrdiff_t src_stride,
-    const Pixel* deblock_buffer, const ptrdiff_t deblock_stride, Pixel* dst,
-    const ptrdiff_t dst_stride, const int width, const int height,
-    const bool frame_top_border, const bool frame_bottom_border) {
-  src_buffer -=
-      kRestorationVerticalBorder * src_stride + kRestorationHorizontalBorder;
-  deblock_buffer -= kRestorationHorizontalBorder;
-  int h = height;
-  // Top 2 rows.
-  if (frame_top_border) {
-    h += kRestorationVerticalBorder;
-  } else {
-    CopyTwoRows<Pixel>(deblock_buffer, deblock_stride, &dst, dst_stride,
-                       width + 2 * kRestorationHorizontalBorder);
-    src_buffer += kRestorationVerticalBorder * src_stride;
-    // If |frame_top_border| is true, then we are in the first superblock row,
-    // so in that case, do not increment |deblock_buffer| since we don't store
-    // anything from the first superblock row into |deblock_buffer|.
-    deblock_buffer += 4 * deblock_stride;
-  }
-  if (frame_bottom_border) h += kRestorationVerticalBorder;
-  // Main body.
-  do {
-    memcpy(dst, src_buffer,
-           sizeof(Pixel) * (width + 2 * kRestorationHorizontalBorder));
-    src_buffer += src_stride;
-    dst += dst_stride;
-  } while (--h != 0);
-  // Bottom 2 rows.
-  if (!frame_bottom_border) {
-    deblock_buffer += kRestorationVerticalBorder * deblock_stride;
-    CopyTwoRows<Pixel>(deblock_buffer, deblock_stride, &dst, dst_stride,
-                       width + 2 * kRestorationHorizontalBorder);
-  }
-}
-
-template void PostFilter::PrepareLoopRestorationBlock<uint8_t>(
-    const uint8_t* src_buffer, ptrdiff_t src_stride,
-    const uint8_t* deblock_buffer, ptrdiff_t deblock_stride, uint8_t* dst,
-    ptrdiff_t dst_stride, const int width, const int height,
-    const bool frame_top_border, const bool frame_bottom_border);
-
-#if LIBGAV1_MAX_BITDEPTH >= 10
-template void PostFilter::PrepareLoopRestorationBlock<uint16_t>(
-    const uint16_t* src_buffer, ptrdiff_t src_stride,
-    const uint16_t* deblock_buffer, ptrdiff_t deblock_stride, uint16_t* dst,
-    ptrdiff_t dst_stride, const int width, const int height,
-    const bool frame_top_border, const bool frame_bottom_border);
-#endif
-
-template <typename Pixel>
-void PostFilter::ApplyLoopRestorationForOneRowInWindow(
-    const Pixel* src_buffer, const Plane plane, const int plane_height,
-    const int plane_width, const int y, const int x, const int row,
+void PostFilter::ApplyLoopRestorationForOneRow(
+    const Pixel* src_buffer, const ptrdiff_t stride, const Plane plane,
+    const int plane_height, const int plane_width, const int unit_y,
     const int unit_row, const int current_process_unit_height,
-    const int plane_unit_size, const int window_width,
-    Array2DView<Pixel>* const loop_restored_window) {
+    const int plane_unit_size, Pixel* dst_buffer) {
   const int num_horizontal_units =
       restoration_info_->num_horizontal_units(static_cast<Plane>(plane));
-  const ptrdiff_t src_stride = frame_buffer_.stride(plane) / sizeof(Pixel);
   const RestorationUnitInfo* const restoration_info =
       restoration_info_->loop_restoration_info(static_cast<Plane>(plane),
                                                unit_row * num_horizontal_units);
-  int unit_column = x / plane_unit_size;
-  src_buffer += (y + row) * src_stride + x;
+  const bool in_place = DoCdef() || thread_pool_ != nullptr;
+  const Pixel* border = nullptr;
+  ptrdiff_t border_stride = 0;
+  src_buffer += unit_y * stride;
+  if (in_place) {
+    const int border_unit_y = std::max(
+        RightShiftWithCeiling(unit_y, 4 - subsampling_y_[plane]) - 4, 0);
+    border_stride = loop_restoration_border_.stride(plane) / sizeof(Pixel);
+    border =
+        reinterpret_cast<const Pixel*>(loop_restoration_border_.data(plane)) +
+        border_unit_y * border_stride;
+  }
+  int unit_column = 0;
   int column = 0;
   do {
-    const int unit_x = x + column;
-    const int unit_y = y + row;
     const int current_process_unit_width =
-        std::min(plane_unit_size, plane_width - unit_x);
+        std::min(plane_unit_size, plane_width - column);
     const Pixel* src = src_buffer + column;
     unit_column = std::min(unit_column, num_horizontal_units - 1);
     if (restoration_info[unit_column].type == kLoopRestorationTypeNone) {
-      const ptrdiff_t dst_stride = loop_restored_window->columns();
-      Pixel* dst = &(*loop_restored_window)[row][column];
-      for (int k = 0; k < current_process_unit_height; ++k) {
-        if (DoCdef()) {
+      Pixel* dst = dst_buffer + column;
+      if (in_place) {
+        int k = current_process_unit_height;
+        do {
           memmove(dst, src, current_process_unit_width * sizeof(Pixel));
-        } else {
-          memcpy(dst, src, current_process_unit_width * sizeof(Pixel));
-        }
-        src += src_stride;
-        dst += dst_stride;
+          src += stride;
+          dst += stride;
+        } while (--k != 0);
+      } else {
+        CopyPlane(src, stride, current_process_unit_width,
+                  current_process_unit_height, dst, stride);
       }
     } else {
-      const ptrdiff_t block_buffer_stride = kRestorationUnitWidthWithBorders;
-      // The SIMD implementation of wiener filter over-reads 15 -
-      // |kRestorationHorizontalBorder| bytes, and the SIMD implementation of
-      // self-guided filter over-reads up to 7 bytes which happens when
-      // |current_process_unit_width| equals |kRestorationUnitWidth| - 7, and
-      // the radius of the first pass in sfg is 0. So add 8 extra bytes at the
-      // end of block_buffer for 8 bit.
-      Pixel
-          block_buffer[kRestorationUnitHeightWithBorders * block_buffer_stride +
-                       ((sizeof(Pixel) == 1) ? 15 - kRestorationHorizontalBorder
-                                             : 0)];
-      RestorationBuffer restoration_buffer;
-      const Pixel* source;
-      ptrdiff_t source_stride;
-      if (DoCdef()) {
-        const int deblock_buffer_units = 64 >> subsampling_y_[plane];
-        const auto* const deblock_buffer =
-            reinterpret_cast<const Pixel*>(deblock_buffer_.data(plane));
-        assert(deblock_buffer != nullptr);
-        const ptrdiff_t deblock_buffer_stride =
-            deblock_buffer_.stride(plane) / sizeof(Pixel);
-        const int deblock_unit_y =
-            std::max(MultiplyBy4(Ceil(unit_y, deblock_buffer_units)) - 4, 0);
-        const Pixel* const deblock_unit_buffer =
-            deblock_buffer + deblock_unit_y * deblock_buffer_stride + unit_x;
-        PrepareLoopRestorationBlock<Pixel>(
-            src, src_stride, deblock_unit_buffer, deblock_buffer_stride,
-            block_buffer, block_buffer_stride, current_process_unit_width,
-            current_process_unit_height, unit_y == 0,
-            unit_y + current_process_unit_height >= plane_height);
-        source = block_buffer +
-                 kRestorationVerticalBorder * block_buffer_stride +
-                 kRestorationHorizontalBorder;
-        source_stride = kRestorationUnitWidthWithBorders;
-      } else {
-        source = src;
-        source_stride = src_stride;
+      const Pixel* top_border = src - kRestorationVerticalBorder * stride;
+      ptrdiff_t top_border_stride = stride;
+      const Pixel* bottom_border = src + current_process_unit_height * stride;
+      ptrdiff_t bottom_border_stride = stride;
+      const bool frame_bottom_border =
+          (unit_y + current_process_unit_height >= plane_height);
+      if (in_place && (unit_y != 0 || !frame_bottom_border)) {
+        const Pixel* loop_restoration_border = border + column;
+        if (unit_y != 0) {
+          top_border = loop_restoration_border;
+          top_border_stride = border_stride;
+          loop_restoration_border += 4 * border_stride;
+        }
+        if (!frame_bottom_border) {
+          bottom_border = loop_restoration_border +
+                          kRestorationVerticalBorder * border_stride;
+          bottom_border_stride = border_stride;
+        }
       }
+      RestorationBuffer restoration_buffer;
       const LoopRestorationType type = restoration_info[unit_column].type;
       assert(type == kLoopRestorationTypeSgrProj ||
              type == kLoopRestorationTypeWiener);
       const dsp::LoopRestorationFunc restoration_func =
           dsp_.loop_restorations[type - 2];
-      restoration_func(source, &(*loop_restored_window)[row][column],
-                       restoration_info[unit_column], source_stride,
-                       loop_restored_window->columns(),
+      restoration_func(restoration_info[unit_column], src, stride, top_border,
+                       top_border_stride, bottom_border, bottom_border_stride,
                        current_process_unit_width, current_process_unit_height,
-                       &restoration_buffer);
+                       &restoration_buffer, dst_buffer + column);
     }
     ++unit_column;
     column += plane_unit_size;
-  } while (column < window_width);
+  } while (column < plane_width);
 }
 
 template <typename Pixel>
-void PostFilter::ApplyLoopRestorationSingleThread(const int row4x4_start,
-                                                  const int sb4x4) {
+void PostFilter::ApplyLoopRestorationForOneSuperBlockRow(const int row4x4_start,
+                                                         const int sb4x4) {
   assert(row4x4_start >= 0);
   assert(DoRestoration());
-  for (int plane = 0; plane < planes_; ++plane) {
+  int plane = kPlaneY;
+  do {
     if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) {
       continue;
     }
     const ptrdiff_t stride = frame_buffer_.stride(plane) / sizeof(Pixel);
     const int unit_height_offset =
         kRestorationUnitOffset >> subsampling_y_[plane];
-    const int plane_height =
-        RightShiftWithRounding(height_, subsampling_y_[plane]);
+    const int plane_height = SubsampledValue(height_, subsampling_y_[plane]);
     const int plane_width =
-        RightShiftWithRounding(upscaled_width_, subsampling_x_[plane]);
-    const int num_vertical_units =
-        restoration_info_->num_vertical_units(static_cast<Plane>(plane));
-    const int plane_unit_size = loop_restoration_.unit_size[plane];
+        SubsampledValue(upscaled_width_, subsampling_x_[plane]);
+    const int plane_unit_size = 1 << loop_restoration_.unit_size_log2[plane];
     const int plane_process_unit_height =
         kRestorationUnitHeight >> subsampling_y_[plane];
     int y = (row4x4_start == 0)
@@ -203,171 +124,53 @@
     for (int sb_y = 0; sb_y < sb4x4;
          sb_y += 16, y += current_process_unit_height) {
       if (y >= plane_height) break;
-      const int unit_row = std::min((y + unit_height_offset) / plane_unit_size,
-                                    num_vertical_units - 1);
+      const int unit_row = std::min(
+          (y + unit_height_offset) >> loop_restoration_.unit_size_log2[plane],
+          restoration_info_->num_vertical_units(static_cast<Plane>(plane)) - 1);
       current_process_unit_height = std::min(expected_height, plane_height - y);
       expected_height = plane_process_unit_height;
-      Array2DView<Pixel> loop_restored_window(
-          current_process_unit_height, static_cast<int>(stride),
+      ApplyLoopRestorationForOneRow<Pixel>(
+          reinterpret_cast<Pixel*>(superres_buffer_[plane]), stride,
+          static_cast<Plane>(plane), plane_height, plane_width, y, unit_row,
+          current_process_unit_height, plane_unit_size,
           reinterpret_cast<Pixel*>(loop_restoration_buffer_[plane]) +
               y * stride);
-      ApplyLoopRestorationForOneRowInWindow<Pixel>(
-          reinterpret_cast<Pixel*>(superres_buffer_[plane]),
-          static_cast<Plane>(plane), plane_height, plane_width, y, 0, 0,
-          unit_row, current_process_unit_height, plane_unit_size, plane_width,
-          &loop_restored_window);
     }
-  }
-}
-
-// Multi-thread version of loop restoration, based on a moving window of size
-// |window_buffer_width_|x|window_buffer_height_|. Inside the moving window, we
-// create a filtering job for each row and each filtering job is submitted to
-// the thread pool. Each free thread takes one job from the thread pool and
-// completes filtering until all jobs are finished. This approach requires an
-// extra buffer (|threaded_window_buffer_|) to hold the filtering output, whose
-// size is the size of the window. It also needs block buffers (i.e.,
-// |block_buffer| in ApplyLoopRestorationForOneRowInWindow()) to store
-// intermediate results in loop restoration for each thread. After all units
-// inside the window are filtered, the output is written to the frame buffer.
-template <typename Pixel>
-void PostFilter::ApplyLoopRestorationThreaded() {
-  const int plane_process_unit_height[kMaxPlanes] = {
-      kRestorationUnitHeight, kRestorationUnitHeight >> subsampling_y_[kPlaneU],
-      kRestorationUnitHeight >> subsampling_y_[kPlaneV]};
-  Array2DView<Pixel> loop_restored_window;
-  if (!DoCdef()) {
-    loop_restored_window.Reset(
-        window_buffer_height_, window_buffer_width_,
-        reinterpret_cast<Pixel*>(threaded_window_buffer_));
-  }
-
-  for (int plane = kPlaneY; plane < planes_; ++plane) {
-    if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) {
-      continue;
-    }
-
-    const int unit_height_offset =
-        kRestorationUnitOffset >> subsampling_y_[plane];
-    auto* const src_buffer = reinterpret_cast<Pixel*>(superres_buffer_[plane]);
-    const ptrdiff_t src_stride = frame_buffer_.stride(plane) / sizeof(Pixel);
-    const int plane_unit_size = loop_restoration_.unit_size[plane];
-    const int num_vertical_units =
-        restoration_info_->num_vertical_units(static_cast<Plane>(plane));
-    const int plane_width =
-        RightShiftWithRounding(upscaled_width_, subsampling_x_[plane]);
-    const int plane_height =
-        RightShiftWithRounding(height_, subsampling_y_[plane]);
-    PostFilter::ExtendFrame<Pixel>(
-        src_buffer, plane_width, plane_height, src_stride,
-        kRestorationHorizontalBorder, kRestorationHorizontalBorder,
-        kRestorationVerticalBorder, kRestorationVerticalBorder);
-
-    const int num_workers = thread_pool_->num_threads();
-    for (int y = 0; y < plane_height; y += window_buffer_height_) {
-      const int actual_window_height =
-          std::min(window_buffer_height_ - ((y == 0) ? unit_height_offset : 0),
-                   plane_height - y);
-      int vertical_units_per_window =
-          (actual_window_height + plane_process_unit_height[plane] - 1) /
-          plane_process_unit_height[plane];
-      if (y == 0) {
-        // The first row of loop restoration processing units is not 64x64, but
-        // 64x56 (|unit_height_offset| = 8 rows less than other restoration
-        // processing units). For u/v with subsampling, the size is halved. To
-        // compute the number of vertical units per window, we need to take a
-        // special handling for it.
-        const int height_without_first_unit =
-            actual_window_height -
-            std::min(actual_window_height,
-                     plane_process_unit_height[plane] - unit_height_offset);
-        vertical_units_per_window =
-            (height_without_first_unit + plane_process_unit_height[plane] - 1) /
-                plane_process_unit_height[plane] +
-            1;
-      }
-      const int jobs_for_threadpool =
-          vertical_units_per_window * num_workers / (num_workers + 1);
-      for (int x = 0; x < plane_width; x += window_buffer_width_) {
-        const int actual_window_width =
-            std::min(window_buffer_width_, plane_width - x);
-        assert(jobs_for_threadpool < vertical_units_per_window);
-        if (DoCdef()) {
-          loop_restored_window.Reset(
-              actual_window_height, static_cast<int>(src_stride),
-              reinterpret_cast<Pixel*>(loop_restoration_buffer_[plane]) +
-                  y * src_stride + x);
-        }
-        BlockingCounter pending_jobs(jobs_for_threadpool);
-        int job_count = 0;
-        int current_process_unit_height;
-        for (int row = 0; row < actual_window_height;
-             row += current_process_unit_height) {
-          const int unit_y = y + row;
-          const int expected_height = plane_process_unit_height[plane] -
-                                      ((unit_y == 0) ? unit_height_offset : 0);
-          current_process_unit_height =
-              std::min(expected_height, plane_height - unit_y);
-          const int unit_row =
-              std::min((unit_y + unit_height_offset) / plane_unit_size,
-                       num_vertical_units - 1);
-
-          if (job_count < jobs_for_threadpool) {
-            thread_pool_->Schedule(
-                [this, src_buffer, plane, plane_height, plane_width, y, x, row,
-                 unit_row, current_process_unit_height, plane_unit_size,
-                 actual_window_width, &loop_restored_window, &pending_jobs]() {
-                  ApplyLoopRestorationForOneRowInWindow<Pixel>(
-                      src_buffer, static_cast<Plane>(plane), plane_height,
-                      plane_width, y, x, row, unit_row,
-                      current_process_unit_height, plane_unit_size,
-                      actual_window_width, &loop_restored_window);
-                  pending_jobs.Decrement();
-                });
-          } else {
-            ApplyLoopRestorationForOneRowInWindow<Pixel>(
-                src_buffer, static_cast<Plane>(plane), plane_height,
-                plane_width, y, x, row, unit_row, current_process_unit_height,
-                plane_unit_size, actual_window_width, &loop_restored_window);
-          }
-          ++job_count;
-        }
-        // Wait for all jobs of current window to finish.
-        pending_jobs.Wait();
-        if (!DoCdef()) {
-          // Copy |threaded_window_buffer_| to output frame.
-          CopyPlane<Pixel>(
-              reinterpret_cast<const Pixel*>(threaded_window_buffer_),
-              window_buffer_width_, actual_window_width, actual_window_height,
-              reinterpret_cast<Pixel*>(loop_restoration_buffer_[plane]) +
-                  y * src_stride + x,
-              src_stride);
-        }
-      }
-      if (y == 0) y -= unit_height_offset;
-    }
-  }
+  } while (++plane < planes_);
 }
 
 void PostFilter::ApplyLoopRestoration(const int row4x4_start, const int sb4x4) {
 #if LIBGAV1_MAX_BITDEPTH >= 10
   if (bitdepth_ >= 10) {
-    ApplyLoopRestorationSingleThread<uint16_t>(row4x4_start, sb4x4);
+    ApplyLoopRestorationForOneSuperBlockRow<uint16_t>(row4x4_start, sb4x4);
     return;
   }
 #endif
-  ApplyLoopRestorationSingleThread<uint8_t>(row4x4_start, sb4x4);
+  ApplyLoopRestorationForOneSuperBlockRow<uint8_t>(row4x4_start, sb4x4);
 }
 
-void PostFilter::ApplyLoopRestoration() {
-  assert(threaded_window_buffer_ != nullptr);
+void PostFilter::ApplyLoopRestorationWorker(std::atomic<int>* row4x4_atomic) {
+  int row4x4;
+  // Loop Restoration operates with a lag of 8 rows (4 for chroma with
+  // subsampling) and hence we need to make sure to cover the last 8 rows of the
+  // last superblock row. So we run this loop for an extra iteration to
+  // accomplish that.
+  const int row4x4_end = frame_header_.rows4x4 + kNum4x4InLoopRestorationUnit;
+  while ((row4x4 = row4x4_atomic->fetch_add(kNum4x4InLoopRestorationUnit,
+                                            std::memory_order_relaxed)) <
+         row4x4_end) {
+    CopyBordersForOneSuperBlockRow(row4x4, kNum4x4InLoopRestorationUnit,
+                                   /*for_loop_restoration=*/true);
 #if LIBGAV1_MAX_BITDEPTH >= 10
-  if (bitdepth_ >= 10) {
-    ApplyLoopRestorationThreaded<uint16_t>();
-    return;
-  }
+    if (bitdepth_ >= 10) {
+      ApplyLoopRestorationForOneSuperBlockRow<uint16_t>(
+          row4x4, kNum4x4InLoopRestorationUnit);
+      continue;
+    }
 #endif
-  ApplyLoopRestorationThreaded<uint8_t>();
+    ApplyLoopRestorationForOneSuperBlockRow<uint8_t>(
+        row4x4, kNum4x4InLoopRestorationUnit);
+  }
 }
 
 }  // namespace libgav1

diff --git a/libgav1/src/post_filter/post_filter.cc b/libgav1/src/post_filter/post_filter.cc
index 6d5ef31..7671f01 100644
--- a/libgav1/src/post_filter/post_filter.cc
+++ b/libgav1/src/post_filter/post_filter.cc

@@ -24,6 +24,8 @@
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
 #include "src/utils/array_2d.h"
+#include "src/utils/blocking_counter.h"
+#include "src/utils/common.h"
 #include "src/utils/constants.h"
 #include "src/utils/memory.h"
 #include "src/utils/types.h"
@@ -34,11 +36,10 @@
 // Import all the constants in the anonymous namespace.
 #include "src/post_filter/deblock_thresholds.inc"
 
-// Row indices of deblocked pixels needed by loop restoration. This is used to
-// populate the |deblock_buffer_| when cdef is on. The first dimension is
-// subsampling_y.
-constexpr int kDeblockedRowsForLoopRestoration[2][4] = {{54, 55, 56, 57},
-                                                        {26, 27, 28, 29}};
+// Row indices of loop restoration border. This is used to populate the
+// |loop_restoration_border_| when either cdef is on or multithreading is
+// enabled. The dimension is subsampling_y.
+constexpr int kLoopRestorationBorderRows[2] = {54, 26};
 
 }  // namespace
 
@@ -77,15 +78,13 @@
                              const int height, const ptrdiff_t stride,
                              const int left, const int right, const int top,
                              const int bottom) {
-  const Pixel* src = frame_start;
-  Pixel* dst = frame_start - left;
+  Pixel* src = frame_start;
   // Copy to left and right borders.
-  for (int y = 0; y < height; ++y) {
-    Memset(dst, src[0], left);
-    Memset(dst + left + width, src[width - 1], right);
+  int y = height;
+  do {
+    ExtendLine<Pixel>(src, width, left, right);
     src += stride;
-    dst += stride;
-  }
+  } while (--y != 0);
   // Copy to bottom borders. For performance we copy |stride| pixels
   // (including some padding pixels potentially) in each row, ending at the
   // bottom right border pixel. In the diagram the asterisks indicate padding
@@ -98,7 +97,7 @@
   // **YYY|YZabcdef|fff
   // **YYY|YZabcdef|fff <-- bottom right border pixel
   assert(src == frame_start + height * stride);
-  dst = const_cast<Pixel*>(src) + width + right - stride;
+  Pixel* dst = src - left;
   src = dst - stride;
   for (int y = 0; y < bottom; ++y) {
     memcpy(dst, src, sizeof(Pixel) * stride);
@@ -159,34 +158,42 @@
                      sequence_header.color_config.subsampling_y},
       planes_(sequence_header.color_config.is_monochrome ? kMaxPlanesMonochrome
                                                          : kMaxPlanes),
-      pixel_size_(static_cast<int>((bitdepth_ == 8) ? sizeof(uint8_t)
-                                                    : sizeof(uint16_t))),
+      pixel_size_log2_(static_cast<int>((bitdepth_ == 8) ? sizeof(uint8_t)
+                                                         : sizeof(uint16_t)) -
+                       1),
       inner_thresh_(kInnerThresh[frame_header.loop_filter.sharpness]),
       outer_thresh_(kOuterThresh[frame_header.loop_filter.sharpness]),
       needs_chroma_deblock_(frame_header.loop_filter.level[kPlaneU + 1] != 0 ||
                             frame_header.loop_filter.level[kPlaneV + 1] != 0),
       cdef_index_(frame_scratch_buffer->cdef_index),
       inter_transform_sizes_(frame_scratch_buffer->inter_transform_sizes),
-      threaded_window_buffer_(
-          frame_scratch_buffer->threaded_window_buffer.get()),
       restoration_info_(&frame_scratch_buffer->loop_restoration_info),
-      superres_line_buffer_(frame_scratch_buffer->superres_line_buffer.get()),
+      superres_coefficients_{
+          frame_scratch_buffer->superres_coefficients[kPlaneTypeY].get(),
+          frame_scratch_buffer
+              ->superres_coefficients
+                  [(sequence_header.color_config.is_monochrome ||
+                    sequence_header.color_config.subsampling_x == 0)
+                       ? kPlaneTypeY
+                       : kPlaneTypeUV]
+              .get()},
+      superres_line_buffer_(frame_scratch_buffer->superres_line_buffer),
       block_parameters_(frame_scratch_buffer->block_parameters_holder),
       frame_buffer_(*frame_buffer),
-      deblock_buffer_(frame_scratch_buffer->deblock_buffer),
+      cdef_border_(frame_scratch_buffer->cdef_border),
+      loop_restoration_border_(frame_scratch_buffer->loop_restoration_border),
       do_post_filter_mask_(do_post_filter_mask),
       thread_pool_(
-          frame_scratch_buffer->threading_strategy.post_filter_thread_pool()),
-      window_buffer_width_(GetWindowBufferWidth(thread_pool_, frame_header)),
-      window_buffer_height_(GetWindowBufferHeight(thread_pool_, frame_header)) {
+          frame_scratch_buffer->threading_strategy.post_filter_thread_pool()) {
   const int8_t zero_delta_lf[kFrameLfCount] = {};
   ComputeDeblockFilterLevels(zero_delta_lf, deblock_filter_levels_);
   if (DoSuperRes()) {
-    for (int plane = 0; plane < planes_; ++plane) {
+    int plane = kPlaneY;
+    do {
       const int downscaled_width =
-          RightShiftWithRounding(width_, subsampling_x_[plane]);
+          SubsampledValue(width_, subsampling_x_[plane]);
       const int upscaled_width =
-          RightShiftWithRounding(upscaled_width_, subsampling_x_[plane]);
+          SubsampledValue(upscaled_width_, subsampling_x_[plane]);
       const int superres_width = downscaled_width << kSuperResScaleBits;
       super_res_info_[plane].step =
           (superres_width + upscaled_width / 2) / upscaled_width;
@@ -199,46 +206,58 @@
            (1 << (kSuperResExtraBits - 1)) - error / 2) &
           kSuperResScaleMask;
       super_res_info_[plane].upscaled_width = upscaled_width;
+    } while (++plane < planes_);
+    if (dsp->super_res_coefficients != nullptr) {
+      int plane = kPlaneY;
+      const int number_loops = (superres_coefficients_[kPlaneTypeY] ==
+                                superres_coefficients_[kPlaneTypeUV])
+                                   ? kMaxPlanesMonochrome
+                                   : static_cast<int>(kNumPlaneTypes);
+      do {
+        dsp->super_res_coefficients(
+            SubsampledValue(upscaled_width_, subsampling_x_[plane]),
+            super_res_info_[plane].initial_subpixel_x,
+            super_res_info_[plane].step, superres_coefficients_[plane]);
+      } while (++plane < number_loops);
     }
   }
-  for (int plane = 0; plane < planes_; ++plane) {
+  int plane = kPlaneY;
+  do {
     loop_restoration_buffer_[plane] = frame_buffer_.data(plane);
     cdef_buffer_[plane] = frame_buffer_.data(plane);
     superres_buffer_[plane] = frame_buffer_.data(plane);
     source_buffer_[plane] = frame_buffer_.data(plane);
-  }
-  // In single threaded mode, we apply SuperRes without making a copy of the
-  // input row by writing the output to one row to the top (we refer to this
-  // process as "in place superres" in our code).
-  const bool in_place_superres = DoSuperRes() && thread_pool_ == nullptr;
-  if (DoCdef() || DoRestoration() || in_place_superres) {
-    for (int plane = 0; plane < planes_; ++plane) {
+  } while (++plane < planes_);
+  if (DoCdef() || DoRestoration() || DoSuperRes()) {
+    plane = kPlaneY;
+    const int pixel_size_log2 = pixel_size_log2_;
+    do {
       int horizontal_shift = 0;
       int vertical_shift = 0;
       if (DoRestoration() &&
           loop_restoration_.type[plane] != kLoopRestorationTypeNone) {
         horizontal_shift += frame_buffer_.alignment();
-        if (!DoCdef()) {
+        if (!DoCdef() && thread_pool_ == nullptr) {
           vertical_shift += kRestorationVerticalBorder;
         }
         superres_buffer_[plane] +=
             vertical_shift * frame_buffer_.stride(plane) +
-            horizontal_shift * pixel_size_;
+            (horizontal_shift << pixel_size_log2);
       }
-      if (in_place_superres) {
+      if (DoSuperRes()) {
         vertical_shift += kSuperResVerticalBorder;
       }
       cdef_buffer_[plane] += vertical_shift * frame_buffer_.stride(plane) +
-                             horizontal_shift * pixel_size_;
-      if (DoCdef()) {
+                             (horizontal_shift << pixel_size_log2);
+      if (DoCdef() && thread_pool_ == nullptr) {
         horizontal_shift += frame_buffer_.alignment();
         vertical_shift += kCdefBorder;
       }
       assert(horizontal_shift <= frame_buffer_.right_border(plane));
       assert(vertical_shift <= frame_buffer_.bottom_border(plane));
       source_buffer_[plane] += vertical_shift * frame_buffer_.stride(plane) +
-                               horizontal_shift * pixel_size_;
-    }
+                               (horizontal_shift << pixel_size_log2);
+    } while (++plane < planes_);
   }
 }
 
@@ -261,11 +280,11 @@
 
 void PostFilter::ExtendBordersForReferenceFrame() {
   if (frame_header_.refresh_frame_flags == 0) return;
-  for (int plane = kPlaneY; plane < planes_; ++plane) {
+  int plane = kPlaneY;
+  do {
     const int plane_width =
-        RightShiftWithRounding(upscaled_width_, subsampling_x_[plane]);
-    const int plane_height =
-        RightShiftWithRounding(height_, subsampling_y_[plane]);
+        SubsampledValue(upscaled_width_, subsampling_x_[plane]);
+    const int plane_height = SubsampledValue(height_, subsampling_y_[plane]);
     assert(frame_buffer_.left_border(plane) >= kMinLeftBorderPixels &&
            frame_buffer_.right_border(plane) >= kMinRightBorderPixels &&
            frame_buffer_.top_border(plane) >= kMinTopBorderPixels &&
@@ -283,45 +302,31 @@
         frame_buffer_.stride(plane), frame_buffer_.left_border(plane),
         frame_buffer_.right_border(plane), frame_buffer_.top_border(plane),
         frame_buffer_.bottom_border(plane));
-  }
+  } while (++plane < planes_);
 }
 
 void PostFilter::CopyDeblockedPixels(Plane plane, int row4x4) {
   const ptrdiff_t src_stride = frame_buffer_.stride(plane);
-  const uint8_t* const src =
-      GetSourceBuffer(static_cast<Plane>(plane), row4x4, 0);
-  const ptrdiff_t dst_stride = deblock_buffer_.stride(plane);
+  const uint8_t* const src = GetSourceBuffer(plane, row4x4, 0);
   const int row_offset = DivideBy4(row4x4);
-  uint8_t* dst = deblock_buffer_.data(plane) + dst_stride * row_offset;
+  const ptrdiff_t dst_stride = loop_restoration_border_.stride(plane);
+  uint8_t* dst = loop_restoration_border_.data(plane) + row_offset * dst_stride;
   const int num_pixels = SubsampledValue(MultiplyBy4(frame_header_.columns4x4),
                                          subsampling_x_[plane]);
+  const int row_width = num_pixels << pixel_size_log2_;
   int last_valid_row = -1;
   const int plane_height =
       SubsampledValue(frame_header_.height, subsampling_y_[plane]);
-  for (int i = 0; i < 4; ++i) {
-    int row = kDeblockedRowsForLoopRestoration[subsampling_y_[plane]][i];
-    const int absolute_row =
-        (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row;
-    if (absolute_row >= plane_height) {
-      if (last_valid_row == -1) {
-        // We have run out of rows and there no valid row to copy. This will not
-        // be used by loop restoration, so we can simply break here. However,
-        // MSAN does not know that this is never used (since we sometimes apply
-        // superres to this row as well). So zero it out in case of MSAN.
-#if LIBGAV1_MSAN
-        if (DoSuperRes()) {
-          memset(dst, 0, num_pixels * pixel_size_);
-          dst += dst_stride;
-          continue;
-        }
-#endif
-        break;
-      }
+  int row = kLoopRestorationBorderRows[subsampling_y_[plane]];
+  const int absolute_row = (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row;
+  for (int i = 0; i < 4; ++i, ++row) {
+    if (absolute_row + i >= plane_height) {
+      if (last_valid_row == -1) break;
       // If we run out of rows, copy the last valid row (mimics the bottom
       // border extension).
       row = last_valid_row;
     }
-    memcpy(dst, src + src_stride * row, num_pixels * pixel_size_);
+    memcpy(dst, src + row * src_stride, row_width);
     last_valid_row = row;
     dst += dst_stride;
   }
@@ -334,20 +339,21 @@
   const int row_offset = (row4x4 == 0) ? 0 : 8;
   // Number of rows to be subtracted from the height described by sb4x4.
   const int height_offset = (row4x4 == 0) ? 8 : 0;
-  // If cdef is off, then loop restoration needs 2 extra rows for the bottom
-  // border in each plane.
-  const int extra_rows = (for_loop_restoration && !DoCdef()) ? 2 : 0;
-  for (int plane = 0; plane < planes_; ++plane) {
+  // If cdef is off and post filter multithreading is off, then loop restoration
+  // needs 2 extra rows for the bottom border in each plane.
+  const int extra_rows =
+      (for_loop_restoration && thread_pool_ == nullptr && !DoCdef()) ? 2 : 0;
+  int plane = kPlaneY;
+  do {
     const int plane_width =
-        RightShiftWithRounding(upscaled_width_, subsampling_x_[plane]);
-    const int plane_height =
-        RightShiftWithRounding(height_, subsampling_y_[plane]);
+        SubsampledValue(upscaled_width_, subsampling_x_[plane]);
+    const int plane_height = SubsampledValue(height_, subsampling_y_[plane]);
     const int row = (MultiplyBy4(row4x4) - row_offset) >> subsampling_y_[plane];
     assert(row >= 0);
     if (row >= plane_height) break;
     const int num_rows =
-        std::min(RightShiftWithRounding(MultiplyBy4(sb4x4) - height_offset,
-                                        subsampling_y_[plane]) +
+        std::min(SubsampledValue(MultiplyBy4(sb4x4) - height_offset,
+                                 subsampling_y_[plane]) +
                      extra_rows,
                  plane_height - row);
     // We only need to track the progress of the Y plane since the progress of
@@ -377,20 +383,182 @@
             : 0;
     ExtendFrameBoundary(start, plane_width, num_rows, stride, left_border,
                         right_border, top_border, bottom_border);
+  } while (++plane < planes_);
+}
+
+void PostFilter::SetupLoopRestorationBorder(const int row4x4) {
+  assert(row4x4 >= 0);
+  assert(!DoCdef());
+  assert(DoRestoration());
+  int plane = kPlaneY;
+  do {
+    if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) {
+      continue;
+    }
+    const int row_offset = DivideBy4(row4x4);
+    const int num_pixels =
+        SubsampledValue(upscaled_width_, subsampling_x_[plane]);
+    const int row_width = num_pixels << pixel_size_log2_;
+    const int plane_height = SubsampledValue(height_, subsampling_y_[plane]);
+    const int row = kLoopRestorationBorderRows[subsampling_y_[plane]];
+    const int absolute_row =
+        (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row;
+    const ptrdiff_t src_stride = frame_buffer_.stride(plane);
+    const uint8_t* src =
+        GetSuperResBuffer(static_cast<Plane>(plane), row4x4, 0) +
+        row * src_stride;
+    const ptrdiff_t dst_stride = loop_restoration_border_.stride(plane);
+    uint8_t* dst =
+        loop_restoration_border_.data(plane) + row_offset * dst_stride;
+    for (int i = 0; i < 4; ++i) {
+      memcpy(dst, src, row_width);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+      if (bitdepth_ >= 10) {
+        ExtendLine<uint16_t>(dst, num_pixels, kRestorationHorizontalBorder,
+                             kRestorationHorizontalBorder);
+      } else  // NOLINT.
+#endif
+        ExtendLine<uint8_t>(dst, num_pixels, kRestorationHorizontalBorder,
+                            kRestorationHorizontalBorder);
+      // If we run out of rows, copy the last valid row (mimics the bottom
+      // border extension).
+      if (absolute_row + i < plane_height - 1) src += src_stride;
+      dst += dst_stride;
+    }
+  } while (++plane < planes_);
+}
+
+void PostFilter::SetupLoopRestorationBorder(int row4x4_start, int sb4x4) {
+  assert(row4x4_start >= 0);
+  assert(DoCdef());
+  assert(DoRestoration());
+  for (int sb_y = 0; sb_y < sb4x4; sb_y += 16) {
+    const int row4x4 = row4x4_start + sb_y;
+    const int row_offset_start = DivideBy4(row4x4);
+    const std::array<uint8_t*, kMaxPlanes> dst = {
+        loop_restoration_border_.data(kPlaneY) +
+            row_offset_start * loop_restoration_border_.stride(kPlaneY),
+        loop_restoration_border_.data(kPlaneU) +
+            row_offset_start * loop_restoration_border_.stride(kPlaneU),
+        loop_restoration_border_.data(kPlaneV) +
+            row_offset_start * loop_restoration_border_.stride(kPlaneV)};
+    // If SuperRes is enabled, then we apply SuperRes for the rows to be copied
+    // directly with |loop_restoration_border_| as the destination. Otherwise,
+    // we simply copy the rows.
+    if (DoSuperRes()) {
+      std::array<uint8_t*, kMaxPlanes> src;
+      std::array<int, kMaxPlanes> rows;
+      int plane = kPlaneY;
+      do {
+        if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) {
+          rows[plane] = 0;
+          continue;
+        }
+        const int plane_height =
+            SubsampledValue(frame_header_.height, subsampling_y_[plane]);
+        const int row = kLoopRestorationBorderRows[subsampling_y_[plane]];
+        const int absolute_row =
+            (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row;
+        src[plane] = GetSourceBuffer(static_cast<Plane>(plane), row4x4, 0) +
+                     row * frame_buffer_.stride(plane);
+        rows[plane] = Clip3(plane_height - absolute_row, 0, 4);
+      } while (++plane < planes_);
+      ApplySuperRes(src, rows, /*line_buffer_row=*/-1, dst,
+                    /*dst_is_loop_restoration_border=*/true);
+      // If we run out of rows, copy the last valid row (mimics the bottom
+      // border extension).
+      plane = kPlaneY;
+      do {
+        if (rows[plane] == 0 || rows[plane] >= 4) continue;
+        const ptrdiff_t stride = loop_restoration_border_.stride(plane);
+        uint8_t* dst_line = dst[plane] + rows[plane] * stride;
+        const uint8_t* const src_line = dst_line - stride;
+        const int upscaled_width = super_res_info_[plane].upscaled_width
+                                   << pixel_size_log2_;
+        for (int i = rows[plane]; i < 4; ++i) {
+          memcpy(dst_line, src_line, upscaled_width);
+          dst_line += stride;
+        }
+      } while (++plane < planes_);
+    } else {
+      int plane = kPlaneY;
+      do {
+        CopyDeblockedPixels(static_cast<Plane>(plane), row4x4);
+      } while (++plane < planes_);
+    }
+    // Extend the left and right boundaries needed for loop restoration.
+    int plane = kPlaneY;
+    do {
+      if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) {
+        continue;
+      }
+      uint8_t* dst_line = dst[plane];
+      const int plane_width =
+          SubsampledValue(upscaled_width_, subsampling_x_[plane]);
+      for (int i = 0; i < 4; ++i) {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+        if (bitdepth_ >= 10) {
+          ExtendLine<uint16_t>(dst_line, plane_width,
+                               kRestorationHorizontalBorder,
+                               kRestorationHorizontalBorder);
+        } else  // NOLINT.
+#endif
+        {
+          ExtendLine<uint8_t>(dst_line, plane_width,
+                              kRestorationHorizontalBorder,
+                              kRestorationHorizontalBorder);
+        }
+        dst_line += loop_restoration_border_.stride(plane);
+      }
+    } while (++plane < planes_);
   }
 }
 
+void PostFilter::RunJobs(WorkerFunction worker) {
+  std::atomic<int> row4x4(0);
+  const int num_workers = thread_pool_->num_threads();
+  BlockingCounter pending_workers(num_workers);
+  for (int i = 0; i < num_workers; ++i) {
+    thread_pool_->Schedule([this, &row4x4, &pending_workers, worker]() {
+      (this->*worker)(&row4x4);
+      pending_workers.Decrement();
+    });
+  }
+  // Run the jobs on the current thread.
+  (this->*worker)(&row4x4);
+  // Wait for the threadpool jobs to finish.
+  pending_workers.Wait();
+}
+
 void PostFilter::ApplyFilteringThreaded() {
-  if (DoDeblock()) ApplyDeblockFilterThreaded();
+  if (DoDeblock()) {
+    RunJobs(&PostFilter::DeblockFilterWorker<kLoopFilterTypeVertical>);
+    RunJobs(&PostFilter::DeblockFilterWorker<kLoopFilterTypeHorizontal>);
+  }
   if (DoCdef() && DoRestoration()) {
     for (int row4x4 = 0; row4x4 < frame_header_.rows4x4;
          row4x4 += kNum4x4InLoopFilterUnit) {
-      SetupDeblockBuffer(row4x4, kNum4x4InLoopFilterUnit);
+      SetupLoopRestorationBorder(row4x4, kNum4x4InLoopFilterUnit);
     }
   }
-  if (DoCdef()) ApplyCdef();
+  if (DoCdef()) {
+    for (int row4x4 = 0; row4x4 < frame_header_.rows4x4;
+         row4x4 += kNum4x4InLoopFilterUnit) {
+      SetupCdefBorder(row4x4);
+    }
+    RunJobs(&PostFilter::ApplyCdefWorker);
+  }
   if (DoSuperRes()) ApplySuperResThreaded();
-  if (DoRestoration()) ApplyLoopRestoration();
+  if (DoRestoration()) {
+    if (!DoCdef()) {
+      int row4x4 = 0;
+      do {
+        SetupLoopRestorationBorder(row4x4);
+        row4x4 += kNum4x4InLoopFilterUnit;
+      } while (row4x4 < frame_header_.rows4x4);
+    }
+    RunJobs(&PostFilter::ApplyLoopRestorationWorker);
+  }
   ExtendBordersForReferenceFrame();
 }
 
@@ -402,7 +570,7 @@
     ApplyDeblockFilterForOneSuperBlockRow(row4x4, sb4x4);
   }
   if (DoRestoration() && DoCdef()) {
-    SetupDeblockBuffer(row4x4, sb4x4);
+    SetupLoopRestorationBorder(row4x4, sb4x4);
   }
   if (DoCdef()) {
     ApplyCdefForOneSuperBlockRow(row4x4, sb4x4, is_last_row);

diff --git a/libgav1/src/post_filter/super_res.cc b/libgav1/src/post_filter/super_res.cc
index f6594f4..554e537 100644
--- a/libgav1/src/post_filter/super_res.cc
+++ b/libgav1/src/post_filter/super_res.cc

@@ -15,218 +15,197 @@
 #include "src/utils/blocking_counter.h"
 
 namespace libgav1 {
-namespace {
 
-template <typename Pixel>
-void ExtendLine(uint8_t* const line_start, const int width, const int left,
-                const int right) {
-  auto* const start = reinterpret_cast<Pixel*>(line_start);
-  const Pixel* src = start;
-  Pixel* dst = start - left;
-  // Copy to left and right borders.
-  Memset(dst, src[0], left);
-  Memset(dst + (left + width), src[width - 1], right);
-}
-
-}  // namespace
-
-template <bool in_place>
-void PostFilter::ApplySuperRes(const std::array<uint8_t*, kMaxPlanes>& buffers,
-                               const std::array<int, kMaxPlanes>& strides,
+void PostFilter::ApplySuperRes(const std::array<uint8_t*, kMaxPlanes>& src,
                                const std::array<int, kMaxPlanes>& rows,
-                               size_t line_buffer_offset) {
-  // Only used when |in_place| == false.
-  uint8_t* const line_buffer_start = superres_line_buffer_ +
-                                     line_buffer_offset +
-                                     kSuperResHorizontalBorder * pixel_size_;
-  for (int plane = kPlaneY; plane < planes_; ++plane) {
-    const int8_t subsampling_x = subsampling_x_[plane];
+                               const int line_buffer_row,
+                               const std::array<uint8_t*, kMaxPlanes>& dst,
+                               bool dst_is_loop_restoration_border /*=false*/) {
+  int plane = kPlaneY;
+  do {
     const int plane_width =
-        MultiplyBy4(frame_header_.columns4x4) >> subsampling_x;
-    uint8_t* input = buffers[plane];
-    const uint32_t input_stride = strides[plane];
+        MultiplyBy4(frame_header_.columns4x4) >> subsampling_x_[plane];
 #if LIBGAV1_MAX_BITDEPTH >= 10
     if (bitdepth_ >= 10) {
-      for (int y = 0; y < rows[plane]; ++y, input += input_stride) {
-        if (!in_place) {
-          memcpy(line_buffer_start, input, plane_width * sizeof(uint16_t));
-        }
-        ExtendLine<uint16_t>(in_place ? input : line_buffer_start, plane_width,
-                             kSuperResHorizontalBorder,
-                             kSuperResHorizontalBorder);
-        dsp_.super_res_row(in_place ? input : line_buffer_start,
-                           super_res_info_[plane].upscaled_width,
-                           super_res_info_[plane].initial_subpixel_x,
-                           super_res_info_[plane].step,
-                           input - (in_place ? input_stride : 0));
+      auto* input = reinterpret_cast<uint16_t*>(src[plane]);
+      auto* output = reinterpret_cast<uint16_t*>(dst[plane]);
+      const ptrdiff_t input_stride =
+          frame_buffer_.stride(plane) / sizeof(uint16_t);
+      const ptrdiff_t output_stride =
+          (dst_is_loop_restoration_border
+               ? loop_restoration_border_.stride(plane)
+               : frame_buffer_.stride(plane)) /
+          sizeof(uint16_t);
+      if (rows[plane] > 0) {
+        dsp_.super_res(superres_coefficients_[static_cast<int>(plane != 0)],
+                       input, input_stride, rows[plane], plane_width,
+                       super_res_info_[plane].upscaled_width,
+                       super_res_info_[plane].initial_subpixel_x,
+                       super_res_info_[plane].step, output, output_stride);
+      }
+      // In the multi-threaded case, the |superres_line_buffer_| holds the last
+      // input row. Apply SuperRes for that row.
+      if (line_buffer_row >= 0) {
+        auto* const line_buffer_start =
+            reinterpret_cast<uint16_t*>(superres_line_buffer_.data(plane)) +
+            line_buffer_row * superres_line_buffer_.stride(plane) /
+                sizeof(uint16_t) +
+            kSuperResHorizontalBorder;
+        dsp_.super_res(superres_coefficients_[static_cast<int>(plane != 0)],
+                       line_buffer_start, /*source_stride=*/0,
+                       /*height=*/1, plane_width,
+                       super_res_info_[plane].upscaled_width,
+                       super_res_info_[plane].initial_subpixel_x,
+                       super_res_info_[plane].step,
+                       output + rows[plane] * output_stride, /*dest_stride=*/0);
       }
       continue;
     }
 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
-    for (int y = 0; y < rows[plane]; ++y, input += input_stride) {
-      if (!in_place) {
-        memcpy(line_buffer_start, input, plane_width);
-      }
-      ExtendLine<uint8_t>(in_place ? input : line_buffer_start, plane_width,
-                          kSuperResHorizontalBorder, kSuperResHorizontalBorder);
-      dsp_.super_res_row(in_place ? input : line_buffer_start,
-                         super_res_info_[plane].upscaled_width,
-                         super_res_info_[plane].initial_subpixel_x,
-                         super_res_info_[plane].step,
-                         input - (in_place ? input_stride : 0));
+    uint8_t* input = src[plane];
+    uint8_t* output = dst[plane];
+    const ptrdiff_t input_stride = frame_buffer_.stride(plane);
+    const ptrdiff_t output_stride = dst_is_loop_restoration_border
+                                        ? loop_restoration_border_.stride(plane)
+                                        : frame_buffer_.stride(plane);
+    if (rows[plane] > 0) {
+      dsp_.super_res(superres_coefficients_[static_cast<int>(plane != 0)],
+                     input, input_stride, rows[plane], plane_width,
+                     super_res_info_[plane].upscaled_width,
+                     super_res_info_[plane].initial_subpixel_x,
+                     super_res_info_[plane].step, output, output_stride);
     }
-  }
+    // In the multi-threaded case, the |superres_line_buffer_| holds the last
+    // input row. Apply SuperRes for that row.
+    if (line_buffer_row >= 0) {
+      uint8_t* const line_buffer_start =
+          superres_line_buffer_.data(plane) +
+          line_buffer_row * superres_line_buffer_.stride(plane) +
+          kSuperResHorizontalBorder;
+      dsp_.super_res(
+          superres_coefficients_[static_cast<int>(plane != 0)],
+          line_buffer_start, /*source_stride=*/0,
+          /*height=*/1, plane_width, super_res_info_[plane].upscaled_width,
+          super_res_info_[plane].initial_subpixel_x,
+          super_res_info_[plane].step, output + rows[plane] * output_stride,
+          /*dest_stride=*/0);
+    }
+  } while (++plane < planes_);
 }
 
-// Used by post_filter_test.cc.
-template void PostFilter::ApplySuperRes<false>(
-    const std::array<uint8_t*, kMaxPlanes>& buffers,
-    const std::array<int, kMaxPlanes>& strides,
-    const std::array<int, kMaxPlanes>& rows, size_t line_buffer_offset);
-
 void PostFilter::ApplySuperResForOneSuperBlockRow(int row4x4_start, int sb4x4,
                                                   bool is_last_row) {
   assert(row4x4_start >= 0);
   assert(DoSuperRes());
   // If not doing cdef, then LR needs two rows of border with superres applied.
   const int num_rows_extra = (DoCdef() || !DoRestoration()) ? 0 : 2;
-  std::array<uint8_t*, kMaxPlanes> buffers;
-  std::array<int, kMaxPlanes> strides;
+  std::array<uint8_t*, kMaxPlanes> src;
+  std::array<uint8_t*, kMaxPlanes> dst;
   std::array<int, kMaxPlanes> rows;
-  // Apply superres for the last 8-num_rows_extra rows of the previous
-  // superblock.
-  if (row4x4_start > 0) {
-    const int row4x4 = row4x4_start - 2;
-    for (int plane = 0; plane < planes_; ++plane) {
-      const int row =
-          (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + num_rows_extra;
-      const ptrdiff_t row_offset = row * frame_buffer_.stride(plane);
-      buffers[plane] = cdef_buffer_[plane] + row_offset;
-      strides[plane] = frame_buffer_.stride(plane);
-      // Note that the |num_rows_extra| subtraction is done after the value is
-      // subsampled since we always need to work on |num_rows_extra| extra rows
-      // irrespective of the plane subsampling.
-      rows[plane] = (8 >> subsampling_y_[plane]) - num_rows_extra;
-    }
-    ApplySuperRes<true>(buffers, strides, rows, /*line_buffer_offset=*/0);
-  }
-  // Apply superres for the current superblock row (except for the last
-  // 8-num_rows_extra rows).
   const int num_rows4x4 =
       std::min(sb4x4, frame_header_.rows4x4 - row4x4_start) -
       (is_last_row ? 0 : 2);
-  for (int plane = 0; plane < planes_; ++plane) {
-    const ptrdiff_t row_offset =
-        (MultiplyBy4(row4x4_start) >> subsampling_y_[plane]) *
-        frame_buffer_.stride(plane);
-    buffers[plane] = cdef_buffer_[plane] + row_offset;
-    strides[plane] = frame_buffer_.stride(plane);
-    // Note that the |num_rows_extra| subtraction is done after the value is
-    // subsampled since we always need to work on |num_rows_extra| extra rows
-    // irrespective of the plane subsampling.
-    rows[plane] = (MultiplyBy4(num_rows4x4) >> subsampling_y_[plane]) +
-                  (is_last_row ? 0 : num_rows_extra);
+  if (row4x4_start > 0) {
+    const int row4x4 = row4x4_start - 2;
+    int plane = kPlaneY;
+    do {
+      const int row =
+          (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + num_rows_extra;
+      const ptrdiff_t row_offset = row * frame_buffer_.stride(plane);
+      src[plane] = cdef_buffer_[plane] + row_offset;
+      dst[plane] = superres_buffer_[plane] + row_offset;
+      // Note that the |num_rows_extra| subtraction is done after the value is
+      // subsampled since we always need to work on |num_rows_extra| extra rows
+      // irrespective of the plane subsampling.
+      // Apply superres for the last 8-|num_rows_extra| rows of the previous
+      // superblock.
+      rows[plane] = (8 >> subsampling_y_[plane]) - num_rows_extra;
+      // Apply superres for the current superblock row (except for the last
+      // 8-|num_rows_extra| rows).
+      rows[plane] += (MultiplyBy4(num_rows4x4) >> subsampling_y_[plane]) +
+                     (is_last_row ? 0 : num_rows_extra);
+    } while (++plane < planes_);
+  } else {
+    // Apply superres for the current superblock row (except for the last
+    // 8-|num_rows_extra| rows).
+    int plane = kPlaneY;
+    do {
+      const ptrdiff_t row_offset =
+          (MultiplyBy4(row4x4_start) >> subsampling_y_[plane]) *
+          frame_buffer_.stride(plane);
+      src[plane] = cdef_buffer_[plane] + row_offset;
+      dst[plane] = superres_buffer_[plane] + row_offset;
+      // Note that the |num_rows_extra| addition is done after the value is
+      // subsampled since we always need to work on |num_rows_extra| extra rows
+      // irrespective of the plane subsampling.
+      rows[plane] = (MultiplyBy4(num_rows4x4) >> subsampling_y_[plane]) +
+                    (is_last_row ? 0 : num_rows_extra);
+    } while (++plane < planes_);
   }
-  ApplySuperRes<true>(buffers, strides, rows, /*line_buffer_offset=*/0);
+  ApplySuperRes(src, rows, /*line_buffer_row=*/-1, dst);
 }
 
 void PostFilter::ApplySuperResThreaded() {
-  const int num_threads = thread_pool_->num_threads() + 1;
-  // The number of rows4x4 that will be processed by each thread in the thread
-  // pool (other than the current thread).
-  const int thread_pool_rows4x4 = frame_header_.rows4x4 / num_threads;
-  // For the current thread, we round up to process all the remaining rows so
-  // that the current thread's job will potentially run the longest.
-  const int current_thread_rows4x4 =
-      frame_header_.rows4x4 - (thread_pool_rows4x4 * (num_threads - 1));
-  // The size of the line buffer required by each thread. In the multi-threaded
-  // case we are guaranteed to have a line buffer which can store |num_threads|
-  // rows at the same time.
-  const size_t line_buffer_size =
-      (MultiplyBy4(frame_header_.columns4x4) +
-       MultiplyBy2(kSuperResHorizontalBorder) + kSuperResHorizontalPadding) *
-      pixel_size_;
-  size_t line_buffer_offset = 0;
+  int num_threads = thread_pool_->num_threads() + 1;
+  // The number of rows that will be processed by each thread in the thread pool
+  // (other than the current thread).
+  int thread_pool_rows = height_ / num_threads;
+  thread_pool_rows = std::max(thread_pool_rows, 1);
+  // Make rows of Y plane even when there is subsampling for the other planes.
+  if ((thread_pool_rows & 1) != 0 && subsampling_y_[kPlaneU] != 0) {
+    ++thread_pool_rows;
+  }
+  // Adjust the number of threads to what we really need.
+  num_threads = Clip3(height_ / thread_pool_rows, 1, num_threads);
+  // For the current thread, we round up to process all the remaining rows.
+  int current_thread_rows = height_ - thread_pool_rows * (num_threads - 1);
+  // Make rows of Y plane even when there is subsampling for the other planes.
+  if ((current_thread_rows & 1) != 0 && subsampling_y_[kPlaneU] != 0) {
+    ++current_thread_rows;
+  }
+  assert(current_thread_rows > 0);
   BlockingCounter pending_workers(num_threads - 1);
-  for (int i = 0, row4x4_start = 0; i < num_threads; ++i,
-           row4x4_start += thread_pool_rows4x4,
-           line_buffer_offset += line_buffer_size) {
-    std::array<uint8_t*, kMaxPlanes> buffers;
-    std::array<int, kMaxPlanes> strides;
+  for (int line_buffer_row = 0, row_start = 0; line_buffer_row < num_threads;
+       ++line_buffer_row, row_start += thread_pool_rows) {
+    std::array<uint8_t*, kMaxPlanes> src;
+    std::array<uint8_t*, kMaxPlanes> dst;
     std::array<int, kMaxPlanes> rows;
-    for (int plane = 0; plane < planes_; ++plane) {
-      strides[plane] = frame_buffer_.stride(plane);
-      buffers[plane] =
-          GetBufferOffset(cdef_buffer_[plane], strides[plane],
-                          static_cast<Plane>(plane), row4x4_start, 0);
-      if (i < num_threads - 1) {
-        rows[plane] = MultiplyBy4(thread_pool_rows4x4) >> subsampling_y_[plane];
-      } else {
-        rows[plane] =
-            MultiplyBy4(current_thread_rows4x4) >> subsampling_y_[plane];
-      }
-    }
-    if (i < num_threads - 1) {
-      thread_pool_->Schedule([this, buffers, strides, rows, line_buffer_offset,
-                              &pending_workers]() {
-        ApplySuperRes<false>(buffers, strides, rows, line_buffer_offset);
-        pending_workers.Decrement();
-      });
+    int plane = kPlaneY;
+    const int pixel_size_log2 = pixel_size_log2_;
+    do {
+      src[plane] =
+          GetBufferOffset(cdef_buffer_[plane], frame_buffer_.stride(plane),
+                          static_cast<Plane>(plane), row_start, 0);
+      dst[plane] =
+          GetBufferOffset(superres_buffer_[plane], frame_buffer_.stride(plane),
+                          static_cast<Plane>(plane), row_start, 0);
+      rows[plane] =
+          (((line_buffer_row < num_threads - 1) ? thread_pool_rows
+                                                : current_thread_rows) >>
+           subsampling_y_[plane]) -
+          1;
+      const int plane_width =
+          MultiplyBy4(frame_header_.columns4x4) >> subsampling_x_[plane];
+      uint8_t* const input =
+          src[plane] + rows[plane] * frame_buffer_.stride(plane);
+      uint8_t* const line_buffer_start =
+          superres_line_buffer_.data(plane) +
+          line_buffer_row * superres_line_buffer_.stride(plane) +
+          (kSuperResHorizontalBorder << pixel_size_log2);
+      memcpy(line_buffer_start, input, plane_width << pixel_size_log2);
+    } while (++plane < planes_);
+    if (line_buffer_row < num_threads - 1) {
+      thread_pool_->Schedule(
+          [this, src, rows, line_buffer_row, dst, &pending_workers]() {
+            ApplySuperRes(src, rows, line_buffer_row, dst);
+            pending_workers.Decrement();
+          });
     } else {
-      ApplySuperRes<false>(buffers, strides, rows, line_buffer_offset);
+      ApplySuperRes(src, rows, line_buffer_row, dst);
     }
   }
   // Wait for the threadpool jobs to finish.
   pending_workers.Wait();
 }
 
-// This function lives in this file so that it has access to ExtendLine<>.
-void PostFilter::SetupDeblockBuffer(int row4x4_start, int sb4x4) {
-  assert(row4x4_start >= 0);
-  assert(DoCdef());
-  assert(DoRestoration());
-  for (int sb_y = 0; sb_y < sb4x4; sb_y += 16) {
-    const int row4x4 = row4x4_start + sb_y;
-    for (int plane = 0; plane < planes_; ++plane) {
-      CopyDeblockedPixels(static_cast<Plane>(plane), row4x4);
-    }
-    const int row_offset_start = DivideBy4(row4x4);
-    if (DoSuperRes()) {
-      std::array<uint8_t*, kMaxPlanes> buffers = {
-          deblock_buffer_.data(kPlaneY) +
-              row_offset_start * deblock_buffer_.stride(kPlaneY),
-          deblock_buffer_.data(kPlaneU) +
-              row_offset_start * deblock_buffer_.stride(kPlaneU),
-          deblock_buffer_.data(kPlaneV) +
-              row_offset_start * deblock_buffer_.stride(kPlaneV)};
-      std::array<int, kMaxPlanes> strides = {deblock_buffer_.stride(kPlaneY),
-                                             deblock_buffer_.stride(kPlaneU),
-                                             deblock_buffer_.stride(kPlaneV)};
-      std::array<int, kMaxPlanes> rows = {4, 4, 4};
-      ApplySuperRes<false>(buffers, strides, rows,
-                           /*line_buffer_offset=*/0);
-    }
-    // Extend the left and right boundaries needed for loop restoration.
-    for (int plane = 0; plane < planes_; ++plane) {
-      uint8_t* src = deblock_buffer_.data(plane) +
-                     row_offset_start * deblock_buffer_.stride(plane);
-      const int plane_width =
-          RightShiftWithRounding(upscaled_width_, subsampling_x_[plane]);
-      for (int i = 0; i < 4; ++i) {
-#if LIBGAV1_MAX_BITDEPTH >= 10
-        if (bitdepth_ >= 10) {
-          ExtendLine<uint16_t>(src, plane_width, kRestorationHorizontalBorder,
-                               kRestorationHorizontalBorder);
-        } else  // NOLINT.
-#endif
-        {
-          ExtendLine<uint8_t>(src, plane_width, kRestorationHorizontalBorder,
-                              kRestorationHorizontalBorder);
-        }
-        src += deblock_buffer_.stride(plane);
-      }
-    }
-  }
-}
-
 }  // namespace libgav1

diff --git a/libgav1/src/quantizer.cc b/libgav1/src/quantizer.cc
index b26024d..cd720d6 100644
--- a/libgav1/src/quantizer.cc
+++ b/libgav1/src/quantizer.cc

@@ -18,6 +18,7 @@
 #include <cstdint>
 
 #include "src/utils/common.h"
+#include "src/utils/constants.h"
 
 #if LIBGAV1_MAX_BITDEPTH != 8 && LIBGAV1_MAX_BITDEPTH != 10
 #error LIBGAV1_MAX_BITDEPTH must be 8 or 10
@@ -26,6 +27,9 @@
 namespace libgav1 {
 namespace {
 
+// Import all the constants in the anonymous namespace.
+#include "src/quantizer_tables.inc"
+
 // Format the kDcLookup and kAcLookup arrays manually for easier comparison
 // with the Dc_Qlookup and Ac_Qlookup arrays in Section 7.12.2.
 
@@ -141,8 +145,99 @@
 };
 // clang-format on
 
+void Transpose(uint8_t* const dst, const uint8_t* const src, int src_width,
+               int src_height) {
+  const int dst_width = src_height;
+  const int dst_height = src_width;
+  Array2DView<const uint8_t> source(src_height, src_width, src);
+  Array2DView<uint8_t> dest(dst_height, dst_width, dst);
+  for (int y = 0; y < dst_height; ++y) {
+    for (int x = 0; x < dst_width; ++x) {
+      dest[y][x] = source[x][y];
+    }
+  }
+}
+
+// Copies the lower triangle and fills the upper triangle of |dst| using |src|
+// as the source.
+void FillUpperTriangle(uint8_t* dst, const uint8_t* src, int size) {
+  Array2DView<uint8_t> dest(size, size, dst);
+  int k = 0;
+  for (int y = 0; y < size; ++y) {
+    for (int x = 0; x <= y; ++x) {
+      dest[y][x] = dest[x][y] = src[k++];
+    }
+  }
+}
+
 }  // namespace
 
+bool InitializeQuantizerMatrix(QuantizerMatrix* quantizer_matrix_ptr) {
+  for (int level = 0; level < kNumQuantizerLevelsForQuantizerMatrix; ++level) {
+    for (int plane_type = kPlaneTypeY; plane_type < kNumPlaneTypes;
+         ++plane_type) {
+      auto& quantizer_matrix = (*quantizer_matrix_ptr)[level][plane_type];
+      // Notes about how these matrices are populated:
+      // * For square transforms, we store only the lower left triangle (it is
+      // symmetric about the main diagonal. So when populating the matrix, we
+      // will have to fill in the upper right triangle.
+      // * For rectangular transforms, the matrices are transposes when the
+      // width and height are reversed. So when populating we populate it with
+      // memcpy when w < h and populate it by transposing when w > h.
+      // * There is a special case for 16x16 where the matrix is the same as
+      // 32x32 with some offsets.
+      // * We use the "adjusted transform size" when using these matrices, so we
+      // won't have to populate them for transform sizes with one of the
+      // dimensions equal to 64.
+      for (int tx_size = 0; tx_size < kNumTransformSizes; ++tx_size) {
+        if (kTransformWidth[tx_size] == 64 || kTransformHeight[tx_size] == 64) {
+          continue;
+        }
+        const int size = kTransformWidth[tx_size] * kTransformHeight[tx_size];
+        if (!quantizer_matrix[tx_size].Resize(size)) {
+          return false;
+        }
+      }
+#define QUANTIZER_MEMCPY(W, H)                            \
+  memcpy(quantizer_matrix[kTransformSize##W##x##H].get(), \
+         kQuantizerMatrix##W##x##H[level][plane_type], (W) * (H))
+#define QUANTIZER_TRANSPOSE(W, H)                            \
+  Transpose(quantizer_matrix[kTransformSize##W##x##H].get(), \
+            kQuantizerMatrix##H##x##W[level][plane_type], H, W)
+#define QUANTIZER_FILL_UPPER_TRIANGLE(SIZE)                                \
+  FillUpperTriangle(quantizer_matrix[kTransformSize##SIZE##x##SIZE].get(), \
+                    kQuantizerMatrix##SIZE##x##SIZE[level][plane_type], SIZE)
+      QUANTIZER_FILL_UPPER_TRIANGLE(4);   // 4x4
+      QUANTIZER_MEMCPY(4, 8);             // 4x8
+      QUANTIZER_MEMCPY(4, 16);            // 4x16
+      QUANTIZER_TRANSPOSE(8, 4);          // 8x4
+      QUANTIZER_FILL_UPPER_TRIANGLE(8);   // 8x8
+      QUANTIZER_MEMCPY(8, 16);            // 8x16
+      QUANTIZER_MEMCPY(8, 32);            // 8x32
+      QUANTIZER_TRANSPOSE(16, 4);         // 16x4
+      QUANTIZER_TRANSPOSE(16, 8);         // 16x8
+      QUANTIZER_MEMCPY(16, 32);           // 16x32
+      QUANTIZER_TRANSPOSE(32, 8);         // 32x8
+      QUANTIZER_TRANSPOSE(32, 16);        // 32x16
+      QUANTIZER_FILL_UPPER_TRIANGLE(32);  // 32x32
+      // 16x16.
+      Array2DView<uint8_t> dst16x16(
+          16, 16, quantizer_matrix[kTransformSize16x16].get());
+      Array2DView<const uint8_t> src32x32(
+          32, 32, quantizer_matrix[kTransformSize32x32].get());
+      for (int y = 0; y < 16; ++y) {
+        for (int x = 0; x < 16; ++x) {
+          dst16x16[y][x] = src32x32[MultiplyBy2(y)][MultiplyBy2(x)];
+        }
+      }
+#undef QUANTIZER_FILL_UPPER_TRIANGLE
+#undef QUANTIZER_TRANSPOSE
+#undef QUANTIZER_MEMCPY
+    }
+  }
+  return true;
+}
+
 int GetQIndex(const Segmentation& segmentation, int index, int base_qindex) {
   if (segmentation.FeatureActive(index, kSegmentFeatureQuantizer)) {
     const int segment_qindex =

diff --git a/libgav1/src/quantizer.h b/libgav1/src/quantizer.h
index e555115..00c53ab 100644
--- a/libgav1/src/quantizer.h
+++ b/libgav1/src/quantizer.h

@@ -20,11 +20,17 @@
 #include <cstdint>
 
 #include "src/utils/constants.h"
+#include "src/utils/dynamic_buffer.h"
 #include "src/utils/segmentation.h"
 #include "src/utils/types.h"
 
 namespace libgav1 {
 
+using QuantizerMatrix = std::array<
+    std::array<std::array<DynamicBuffer<uint8_t>, kNumTransformSizes>,
+               kNumPlaneTypes>,
+    kNumQuantizerLevelsForQuantizerMatrix>;
+
 // Implements the dequantization functions of Section 7.12.2.
 class Quantizer {
  public:
@@ -48,6 +54,9 @@
   const int16_t* ac_lookup_;
 };
 
+// Initialize the quantizer matrix.
+bool InitializeQuantizerMatrix(QuantizerMatrix* quantizer_matrix);
+
 // Get the quantizer index for the |index|th segment.
 //
 // This function has two use cases. What should be passed as the |base_qindex|

diff --git a/libgav1/src/quantizer_tables.inc b/libgav1/src/quantizer_tables.inc
index b5a89a8..34342c4 100644
--- a/libgav1/src/quantizer_tables.inc
+++ b/libgav1/src/quantizer_tables.inc

@@ -15,6729 +15,3066 @@
 // This file is just a convenience to separate out all the quantizer table
 // definitions from the quantizer functions.
 
-// Quantizer matrix is used only when level < 15.
-constexpr int kNumQuantizerLevelsForQuantizerMatrix = 15;
-constexpr int kQuantizerMatrixSize = 3344;
-
-constexpr uint16_t kQuantizerMatrixOffset[kNumTransformSizes] = {
-    0,    1360, 2704, 1392, 16,  1424, 2832, 2768, 1552, 80,
-    1680, 1680, 3088, 2192, 336, 336,  2192, 336,  336};
-
-constexpr uint8_t kQuantizerMatrix
-    [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes]
-    [kQuantizerMatrixSize] = {
-        // Quantizer level 0.
-        {
-            {// Luma
-             // Size 4x4
-             32, 43, 73, 97, 43, 67, 94, 110, 73, 94, 137, 150, 97, 110, 150,
-             200,
-             // Size 8x8
-             32, 32, 38, 51, 68, 84, 95, 109, 32, 35, 40, 49, 63, 76, 89, 102,
-             38, 40, 54, 65, 78, 91, 98, 106, 51, 49, 65, 82, 97, 111, 113, 121,
-             68, 63, 78, 97, 117, 134, 138, 142, 84, 76, 91, 111, 134, 152, 159,
-             168, 95, 89, 98, 113, 138, 159, 183, 199, 109, 102, 106, 121, 142,
-             168, 199, 220,
-             // Size 16x16
-             32, 31, 31, 34, 36, 44, 48, 59, 65, 80, 83, 91, 97, 104, 111, 119,
-             31, 32, 32, 33, 34, 41, 44, 54, 59, 72, 75, 83, 90, 97, 104, 112,
-             31, 32, 33, 35, 36, 42, 45, 54, 59, 71, 74, 81, 86, 93, 100, 107,
-             34, 33, 35, 39, 42, 47, 51, 58, 63, 74, 76, 81, 84, 90, 97, 105,
-             36, 34, 36, 42, 48, 54, 57, 64, 68, 79, 81, 88, 91, 96, 102, 105,
-             44, 41, 42, 47, 54, 63, 67, 75, 79, 90, 92, 95, 100, 102, 109, 112,
-             48, 44, 45, 51, 57, 67, 71, 80, 85, 96, 99, 107, 108, 111, 117,
-             120, 59, 54, 54, 58, 64, 75, 80, 92, 98, 110, 113, 115, 116, 122,
-             125, 130, 65, 59, 59, 63, 68, 79, 85, 98, 105, 118, 121, 127, 130,
-             134, 135, 140, 80, 72, 71, 74, 79, 90, 96, 110, 118, 134, 137, 140,
-             143, 144, 146, 152, 83, 75, 74, 76, 81, 92, 99, 113, 121, 137, 140,
-             151, 152, 155, 158, 165, 91, 83, 81, 81, 88, 95, 107, 115, 127,
-             140, 151, 159, 166, 169, 173, 179, 97, 90, 86, 84, 91, 100, 108,
-             116, 130, 143, 152, 166, 174, 182, 189, 193, 104, 97, 93, 90, 96,
-             102, 111, 122, 134, 144, 155, 169, 182, 191, 200, 210, 111, 104,
-             100, 97, 102, 109, 117, 125, 135, 146, 158, 173, 189, 200, 210,
-             220, 119, 112, 107, 105, 105, 112, 120, 130, 140, 152, 165, 179,
-             193, 210, 220, 231,
-             // Size 32x32
-             32, 31, 31, 31, 31, 32, 34, 35, 36, 39, 44, 46, 48, 54, 59, 62, 65,
-             71, 80, 81, 83, 88, 91, 94, 97, 101, 104, 107, 111, 115, 119, 123,
-             31, 32, 32, 32, 32, 32, 34, 34, 35, 38, 42, 44, 46, 51, 56, 59, 62,
-             68, 76, 77, 78, 84, 86, 89, 92, 95, 99, 102, 105, 109, 113, 116,
-             31, 32, 32, 32, 32, 32, 33, 34, 34, 37, 41, 42, 44, 49, 54, 56, 59,
-             65, 72, 73, 75, 80, 83, 86, 90, 93, 97, 101, 104, 108, 112, 116,
-             31, 32, 32, 32, 33, 33, 34, 35, 35, 38, 41, 43, 45, 49, 54, 56, 59,
-             64, 72, 73, 74, 79, 82, 85, 88, 91, 94, 97, 101, 104, 107, 111, 31,
-             32, 32, 33, 33, 34, 35, 36, 36, 39, 42, 44, 45, 50, 54, 56, 59, 64,
-             71, 72, 74, 78, 81, 84, 86, 89, 93, 96, 100, 104, 107, 111, 32, 32,
-             32, 33, 34, 35, 37, 37, 38, 40, 42, 44, 46, 49, 53, 55, 58, 63, 69,
-             70, 72, 76, 79, 82, 85, 89, 93, 96, 99, 102, 106, 109, 34, 34, 33,
-             34, 35, 37, 39, 41, 42, 45, 47, 49, 51, 54, 58, 60, 63, 68, 74, 75,
-             76, 80, 81, 82, 84, 87, 90, 93, 97, 101, 105, 110, 35, 34, 34, 35,
-             36, 37, 41, 43, 45, 47, 50, 52, 53, 57, 61, 63, 65, 70, 76, 77, 79,
-             82, 84, 86, 89, 91, 92, 93, 96, 100, 103, 107, 36, 35, 34, 35, 36,
-             38, 42, 45, 48, 50, 54, 55, 57, 60, 64, 66, 68, 73, 79, 80, 81, 85,
-             88, 90, 91, 93, 96, 99, 102, 103, 105, 107, 39, 38, 37, 38, 39, 40,
-             45, 47, 50, 54, 58, 59, 61, 65, 69, 71, 73, 78, 84, 85, 86, 91, 92,
-             92, 95, 98, 100, 101, 103, 106, 110, 114, 44, 42, 41, 41, 42, 42,
-             47, 50, 54, 58, 63, 65, 67, 71, 75, 77, 79, 84, 90, 91, 92, 95, 95,
-             97, 100, 101, 102, 105, 109, 111, 112, 114, 46, 44, 42, 43, 44, 44,
-             49, 52, 55, 59, 65, 67, 69, 74, 78, 80, 82, 87, 93, 94, 95, 98,
-             100, 103, 102, 105, 108, 110, 111, 113, 117, 121, 48, 46, 44, 45,
-             45, 46, 51, 53, 57, 61, 67, 69, 71, 76, 80, 83, 85, 90, 96, 97, 99,
-             103, 107, 105, 108, 111, 111, 113, 117, 119, 120, 122, 54, 51, 49,
-             49, 50, 49, 54, 57, 60, 65, 71, 74, 76, 82, 87, 89, 92, 97, 104,
-             105, 106, 111, 110, 111, 114, 113, 116, 120, 120, 121, 125, 130,
-             59, 56, 54, 54, 54, 53, 58, 61, 64, 69, 75, 78, 80, 87, 92, 95, 98,
-             103, 110, 111, 113, 115, 115, 119, 116, 120, 122, 122, 125, 129,
-             130, 130, 62, 59, 56, 56, 56, 55, 60, 63, 66, 71, 77, 80, 83, 89,
-             95, 98, 101, 107, 114, 115, 117, 119, 123, 121, 125, 126, 125, 129,
-             131, 131, 135, 140, 65, 62, 59, 59, 59, 58, 63, 65, 68, 73, 79, 82,
-             85, 92, 98, 101, 105, 111, 118, 119, 121, 126, 127, 128, 130, 130,
-             134, 133, 135, 140, 140, 140, 71, 68, 65, 64, 64, 63, 68, 70, 73,
-             78, 84, 87, 90, 97, 103, 107, 111, 117, 125, 126, 128, 134, 132,
-             136, 133, 138, 137, 140, 143, 142, 145, 150, 80, 76, 72, 72, 71,
-             69, 74, 76, 79, 84, 90, 93, 96, 104, 110, 114, 118, 125, 134, 135,
-             137, 139, 140, 139, 143, 142, 144, 146, 146, 151, 152, 151, 81, 77,
-             73, 73, 72, 70, 75, 77, 80, 85, 91, 94, 97, 105, 111, 115, 119,
-             126, 135, 137, 138, 144, 147, 146, 148, 149, 151, 150, 156, 155,
-             157, 163, 83, 78, 75, 74, 74, 72, 76, 79, 81, 86, 92, 95, 99, 106,
-             113, 117, 121, 128, 137, 138, 140, 147, 151, 156, 152, 157, 155,
-             161, 158, 162, 165, 164, 88, 84, 80, 79, 78, 76, 80, 82, 85, 91,
-             95, 98, 103, 111, 115, 119, 126, 134, 139, 144, 147, 152, 154, 158,
-             163, 159, 165, 163, 168, 168, 169, 176, 91, 86, 83, 82, 81, 79, 81,
-             84, 88, 92, 95, 100, 107, 110, 115, 123, 127, 132, 140, 147, 151,
-             154, 159, 161, 166, 171, 169, 173, 173, 176, 179, 177, 94, 89, 86,
-             85, 84, 82, 82, 86, 90, 92, 97, 103, 105, 111, 119, 121, 128, 136,
-             139, 146, 156, 158, 161, 166, 168, 174, 179, 178, 180, 183, 183,
-             190, 97, 92, 90, 88, 86, 85, 84, 89, 91, 95, 100, 102, 108, 114,
-             116, 125, 130, 133, 143, 148, 152, 163, 166, 168, 174, 176, 182,
-             187, 189, 188, 193, 191, 101, 95, 93, 91, 89, 89, 87, 91, 93, 98,
-             101, 105, 111, 113, 120, 126, 130, 138, 142, 149, 157, 159, 171,
-             174, 176, 183, 184, 191, 195, 199, 197, 204, 104, 99, 97, 94, 93,
-             93, 90, 92, 96, 100, 102, 108, 111, 116, 122, 125, 134, 137, 144,
-             151, 155, 165, 169, 179, 182, 184, 191, 193, 200, 204, 210, 206,
-             107, 102, 101, 97, 96, 96, 93, 93, 99, 101, 105, 110, 113, 120,
-             122, 129, 133, 140, 146, 150, 161, 163, 173, 178, 187, 191, 193,
-             200, 202, 210, 214, 222, 111, 105, 104, 101, 100, 99, 97, 96, 102,
-             103, 109, 111, 117, 120, 125, 131, 135, 143, 146, 156, 158, 168,
-             173, 180, 189, 195, 200, 202, 210, 212, 220, 224, 115, 109, 108,
-             104, 104, 102, 101, 100, 103, 106, 111, 113, 119, 121, 129, 131,
-             140, 142, 151, 155, 162, 168, 176, 183, 188, 199, 204, 210, 212,
-             220, 222, 230, 119, 113, 112, 107, 107, 106, 105, 103, 105, 110,
-             112, 117, 120, 125, 130, 135, 140, 145, 152, 157, 165, 169, 179,
-             183, 193, 197, 210, 214, 220, 222, 231, 232, 123, 116, 116, 111,
-             111, 109, 110, 107, 107, 114, 114, 121, 122, 130, 130, 140, 140,
-             150, 151, 163, 164, 176, 177, 190, 191, 204, 206, 222, 224, 230,
-             232, 242,
-             // Size 4x8
-             32, 42, 75, 91, 33, 42, 69, 86, 37, 58, 84, 91, 49, 71, 103, 110,
-             65, 84, 125, 128, 80, 97, 142, 152, 91, 100, 145, 178, 104, 112,
-             146, 190,
-             // Size 8x4
-             32, 33, 37, 49, 65, 80, 91, 104, 42, 42, 58, 71, 84, 97, 100, 112,
-             75, 69, 84, 103, 125, 142, 145, 146, 91, 86, 91, 110, 128, 152,
-             178, 190,
-             // Size 8x16
-             32, 32, 36, 53, 65, 87, 93, 99, 31, 33, 34, 49, 59, 78, 86, 93, 32,
-             34, 36, 50, 59, 77, 82, 89, 34, 37, 42, 54, 63, 79, 80, 88, 36, 38,
-             48, 60, 68, 84, 86, 90, 44, 43, 53, 71, 79, 95, 94, 97, 48, 46, 56,
-             76, 85, 102, 105, 105, 58, 54, 63, 87, 98, 116, 112, 115, 65, 58,
-             68, 92, 105, 124, 122, 124, 79, 70, 79, 104, 118, 141, 135, 135,
-             82, 72, 81, 106, 121, 144, 149, 146, 91, 80, 88, 106, 130, 148,
-             162, 159, 97, 86, 94, 107, 128, 157, 167, 171, 103, 93, 98, 114,
-             131, 150, 174, 186, 110, 100, 101, 117, 138, 161, 183, 193, 118,
-             107, 105, 118, 136, 157, 182, 203,
-             // Size 16x8
-             32, 31, 32, 34, 36, 44, 48, 58, 65, 79, 82, 91, 97, 103, 110, 118,
-             32, 33, 34, 37, 38, 43, 46, 54, 58, 70, 72, 80, 86, 93, 100, 107,
-             36, 34, 36, 42, 48, 53, 56, 63, 68, 79, 81, 88, 94, 98, 101, 105,
-             53, 49, 50, 54, 60, 71, 76, 87, 92, 104, 106, 106, 107, 114, 117,
-             118, 65, 59, 59, 63, 68, 79, 85, 98, 105, 118, 121, 130, 128, 131,
-             138, 136, 87, 78, 77, 79, 84, 95, 102, 116, 124, 141, 144, 148,
-             157, 150, 161, 157, 93, 86, 82, 80, 86, 94, 105, 112, 122, 135,
-             149, 162, 167, 174, 183, 182, 99, 93, 89, 88, 90, 97, 105, 115,
-             124, 135, 146, 159, 171, 186, 193, 203,
-             // Size 16x32
-             32, 31, 32, 34, 36, 44, 53, 59, 65, 79, 87, 90, 93, 96, 99, 102,
-             31, 32, 32, 34, 35, 42, 51, 56, 62, 75, 82, 85, 88, 91, 94, 97, 31,
-             32, 33, 33, 34, 41, 49, 54, 59, 72, 78, 82, 86, 90, 93, 97, 31, 32,
-             33, 34, 35, 41, 49, 54, 59, 71, 78, 81, 84, 87, 90, 93, 32, 32, 34,
-             35, 36, 42, 50, 54, 59, 71, 77, 80, 82, 86, 89, 93, 32, 33, 35, 37,
-             38, 42, 49, 53, 58, 69, 75, 78, 82, 86, 89, 92, 34, 34, 37, 39, 42,
-             48, 54, 58, 63, 73, 79, 78, 80, 83, 88, 92, 35, 34, 37, 41, 45, 50,
-             57, 61, 65, 76, 82, 83, 84, 84, 87, 90, 36, 34, 38, 43, 48, 54, 60,
-             64, 68, 78, 84, 87, 86, 89, 90, 90, 39, 37, 40, 45, 50, 58, 65, 69,
-             73, 84, 89, 89, 91, 91, 93, 96, 44, 41, 43, 48, 53, 63, 71, 75, 79,
-             90, 95, 93, 94, 95, 97, 97, 46, 43, 44, 49, 55, 65, 73, 78, 82, 93,
-             98, 100, 98, 100, 99, 103, 48, 45, 46, 51, 56, 67, 76, 80, 85, 96,
-             102, 102, 105, 102, 105, 104, 53, 49, 50, 54, 60, 71, 82, 87, 92,
-             103, 109, 107, 107, 110, 107, 111, 58, 54, 54, 58, 63, 75, 87, 92,
-             98, 110, 116, 115, 112, 111, 115, 112, 61, 57, 56, 60, 66, 77, 89,
-             95, 101, 114, 120, 118, 119, 118, 116, 120, 65, 60, 58, 63, 68, 79,
-             92, 98, 105, 118, 124, 123, 122, 123, 124, 121, 71, 65, 63, 68, 73,
-             84, 97, 103, 111, 125, 132, 132, 130, 128, 127, 130, 79, 72, 70,
-             74, 79, 90, 104, 110, 118, 133, 141, 136, 135, 135, 135, 131, 81,
-             74, 71, 75, 80, 91, 105, 112, 119, 135, 142, 140, 140, 138, 139,
-             142, 82, 75, 72, 76, 81, 92, 106, 113, 121, 136, 144, 151, 149,
-             149, 146, 143, 88, 80, 77, 80, 85, 97, 108, 115, 126, 142, 149,
-             153, 153, 152, 152, 154, 91, 83, 80, 81, 88, 100, 106, 114, 130,
-             142, 148, 155, 162, 160, 159, 155, 94, 85, 83, 82, 91, 100, 105,
-             118, 131, 137, 153, 160, 165, 167, 166, 168, 97, 88, 86, 85, 94,
-             100, 107, 123, 128, 140, 157, 161, 167, 173, 171, 169, 100, 91, 89,
-             87, 97, 100, 111, 121, 127, 145, 152, 164, 173, 178, 182, 181, 103,
-             94, 93, 90, 98, 101, 114, 120, 131, 144, 150, 170, 174, 180, 186,
-             183, 107, 97, 96, 93, 100, 104, 117, 119, 136, 142, 155, 168, 177,
-             187, 191, 198, 110, 101, 100, 97, 101, 108, 117, 123, 138, 141,
-             161, 165, 183, 188, 193, 200, 114, 104, 104, 100, 103, 112, 117,
-             127, 137, 146, 159, 167, 185, 190, 201, 206, 118, 108, 107, 103,
-             105, 115, 118, 131, 136, 151, 157, 172, 182, 197, 203, 208, 122,
-             111, 111, 107, 107, 119, 119, 136, 136, 156, 156, 178, 179, 203,
-             204, 217,
-             // Size 32x16
-             32, 31, 31, 31, 32, 32, 34, 35, 36, 39, 44, 46, 48, 53, 58, 61, 65,
-             71, 79, 81, 82, 88, 91, 94, 97, 100, 103, 107, 110, 114, 118, 122,
-             31, 32, 32, 32, 32, 33, 34, 34, 34, 37, 41, 43, 45, 49, 54, 57, 60,
-             65, 72, 74, 75, 80, 83, 85, 88, 91, 94, 97, 101, 104, 108, 111, 32,
-             32, 33, 33, 34, 35, 37, 37, 38, 40, 43, 44, 46, 50, 54, 56, 58, 63,
-             70, 71, 72, 77, 80, 83, 86, 89, 93, 96, 100, 104, 107, 111, 34, 34,
-             33, 34, 35, 37, 39, 41, 43, 45, 48, 49, 51, 54, 58, 60, 63, 68, 74,
-             75, 76, 80, 81, 82, 85, 87, 90, 93, 97, 100, 103, 107, 36, 35, 34,
-             35, 36, 38, 42, 45, 48, 50, 53, 55, 56, 60, 63, 66, 68, 73, 79, 80,
-             81, 85, 88, 91, 94, 97, 98, 100, 101, 103, 105, 107, 44, 42, 41,
-             41, 42, 42, 48, 50, 54, 58, 63, 65, 67, 71, 75, 77, 79, 84, 90, 91,
-             92, 97, 100, 100, 100, 100, 101, 104, 108, 112, 115, 119, 53, 51,
-             49, 49, 50, 49, 54, 57, 60, 65, 71, 73, 76, 82, 87, 89, 92, 97,
-             104, 105, 106, 108, 106, 105, 107, 111, 114, 117, 117, 117, 118,
-             119, 59, 56, 54, 54, 54, 53, 58, 61, 64, 69, 75, 78, 80, 87, 92,
-             95, 98, 103, 110, 112, 113, 115, 114, 118, 123, 121, 120, 119, 123,
-             127, 131, 136, 65, 62, 59, 59, 59, 58, 63, 65, 68, 73, 79, 82, 85,
-             92, 98, 101, 105, 111, 118, 119, 121, 126, 130, 131, 128, 127, 131,
-             136, 138, 137, 136, 136, 79, 75, 72, 71, 71, 69, 73, 76, 78, 84,
-             90, 93, 96, 103, 110, 114, 118, 125, 133, 135, 136, 142, 142, 137,
-             140, 145, 144, 142, 141, 146, 151, 156, 87, 82, 78, 78, 77, 75, 79,
-             82, 84, 89, 95, 98, 102, 109, 116, 120, 124, 132, 141, 142, 144,
-             149, 148, 153, 157, 152, 150, 155, 161, 159, 157, 156, 90, 85, 82,
-             81, 80, 78, 78, 83, 87, 89, 93, 100, 102, 107, 115, 118, 123, 132,
-             136, 140, 151, 153, 155, 160, 161, 164, 170, 168, 165, 167, 172,
-             178, 93, 88, 86, 84, 82, 82, 80, 84, 86, 91, 94, 98, 105, 107, 112,
-             119, 122, 130, 135, 140, 149, 153, 162, 165, 167, 173, 174, 177,
-             183, 185, 182, 179, 96, 91, 90, 87, 86, 86, 83, 84, 89, 91, 95,
-             100, 102, 110, 111, 118, 123, 128, 135, 138, 149, 152, 160, 167,
-             173, 178, 180, 187, 188, 190, 197, 203, 99, 94, 93, 90, 89, 89, 88,
-             87, 90, 93, 97, 99, 105, 107, 115, 116, 124, 127, 135, 139, 146,
-             152, 159, 166, 171, 182, 186, 191, 193, 201, 203, 204, 102, 97, 97,
-             93, 93, 92, 92, 90, 90, 96, 97, 103, 104, 111, 112, 120, 121, 130,
-             131, 142, 143, 154, 155, 168, 169, 181, 183, 198, 200, 206, 208,
-             217,
-             // Size 4x16
-             31, 44, 79, 96, 32, 41, 72, 90, 32, 42, 71, 86, 34, 48, 73, 83, 34,
-             54, 78, 89, 41, 63, 90, 95, 45, 67, 96, 102, 54, 75, 110, 111, 60,
-             79, 118, 123, 72, 90, 133, 135, 75, 92, 136, 149, 83, 100, 142,
-             160, 88, 100, 140, 173, 94, 101, 144, 180, 101, 108, 141, 188, 108,
-             115, 151, 197,
-             // Size 16x4
-             31, 32, 32, 34, 34, 41, 45, 54, 60, 72, 75, 83, 88, 94, 101, 108,
-             44, 41, 42, 48, 54, 63, 67, 75, 79, 90, 92, 100, 100, 101, 108,
-             115, 79, 72, 71, 73, 78, 90, 96, 110, 118, 133, 136, 142, 140, 144,
-             141, 151, 96, 90, 86, 83, 89, 95, 102, 111, 123, 135, 149, 160,
-             173, 180, 188, 197,
-             // Size 8x32
-             32, 32, 36, 53, 65, 87, 93, 99, 31, 32, 35, 51, 62, 82, 88, 94, 31,
-             33, 34, 49, 59, 78, 86, 93, 31, 33, 35, 49, 59, 78, 84, 90, 32, 34,
-             36, 50, 59, 77, 82, 89, 32, 35, 38, 49, 58, 75, 82, 89, 34, 37, 42,
-             54, 63, 79, 80, 88, 35, 37, 45, 57, 65, 82, 84, 87, 36, 38, 48, 60,
-             68, 84, 86, 90, 39, 40, 50, 65, 73, 89, 91, 93, 44, 43, 53, 71, 79,
-             95, 94, 97, 46, 44, 55, 73, 82, 98, 98, 99, 48, 46, 56, 76, 85,
-             102, 105, 105, 53, 50, 60, 82, 92, 109, 107, 107, 58, 54, 63, 87,
-             98, 116, 112, 115, 61, 56, 66, 89, 101, 120, 119, 116, 65, 58, 68,
-             92, 105, 124, 122, 124, 71, 63, 73, 97, 111, 132, 130, 127, 79, 70,
-             79, 104, 118, 141, 135, 135, 81, 71, 80, 105, 119, 142, 140, 139,
-             82, 72, 81, 106, 121, 144, 149, 146, 88, 77, 85, 108, 126, 149,
-             153, 152, 91, 80, 88, 106, 130, 148, 162, 159, 94, 83, 91, 105,
-             131, 153, 165, 166, 97, 86, 94, 107, 128, 157, 167, 171, 100, 89,
-             97, 111, 127, 152, 173, 182, 103, 93, 98, 114, 131, 150, 174, 186,
-             107, 96, 100, 117, 136, 155, 177, 191, 110, 100, 101, 117, 138,
-             161, 183, 193, 114, 104, 103, 117, 137, 159, 185, 201, 118, 107,
-             105, 118, 136, 157, 182, 203, 122, 111, 107, 119, 136, 156, 179,
-             204,
-             // Size 32x8
-             32, 31, 31, 31, 32, 32, 34, 35, 36, 39, 44, 46, 48, 53, 58, 61, 65,
-             71, 79, 81, 82, 88, 91, 94, 97, 100, 103, 107, 110, 114, 118, 122,
-             32, 32, 33, 33, 34, 35, 37, 37, 38, 40, 43, 44, 46, 50, 54, 56, 58,
-             63, 70, 71, 72, 77, 80, 83, 86, 89, 93, 96, 100, 104, 107, 111, 36,
-             35, 34, 35, 36, 38, 42, 45, 48, 50, 53, 55, 56, 60, 63, 66, 68, 73,
-             79, 80, 81, 85, 88, 91, 94, 97, 98, 100, 101, 103, 105, 107, 53,
-             51, 49, 49, 50, 49, 54, 57, 60, 65, 71, 73, 76, 82, 87, 89, 92, 97,
-             104, 105, 106, 108, 106, 105, 107, 111, 114, 117, 117, 117, 118,
-             119, 65, 62, 59, 59, 59, 58, 63, 65, 68, 73, 79, 82, 85, 92, 98,
-             101, 105, 111, 118, 119, 121, 126, 130, 131, 128, 127, 131, 136,
-             138, 137, 136, 136, 87, 82, 78, 78, 77, 75, 79, 82, 84, 89, 95, 98,
-             102, 109, 116, 120, 124, 132, 141, 142, 144, 149, 148, 153, 157,
-             152, 150, 155, 161, 159, 157, 156, 93, 88, 86, 84, 82, 82, 80, 84,
-             86, 91, 94, 98, 105, 107, 112, 119, 122, 130, 135, 140, 149, 153,
-             162, 165, 167, 173, 174, 177, 183, 185, 182, 179, 99, 94, 93, 90,
-             89, 89, 88, 87, 90, 93, 97, 99, 105, 107, 115, 116, 124, 127, 135,
-             139, 146, 152, 159, 166, 171, 182, 186, 191, 193, 201, 203, 204},
-            {// Chroma
-             // Size 4x4
-             35, 46, 57, 66, 46, 60, 69, 71, 57, 69, 90, 90, 66, 71, 90, 109,
-             // Size 8x8
-             31, 38, 47, 50, 57, 63, 67, 71, 38, 47, 46, 47, 52, 57, 62, 67, 47,
-             46, 54, 57, 61, 66, 67, 68, 50, 47, 57, 66, 72, 77, 75, 75, 57, 52,
-             61, 72, 82, 88, 86, 84, 63, 57, 66, 77, 88, 96, 95, 95, 67, 62, 67,
-             75, 86, 95, 104, 107, 71, 67, 68, 75, 84, 95, 107, 113,
-             // Size 16x16
-             32, 30, 33, 41, 49, 49, 50, 54, 57, 63, 65, 68, 70, 72, 74, 76, 30,
-             32, 35, 42, 46, 45, 46, 49, 52, 57, 58, 62, 64, 67, 70, 72, 33, 35,
-             39, 45, 47, 45, 46, 49, 51, 56, 57, 60, 62, 64, 66, 69, 41, 42, 45,
-             48, 50, 49, 50, 52, 53, 57, 58, 59, 60, 61, 64, 67, 49, 46, 47, 50,
-             53, 53, 54, 55, 56, 60, 61, 64, 64, 65, 66, 66, 49, 45, 45, 49, 53,
-             58, 60, 62, 63, 67, 68, 67, 69, 68, 70, 70, 50, 46, 46, 50, 54, 60,
-             61, 65, 67, 71, 71, 74, 73, 73, 74, 74, 54, 49, 49, 52, 55, 62, 65,
-             71, 73, 78, 79, 78, 77, 78, 78, 78, 57, 52, 51, 53, 56, 63, 67, 73,
-             76, 82, 83, 84, 84, 84, 82, 83, 63, 57, 56, 57, 60, 67, 71, 78, 82,
-             89, 90, 90, 89, 88, 87, 88, 65, 58, 57, 58, 61, 68, 71, 79, 83, 90,
-             91, 94, 93, 93, 92, 93, 68, 62, 60, 59, 64, 67, 74, 78, 84, 90, 94,
-             98, 99, 98, 98, 98, 70, 64, 62, 60, 64, 69, 73, 77, 84, 89, 93, 99,
-             102, 103, 104, 104, 72, 67, 64, 61, 65, 68, 73, 78, 84, 88, 93, 98,
-             103, 106, 108, 109, 74, 70, 66, 64, 66, 70, 74, 78, 82, 87, 92, 98,
-             104, 108, 111, 112, 76, 72, 69, 67, 66, 70, 74, 78, 83, 88, 93, 98,
-             104, 109, 112, 116,
-             // Size 32x32
-             32, 31, 30, 32, 33, 36, 41, 45, 49, 48, 49, 50, 50, 52, 54, 56, 57,
-             60, 63, 64, 65, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 78, 31, 31,
-             31, 33, 34, 38, 42, 45, 47, 47, 47, 47, 48, 50, 52, 53, 54, 57, 60,
-             61, 61, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 74, 30, 31, 32, 33,
-             35, 40, 42, 44, 46, 45, 45, 45, 46, 47, 49, 51, 52, 54, 57, 58, 58,
-             61, 62, 63, 64, 66, 67, 68, 70, 71, 72, 74, 32, 33, 33, 35, 37, 41,
-             43, 45, 47, 46, 45, 46, 46, 47, 49, 50, 51, 54, 57, 57, 58, 60, 61,
-             62, 63, 64, 65, 66, 67, 68, 69, 70, 33, 34, 35, 37, 39, 43, 45, 46,
-             47, 46, 45, 46, 46, 47, 49, 50, 51, 53, 56, 57, 57, 59, 60, 61, 62,
-             63, 64, 65, 66, 68, 69, 70, 36, 38, 40, 41, 43, 47, 47, 47, 48, 46,
-             45, 46, 46, 47, 48, 49, 50, 52, 54, 55, 55, 57, 58, 59, 61, 62, 64,
-             65, 66, 67, 68, 69, 41, 42, 42, 43, 45, 47, 48, 49, 50, 49, 49, 49,
-             50, 50, 52, 52, 53, 55, 57, 58, 58, 60, 59, 59, 60, 61, 61, 63, 64,
-             66, 67, 69, 45, 45, 44, 45, 46, 47, 49, 50, 51, 51, 51, 51, 52, 52,
-             53, 54, 55, 57, 59, 59, 60, 61, 61, 62, 63, 63, 63, 63, 63, 64, 65,
-             66, 49, 47, 46, 47, 47, 48, 50, 51, 53, 53, 53, 54, 54, 54, 55, 56,
-             56, 58, 60, 61, 61, 63, 64, 64, 64, 64, 65, 66, 66, 66, 66, 66, 48,
-             47, 45, 46, 46, 46, 49, 51, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61,
-             63, 64, 64, 66, 66, 65, 66, 67, 67, 67, 67, 68, 69, 70, 49, 47, 45,
-             45, 45, 45, 49, 51, 53, 55, 58, 59, 60, 61, 62, 63, 63, 65, 67, 67,
-             68, 69, 67, 68, 69, 68, 68, 69, 70, 70, 70, 70, 50, 47, 45, 46, 46,
-             46, 49, 51, 54, 56, 59, 60, 60, 62, 64, 64, 65, 67, 69, 69, 70, 70,
-             71, 71, 70, 70, 71, 71, 71, 71, 72, 74, 50, 48, 46, 46, 46, 46, 50,
-             52, 54, 56, 60, 60, 61, 63, 65, 66, 67, 68, 71, 71, 71, 73, 74, 72,
-             73, 74, 73, 73, 74, 74, 74, 74, 52, 50, 47, 47, 47, 47, 50, 52, 54,
-             57, 61, 62, 63, 66, 68, 69, 70, 72, 75, 75, 75, 77, 75, 75, 76, 75,
-             75, 76, 75, 75, 76, 77, 54, 52, 49, 49, 49, 48, 52, 53, 55, 58, 62,
-             64, 65, 68, 71, 72, 73, 75, 78, 78, 79, 79, 78, 79, 77, 78, 78, 77,
-             78, 79, 78, 78, 56, 53, 51, 50, 50, 49, 52, 54, 56, 59, 63, 64, 66,
-             69, 72, 73, 75, 77, 80, 80, 81, 81, 82, 80, 81, 81, 79, 81, 80, 79,
-             81, 82, 57, 54, 52, 51, 51, 50, 53, 55, 56, 60, 63, 65, 67, 70, 73,
-             75, 76, 79, 82, 82, 83, 85, 84, 83, 84, 83, 84, 82, 82, 84, 83, 82,
-             60, 57, 54, 54, 53, 52, 55, 57, 58, 61, 65, 67, 68, 72, 75, 77, 79,
-             82, 85, 85, 86, 88, 86, 87, 85, 86, 85, 85, 86, 84, 85, 86, 63, 60,
-             57, 57, 56, 54, 57, 59, 60, 63, 67, 69, 71, 75, 78, 80, 82, 85, 89,
-             89, 90, 90, 90, 89, 89, 88, 88, 88, 87, 88, 88, 87, 64, 61, 58, 57,
-             57, 55, 58, 59, 61, 64, 67, 69, 71, 75, 78, 80, 82, 85, 89, 90, 91,
-             92, 93, 92, 92, 91, 91, 90, 91, 90, 90, 92, 65, 61, 58, 58, 57, 55,
-             58, 60, 61, 64, 68, 70, 71, 75, 79, 81, 83, 86, 90, 91, 91, 94, 94,
-             96, 93, 94, 93, 94, 92, 93, 93, 92, 67, 63, 61, 60, 59, 57, 60, 61,
-             63, 66, 69, 70, 73, 77, 79, 81, 85, 88, 90, 92, 94, 96, 96, 97, 98,
-             95, 97, 95, 96, 95, 95, 96, 68, 64, 62, 61, 60, 58, 59, 61, 64, 66,
-             67, 71, 74, 75, 78, 82, 84, 86, 90, 93, 94, 96, 98, 98, 99, 100,
-             98, 99, 98, 98, 98, 97, 69, 65, 63, 62, 61, 59, 59, 62, 64, 65, 68,
-             71, 72, 75, 79, 80, 83, 87, 89, 92, 96, 97, 98, 100, 100, 101, 102,
-             101, 101, 101, 100, 102, 70, 66, 64, 63, 62, 61, 60, 63, 64, 66,
-             69, 70, 73, 76, 77, 81, 84, 85, 89, 92, 93, 98, 99, 100, 102, 102,
-             103, 104, 104, 103, 104, 102, 71, 67, 66, 64, 63, 62, 61, 63, 64,
-             67, 68, 70, 74, 75, 78, 81, 83, 86, 88, 91, 94, 95, 100, 101, 102,
-             104, 104, 105, 106, 107, 105, 107, 72, 68, 67, 65, 64, 64, 61, 63,
-             65, 67, 68, 71, 73, 75, 78, 79, 84, 85, 88, 91, 93, 97, 98, 102,
-             103, 104, 106, 106, 108, 108, 109, 107, 73, 69, 68, 66, 65, 65, 63,
-             63, 66, 67, 69, 71, 73, 76, 77, 81, 82, 85, 88, 90, 94, 95, 99,
-             101, 104, 105, 106, 109, 108, 110, 111, 112, 74, 70, 70, 67, 66,
-             66, 64, 63, 66, 67, 70, 71, 74, 75, 78, 80, 82, 86, 87, 91, 92, 96,
-             98, 101, 104, 106, 108, 108, 111, 111, 112, 113, 75, 71, 71, 68,
-             68, 67, 66, 64, 66, 68, 70, 71, 74, 75, 79, 79, 84, 84, 88, 90, 93,
-             95, 98, 101, 103, 107, 108, 110, 111, 113, 113, 115, 76, 72, 72,
-             69, 69, 68, 67, 65, 66, 69, 70, 72, 74, 76, 78, 81, 83, 85, 88, 90,
-             93, 95, 98, 100, 104, 105, 109, 111, 112, 113, 116, 115, 78, 74,
-             74, 70, 70, 69, 69, 66, 66, 70, 70, 74, 74, 77, 78, 82, 82, 86, 87,
-             92, 92, 96, 97, 102, 102, 107, 107, 112, 113, 115, 115, 118,
-             // Size 4x8
-             31, 47, 60, 66, 40, 45, 54, 61, 46, 56, 64, 64, 48, 61, 75, 73, 54,
-             65, 85, 82, 61, 69, 92, 92, 64, 68, 90, 102, 68, 71, 87, 105,
-             // Size 8x4
-             31, 40, 46, 48, 54, 61, 64, 68, 47, 45, 56, 61, 65, 69, 68, 71, 60,
-             54, 64, 75, 85, 92, 90, 87, 66, 61, 64, 73, 82, 92, 102, 105,
-             // Size 8x16
-             32, 37, 48, 52, 57, 66, 68, 71, 30, 40, 46, 48, 52, 60, 63, 66, 33,
-             43, 47, 47, 51, 59, 60, 63, 42, 47, 50, 50, 53, 60, 59, 62, 49, 48,
-             53, 54, 57, 62, 62, 62, 49, 46, 53, 61, 64, 69, 66, 66, 50, 46, 54,
-             64, 67, 73, 72, 70, 54, 49, 55, 68, 73, 80, 76, 75, 57, 50, 56, 70,
-             76, 84, 80, 79, 63, 55, 60, 75, 82, 92, 87, 84, 64, 56, 61, 75, 83,
-             93, 93, 89, 68, 59, 64, 74, 86, 94, 98, 94, 70, 62, 66, 73, 83, 96,
-             99, 98, 72, 64, 66, 75, 83, 92, 101, 104, 74, 67, 66, 74, 84, 94,
-             103, 106, 76, 69, 67, 73, 82, 91, 101, 109,
-             // Size 16x8
-             32, 30, 33, 42, 49, 49, 50, 54, 57, 63, 64, 68, 70, 72, 74, 76, 37,
-             40, 43, 47, 48, 46, 46, 49, 50, 55, 56, 59, 62, 64, 67, 69, 48, 46,
-             47, 50, 53, 53, 54, 55, 56, 60, 61, 64, 66, 66, 66, 67, 52, 48, 47,
-             50, 54, 61, 64, 68, 70, 75, 75, 74, 73, 75, 74, 73, 57, 52, 51, 53,
-             57, 64, 67, 73, 76, 82, 83, 86, 83, 83, 84, 82, 66, 60, 59, 60, 62,
-             69, 73, 80, 84, 92, 93, 94, 96, 92, 94, 91, 68, 63, 60, 59, 62, 66,
-             72, 76, 80, 87, 93, 98, 99, 101, 103, 101, 71, 66, 63, 62, 62, 66,
-             70, 75, 79, 84, 89, 94, 98, 104, 106, 109,
-             // Size 16x32
-             32, 31, 37, 42, 48, 49, 52, 54, 57, 63, 66, 67, 68, 69, 71, 72, 31,
-             31, 38, 42, 47, 47, 50, 52, 54, 60, 63, 64, 65, 66, 67, 68, 30, 32,
-             40, 42, 46, 45, 48, 50, 52, 57, 60, 62, 63, 65, 66, 68, 32, 34, 41,
-             44, 46, 45, 48, 49, 51, 57, 59, 61, 62, 63, 64, 65, 33, 36, 43, 45,
-             47, 46, 47, 49, 51, 56, 59, 60, 60, 62, 63, 65, 37, 40, 47, 47, 47,
-             45, 47, 48, 50, 54, 57, 58, 60, 61, 62, 63, 42, 43, 47, 48, 50, 49,
-             50, 52, 53, 57, 60, 58, 59, 60, 62, 63, 45, 44, 47, 49, 51, 51, 52,
-             54, 55, 59, 61, 61, 61, 60, 61, 61, 49, 46, 48, 50, 53, 53, 54, 55,
-             57, 60, 62, 63, 62, 63, 62, 62, 48, 46, 47, 50, 53, 56, 57, 59, 60,
-             64, 66, 65, 65, 64, 64, 65, 49, 45, 46, 49, 53, 58, 61, 62, 64, 67,
-             69, 67, 66, 66, 66, 65, 49, 46, 46, 49, 53, 59, 62, 64, 65, 69, 71,
-             70, 68, 68, 67, 68, 50, 46, 46, 50, 54, 59, 64, 65, 67, 71, 73, 72,
-             72, 70, 70, 69, 52, 48, 47, 50, 54, 61, 66, 68, 71, 75, 77, 74, 73,
-             73, 71, 72, 54, 50, 49, 52, 55, 62, 68, 71, 73, 78, 80, 78, 76, 74,
-             75, 73, 55, 51, 49, 52, 56, 63, 69, 72, 75, 80, 82, 80, 79, 78, 76,
-             77, 57, 52, 50, 53, 56, 64, 70, 73, 76, 82, 84, 82, 80, 80, 79, 77,
-             60, 54, 52, 55, 58, 65, 72, 75, 79, 85, 88, 86, 84, 82, 81, 81, 63,
-             57, 55, 58, 60, 67, 75, 78, 82, 89, 92, 88, 87, 85, 84, 81, 64, 58,
-             55, 58, 61, 68, 75, 78, 82, 89, 92, 90, 89, 87, 86, 86, 64, 59, 56,
-             58, 61, 68, 75, 79, 83, 90, 93, 95, 93, 91, 89, 87, 67, 61, 58, 60,
-             63, 69, 76, 79, 85, 92, 95, 96, 94, 92, 91, 91, 68, 62, 59, 60, 64,
-             71, 74, 78, 86, 91, 94, 96, 98, 96, 94, 91, 69, 62, 60, 60, 65, 70,
-             72, 79, 85, 88, 95, 98, 99, 98, 97, 96, 70, 63, 62, 60, 66, 69, 73,
-             81, 83, 89, 96, 97, 99, 101, 98, 97, 71, 64, 63, 61, 67, 68, 74,
-             79, 82, 90, 93, 98, 102, 102, 102, 101, 72, 65, 64, 62, 66, 68, 75,
-             78, 83, 89, 92, 100, 101, 103, 104, 102, 73, 66, 65, 63, 66, 69,
-             75, 76, 84, 87, 93, 98, 102, 105, 106, 107, 74, 67, 67, 64, 66, 70,
-             74, 77, 84, 86, 94, 96, 103, 105, 106, 107, 75, 68, 68, 65, 66, 71,
-             74, 78, 83, 87, 93, 96, 103, 105, 109, 109, 76, 69, 69, 66, 67, 72,
-             73, 80, 82, 88, 91, 97, 101, 107, 109, 110, 77, 70, 70, 67, 67, 73,
-             73, 81, 81, 90, 90, 99, 99, 108, 108, 113,
-             // Size 32x16
-             32, 31, 30, 32, 33, 37, 42, 45, 49, 48, 49, 49, 50, 52, 54, 55, 57,
-             60, 63, 64, 64, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 31, 31,
-             32, 34, 36, 40, 43, 44, 46, 46, 45, 46, 46, 48, 50, 51, 52, 54, 57,
-             58, 59, 61, 62, 62, 63, 64, 65, 66, 67, 68, 69, 70, 37, 38, 40, 41,
-             43, 47, 47, 47, 48, 47, 46, 46, 46, 47, 49, 49, 50, 52, 55, 55, 56,
-             58, 59, 60, 62, 63, 64, 65, 67, 68, 69, 70, 42, 42, 42, 44, 45, 47,
-             48, 49, 50, 50, 49, 49, 50, 50, 52, 52, 53, 55, 58, 58, 58, 60, 60,
-             60, 60, 61, 62, 63, 64, 65, 66, 67, 48, 47, 46, 46, 47, 47, 50, 51,
-             53, 53, 53, 53, 54, 54, 55, 56, 56, 58, 60, 61, 61, 63, 64, 65, 66,
-             67, 66, 66, 66, 66, 67, 67, 49, 47, 45, 45, 46, 45, 49, 51, 53, 56,
-             58, 59, 59, 61, 62, 63, 64, 65, 67, 68, 68, 69, 71, 70, 69, 68, 68,
-             69, 70, 71, 72, 73, 52, 50, 48, 48, 47, 47, 50, 52, 54, 57, 61, 62,
-             64, 66, 68, 69, 70, 72, 75, 75, 75, 76, 74, 72, 73, 74, 75, 75, 74,
-             74, 73, 73, 54, 52, 50, 49, 49, 48, 52, 54, 55, 59, 62, 64, 65, 68,
-             71, 72, 73, 75, 78, 78, 79, 79, 78, 79, 81, 79, 78, 76, 77, 78, 80,
-             81, 57, 54, 52, 51, 51, 50, 53, 55, 57, 60, 64, 65, 67, 71, 73, 75,
-             76, 79, 82, 82, 83, 85, 86, 85, 83, 82, 83, 84, 84, 83, 82, 81, 63,
-             60, 57, 57, 56, 54, 57, 59, 60, 64, 67, 69, 71, 75, 78, 80, 82, 85,
-             89, 89, 90, 92, 91, 88, 89, 90, 89, 87, 86, 87, 88, 90, 66, 63, 60,
-             59, 59, 57, 60, 61, 62, 66, 69, 71, 73, 77, 80, 82, 84, 88, 92, 92,
-             93, 95, 94, 95, 96, 93, 92, 93, 94, 93, 91, 90, 67, 64, 62, 61, 60,
-             58, 58, 61, 63, 65, 67, 70, 72, 74, 78, 80, 82, 86, 88, 90, 95, 96,
-             96, 98, 97, 98, 100, 98, 96, 96, 97, 99, 68, 65, 63, 62, 60, 60,
-             59, 61, 62, 65, 66, 68, 72, 73, 76, 79, 80, 84, 87, 89, 93, 94, 98,
-             99, 99, 102, 101, 102, 103, 103, 101, 99, 69, 66, 65, 63, 62, 61,
-             60, 60, 63, 64, 66, 68, 70, 73, 74, 78, 80, 82, 85, 87, 91, 92, 96,
-             98, 101, 102, 103, 105, 105, 105, 107, 108, 71, 67, 66, 64, 63, 62,
-             62, 61, 62, 64, 66, 67, 70, 71, 75, 76, 79, 81, 84, 86, 89, 91, 94,
-             97, 98, 102, 104, 106, 106, 109, 109, 108, 72, 68, 68, 65, 65, 63,
-             63, 61, 62, 65, 65, 68, 69, 72, 73, 77, 77, 81, 81, 86, 87, 91, 91,
-             96, 97, 101, 102, 107, 107, 109, 110, 113,
-             // Size 4x16
-             31, 49, 63, 69, 32, 45, 57, 65, 36, 46, 56, 62, 43, 49, 57, 60, 46,
-             53, 60, 63, 45, 58, 67, 66, 46, 59, 71, 70, 50, 62, 78, 74, 52, 64,
-             82, 80, 57, 67, 89, 85, 59, 68, 90, 91, 62, 71, 91, 96, 63, 69, 89,
-             101, 65, 68, 89, 103, 67, 70, 86, 105, 69, 72, 88, 107,
-             // Size 16x4
-             31, 32, 36, 43, 46, 45, 46, 50, 52, 57, 59, 62, 63, 65, 67, 69, 49,
-             45, 46, 49, 53, 58, 59, 62, 64, 67, 68, 71, 69, 68, 70, 72, 63, 57,
-             56, 57, 60, 67, 71, 78, 82, 89, 90, 91, 89, 89, 86, 88, 69, 65, 62,
-             60, 63, 66, 70, 74, 80, 85, 91, 96, 101, 103, 105, 107,
-             // Size 8x32
-             32, 37, 48, 52, 57, 66, 68, 71, 31, 38, 47, 50, 54, 63, 65, 67, 30,
-             40, 46, 48, 52, 60, 63, 66, 32, 41, 46, 48, 51, 59, 62, 64, 33, 43,
-             47, 47, 51, 59, 60, 63, 37, 47, 47, 47, 50, 57, 60, 62, 42, 47, 50,
-             50, 53, 60, 59, 62, 45, 47, 51, 52, 55, 61, 61, 61, 49, 48, 53, 54,
-             57, 62, 62, 62, 48, 47, 53, 57, 60, 66, 65, 64, 49, 46, 53, 61, 64,
-             69, 66, 66, 49, 46, 53, 62, 65, 71, 68, 67, 50, 46, 54, 64, 67, 73,
-             72, 70, 52, 47, 54, 66, 71, 77, 73, 71, 54, 49, 55, 68, 73, 80, 76,
-             75, 55, 49, 56, 69, 75, 82, 79, 76, 57, 50, 56, 70, 76, 84, 80, 79,
-             60, 52, 58, 72, 79, 88, 84, 81, 63, 55, 60, 75, 82, 92, 87, 84, 64,
-             55, 61, 75, 82, 92, 89, 86, 64, 56, 61, 75, 83, 93, 93, 89, 67, 58,
-             63, 76, 85, 95, 94, 91, 68, 59, 64, 74, 86, 94, 98, 94, 69, 60, 65,
-             72, 85, 95, 99, 97, 70, 62, 66, 73, 83, 96, 99, 98, 71, 63, 67, 74,
-             82, 93, 102, 102, 72, 64, 66, 75, 83, 92, 101, 104, 73, 65, 66, 75,
-             84, 93, 102, 106, 74, 67, 66, 74, 84, 94, 103, 106, 75, 68, 66, 74,
-             83, 93, 103, 109, 76, 69, 67, 73, 82, 91, 101, 109, 77, 70, 67, 73,
-             81, 90, 99, 108,
-             // Size 32x8
-             32, 31, 30, 32, 33, 37, 42, 45, 49, 48, 49, 49, 50, 52, 54, 55, 57,
-             60, 63, 64, 64, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 37, 38,
-             40, 41, 43, 47, 47, 47, 48, 47, 46, 46, 46, 47, 49, 49, 50, 52, 55,
-             55, 56, 58, 59, 60, 62, 63, 64, 65, 67, 68, 69, 70, 48, 47, 46, 46,
-             47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 55, 56, 56, 58, 60, 61, 61,
-             63, 64, 65, 66, 67, 66, 66, 66, 66, 67, 67, 52, 50, 48, 48, 47, 47,
-             50, 52, 54, 57, 61, 62, 64, 66, 68, 69, 70, 72, 75, 75, 75, 76, 74,
-             72, 73, 74, 75, 75, 74, 74, 73, 73, 57, 54, 52, 51, 51, 50, 53, 55,
-             57, 60, 64, 65, 67, 71, 73, 75, 76, 79, 82, 82, 83, 85, 86, 85, 83,
-             82, 83, 84, 84, 83, 82, 81, 66, 63, 60, 59, 59, 57, 60, 61, 62, 66,
-             69, 71, 73, 77, 80, 82, 84, 88, 92, 92, 93, 95, 94, 95, 96, 93, 92,
-             93, 94, 93, 91, 90, 68, 65, 63, 62, 60, 60, 59, 61, 62, 65, 66, 68,
-             72, 73, 76, 79, 80, 84, 87, 89, 93, 94, 98, 99, 99, 102, 101, 102,
-             103, 103, 101, 99, 71, 67, 66, 64, 63, 62, 62, 61, 62, 64, 66, 67,
-             70, 71, 75, 76, 79, 81, 84, 86, 89, 91, 94, 97, 98, 102, 104, 106,
-             106, 109, 109, 108},
-        },
-        // Quantizer level 1.
-        {
-            {// Luma
-             // Size 4x4
-             32, 41, 69, 92, 41, 63, 88, 103, 69, 88, 127, 140, 92, 103, 140,
-             184,
-             // Size 8x8
-             32, 32, 37, 47, 62, 78, 90, 102, 32, 35, 39, 46, 58, 72, 84, 96,
-             37, 39, 51, 60, 71, 84, 93, 100, 47, 46, 60, 73, 87, 100, 106, 113,
-             62, 58, 71, 87, 105, 121, 129, 132, 78, 72, 84, 100, 121, 140, 148,
-             155, 90, 84, 93, 106, 129, 148, 169, 183, 102, 96, 100, 113, 132,
-             155, 183, 201,
-             // Size 16x16
-             32, 31, 31, 32, 36, 39, 47, 54, 61, 71, 80, 86, 92, 98, 104, 111,
-             31, 32, 32, 33, 34, 37, 44, 50, 56, 65, 73, 79, 85, 91, 98, 105,
-             31, 32, 33, 34, 36, 39, 45, 50, 56, 64, 71, 77, 82, 88, 94, 100,
-             32, 33, 34, 36, 40, 42, 47, 51, 57, 65, 71, 76, 80, 85, 91, 98, 36,
-             34, 36, 40, 48, 50, 56, 60, 65, 73, 79, 84, 86, 90, 95, 98, 39, 37,
-             39, 42, 50, 54, 60, 65, 70, 78, 84, 89, 95, 96, 102, 105, 47, 44,
-             45, 47, 56, 60, 69, 75, 81, 89, 95, 100, 102, 104, 109, 112, 54,
-             50, 50, 51, 60, 65, 75, 82, 89, 97, 104, 109, 110, 114, 117, 121,
-             61, 56, 56, 57, 65, 70, 81, 89, 97, 106, 113, 119, 122, 126, 125,
-             130, 71, 65, 64, 65, 73, 78, 89, 97, 106, 117, 125, 131, 134, 134,
-             136, 141, 80, 73, 71, 71, 79, 84, 95, 104, 113, 125, 134, 140, 142,
-             145, 146, 152, 86, 79, 77, 76, 84, 89, 100, 109, 119, 131, 140,
-             147, 154, 157, 160, 165, 92, 85, 82, 80, 86, 95, 102, 110, 122,
-             134, 142, 154, 162, 168, 174, 178, 98, 91, 88, 85, 90, 96, 104,
-             114, 126, 134, 145, 157, 168, 176, 184, 193, 104, 98, 94, 91, 95,
-             102, 109, 117, 125, 136, 146, 160, 174, 184, 193, 201, 111, 105,
-             100, 98, 98, 105, 112, 121, 130, 141, 152, 165, 178, 193, 201, 210,
-             // Size 32x32
-             32, 31, 31, 31, 31, 32, 32, 34, 36, 38, 39, 44, 47, 49, 54, 59, 61,
-             65, 71, 76, 80, 83, 86, 89, 92, 95, 98, 101, 104, 108, 111, 114,
-             31, 32, 32, 32, 32, 32, 33, 34, 35, 37, 38, 42, 45, 47, 51, 56, 58,
-             62, 68, 72, 76, 78, 82, 85, 88, 90, 93, 96, 99, 102, 105, 109, 31,
-             32, 32, 32, 32, 32, 33, 33, 34, 36, 37, 41, 44, 46, 50, 54, 56, 60,
-             65, 70, 73, 76, 79, 82, 85, 88, 91, 95, 98, 101, 105, 109, 31, 32,
-             32, 32, 32, 33, 33, 34, 35, 36, 38, 41, 44, 45, 49, 54, 56, 59, 65,
-             69, 72, 75, 78, 81, 84, 86, 89, 92, 95, 98, 101, 104, 31, 32, 32,
-             32, 33, 34, 34, 35, 36, 38, 39, 42, 45, 46, 50, 54, 56, 59, 64, 68,
-             71, 74, 77, 79, 82, 85, 88, 91, 94, 97, 100, 104, 32, 32, 32, 33,
-             34, 35, 36, 37, 38, 39, 40, 42, 45, 46, 49, 53, 55, 58, 63, 66, 69,
-             72, 74, 78, 81, 84, 87, 90, 93, 96, 99, 102, 32, 33, 33, 33, 34,
-             36, 36, 38, 40, 41, 42, 44, 47, 48, 51, 55, 57, 60, 65, 68, 71, 73,
-             76, 78, 80, 82, 85, 88, 91, 95, 98, 102, 34, 34, 33, 34, 35, 37,
-             38, 39, 42, 44, 45, 47, 50, 51, 54, 58, 60, 63, 68, 71, 74, 76, 79,
-             82, 85, 86, 87, 88, 90, 93, 96, 99, 36, 35, 34, 35, 36, 38, 40, 42,
-             48, 50, 50, 54, 56, 57, 60, 64, 65, 68, 73, 76, 79, 81, 84, 86, 86,
-             88, 90, 93, 95, 97, 98, 100, 38, 37, 36, 36, 38, 39, 41, 44, 50,
-             51, 52, 56, 58, 60, 63, 67, 68, 71, 76, 79, 82, 84, 87, 87, 90, 93,
-             94, 95, 96, 100, 103, 106, 39, 38, 37, 38, 39, 40, 42, 45, 50, 52,
-             54, 58, 60, 62, 65, 69, 70, 73, 78, 81, 84, 86, 89, 92, 95, 95, 96,
-             99, 102, 104, 105, 106, 44, 42, 41, 41, 42, 42, 44, 47, 54, 56, 58,
-             63, 66, 68, 71, 75, 77, 79, 84, 88, 90, 92, 95, 97, 97, 99, 102,
-             103, 103, 106, 109, 113, 47, 45, 44, 44, 45, 45, 47, 50, 56, 58,
-             60, 66, 69, 71, 75, 79, 81, 84, 89, 92, 95, 97, 100, 100, 102, 105,
-             104, 106, 109, 111, 112, 113, 49, 47, 46, 45, 46, 46, 48, 51, 57,
-             60, 62, 68, 71, 73, 77, 81, 83, 87, 92, 95, 98, 100, 103, 105, 107,
-             106, 109, 112, 112, 113, 117, 120, 54, 51, 50, 49, 50, 49, 51, 54,
-             60, 63, 65, 71, 75, 77, 82, 87, 89, 92, 97, 101, 104, 106, 109,
-             112, 110, 113, 114, 114, 117, 121, 121, 121, 59, 56, 54, 54, 54,
-             53, 55, 58, 64, 67, 69, 75, 79, 81, 87, 92, 94, 98, 103, 107, 110,
-             113, 116, 114, 117, 118, 117, 121, 122, 122, 125, 129, 61, 58, 56,
-             56, 56, 55, 57, 60, 65, 68, 70, 77, 81, 83, 89, 94, 97, 101, 106,
-             110, 113, 116, 119, 120, 122, 121, 126, 124, 125, 130, 130, 130,
-             65, 62, 60, 59, 59, 58, 60, 63, 68, 71, 73, 79, 84, 87, 92, 98,
-             101, 105, 111, 115, 118, 121, 124, 128, 125, 129, 128, 131, 133,
-             132, 135, 139, 71, 68, 65, 65, 64, 63, 65, 68, 73, 76, 78, 84, 89,
-             92, 97, 103, 106, 111, 117, 122, 125, 128, 131, 131, 134, 132, 134,
-             136, 136, 140, 141, 140, 76, 72, 70, 69, 68, 66, 68, 71, 76, 79,
-             81, 88, 92, 95, 101, 107, 110, 115, 122, 127, 130, 133, 136, 136,
-             138, 139, 141, 140, 145, 143, 146, 151, 80, 76, 73, 72, 71, 69, 71,
-             74, 79, 82, 84, 90, 95, 98, 104, 110, 113, 118, 125, 130, 134, 137,
-             140, 146, 142, 146, 145, 149, 146, 150, 152, 151, 83, 78, 76, 75,
-             74, 72, 73, 76, 81, 84, 86, 92, 97, 100, 106, 113, 116, 121, 128,
-             133, 137, 140, 144, 147, 152, 148, 154, 151, 156, 155, 156, 162,
-             86, 82, 79, 78, 77, 74, 76, 79, 84, 87, 89, 95, 100, 103, 109, 116,
-             119, 124, 131, 136, 140, 144, 147, 150, 154, 159, 157, 160, 160,
-             162, 165, 162, 89, 85, 82, 81, 79, 78, 78, 82, 86, 87, 92, 97, 100,
-             105, 112, 114, 120, 128, 131, 136, 146, 147, 150, 155, 156, 161,
-             166, 165, 167, 169, 169, 175, 92, 88, 85, 84, 82, 81, 80, 85, 86,
-             90, 95, 97, 102, 107, 110, 117, 122, 125, 134, 138, 142, 152, 154,
-             156, 162, 163, 168, 173, 174, 174, 178, 176, 95, 90, 88, 86, 85,
-             84, 82, 86, 88, 93, 95, 99, 105, 106, 113, 118, 121, 129, 132, 139,
-             146, 148, 159, 161, 163, 169, 170, 176, 180, 183, 181, 187, 98, 93,
-             91, 89, 88, 87, 85, 87, 90, 94, 96, 102, 104, 109, 114, 117, 126,
-             128, 134, 141, 145, 154, 157, 166, 168, 170, 176, 178, 184, 188,
-             193, 188, 101, 96, 95, 92, 91, 90, 88, 88, 93, 95, 99, 103, 106,
-             112, 114, 121, 124, 131, 136, 140, 149, 151, 160, 165, 173, 176,
-             178, 184, 186, 192, 196, 203, 104, 99, 98, 95, 94, 93, 91, 90, 95,
-             96, 102, 103, 109, 112, 117, 122, 125, 133, 136, 145, 146, 156,
-             160, 167, 174, 180, 184, 186, 193, 194, 201, 204, 108, 102, 101,
-             98, 97, 96, 95, 93, 97, 100, 104, 106, 111, 113, 121, 122, 130,
-             132, 140, 143, 150, 155, 162, 169, 174, 183, 188, 192, 194, 201,
-             202, 210, 111, 105, 105, 101, 100, 99, 98, 96, 98, 103, 105, 109,
-             112, 117, 121, 125, 130, 135, 141, 146, 152, 156, 165, 169, 178,
-             181, 193, 196, 201, 202, 210, 211, 114, 109, 109, 104, 104, 102,
-             102, 99, 100, 106, 106, 113, 113, 120, 121, 129, 130, 139, 140,
-             151, 151, 162, 162, 175, 176, 187, 188, 203, 204, 210, 211, 219,
-             // Size 4x8
-             32, 42, 69, 88, 33, 42, 64, 83, 36, 56, 77, 88, 46, 67, 93, 105,
-             60, 79, 112, 122, 75, 92, 130, 144, 86, 95, 136, 167, 98, 105, 136,
-             177,
-             // Size 8x4
-             32, 33, 36, 46, 60, 75, 86, 98, 42, 42, 56, 67, 79, 92, 95, 105,
-             69, 64, 77, 93, 112, 130, 136, 136, 88, 83, 88, 105, 122, 144, 167,
-             177,
-             // Size 8x16
-             32, 32, 36, 47, 65, 79, 90, 96, 31, 32, 35, 44, 60, 72, 84, 90, 32,
-             34, 36, 45, 59, 71, 80, 87, 32, 35, 40, 47, 60, 71, 78, 85, 36, 37,
-             48, 56, 68, 78, 83, 87, 39, 40, 50, 60, 73, 84, 91, 94, 47, 45, 56,
-             69, 84, 95, 101, 101, 53, 50, 60, 75, 92, 103, 108, 110, 61, 56,
-             65, 81, 100, 113, 116, 118, 71, 64, 73, 89, 111, 125, 129, 129, 79,
-             70, 79, 95, 118, 133, 142, 138, 86, 76, 84, 100, 124, 140, 153,
-             150, 92, 82, 89, 101, 121, 148, 157, 161, 98, 88, 93, 108, 124,
-             141, 163, 174, 104, 94, 95, 110, 129, 151, 171, 181, 110, 100, 98,
-             111, 127, 147, 169, 188,
-             // Size 16x8
-             32, 31, 32, 32, 36, 39, 47, 53, 61, 71, 79, 86, 92, 98, 104, 110,
-             32, 32, 34, 35, 37, 40, 45, 50, 56, 64, 70, 76, 82, 88, 94, 100,
-             36, 35, 36, 40, 48, 50, 56, 60, 65, 73, 79, 84, 89, 93, 95, 98, 47,
-             44, 45, 47, 56, 60, 69, 75, 81, 89, 95, 100, 101, 108, 110, 111,
-             65, 60, 59, 60, 68, 73, 84, 92, 100, 111, 118, 124, 121, 124, 129,
-             127, 79, 72, 71, 71, 78, 84, 95, 103, 113, 125, 133, 140, 148, 141,
-             151, 147, 90, 84, 80, 78, 83, 91, 101, 108, 116, 129, 142, 153,
-             157, 163, 171, 169, 96, 90, 87, 85, 87, 94, 101, 110, 118, 129,
-             138, 150, 161, 174, 181, 188,
-             // Size 16x32
-             32, 31, 32, 32, 36, 44, 47, 53, 65, 73, 79, 87, 90, 93, 96, 99, 31,
-             32, 32, 33, 35, 42, 45, 51, 62, 69, 75, 83, 86, 88, 91, 94, 31, 32,
-             32, 33, 35, 41, 44, 49, 60, 67, 72, 80, 84, 87, 90, 94, 31, 32, 33,
-             33, 35, 41, 44, 49, 59, 66, 71, 79, 82, 84, 87, 90, 32, 32, 34, 34,
-             36, 42, 45, 50, 59, 65, 71, 78, 80, 83, 87, 90, 32, 33, 35, 36, 38,
-             42, 45, 49, 58, 64, 69, 76, 80, 83, 86, 88, 32, 33, 35, 36, 40, 44,
-             47, 51, 60, 66, 71, 76, 78, 81, 85, 89, 34, 34, 36, 38, 42, 48, 50,
-             54, 63, 69, 73, 80, 82, 81, 84, 86, 36, 34, 37, 40, 48, 54, 56, 60,
-             68, 74, 78, 84, 83, 86, 87, 87, 38, 36, 39, 41, 49, 56, 58, 63, 71,
-             77, 81, 86, 88, 88, 90, 93, 39, 37, 40, 42, 50, 58, 60, 65, 73, 79,
-             84, 90, 91, 92, 94, 93, 44, 41, 42, 45, 53, 63, 66, 71, 79, 85, 90,
-             96, 94, 96, 96, 99, 47, 44, 45, 47, 56, 66, 69, 75, 84, 90, 95, 99,
-             101, 98, 101, 99, 49, 46, 47, 48, 57, 67, 71, 77, 86, 93, 97, 103,
-             103, 105, 102, 106, 53, 49, 50, 51, 60, 71, 75, 82, 92, 99, 103,
-             111, 108, 107, 110, 107, 58, 54, 54, 55, 63, 75, 79, 87, 98, 105,
-             110, 114, 114, 113, 111, 115, 61, 56, 56, 57, 65, 77, 81, 89, 100,
-             107, 113, 118, 116, 117, 118, 116, 65, 60, 59, 60, 68, 79, 84, 92,
-             105, 112, 118, 126, 124, 122, 121, 124, 71, 65, 64, 65, 73, 84, 89,
-             97, 111, 119, 125, 130, 129, 129, 129, 125, 76, 69, 68, 69, 76, 88,
-             92, 101, 115, 123, 130, 134, 134, 131, 132, 135, 79, 72, 70, 71,
-             79, 90, 95, 104, 118, 127, 133, 143, 142, 141, 138, 136, 82, 75,
-             73, 74, 81, 92, 97, 106, 121, 130, 136, 146, 145, 144, 144, 145,
-             86, 78, 76, 77, 84, 95, 100, 109, 124, 133, 140, 147, 153, 151,
-             150, 146, 89, 81, 79, 78, 87, 95, 99, 112, 124, 130, 145, 152, 156,
-             157, 156, 158, 92, 84, 82, 80, 89, 95, 101, 116, 121, 132, 148,
-             151, 157, 163, 161, 159, 95, 86, 85, 83, 92, 95, 105, 114, 120,
-             136, 143, 155, 163, 167, 171, 170, 98, 89, 88, 85, 93, 95, 108,
-             113, 124, 136, 141, 160, 163, 169, 174, 171, 101, 92, 91, 88, 94,
-             98, 110, 112, 128, 133, 146, 158, 166, 175, 179, 185, 104, 95, 94,
-             91, 95, 101, 110, 115, 129, 132, 151, 154, 171, 175, 181, 186, 107,
-             98, 97, 94, 96, 105, 110, 119, 128, 136, 149, 156, 173, 177, 188,
-             192, 110, 101, 100, 97, 98, 108, 111, 123, 127, 141, 147, 161, 169,
-             183, 188, 193, 114, 104, 104, 100, 100, 111, 111, 126, 127, 145,
-             145, 166, 166, 189, 190, 201,
-             // Size 32x16
-             32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 61,
-             65, 71, 76, 79, 82, 86, 89, 92, 95, 98, 101, 104, 107, 110, 114,
-             31, 32, 32, 32, 32, 33, 33, 34, 34, 36, 37, 41, 44, 46, 49, 54, 56,
-             60, 65, 69, 72, 75, 78, 81, 84, 86, 89, 92, 95, 98, 101, 104, 32,
-             32, 32, 33, 34, 35, 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 56, 59,
-             64, 68, 70, 73, 76, 79, 82, 85, 88, 91, 94, 97, 100, 104, 32, 33,
-             33, 33, 34, 36, 36, 38, 40, 41, 42, 45, 47, 48, 51, 55, 57, 60, 65,
-             69, 71, 74, 77, 78, 80, 83, 85, 88, 91, 94, 97, 100, 36, 35, 35,
-             35, 36, 38, 40, 42, 48, 49, 50, 53, 56, 57, 60, 63, 65, 68, 73, 76,
-             79, 81, 84, 87, 89, 92, 93, 94, 95, 96, 98, 100, 44, 42, 41, 41,
-             42, 42, 44, 48, 54, 56, 58, 63, 66, 67, 71, 75, 77, 79, 84, 88, 90,
-             92, 95, 95, 95, 95, 95, 98, 101, 105, 108, 111, 47, 45, 44, 44, 45,
-             45, 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 81, 84, 89, 92, 95, 97,
-             100, 99, 101, 105, 108, 110, 110, 110, 111, 111, 53, 51, 49, 49,
-             50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82, 87, 89, 92, 97, 101,
-             104, 106, 109, 112, 116, 114, 113, 112, 115, 119, 123, 126, 65, 62,
-             60, 59, 59, 58, 60, 63, 68, 71, 73, 79, 84, 86, 92, 98, 100, 105,
-             111, 115, 118, 121, 124, 124, 121, 120, 124, 128, 129, 128, 127,
-             127, 73, 69, 67, 66, 65, 64, 66, 69, 74, 77, 79, 85, 90, 93, 99,
-             105, 107, 112, 119, 123, 127, 130, 133, 130, 132, 136, 136, 133,
-             132, 136, 141, 145, 79, 75, 72, 71, 71, 69, 71, 73, 78, 81, 84, 90,
-             95, 97, 103, 110, 113, 118, 125, 130, 133, 136, 140, 145, 148, 143,
-             141, 146, 151, 149, 147, 145, 87, 83, 80, 79, 78, 76, 76, 80, 84,
-             86, 90, 96, 99, 103, 111, 114, 118, 126, 130, 134, 143, 146, 147,
-             152, 151, 155, 160, 158, 154, 156, 161, 166, 90, 86, 84, 82, 80,
-             80, 78, 82, 83, 88, 91, 94, 101, 103, 108, 114, 116, 124, 129, 134,
-             142, 145, 153, 156, 157, 163, 163, 166, 171, 173, 169, 166, 93, 88,
-             87, 84, 83, 83, 81, 81, 86, 88, 92, 96, 98, 105, 107, 113, 117,
-             122, 129, 131, 141, 144, 151, 157, 163, 167, 169, 175, 175, 177,
-             183, 189, 96, 91, 90, 87, 87, 86, 85, 84, 87, 90, 94, 96, 101, 102,
-             110, 111, 118, 121, 129, 132, 138, 144, 150, 156, 161, 171, 174,
-             179, 181, 188, 188, 190, 99, 94, 94, 90, 90, 88, 89, 86, 87, 93,
-             93, 99, 99, 106, 107, 115, 116, 124, 125, 135, 136, 145, 146, 158,
-             159, 170, 171, 185, 186, 192, 193, 201,
-             // Size 4x16
-             31, 44, 73, 93, 32, 41, 67, 87, 32, 42, 65, 83, 33, 44, 66, 81, 34,
-             54, 74, 86, 37, 58, 79, 92, 44, 66, 90, 98, 49, 71, 99, 107, 56,
-             77, 107, 117, 65, 84, 119, 129, 72, 90, 127, 141, 78, 95, 133, 151,
-             84, 95, 132, 163, 89, 95, 136, 169, 95, 101, 132, 175, 101, 108,
-             141, 183,
-             // Size 16x4
-             31, 32, 32, 33, 34, 37, 44, 49, 56, 65, 72, 78, 84, 89, 95, 101,
-             44, 41, 42, 44, 54, 58, 66, 71, 77, 84, 90, 95, 95, 95, 101, 108,
-             73, 67, 65, 66, 74, 79, 90, 99, 107, 119, 127, 133, 132, 136, 132,
-             141, 93, 87, 83, 81, 86, 92, 98, 107, 117, 129, 141, 151, 163, 169,
-             175, 183,
-             // Size 8x32
-             32, 32, 36, 47, 65, 79, 90, 96, 31, 32, 35, 45, 62, 75, 86, 91, 31,
-             32, 35, 44, 60, 72, 84, 90, 31, 33, 35, 44, 59, 71, 82, 87, 32, 34,
-             36, 45, 59, 71, 80, 87, 32, 35, 38, 45, 58, 69, 80, 86, 32, 35, 40,
-             47, 60, 71, 78, 85, 34, 36, 42, 50, 63, 73, 82, 84, 36, 37, 48, 56,
-             68, 78, 83, 87, 38, 39, 49, 58, 71, 81, 88, 90, 39, 40, 50, 60, 73,
-             84, 91, 94, 44, 42, 53, 66, 79, 90, 94, 96, 47, 45, 56, 69, 84, 95,
-             101, 101, 49, 47, 57, 71, 86, 97, 103, 102, 53, 50, 60, 75, 92,
-             103, 108, 110, 58, 54, 63, 79, 98, 110, 114, 111, 61, 56, 65, 81,
-             100, 113, 116, 118, 65, 59, 68, 84, 105, 118, 124, 121, 71, 64, 73,
-             89, 111, 125, 129, 129, 76, 68, 76, 92, 115, 130, 134, 132, 79, 70,
-             79, 95, 118, 133, 142, 138, 82, 73, 81, 97, 121, 136, 145, 144, 86,
-             76, 84, 100, 124, 140, 153, 150, 89, 79, 87, 99, 124, 145, 156,
-             156, 92, 82, 89, 101, 121, 148, 157, 161, 95, 85, 92, 105, 120,
-             143, 163, 171, 98, 88, 93, 108, 124, 141, 163, 174, 101, 91, 94,
-             110, 128, 146, 166, 179, 104, 94, 95, 110, 129, 151, 171, 181, 107,
-             97, 96, 110, 128, 149, 173, 188, 110, 100, 98, 111, 127, 147, 169,
-             188, 114, 104, 100, 111, 127, 145, 166, 190,
-             // Size 32x8
-             32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 61,
-             65, 71, 76, 79, 82, 86, 89, 92, 95, 98, 101, 104, 107, 110, 114,
-             32, 32, 32, 33, 34, 35, 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 56,
-             59, 64, 68, 70, 73, 76, 79, 82, 85, 88, 91, 94, 97, 100, 104, 36,
-             35, 35, 35, 36, 38, 40, 42, 48, 49, 50, 53, 56, 57, 60, 63, 65, 68,
-             73, 76, 79, 81, 84, 87, 89, 92, 93, 94, 95, 96, 98, 100, 47, 45,
-             44, 44, 45, 45, 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 81, 84, 89,
-             92, 95, 97, 100, 99, 101, 105, 108, 110, 110, 110, 111, 111, 65,
-             62, 60, 59, 59, 58, 60, 63, 68, 71, 73, 79, 84, 86, 92, 98, 100,
-             105, 111, 115, 118, 121, 124, 124, 121, 120, 124, 128, 129, 128,
-             127, 127, 79, 75, 72, 71, 71, 69, 71, 73, 78, 81, 84, 90, 95, 97,
-             103, 110, 113, 118, 125, 130, 133, 136, 140, 145, 148, 143, 141,
-             146, 151, 149, 147, 145, 90, 86, 84, 82, 80, 80, 78, 82, 83, 88,
-             91, 94, 101, 103, 108, 114, 116, 124, 129, 134, 142, 145, 153, 156,
-             157, 163, 163, 166, 171, 173, 169, 166, 96, 91, 90, 87, 87, 86, 85,
-             84, 87, 90, 94, 96, 101, 102, 110, 111, 118, 121, 129, 132, 138,
-             144, 150, 156, 161, 171, 174, 179, 181, 188, 188, 190},
-            {// Chroma
-             // Size 4x4
-             33, 45, 56, 64, 45, 58, 66, 69, 56, 66, 86, 87, 64, 69, 87, 105,
-             // Size 8x8
-             31, 38, 47, 48, 54, 61, 66, 69, 38, 47, 47, 46, 50, 55, 61, 65, 47,
-             47, 53, 55, 58, 63, 65, 66, 48, 46, 55, 62, 67, 72, 73, 73, 54, 50,
-             58, 67, 76, 83, 84, 82, 61, 55, 63, 72, 83, 91, 92, 92, 66, 61, 65,
-             73, 84, 92, 101, 103, 69, 65, 66, 73, 82, 92, 103, 109,
-             // Size 16x16
-             32, 30, 33, 38, 49, 48, 50, 52, 55, 60, 63, 66, 68, 70, 72, 74, 30,
-             31, 35, 41, 46, 46, 46, 48, 51, 55, 58, 60, 63, 65, 68, 70, 33, 35,
-             39, 44, 47, 46, 46, 47, 50, 53, 56, 58, 60, 62, 65, 67, 38, 41, 44,
-             47, 49, 48, 47, 48, 50, 53, 55, 58, 58, 60, 62, 65, 49, 46, 47, 49,
-             53, 53, 54, 54, 56, 58, 60, 62, 62, 63, 64, 64, 48, 46, 46, 48, 53,
-             54, 56, 57, 59, 61, 63, 65, 67, 66, 68, 68, 50, 46, 46, 47, 54, 56,
-             61, 63, 65, 68, 70, 72, 71, 71, 72, 72, 52, 48, 47, 48, 54, 57, 63,
-             66, 69, 72, 75, 76, 75, 76, 76, 76, 55, 51, 50, 50, 56, 59, 65, 69,
-             73, 77, 79, 81, 81, 81, 80, 80, 60, 55, 53, 53, 58, 61, 68, 72, 77,
-             82, 85, 87, 87, 85, 84, 85, 63, 58, 56, 55, 60, 63, 70, 75, 79, 85,
-             89, 91, 91, 90, 89, 90, 66, 60, 58, 58, 62, 65, 72, 76, 81, 87, 91,
-             94, 96, 95, 95, 95, 68, 63, 60, 58, 62, 67, 71, 75, 81, 87, 91, 96,
-             99, 100, 100, 100, 70, 65, 62, 60, 63, 66, 71, 76, 81, 85, 90, 95,
-             100, 103, 104, 105, 72, 68, 65, 62, 64, 68, 72, 76, 80, 84, 89, 95,
-             100, 104, 107, 108, 74, 70, 67, 65, 64, 68, 72, 76, 80, 85, 90, 95,
-             100, 105, 108, 111,
-             // Size 32x32
-             32, 31, 30, 31, 33, 36, 38, 41, 49, 49, 48, 49, 50, 51, 52, 54, 55,
-             57, 60, 62, 63, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 31, 31,
-             31, 32, 34, 38, 40, 42, 47, 47, 47, 47, 48, 48, 50, 52, 53, 54, 57,
-             59, 60, 61, 63, 64, 65, 66, 67, 67, 68, 69, 70, 71, 30, 31, 31, 32,
-             35, 39, 41, 42, 46, 46, 46, 45, 46, 47, 48, 50, 51, 52, 55, 57, 58,
-             59, 60, 62, 63, 64, 65, 67, 68, 69, 70, 71, 31, 32, 32, 33, 36, 40,
-             41, 43, 46, 46, 45, 45, 46, 46, 47, 49, 50, 51, 54, 56, 57, 58, 59,
-             61, 62, 63, 63, 64, 65, 66, 67, 68, 33, 34, 35, 36, 39, 43, 44, 45,
-             47, 46, 46, 45, 46, 47, 47, 49, 50, 51, 53, 55, 56, 57, 58, 59, 60,
-             61, 62, 63, 65, 66, 67, 68, 36, 38, 39, 40, 43, 47, 47, 47, 48, 47,
-             46, 45, 46, 46, 47, 48, 49, 50, 52, 53, 54, 55, 56, 58, 59, 61, 62,
-             63, 64, 65, 66, 66, 38, 40, 41, 41, 44, 47, 47, 48, 49, 48, 48, 47,
-             47, 47, 48, 49, 50, 51, 53, 54, 55, 56, 58, 58, 58, 59, 60, 61, 62,
-             64, 65, 66, 41, 42, 42, 43, 45, 47, 48, 48, 50, 50, 49, 49, 50, 50,
-             50, 52, 52, 53, 55, 56, 57, 58, 59, 60, 61, 61, 61, 61, 62, 63, 63,
-             64, 49, 47, 46, 46, 47, 48, 49, 50, 53, 53, 53, 53, 54, 54, 54, 55,
-             56, 56, 58, 59, 60, 61, 62, 63, 62, 62, 63, 64, 64, 64, 64, 64, 49,
-             47, 46, 46, 46, 47, 48, 50, 53, 53, 54, 55, 55, 55, 56, 57, 58, 58,
-             60, 61, 62, 63, 64, 64, 64, 65, 65, 65, 65, 66, 67, 68, 48, 47, 46,
-             45, 46, 46, 48, 49, 53, 54, 54, 55, 56, 56, 57, 58, 59, 60, 61, 63,
-             63, 64, 65, 66, 67, 66, 66, 67, 68, 68, 68, 68, 49, 47, 45, 45, 45,
-             45, 47, 49, 53, 55, 55, 58, 59, 60, 61, 62, 63, 63, 65, 66, 67, 68,
-             69, 69, 68, 68, 69, 69, 69, 69, 70, 71, 50, 48, 46, 46, 46, 46, 47,
-             50, 54, 55, 56, 59, 61, 61, 63, 64, 65, 66, 68, 69, 70, 71, 72, 71,
-             71, 72, 71, 71, 72, 72, 72, 71, 51, 48, 47, 46, 47, 46, 47, 50, 54,
-             55, 56, 60, 61, 62, 64, 66, 66, 67, 69, 70, 71, 72, 73, 73, 74, 73,
-             73, 74, 73, 73, 74, 75, 52, 50, 48, 47, 47, 47, 48, 50, 54, 56, 57,
-             61, 63, 64, 66, 68, 69, 70, 72, 74, 75, 75, 76, 77, 75, 76, 76, 75,
-             76, 77, 76, 75, 54, 52, 50, 49, 49, 48, 49, 52, 55, 57, 58, 62, 64,
-             66, 68, 71, 72, 73, 75, 77, 78, 79, 80, 78, 79, 78, 77, 78, 78, 77,
-             78, 79, 55, 53, 51, 50, 50, 49, 50, 52, 56, 58, 59, 63, 65, 66, 69,
-             72, 73, 74, 77, 78, 79, 80, 81, 81, 81, 80, 81, 80, 80, 81, 80, 79,
-             57, 54, 52, 51, 51, 50, 51, 53, 56, 58, 60, 63, 66, 67, 70, 73, 74,
-             76, 79, 80, 82, 83, 84, 85, 83, 84, 83, 83, 83, 82, 82, 83, 60, 57,
-             55, 54, 53, 52, 53, 55, 58, 60, 61, 65, 68, 69, 72, 75, 77, 79, 82,
-             84, 85, 86, 87, 86, 87, 85, 85, 85, 84, 86, 85, 84, 62, 59, 57, 56,
-             55, 53, 54, 56, 59, 61, 63, 66, 69, 70, 74, 77, 78, 80, 84, 86, 87,
-             88, 90, 89, 89, 88, 88, 87, 88, 87, 87, 88, 63, 60, 58, 57, 56, 54,
-             55, 57, 60, 62, 63, 67, 70, 71, 75, 78, 79, 82, 85, 87, 89, 90, 91,
-             93, 91, 91, 90, 91, 89, 90, 90, 89, 65, 61, 59, 58, 57, 55, 56, 58,
-             61, 63, 64, 68, 71, 72, 75, 79, 80, 83, 86, 88, 90, 91, 93, 94, 95,
-             92, 94, 92, 93, 92, 91, 93, 66, 63, 60, 59, 58, 56, 58, 59, 62, 64,
-             65, 69, 72, 73, 76, 80, 81, 84, 87, 90, 91, 93, 94, 95, 96, 97, 95,
-             95, 95, 95, 95, 93, 67, 64, 62, 61, 59, 58, 58, 60, 63, 64, 66, 69,
-             71, 73, 77, 78, 81, 85, 86, 89, 93, 94, 95, 97, 97, 98, 99, 97, 97,
-             97, 96, 98, 68, 65, 63, 62, 60, 59, 58, 61, 62, 64, 67, 68, 71, 74,
-             75, 79, 81, 83, 87, 89, 91, 95, 96, 97, 99, 98, 100, 100, 100, 99,
-             100, 98, 69, 66, 64, 63, 61, 61, 59, 61, 62, 65, 66, 68, 72, 73,
-             76, 78, 80, 84, 85, 88, 91, 92, 97, 98, 98, 101, 100, 102, 102,
-             103, 101, 102, 70, 67, 65, 63, 62, 62, 60, 61, 63, 65, 66, 69, 71,
-             73, 76, 77, 81, 83, 85, 88, 90, 94, 95, 99, 100, 100, 103, 102,
-             104, 104, 105, 103, 71, 67, 67, 64, 63, 63, 61, 61, 64, 65, 67, 69,
-             71, 74, 75, 78, 80, 83, 85, 87, 91, 92, 95, 97, 100, 102, 102, 105,
-             104, 106, 106, 108, 72, 68, 68, 65, 65, 64, 62, 62, 64, 65, 68, 69,
-             72, 73, 76, 78, 80, 83, 84, 88, 89, 93, 95, 97, 100, 102, 104, 104,
-             107, 106, 108, 108, 73, 69, 69, 66, 66, 65, 64, 63, 64, 66, 68, 69,
-             72, 73, 77, 77, 81, 82, 86, 87, 90, 92, 95, 97, 99, 103, 104, 106,
-             106, 109, 108, 110, 74, 70, 70, 67, 67, 66, 65, 63, 64, 67, 68, 70,
-             72, 74, 76, 78, 80, 82, 85, 87, 90, 91, 95, 96, 100, 101, 105, 106,
-             108, 108, 111, 110, 75, 71, 71, 68, 68, 66, 66, 64, 64, 68, 68, 71,
-             71, 75, 75, 79, 79, 83, 84, 88, 89, 93, 93, 98, 98, 102, 103, 108,
-             108, 110, 110, 113,
-             // Size 4x8
-             31, 47, 57, 65, 40, 45, 52, 61, 46, 55, 61, 63, 47, 60, 70, 72, 52,
-             64, 79, 81, 59, 68, 87, 90, 63, 66, 88, 99, 66, 69, 85, 102,
-             // Size 8x4
-             31, 40, 46, 47, 52, 59, 63, 66, 47, 45, 55, 60, 64, 68, 66, 69, 57,
-             52, 61, 70, 79, 87, 88, 85, 65, 61, 63, 72, 81, 90, 99, 102,
-             // Size 8x16
-             32, 35, 48, 50, 57, 63, 68, 70, 30, 38, 46, 46, 52, 58, 63, 65, 33,
-             41, 47, 46, 51, 56, 60, 63, 39, 46, 48, 47, 51, 55, 58, 61, 49, 48,
-             53, 54, 57, 60, 61, 61, 48, 46, 53, 56, 60, 64, 65, 65, 50, 46, 54,
-             61, 66, 70, 71, 69, 52, 47, 54, 63, 71, 75, 75, 74, 55, 49, 56, 65,
-             74, 79, 79, 78, 60, 53, 58, 68, 79, 85, 85, 82, 63, 55, 60, 70, 82,
-             89, 91, 87, 66, 58, 62, 72, 84, 91, 95, 91, 68, 60, 64, 71, 81, 94,
-             97, 96, 70, 62, 65, 73, 81, 89, 98, 101, 72, 65, 65, 72, 82, 92,
-             100, 103, 74, 67, 65, 71, 79, 89, 98, 105,
-             // Size 16x8
-             32, 30, 33, 39, 49, 48, 50, 52, 55, 60, 63, 66, 68, 70, 72, 74, 35,
-             38, 41, 46, 48, 46, 46, 47, 49, 53, 55, 58, 60, 62, 65, 67, 48, 46,
-             47, 48, 53, 53, 54, 54, 56, 58, 60, 62, 64, 65, 65, 65, 50, 46, 46,
-             47, 54, 56, 61, 63, 65, 68, 70, 72, 71, 73, 72, 71, 57, 52, 51, 51,
-             57, 60, 66, 71, 74, 79, 82, 84, 81, 81, 82, 79, 63, 58, 56, 55, 60,
-             64, 70, 75, 79, 85, 89, 91, 94, 89, 92, 89, 68, 63, 60, 58, 61, 65,
-             71, 75, 79, 85, 91, 95, 97, 98, 100, 98, 70, 65, 63, 61, 61, 65,
-             69, 74, 78, 82, 87, 91, 96, 101, 103, 105,
-             // Size 16x32
-             32, 31, 35, 38, 48, 49, 50, 52, 57, 61, 63, 67, 68, 69, 70, 71, 31,
-             31, 37, 40, 47, 47, 48, 50, 54, 57, 60, 63, 64, 65, 66, 67, 30, 32,
-             38, 40, 46, 45, 46, 48, 52, 55, 58, 61, 63, 64, 65, 67, 31, 33, 38,
-             41, 46, 45, 46, 48, 52, 55, 57, 60, 61, 62, 63, 64, 33, 36, 41, 44,
-             47, 46, 46, 47, 51, 54, 56, 59, 60, 61, 63, 64, 37, 40, 45, 47, 47,
-             45, 46, 47, 50, 52, 54, 57, 59, 61, 62, 62, 39, 41, 46, 47, 48, 47,
-             47, 48, 51, 54, 55, 57, 58, 59, 61, 62, 42, 43, 46, 48, 50, 49, 50,
-             50, 53, 56, 57, 60, 60, 59, 60, 60, 49, 46, 48, 49, 53, 53, 54, 54,
-             57, 59, 60, 63, 61, 62, 61, 61, 48, 46, 47, 48, 53, 55, 55, 56, 58,
-             61, 62, 64, 64, 63, 63, 64, 48, 46, 46, 48, 53, 56, 56, 57, 60, 62,
-             64, 66, 65, 65, 65, 64, 49, 45, 45, 47, 53, 58, 59, 61, 64, 66, 67,
-             69, 67, 67, 66, 67, 50, 46, 46, 48, 54, 59, 61, 63, 66, 68, 70, 71,
-             71, 68, 69, 67, 51, 47, 47, 48, 54, 60, 61, 64, 68, 70, 71, 73, 72,
-             72, 70, 71, 52, 48, 47, 48, 54, 61, 63, 66, 71, 73, 75, 77, 75, 73,
-             74, 71, 54, 50, 49, 50, 55, 62, 65, 68, 73, 76, 78, 79, 78, 76, 74,
-             75, 55, 51, 49, 50, 56, 63, 65, 69, 74, 77, 79, 81, 79, 78, 78, 75,
-             57, 52, 50, 51, 56, 64, 66, 70, 76, 79, 82, 85, 83, 81, 79, 79, 60,
-             54, 53, 53, 58, 65, 68, 72, 79, 82, 85, 87, 85, 84, 82, 80, 62, 56,
-             54, 55, 60, 66, 69, 74, 81, 84, 87, 88, 87, 85, 84, 84, 63, 57, 55,
-             56, 60, 67, 70, 75, 82, 86, 89, 92, 91, 89, 87, 84, 64, 59, 56, 57,
-             61, 68, 71, 75, 83, 87, 90, 93, 92, 90, 89, 89, 66, 60, 58, 58, 62,
-             69, 72, 76, 84, 88, 91, 94, 95, 93, 91, 89, 67, 61, 59, 58, 63, 68,
-             71, 78, 83, 86, 93, 96, 96, 96, 94, 94, 68, 62, 60, 59, 64, 67, 71,
-             79, 81, 86, 94, 95, 97, 98, 96, 94, 69, 63, 61, 60, 65, 66, 72, 77,
-             80, 88, 91, 96, 99, 99, 100, 98, 70, 64, 62, 60, 65, 66, 73, 76,
-             81, 87, 89, 97, 98, 100, 101, 99, 71, 65, 64, 61, 65, 67, 73, 74,
-             82, 85, 90, 95, 99, 102, 103, 104, 72, 65, 65, 62, 65, 68, 72, 75,
-             82, 83, 92, 93, 100, 102, 103, 104, 73, 66, 66, 63, 65, 69, 72, 76,
-             81, 85, 90, 93, 100, 102, 105, 106, 74, 67, 67, 64, 65, 70, 71, 77,
-             79, 86, 89, 94, 98, 103, 105, 106, 75, 68, 68, 65, 65, 71, 71, 78,
-             78, 87, 87, 96, 96, 105, 105, 109,
-             // Size 32x16
-             32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 55,
-             57, 60, 62, 63, 64, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 31, 31,
-             32, 33, 36, 40, 41, 43, 46, 46, 46, 45, 46, 47, 48, 50, 51, 52, 54,
-             56, 57, 59, 60, 61, 62, 63, 64, 65, 65, 66, 67, 68, 35, 37, 38, 38,
-             41, 45, 46, 46, 48, 47, 46, 45, 46, 47, 47, 49, 49, 50, 53, 54, 55,
-             56, 58, 59, 60, 61, 62, 64, 65, 66, 67, 68, 38, 40, 40, 41, 44, 47,
-             47, 48, 49, 48, 48, 47, 48, 48, 48, 50, 50, 51, 53, 55, 56, 57, 58,
-             58, 59, 60, 60, 61, 62, 63, 64, 65, 48, 47, 46, 46, 47, 47, 48, 50,
-             53, 53, 53, 53, 54, 54, 54, 55, 56, 56, 58, 60, 60, 61, 62, 63, 64,
-             65, 65, 65, 65, 65, 65, 65, 49, 47, 45, 45, 46, 45, 47, 49, 53, 55,
-             56, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 68, 67, 66, 66,
-             67, 68, 69, 70, 71, 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59,
-             61, 61, 63, 65, 65, 66, 68, 69, 70, 71, 72, 71, 71, 72, 73, 73, 72,
-             72, 71, 71, 52, 50, 48, 48, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64,
-             66, 68, 69, 70, 72, 74, 75, 75, 76, 78, 79, 77, 76, 74, 75, 76, 77,
-             78, 57, 54, 52, 52, 51, 50, 51, 53, 57, 58, 60, 64, 66, 68, 71, 73,
-             74, 76, 79, 81, 82, 83, 84, 83, 81, 80, 81, 82, 82, 81, 79, 78, 61,
-             57, 55, 55, 54, 52, 54, 56, 59, 61, 62, 66, 68, 70, 73, 76, 77, 79,
-             82, 84, 86, 87, 88, 86, 86, 88, 87, 85, 83, 85, 86, 87, 63, 60, 58,
-             57, 56, 54, 55, 57, 60, 62, 64, 67, 70, 71, 75, 78, 79, 82, 85, 87,
-             89, 90, 91, 93, 94, 91, 89, 90, 92, 90, 89, 87, 67, 63, 61, 60, 59,
-             57, 57, 60, 63, 64, 66, 69, 71, 73, 77, 79, 81, 85, 87, 88, 92, 93,
-             94, 96, 95, 96, 97, 95, 93, 93, 94, 96, 68, 64, 63, 61, 60, 59, 58,
-             60, 61, 64, 65, 67, 71, 72, 75, 78, 79, 83, 85, 87, 91, 92, 95, 96,
-             97, 99, 98, 99, 100, 100, 98, 96, 69, 65, 64, 62, 61, 61, 59, 59,
-             62, 63, 65, 67, 68, 72, 73, 76, 78, 81, 84, 85, 89, 90, 93, 96, 98,
-             99, 100, 102, 102, 102, 103, 105, 70, 66, 65, 63, 63, 62, 61, 60,
-             61, 63, 65, 66, 69, 70, 74, 74, 78, 79, 82, 84, 87, 89, 91, 94, 96,
-             100, 101, 103, 103, 105, 105, 105, 71, 67, 67, 64, 64, 62, 62, 60,
-             61, 64, 64, 67, 67, 71, 71, 75, 75, 79, 80, 84, 84, 89, 89, 94, 94,
-             98, 99, 104, 104, 106, 106, 109,
-             // Size 4x16
-             31, 49, 61, 69, 32, 45, 55, 64, 36, 46, 54, 61, 41, 47, 54, 59, 46,
-             53, 59, 62, 46, 56, 62, 65, 46, 59, 68, 68, 48, 61, 73, 73, 51, 63,
-             77, 78, 54, 65, 82, 84, 57, 67, 86, 89, 60, 69, 88, 93, 62, 67, 86,
-             98, 64, 66, 87, 100, 65, 68, 83, 102, 67, 70, 86, 103,
-             // Size 16x4
-             31, 32, 36, 41, 46, 46, 46, 48, 51, 54, 57, 60, 62, 64, 65, 67, 49,
-             45, 46, 47, 53, 56, 59, 61, 63, 65, 67, 69, 67, 66, 68, 70, 61, 55,
-             54, 54, 59, 62, 68, 73, 77, 82, 86, 88, 86, 87, 83, 86, 69, 64, 61,
-             59, 62, 65, 68, 73, 78, 84, 89, 93, 98, 100, 102, 103,
-             // Size 8x32
-             32, 35, 48, 50, 57, 63, 68, 70, 31, 37, 47, 48, 54, 60, 64, 66, 30,
-             38, 46, 46, 52, 58, 63, 65, 31, 38, 46, 46, 52, 57, 61, 63, 33, 41,
-             47, 46, 51, 56, 60, 63, 37, 45, 47, 46, 50, 54, 59, 62, 39, 46, 48,
-             47, 51, 55, 58, 61, 42, 46, 50, 50, 53, 57, 60, 60, 49, 48, 53, 54,
-             57, 60, 61, 61, 48, 47, 53, 55, 58, 62, 64, 63, 48, 46, 53, 56, 60,
-             64, 65, 65, 49, 45, 53, 59, 64, 67, 67, 66, 50, 46, 54, 61, 66, 70,
-             71, 69, 51, 47, 54, 61, 68, 71, 72, 70, 52, 47, 54, 63, 71, 75, 75,
-             74, 54, 49, 55, 65, 73, 78, 78, 74, 55, 49, 56, 65, 74, 79, 79, 78,
-             57, 50, 56, 66, 76, 82, 83, 79, 60, 53, 58, 68, 79, 85, 85, 82, 62,
-             54, 60, 69, 81, 87, 87, 84, 63, 55, 60, 70, 82, 89, 91, 87, 64, 56,
-             61, 71, 83, 90, 92, 89, 66, 58, 62, 72, 84, 91, 95, 91, 67, 59, 63,
-             71, 83, 93, 96, 94, 68, 60, 64, 71, 81, 94, 97, 96, 69, 61, 65, 72,
-             80, 91, 99, 100, 70, 62, 65, 73, 81, 89, 98, 101, 71, 64, 65, 73,
-             82, 90, 99, 103, 72, 65, 65, 72, 82, 92, 100, 103, 73, 66, 65, 72,
-             81, 90, 100, 105, 74, 67, 65, 71, 79, 89, 98, 105, 75, 68, 65, 71,
-             78, 87, 96, 105,
-             // Size 32x8
-             32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 55,
-             57, 60, 62, 63, 64, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 35, 37,
-             38, 38, 41, 45, 46, 46, 48, 47, 46, 45, 46, 47, 47, 49, 49, 50, 53,
-             54, 55, 56, 58, 59, 60, 61, 62, 64, 65, 66, 67, 68, 48, 47, 46, 46,
-             47, 47, 48, 50, 53, 53, 53, 53, 54, 54, 54, 55, 56, 56, 58, 60, 60,
-             61, 62, 63, 64, 65, 65, 65, 65, 65, 65, 65, 50, 48, 46, 46, 46, 46,
-             47, 50, 54, 55, 56, 59, 61, 61, 63, 65, 65, 66, 68, 69, 70, 71, 72,
-             71, 71, 72, 73, 73, 72, 72, 71, 71, 57, 54, 52, 52, 51, 50, 51, 53,
-             57, 58, 60, 64, 66, 68, 71, 73, 74, 76, 79, 81, 82, 83, 84, 83, 81,
-             80, 81, 82, 82, 81, 79, 78, 63, 60, 58, 57, 56, 54, 55, 57, 60, 62,
-             64, 67, 70, 71, 75, 78, 79, 82, 85, 87, 89, 90, 91, 93, 94, 91, 89,
-             90, 92, 90, 89, 87, 68, 64, 63, 61, 60, 59, 58, 60, 61, 64, 65, 67,
-             71, 72, 75, 78, 79, 83, 85, 87, 91, 92, 95, 96, 97, 99, 98, 99,
-             100, 100, 98, 96, 70, 66, 65, 63, 63, 62, 61, 60, 61, 63, 65, 66,
-             69, 70, 74, 74, 78, 79, 82, 84, 87, 89, 91, 94, 96, 100, 101, 103,
-             103, 105, 105, 105},
-        },
-        // Quantizer level 2.
-        {
-            {// Luma
-             // Size 4x4
-             32, 38, 63, 86, 38, 56, 78, 97, 63, 78, 113, 130, 86, 97, 130, 169,
-             // Size 8x8
-             32, 32, 35, 46, 57, 76, 85, 96, 32, 34, 37, 45, 54, 70, 79, 90, 35,
-             37, 48, 56, 64, 79, 87, 93, 46, 45, 56, 70, 80, 96, 100, 105, 57,
-             54, 64, 80, 93, 111, 121, 122, 76, 70, 79, 96, 111, 134, 138, 144,
-             85, 79, 87, 100, 121, 138, 156, 168, 96, 90, 93, 105, 122, 144,
-             168, 184,
-             // Size 16x16
-             32, 31, 31, 32, 34, 39, 44, 49, 58, 65, 71, 81, 87, 93, 98, 104,
-             31, 32, 32, 32, 34, 38, 41, 46, 54, 60, 66, 75, 81, 86, 92, 98, 31,
-             32, 33, 34, 36, 39, 42, 46, 53, 59, 64, 73, 78, 83, 88, 94, 32, 32,
-             34, 35, 37, 40, 42, 46, 52, 58, 63, 71, 75, 80, 86, 92, 34, 34, 36,
-             37, 42, 47, 50, 53, 59, 65, 70, 77, 82, 85, 89, 92, 39, 38, 39, 40,
-             47, 54, 58, 62, 68, 73, 78, 85, 90, 90, 96, 98, 44, 41, 42, 42, 50,
-             58, 63, 68, 74, 79, 84, 91, 96, 98, 102, 104, 49, 46, 46, 46, 53,
-             62, 68, 73, 81, 87, 92, 99, 103, 107, 109, 112, 58, 54, 53, 52, 59,
-             68, 74, 81, 90, 97, 102, 110, 114, 118, 117, 121, 65, 60, 59, 58,
-             65, 73, 79, 87, 97, 105, 111, 120, 125, 125, 126, 130, 71, 66, 64,
-             63, 70, 78, 84, 92, 102, 111, 117, 127, 133, 134, 136, 141, 81, 75,
-             73, 71, 77, 85, 91, 99, 110, 120, 127, 137, 143, 145, 148, 152, 87,
-             81, 78, 75, 82, 90, 96, 103, 114, 125, 133, 143, 150, 156, 160,
-             163, 93, 86, 83, 80, 85, 90, 98, 107, 118, 125, 134, 145, 156, 163,
-             169, 177, 98, 92, 88, 86, 89, 96, 102, 109, 117, 126, 136, 148,
-             160, 169, 176, 184, 104, 98, 94, 92, 92, 98, 104, 112, 121, 130,
-             141, 152, 163, 177, 184, 191,
-             // Size 32x32
-             32, 31, 31, 31, 31, 32, 32, 34, 34, 36, 39, 41, 44, 48, 49, 54, 58,
-             59, 65, 69, 71, 80, 81, 83, 87, 90, 93, 95, 98, 101, 104, 107, 31,
-             32, 32, 32, 32, 32, 32, 34, 34, 35, 38, 39, 42, 46, 47, 51, 55, 57,
-             62, 66, 68, 76, 77, 78, 83, 85, 88, 90, 93, 96, 99, 101, 31, 32,
-             32, 32, 32, 32, 32, 33, 34, 34, 38, 39, 41, 45, 46, 50, 54, 55, 60,
-             64, 66, 73, 75, 76, 81, 83, 86, 89, 92, 95, 98, 101, 31, 32, 32,
-             32, 32, 32, 32, 33, 34, 34, 37, 38, 41, 44, 45, 49, 53, 54, 59, 63,
-             65, 72, 74, 75, 79, 81, 84, 86, 89, 91, 94, 97, 31, 32, 32, 32, 33,
-             33, 34, 35, 36, 36, 39, 40, 42, 45, 46, 50, 53, 54, 59, 63, 64, 71,
-             73, 74, 78, 80, 83, 85, 88, 91, 94, 97, 32, 32, 32, 32, 33, 34, 34,
-             36, 36, 37, 40, 40, 42, 45, 46, 49, 53, 54, 58, 62, 63, 70, 72, 73,
-             77, 79, 82, 85, 87, 90, 92, 95, 32, 32, 32, 32, 34, 34, 35, 37, 37,
-             38, 40, 41, 42, 45, 46, 49, 52, 54, 58, 61, 63, 69, 71, 72, 75, 78,
-             80, 83, 86, 89, 92, 95, 34, 34, 33, 33, 35, 36, 37, 39, 41, 42, 45,
-             46, 47, 50, 51, 54, 57, 59, 63, 66, 68, 74, 75, 76, 80, 81, 82, 83,
-             85, 87, 90, 93, 34, 34, 34, 34, 36, 36, 37, 41, 42, 45, 47, 48, 50,
-             53, 53, 56, 59, 61, 65, 68, 70, 76, 77, 78, 82, 83, 85, 88, 89, 90,
-             92, 93, 36, 35, 34, 34, 36, 37, 38, 42, 45, 48, 50, 51, 54, 56, 57,
-             60, 63, 64, 68, 71, 73, 79, 80, 81, 85, 87, 89, 89, 90, 93, 96, 99,
-             39, 38, 38, 37, 39, 40, 40, 45, 47, 50, 54, 55, 58, 61, 62, 65, 68,
-             69, 73, 76, 78, 84, 85, 86, 90, 89, 90, 93, 96, 97, 98, 99, 41, 39,
-             39, 38, 40, 40, 41, 46, 48, 51, 55, 56, 59, 62, 63, 67, 70, 71, 75,
-             78, 80, 86, 87, 88, 91, 93, 96, 97, 97, 99, 102, 105, 44, 42, 41,
-             41, 42, 42, 42, 47, 50, 54, 58, 59, 63, 66, 68, 71, 74, 75, 79, 83,
-             84, 90, 91, 92, 96, 98, 98, 99, 102, 104, 104, 105, 48, 46, 45, 44,
-             45, 45, 45, 50, 53, 56, 61, 62, 66, 70, 71, 76, 79, 80, 85, 88, 90,
-             96, 97, 98, 101, 100, 102, 105, 105, 105, 109, 112, 49, 47, 46, 45,
-             46, 46, 46, 51, 53, 57, 62, 63, 68, 71, 73, 77, 81, 82, 87, 90, 92,
-             98, 99, 100, 103, 106, 107, 106, 109, 112, 112, 112, 54, 51, 50,
-             49, 50, 49, 49, 54, 56, 60, 65, 67, 71, 76, 77, 82, 86, 87, 92, 96,
-             97, 104, 105, 106, 110, 110, 109, 113, 114, 113, 116, 120, 58, 55,
-             54, 53, 53, 53, 52, 57, 59, 63, 68, 70, 74, 79, 81, 86, 90, 91, 97,
-             100, 102, 109, 110, 111, 114, 114, 118, 116, 117, 121, 121, 120,
-             59, 57, 55, 54, 54, 54, 54, 59, 61, 64, 69, 71, 75, 80, 82, 87, 91,
-             93, 99, 102, 104, 111, 112, 113, 117, 121, 120, 122, 124, 122, 125,
-             129, 65, 62, 60, 59, 59, 58, 58, 63, 65, 68, 73, 75, 79, 85, 87,
-             92, 97, 99, 105, 109, 111, 118, 120, 121, 125, 124, 125, 127, 126,
-             130, 130, 129, 69, 66, 64, 63, 63, 62, 61, 66, 68, 71, 76, 78, 83,
-             88, 90, 96, 100, 102, 109, 113, 115, 123, 125, 126, 129, 130, 131,
-             130, 134, 133, 135, 139, 71, 68, 66, 65, 64, 63, 63, 68, 70, 73,
-             78, 80, 84, 90, 92, 97, 102, 104, 111, 115, 117, 125, 127, 128,
-             133, 136, 134, 139, 136, 139, 141, 140, 80, 76, 73, 72, 71, 70, 69,
-             74, 76, 79, 84, 86, 90, 96, 98, 104, 109, 111, 118, 123, 125, 134,
-             136, 137, 142, 138, 143, 140, 144, 144, 144, 149, 81, 77, 75, 74,
-             73, 72, 71, 75, 77, 80, 85, 87, 91, 97, 99, 105, 110, 112, 120,
-             125, 127, 136, 137, 139, 143, 148, 145, 148, 148, 150, 152, 149,
-             83, 78, 76, 75, 74, 73, 72, 76, 78, 81, 86, 88, 92, 98, 100, 106,
-             111, 113, 121, 126, 128, 137, 139, 140, 145, 149, 153, 153, 154,
-             155, 155, 161, 87, 83, 81, 79, 78, 77, 75, 80, 82, 85, 90, 91, 96,
-             101, 103, 110, 114, 117, 125, 129, 133, 142, 143, 145, 150, 151,
-             156, 159, 160, 160, 163, 161, 90, 85, 83, 81, 80, 79, 78, 81, 83,
-             87, 89, 93, 98, 100, 106, 110, 114, 121, 124, 130, 136, 138, 148,
-             149, 151, 156, 157, 162, 166, 168, 166, 172, 93, 88, 86, 84, 83,
-             82, 80, 82, 85, 89, 90, 96, 98, 102, 107, 109, 118, 120, 125, 131,
-             134, 143, 145, 153, 156, 157, 163, 164, 169, 172, 177, 172, 95, 90,
-             89, 86, 85, 85, 83, 83, 88, 89, 93, 97, 99, 105, 106, 113, 116,
-             122, 127, 130, 139, 140, 148, 153, 159, 162, 164, 169, 170, 176,
-             179, 185, 98, 93, 92, 89, 88, 87, 86, 85, 89, 90, 96, 97, 102, 105,
-             109, 114, 117, 124, 126, 134, 136, 144, 148, 154, 160, 166, 169,
-             170, 176, 177, 184, 186, 101, 96, 95, 91, 91, 90, 89, 87, 90, 93,
-             97, 99, 104, 105, 112, 113, 121, 122, 130, 133, 139, 144, 150, 155,
-             160, 168, 172, 176, 177, 184, 185, 191, 104, 99, 98, 94, 94, 92,
-             92, 90, 92, 96, 98, 102, 104, 109, 112, 116, 121, 125, 130, 135,
-             141, 144, 152, 155, 163, 166, 177, 179, 184, 185, 191, 192, 107,
-             101, 101, 97, 97, 95, 95, 93, 93, 99, 99, 105, 105, 112, 112, 120,
-             120, 129, 129, 139, 140, 149, 149, 161, 161, 172, 172, 185, 186,
-             191, 192, 199,
-             // Size 4x8
-             32, 38, 62, 86, 32, 40, 58, 80, 34, 51, 68, 85, 44, 61, 85, 101,
-             54, 69, 98, 117, 72, 84, 118, 136, 82, 89, 129, 157, 92, 98, 127,
-             165,
-             // Size 8x4
-             32, 32, 34, 44, 54, 72, 82, 92, 38, 40, 51, 61, 69, 84, 89, 98, 62,
-             58, 68, 85, 98, 118, 129, 127, 86, 80, 85, 101, 117, 136, 157, 165,
-             // Size 8x16
-             32, 32, 36, 44, 58, 79, 88, 93, 31, 32, 35, 41, 54, 73, 81, 88, 32,
-             33, 36, 42, 53, 71, 78, 84, 32, 34, 38, 42, 52, 69, 76, 82, 34, 36,
-             44, 50, 59, 75, 81, 84, 39, 39, 50, 58, 68, 84, 88, 90, 44, 42, 53,
-             63, 74, 90, 97, 97, 49, 46, 57, 67, 81, 97, 104, 105, 57, 53, 63,
-             74, 90, 108, 111, 113, 65, 59, 68, 79, 97, 118, 123, 122, 71, 64,
-             73, 84, 102, 125, 135, 131, 81, 72, 80, 91, 110, 135, 145, 141, 87,
-             77, 85, 96, 114, 140, 148, 151, 92, 83, 88, 102, 117, 133, 153,
-             163, 98, 88, 89, 103, 121, 141, 160, 169, 103, 94, 92, 103, 119,
-             137, 158, 175,
-             // Size 16x8
-             32, 31, 32, 32, 34, 39, 44, 49, 57, 65, 71, 81, 87, 92, 98, 103,
-             32, 32, 33, 34, 36, 39, 42, 46, 53, 59, 64, 72, 77, 83, 88, 94, 36,
-             35, 36, 38, 44, 50, 53, 57, 63, 68, 73, 80, 85, 88, 89, 92, 44, 41,
-             42, 42, 50, 58, 63, 67, 74, 79, 84, 91, 96, 102, 103, 103, 58, 54,
-             53, 52, 59, 68, 74, 81, 90, 97, 102, 110, 114, 117, 121, 119, 79,
-             73, 71, 69, 75, 84, 90, 97, 108, 118, 125, 135, 140, 133, 141, 137,
-             88, 81, 78, 76, 81, 88, 97, 104, 111, 123, 135, 145, 148, 153, 160,
-             158, 93, 88, 84, 82, 84, 90, 97, 105, 113, 122, 131, 141, 151, 163,
-             169, 175,
-             // Size 16x32
-             32, 31, 32, 32, 36, 39, 44, 53, 58, 65, 79, 81, 88, 90, 93, 96, 31,
-             32, 32, 32, 35, 38, 42, 51, 55, 62, 75, 77, 83, 86, 88, 91, 31, 32,
-             32, 32, 35, 38, 41, 50, 54, 60, 73, 75, 81, 84, 88, 91, 31, 32, 32,
-             33, 34, 37, 41, 49, 53, 59, 72, 74, 79, 82, 84, 87, 32, 32, 33, 34,
-             36, 39, 42, 50, 53, 59, 71, 72, 78, 81, 84, 87, 32, 32, 34, 34, 37,
-             40, 42, 49, 53, 58, 70, 71, 77, 80, 83, 85, 32, 33, 34, 35, 38, 40,
-             42, 49, 52, 58, 69, 70, 76, 78, 82, 86, 34, 34, 35, 37, 42, 45, 48,
-             54, 57, 63, 73, 75, 79, 79, 81, 83, 34, 34, 36, 37, 44, 47, 50, 56,
-             59, 65, 75, 77, 81, 83, 84, 84, 36, 34, 37, 38, 48, 51, 54, 60, 63,
-             68, 78, 80, 85, 85, 86, 89, 39, 37, 39, 40, 50, 54, 58, 65, 68, 73,
-             84, 85, 88, 89, 90, 89, 40, 38, 40, 41, 51, 55, 59, 67, 70, 75, 85,
-             87, 91, 92, 92, 95, 44, 41, 42, 43, 53, 58, 63, 71, 74, 79, 90, 91,
-             97, 94, 97, 95, 47, 44, 45, 46, 56, 61, 66, 75, 79, 85, 95, 97, 99,
-             101, 98, 102, 49, 46, 46, 47, 57, 62, 67, 77, 81, 86, 97, 99, 104,
-             102, 105, 102, 53, 49, 50, 50, 60, 65, 71, 82, 86, 92, 103, 105,
-             109, 108, 106, 110, 57, 53, 53, 53, 63, 68, 74, 86, 90, 97, 108,
-             110, 111, 112, 113, 110, 59, 54, 54, 54, 64, 69, 75, 87, 91, 98,
-             111, 112, 119, 117, 115, 118, 65, 60, 59, 58, 68, 73, 79, 92, 97,
-             105, 118, 119, 123, 123, 122, 119, 69, 63, 62, 62, 71, 76, 83, 96,
-             100, 109, 122, 124, 127, 125, 125, 128, 71, 65, 64, 63, 73, 78, 84,
-             97, 102, 111, 125, 127, 135, 134, 131, 129, 79, 72, 71, 70, 79, 84,
-             90, 104, 109, 118, 133, 135, 137, 136, 136, 137, 81, 74, 72, 71,
-             80, 85, 91, 105, 110, 120, 135, 137, 145, 143, 141, 138, 82, 75,
-             73, 72, 81, 86, 92, 106, 111, 121, 136, 139, 147, 148, 147, 149,
-             87, 79, 77, 76, 85, 90, 96, 110, 114, 125, 140, 143, 148, 154, 151,
-             149, 90, 82, 80, 78, 87, 89, 99, 108, 113, 129, 135, 146, 153, 157,
-             160, 159, 92, 84, 83, 81, 88, 90, 102, 106, 117, 128, 133, 150,
-             153, 158, 163, 160, 95, 87, 85, 83, 88, 92, 103, 105, 120, 125,
-             137, 148, 155, 164, 168, 173, 98, 89, 88, 85, 89, 95, 103, 108,
-             121, 124, 141, 144, 160, 164, 169, 174, 100, 92, 91, 88, 90, 98,
-             103, 111, 120, 127, 139, 146, 161, 165, 175, 179, 103, 94, 94, 90,
-             92, 101, 103, 114, 119, 131, 137, 150, 158, 170, 175, 180, 106, 97,
-             97, 93, 93, 104, 104, 118, 118, 135, 135, 154, 155, 175, 176, 187,
-             // Size 32x16
-             32, 31, 31, 31, 32, 32, 32, 34, 34, 36, 39, 40, 44, 47, 49, 53, 57,
-             59, 65, 69, 71, 79, 81, 82, 87, 90, 92, 95, 98, 100, 103, 106, 31,
-             32, 32, 32, 32, 32, 33, 34, 34, 34, 37, 38, 41, 44, 46, 49, 53, 54,
-             60, 63, 65, 72, 74, 75, 79, 82, 84, 87, 89, 92, 94, 97, 32, 32, 32,
-             32, 33, 34, 34, 35, 36, 37, 39, 40, 42, 45, 46, 50, 53, 54, 59, 62,
-             64, 71, 72, 73, 77, 80, 83, 85, 88, 91, 94, 97, 32, 32, 32, 33, 34,
-             34, 35, 37, 37, 38, 40, 41, 43, 46, 47, 50, 53, 54, 58, 62, 63, 70,
-             71, 72, 76, 78, 81, 83, 85, 88, 90, 93, 36, 35, 35, 34, 36, 37, 38,
-             42, 44, 48, 50, 51, 53, 56, 57, 60, 63, 64, 68, 71, 73, 79, 80, 81,
-             85, 87, 88, 88, 89, 90, 92, 93, 39, 38, 38, 37, 39, 40, 40, 45, 47,
-             51, 54, 55, 58, 61, 62, 65, 68, 69, 73, 76, 78, 84, 85, 86, 90, 89,
-             90, 92, 95, 98, 101, 104, 44, 42, 41, 41, 42, 42, 42, 48, 50, 54,
-             58, 59, 63, 66, 67, 71, 74, 75, 79, 83, 84, 90, 91, 92, 96, 99,
-             102, 103, 103, 103, 103, 104, 53, 51, 50, 49, 50, 49, 49, 54, 56,
-             60, 65, 67, 71, 75, 77, 82, 86, 87, 92, 96, 97, 104, 105, 106, 110,
-             108, 106, 105, 108, 111, 114, 118, 58, 55, 54, 53, 53, 53, 52, 57,
-             59, 63, 68, 70, 74, 79, 81, 86, 90, 91, 97, 100, 102, 109, 110,
-             111, 114, 113, 117, 120, 121, 120, 119, 118, 65, 62, 60, 59, 59,
-             58, 58, 63, 65, 68, 73, 75, 79, 85, 86, 92, 97, 98, 105, 109, 111,
-             118, 120, 121, 125, 129, 128, 125, 124, 127, 131, 135, 79, 75, 73,
-             72, 71, 70, 69, 73, 75, 78, 84, 85, 90, 95, 97, 103, 108, 111, 118,
-             122, 125, 133, 135, 136, 140, 135, 133, 137, 141, 139, 137, 135,
-             81, 77, 75, 74, 72, 71, 70, 75, 77, 80, 85, 87, 91, 97, 99, 105,
-             110, 112, 119, 124, 127, 135, 137, 139, 143, 146, 150, 148, 144,
-             146, 150, 154, 88, 83, 81, 79, 78, 77, 76, 79, 81, 85, 88, 91, 97,
-             99, 104, 109, 111, 119, 123, 127, 135, 137, 145, 147, 148, 153,
-             153, 155, 160, 161, 158, 155, 90, 86, 84, 82, 81, 80, 78, 79, 83,
-             85, 89, 92, 94, 101, 102, 108, 112, 117, 123, 125, 134, 136, 143,
-             148, 154, 157, 158, 164, 164, 165, 170, 175, 93, 88, 88, 84, 84,
-             83, 82, 81, 84, 86, 90, 92, 97, 98, 105, 106, 113, 115, 122, 125,
-             131, 136, 141, 147, 151, 160, 163, 168, 169, 175, 175, 176, 96, 91,
-             91, 87, 87, 85, 86, 83, 84, 89, 89, 95, 95, 102, 102, 110, 110,
-             118, 119, 128, 129, 137, 138, 149, 149, 159, 160, 173, 174, 179,
-             180, 187,
-             // Size 4x16
-             31, 39, 65, 90, 32, 38, 60, 84, 32, 39, 59, 81, 33, 40, 58, 78, 34,
-             47, 65, 83, 37, 54, 73, 89, 41, 58, 79, 94, 46, 62, 86, 102, 53,
-             68, 97, 112, 60, 73, 105, 123, 65, 78, 111, 134, 74, 85, 120, 143,
-             79, 90, 125, 154, 84, 90, 128, 158, 89, 95, 124, 164, 94, 101, 131,
-             170,
-             // Size 16x4
-             31, 32, 32, 33, 34, 37, 41, 46, 53, 60, 65, 74, 79, 84, 89, 94, 39,
-             38, 39, 40, 47, 54, 58, 62, 68, 73, 78, 85, 90, 90, 95, 101, 65,
-             60, 59, 58, 65, 73, 79, 86, 97, 105, 111, 120, 125, 128, 124, 131,
-             90, 84, 81, 78, 83, 89, 94, 102, 112, 123, 134, 143, 154, 158, 164,
-             170,
-             // Size 8x32
-             32, 32, 36, 44, 58, 79, 88, 93, 31, 32, 35, 42, 55, 75, 83, 88, 31,
-             32, 35, 41, 54, 73, 81, 88, 31, 32, 34, 41, 53, 72, 79, 84, 32, 33,
-             36, 42, 53, 71, 78, 84, 32, 34, 37, 42, 53, 70, 77, 83, 32, 34, 38,
-             42, 52, 69, 76, 82, 34, 35, 42, 48, 57, 73, 79, 81, 34, 36, 44, 50,
-             59, 75, 81, 84, 36, 37, 48, 54, 63, 78, 85, 86, 39, 39, 50, 58, 68,
-             84, 88, 90, 40, 40, 51, 59, 70, 85, 91, 92, 44, 42, 53, 63, 74, 90,
-             97, 97, 47, 45, 56, 66, 79, 95, 99, 98, 49, 46, 57, 67, 81, 97,
-             104, 105, 53, 50, 60, 71, 86, 103, 109, 106, 57, 53, 63, 74, 90,
-             108, 111, 113, 59, 54, 64, 75, 91, 111, 119, 115, 65, 59, 68, 79,
-             97, 118, 123, 122, 69, 62, 71, 83, 100, 122, 127, 125, 71, 64, 73,
-             84, 102, 125, 135, 131, 79, 71, 79, 90, 109, 133, 137, 136, 81, 72,
-             80, 91, 110, 135, 145, 141, 82, 73, 81, 92, 111, 136, 147, 147, 87,
-             77, 85, 96, 114, 140, 148, 151, 90, 80, 87, 99, 113, 135, 153, 160,
-             92, 83, 88, 102, 117, 133, 153, 163, 95, 85, 88, 103, 120, 137,
-             155, 168, 98, 88, 89, 103, 121, 141, 160, 169, 100, 91, 90, 103,
-             120, 139, 161, 175, 103, 94, 92, 103, 119, 137, 158, 175, 106, 97,
-             93, 104, 118, 135, 155, 176,
-             // Size 32x8
-             32, 31, 31, 31, 32, 32, 32, 34, 34, 36, 39, 40, 44, 47, 49, 53, 57,
-             59, 65, 69, 71, 79, 81, 82, 87, 90, 92, 95, 98, 100, 103, 106, 32,
-             32, 32, 32, 33, 34, 34, 35, 36, 37, 39, 40, 42, 45, 46, 50, 53, 54,
-             59, 62, 64, 71, 72, 73, 77, 80, 83, 85, 88, 91, 94, 97, 36, 35, 35,
-             34, 36, 37, 38, 42, 44, 48, 50, 51, 53, 56, 57, 60, 63, 64, 68, 71,
-             73, 79, 80, 81, 85, 87, 88, 88, 89, 90, 92, 93, 44, 42, 41, 41, 42,
-             42, 42, 48, 50, 54, 58, 59, 63, 66, 67, 71, 74, 75, 79, 83, 84, 90,
-             91, 92, 96, 99, 102, 103, 103, 103, 103, 104, 58, 55, 54, 53, 53,
-             53, 52, 57, 59, 63, 68, 70, 74, 79, 81, 86, 90, 91, 97, 100, 102,
-             109, 110, 111, 114, 113, 117, 120, 121, 120, 119, 118, 79, 75, 73,
-             72, 71, 70, 69, 73, 75, 78, 84, 85, 90, 95, 97, 103, 108, 111, 118,
-             122, 125, 133, 135, 136, 140, 135, 133, 137, 141, 139, 137, 135,
-             88, 83, 81, 79, 78, 77, 76, 79, 81, 85, 88, 91, 97, 99, 104, 109,
-             111, 119, 123, 127, 135, 137, 145, 147, 148, 153, 153, 155, 160,
-             161, 158, 155, 93, 88, 88, 84, 84, 83, 82, 81, 84, 86, 90, 92, 97,
-             98, 105, 106, 113, 115, 122, 125, 131, 136, 141, 147, 151, 160,
-             163, 168, 169, 175, 175, 176},
-            {// Chroma
-             // Size 4x4
-             32, 45, 53, 63, 45, 55, 62, 67, 53, 62, 80, 84, 63, 67, 84, 101,
-             // Size 8x8
-             31, 36, 47, 48, 52, 60, 64, 67, 36, 43, 47, 46, 49, 55, 59, 63, 47,
-             47, 53, 54, 55, 60, 63, 64, 48, 46, 54, 61, 65, 70, 71, 71, 52, 49,
-             55, 65, 71, 78, 81, 79, 60, 55, 60, 70, 78, 89, 89, 89, 64, 59, 63,
-             71, 81, 89, 97, 99, 67, 63, 64, 71, 79, 89, 99, 104,
-             // Size 16x16
-             32, 30, 33, 36, 44, 48, 49, 51, 54, 57, 60, 64, 67, 68, 70, 72, 30,
-             31, 35, 39, 44, 46, 46, 47, 50, 53, 55, 59, 61, 64, 66, 68, 33, 35,
-             39, 43, 46, 46, 45, 47, 49, 51, 53, 57, 59, 61, 63, 65, 36, 39, 43,
-             47, 47, 46, 45, 46, 48, 50, 52, 55, 57, 58, 61, 63, 44, 44, 46, 47,
-             50, 51, 51, 51, 53, 54, 56, 59, 61, 61, 63, 62, 48, 46, 46, 46, 51,
-             54, 55, 56, 58, 60, 61, 64, 65, 64, 66, 66, 49, 46, 45, 45, 51, 55,
-             58, 60, 62, 63, 65, 68, 69, 69, 69, 69, 51, 47, 47, 46, 51, 56, 60,
-             62, 65, 67, 69, 72, 73, 74, 73, 73, 54, 50, 49, 48, 53, 58, 62, 65,
-             70, 73, 75, 78, 79, 79, 77, 77, 57, 53, 51, 50, 54, 60, 63, 67, 73,
-             76, 79, 82, 84, 83, 82, 82, 60, 55, 53, 52, 56, 61, 65, 69, 75, 79,
-             82, 86, 88, 87, 86, 87, 64, 59, 57, 55, 59, 64, 68, 72, 78, 82, 86,
-             90, 93, 92, 91, 92, 67, 61, 59, 57, 61, 65, 69, 73, 79, 84, 88, 93,
-             95, 96, 96, 96, 68, 64, 61, 58, 61, 64, 69, 74, 79, 83, 87, 92, 96,
-             99, 100, 101, 70, 66, 63, 61, 63, 66, 69, 73, 77, 82, 86, 91, 96,
-             100, 103, 104, 72, 68, 65, 63, 62, 66, 69, 73, 77, 82, 87, 92, 96,
-             101, 104, 106,
-             // Size 32x32
-             32, 31, 30, 30, 33, 35, 36, 41, 44, 49, 48, 48, 49, 50, 51, 52, 54,
-             55, 57, 59, 60, 63, 64, 65, 67, 68, 68, 69, 70, 71, 72, 73, 31, 31,
-             31, 31, 34, 36, 38, 42, 44, 47, 47, 47, 47, 48, 48, 50, 51, 52, 54,
-             56, 57, 60, 61, 61, 63, 64, 65, 66, 67, 67, 68, 69, 30, 31, 31, 31,
-             35, 37, 39, 42, 44, 47, 46, 46, 46, 47, 47, 48, 50, 51, 53, 54, 55,
-             58, 59, 60, 61, 63, 64, 65, 66, 67, 68, 69, 30, 31, 31, 32, 35, 37,
-             40, 42, 44, 46, 45, 45, 45, 46, 46, 47, 49, 50, 52, 53, 54, 57, 58,
-             58, 60, 61, 62, 63, 63, 64, 65, 66, 33, 34, 35, 35, 39, 41, 43, 45,
-             46, 47, 46, 46, 45, 46, 47, 47, 49, 49, 51, 53, 53, 56, 57, 57, 59,
-             60, 61, 62, 63, 64, 65, 66, 35, 36, 37, 37, 41, 43, 45, 46, 46, 47,
-             46, 46, 45, 46, 46, 47, 48, 49, 50, 52, 53, 55, 56, 56, 58, 59, 60,
-             61, 62, 63, 64, 64, 36, 38, 39, 40, 43, 45, 47, 47, 47, 48, 46, 46,
-             45, 46, 46, 47, 48, 48, 50, 51, 52, 54, 55, 55, 57, 58, 58, 59, 61,
-             62, 63, 64, 41, 42, 42, 42, 45, 46, 47, 48, 49, 50, 49, 49, 49, 50,
-             50, 50, 51, 52, 53, 54, 55, 57, 58, 58, 60, 60, 59, 59, 60, 61, 61,
-             62, 44, 44, 44, 44, 46, 46, 47, 49, 50, 51, 51, 51, 51, 51, 51, 52,
-             53, 53, 54, 56, 56, 59, 59, 59, 61, 61, 61, 62, 63, 62, 62, 62, 49,
-             47, 47, 46, 47, 47, 48, 50, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55,
-             56, 58, 58, 60, 61, 61, 63, 63, 64, 63, 63, 64, 65, 66, 48, 47, 46,
-             45, 46, 46, 46, 49, 51, 53, 54, 54, 55, 56, 56, 57, 58, 59, 60, 61,
-             61, 63, 64, 64, 65, 65, 64, 65, 66, 66, 66, 66, 48, 47, 46, 45, 46,
-             46, 46, 49, 51, 53, 54, 55, 56, 57, 57, 58, 59, 60, 61, 62, 63, 65,
-             65, 65, 66, 67, 68, 67, 67, 67, 68, 69, 49, 47, 46, 45, 45, 45, 45,
-             49, 51, 53, 55, 56, 58, 59, 60, 61, 62, 62, 63, 65, 65, 67, 68, 68,
-             69, 70, 69, 69, 69, 70, 69, 69, 50, 48, 47, 46, 46, 46, 46, 50, 51,
-             54, 56, 57, 59, 61, 62, 63, 64, 65, 66, 68, 68, 70, 71, 71, 72, 71,
-             71, 72, 71, 71, 71, 72, 51, 48, 47, 46, 47, 46, 46, 50, 51, 54, 56,
-             57, 60, 62, 62, 64, 65, 66, 67, 69, 69, 71, 72, 72, 73, 74, 74, 72,
-             73, 74, 73, 73, 52, 50, 48, 47, 47, 47, 47, 50, 52, 54, 57, 58, 61,
-             63, 64, 66, 68, 68, 70, 72, 72, 75, 75, 75, 77, 76, 75, 76, 76, 74,
-             75, 76, 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, 58, 59, 62, 64, 65,
-             68, 70, 70, 73, 74, 75, 77, 78, 78, 79, 78, 79, 78, 77, 78, 77, 77,
-             55, 52, 51, 50, 49, 49, 48, 52, 53, 55, 59, 60, 62, 65, 66, 68, 70,
-             71, 73, 75, 76, 78, 79, 79, 80, 81, 80, 80, 81, 79, 79, 81, 57, 54,
-             53, 52, 51, 50, 50, 53, 54, 56, 60, 61, 63, 66, 67, 70, 73, 73, 76,
-             78, 79, 82, 82, 83, 84, 83, 83, 83, 82, 83, 82, 81, 59, 56, 54, 53,
-             53, 52, 51, 54, 56, 58, 61, 62, 65, 68, 69, 72, 74, 75, 78, 80, 81,
-             84, 85, 85, 86, 86, 86, 84, 85, 84, 84, 85, 60, 57, 55, 54, 53, 53,
-             52, 55, 56, 58, 61, 63, 65, 68, 69, 72, 75, 76, 79, 81, 82, 85, 86,
-             86, 88, 88, 87, 88, 86, 87, 87, 85, 63, 60, 58, 57, 56, 55, 54, 57,
-             59, 60, 63, 65, 67, 70, 71, 75, 77, 78, 82, 84, 85, 89, 89, 90, 92,
-             89, 91, 89, 90, 89, 88, 89, 64, 61, 59, 58, 57, 56, 55, 58, 59, 61,
-             64, 65, 68, 71, 72, 75, 78, 79, 82, 85, 86, 89, 90, 91, 93, 94, 92,
-             92, 91, 91, 92, 90, 65, 61, 60, 58, 57, 56, 55, 58, 59, 61, 64, 65,
-             68, 71, 72, 75, 78, 79, 83, 85, 86, 90, 91, 91, 93, 94, 95, 94, 94,
-             94, 93, 94, 67, 63, 61, 60, 59, 58, 57, 60, 61, 63, 65, 66, 69, 72,
-             73, 77, 79, 80, 84, 86, 88, 92, 93, 93, 95, 95, 96, 97, 96, 95, 96,
-             94, 68, 64, 63, 61, 60, 59, 58, 60, 61, 63, 65, 67, 70, 71, 74, 76,
-             78, 81, 83, 86, 88, 89, 94, 94, 95, 97, 97, 98, 99, 99, 97, 99, 68,
-             65, 64, 62, 61, 60, 58, 59, 61, 64, 64, 68, 69, 71, 74, 75, 79, 80,
-             83, 86, 87, 91, 92, 95, 96, 97, 99, 99, 100, 100, 101, 99, 69, 66,
-             65, 63, 62, 61, 59, 59, 62, 63, 65, 67, 69, 72, 72, 76, 78, 80, 83,
-             84, 88, 89, 92, 94, 97, 98, 99, 101, 100, 102, 102, 104, 70, 67,
-             66, 63, 63, 62, 61, 60, 63, 63, 66, 67, 69, 71, 73, 76, 77, 81, 82,
-             85, 86, 90, 91, 94, 96, 99, 100, 100, 103, 102, 104, 104, 71, 67,
-             67, 64, 64, 63, 62, 61, 62, 64, 66, 67, 70, 71, 74, 74, 78, 79, 83,
-             84, 87, 89, 91, 94, 95, 99, 100, 102, 102, 104, 104, 106, 72, 68,
-             68, 65, 65, 64, 63, 61, 62, 65, 66, 68, 69, 71, 73, 75, 77, 79, 82,
-             84, 87, 88, 92, 93, 96, 97, 101, 102, 104, 104, 106, 106, 73, 69,
-             69, 66, 66, 64, 64, 62, 62, 66, 66, 69, 69, 72, 73, 76, 77, 81, 81,
-             85, 85, 89, 90, 94, 94, 99, 99, 104, 104, 106, 106, 108,
-             // Size 4x8
-             31, 47, 54, 64, 38, 46, 50, 60, 46, 53, 57, 62, 46, 56, 66, 71, 50,
-             59, 74, 79, 57, 64, 82, 88, 61, 65, 85, 97, 65, 67, 82, 99,
-             // Size 8x4
-             31, 38, 46, 46, 50, 57, 61, 65, 47, 46, 53, 56, 59, 64, 65, 67, 54,
-             50, 57, 66, 74, 82, 85, 82, 64, 60, 62, 71, 79, 88, 97, 99,
-             // Size 8x16
-             32, 34, 48, 49, 54, 63, 67, 69, 31, 36, 46, 46, 50, 58, 62, 65, 33,
-             40, 47, 46, 49, 56, 59, 62, 37, 44, 47, 45, 48, 54, 57, 60, 44, 46,
-             51, 51, 53, 59, 60, 61, 48, 46, 53, 56, 58, 64, 64, 64, 49, 45, 53,
-             58, 62, 67, 70, 68, 51, 47, 54, 60, 65, 71, 73, 72, 54, 49, 55, 62,
-             70, 77, 77, 76, 57, 51, 56, 64, 73, 82, 83, 81, 60, 53, 58, 65, 75,
-             85, 89, 85, 64, 57, 61, 68, 78, 89, 93, 89, 66, 59, 63, 69, 79, 91,
-             94, 93, 68, 61, 63, 71, 79, 87, 96, 98, 70, 63, 63, 70, 80, 89, 97,
-             100, 72, 65, 63, 69, 77, 86, 95, 102,
-             // Size 16x8
-             32, 31, 33, 37, 44, 48, 49, 51, 54, 57, 60, 64, 66, 68, 70, 72, 34,
-             36, 40, 44, 46, 46, 45, 47, 49, 51, 53, 57, 59, 61, 63, 65, 48, 46,
-             47, 47, 51, 53, 53, 54, 55, 56, 58, 61, 63, 63, 63, 63, 49, 46, 46,
-             45, 51, 56, 58, 60, 62, 64, 65, 68, 69, 71, 70, 69, 54, 50, 49, 48,
-             53, 58, 62, 65, 70, 73, 75, 78, 79, 79, 80, 77, 63, 58, 56, 54, 59,
-             64, 67, 71, 77, 82, 85, 89, 91, 87, 89, 86, 67, 62, 59, 57, 60, 64,
-             70, 73, 77, 83, 89, 93, 94, 96, 97, 95, 69, 65, 62, 60, 61, 64, 68,
-             72, 76, 81, 85, 89, 93, 98, 100, 102,
-             // Size 16x32
-             32, 31, 34, 37, 48, 48, 49, 52, 54, 57, 63, 64, 67, 68, 69, 69, 31,
-             31, 35, 38, 47, 47, 47, 50, 51, 54, 60, 61, 63, 64, 65, 66, 31, 32,
-             36, 39, 46, 46, 46, 48, 50, 53, 58, 59, 62, 63, 65, 66, 30, 32, 36,
-             40, 46, 45, 45, 48, 49, 52, 57, 58, 60, 61, 62, 63, 33, 36, 40, 43,
-             47, 46, 46, 47, 49, 51, 56, 57, 59, 60, 62, 63, 35, 38, 42, 45, 47,
-             46, 45, 47, 48, 50, 55, 56, 58, 60, 61, 61, 37, 40, 44, 47, 47, 46,
-             45, 47, 48, 50, 54, 55, 57, 58, 60, 61, 42, 43, 45, 47, 50, 50, 49,
-             50, 51, 53, 57, 58, 59, 58, 59, 59, 44, 44, 46, 47, 51, 51, 51, 52,
-             53, 54, 59, 59, 60, 61, 61, 60, 49, 46, 47, 48, 53, 53, 53, 54, 55,
-             57, 60, 61, 63, 62, 62, 63, 48, 46, 46, 47, 53, 54, 56, 57, 58, 60,
-             64, 64, 64, 64, 64, 63, 48, 45, 46, 46, 53, 55, 56, 58, 59, 61, 65,
-             65, 66, 66, 65, 66, 49, 45, 45, 46, 53, 56, 58, 61, 62, 64, 67, 68,
-             70, 67, 68, 66, 50, 46, 46, 46, 54, 56, 59, 63, 65, 66, 70, 71, 70,
-             71, 68, 70, 51, 47, 47, 47, 54, 57, 60, 64, 65, 68, 71, 72, 73, 71,
-             72, 70, 52, 48, 47, 47, 54, 57, 61, 66, 68, 71, 75, 75, 76, 75, 73,
-             73, 54, 49, 49, 48, 55, 58, 62, 68, 70, 73, 77, 78, 77, 77, 76, 74,
-             54, 50, 49, 49, 55, 59, 62, 68, 70, 74, 78, 79, 81, 79, 77, 78, 57,
-             52, 51, 50, 56, 60, 64, 70, 73, 76, 82, 82, 83, 82, 81, 78, 59, 54,
-             52, 52, 58, 61, 65, 72, 74, 78, 84, 85, 85, 83, 82, 82, 60, 54, 53,
-             52, 58, 62, 65, 72, 75, 79, 85, 86, 89, 87, 85, 82, 63, 57, 56, 55,
-             60, 64, 67, 75, 77, 82, 89, 90, 90, 88, 87, 86, 64, 58, 57, 55, 61,
-             64, 68, 75, 78, 82, 89, 90, 93, 91, 89, 87, 64, 59, 57, 56, 61, 65,
-             68, 75, 78, 83, 90, 91, 94, 93, 92, 91, 66, 60, 59, 57, 63, 66, 69,
-             77, 79, 84, 91, 93, 94, 95, 93, 91, 67, 61, 60, 58, 63, 65, 70, 75,
-             78, 85, 88, 93, 96, 97, 97, 95, 68, 62, 61, 59, 63, 64, 71, 74, 79,
-             84, 87, 94, 96, 97, 98, 96, 69, 63, 62, 60, 63, 65, 71, 72, 80, 82,
-             88, 93, 96, 99, 100, 101, 70, 64, 63, 60, 63, 66, 70, 73, 80, 81,
-             89, 90, 97, 99, 100, 101, 71, 65, 64, 61, 63, 67, 70, 74, 78, 82,
-             88, 90, 97, 99, 102, 103, 72, 65, 65, 62, 63, 68, 69, 75, 77, 83,
-             86, 92, 95, 100, 102, 103, 73, 66, 66, 63, 63, 69, 69, 76, 76, 84,
-             84, 93, 93, 101, 101, 105,
-             // Size 32x16
-             32, 31, 31, 30, 33, 35, 37, 42, 44, 49, 48, 48, 49, 50, 51, 52, 54,
-             54, 57, 59, 60, 63, 64, 64, 66, 67, 68, 69, 70, 71, 72, 73, 31, 31,
-             32, 32, 36, 38, 40, 43, 44, 46, 46, 45, 45, 46, 47, 48, 49, 50, 52,
-             54, 54, 57, 58, 59, 60, 61, 62, 63, 64, 65, 65, 66, 34, 35, 36, 36,
-             40, 42, 44, 45, 46, 47, 46, 46, 45, 46, 47, 47, 49, 49, 51, 52, 53,
-             56, 57, 57, 59, 60, 61, 62, 63, 64, 65, 66, 37, 38, 39, 40, 43, 45,
-             47, 47, 47, 48, 47, 46, 46, 46, 47, 47, 48, 49, 50, 52, 52, 55, 55,
-             56, 57, 58, 59, 60, 60, 61, 62, 63, 48, 47, 46, 46, 47, 47, 47, 50,
-             51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 58, 58, 60, 61, 61, 63,
-             63, 63, 63, 63, 63, 63, 63, 48, 47, 46, 45, 46, 46, 46, 50, 51, 53,
-             54, 55, 56, 56, 57, 57, 58, 59, 60, 61, 62, 64, 64, 65, 66, 65, 64,
-             65, 66, 67, 68, 69, 49, 47, 46, 45, 46, 45, 45, 49, 51, 53, 56, 56,
-             58, 59, 60, 61, 62, 62, 64, 65, 65, 67, 68, 68, 69, 70, 71, 71, 70,
-             70, 69, 69, 52, 50, 48, 48, 47, 47, 47, 50, 52, 54, 57, 58, 61, 63,
-             64, 66, 68, 68, 70, 72, 72, 75, 75, 75, 77, 75, 74, 72, 73, 74, 75,
-             76, 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, 58, 59, 62, 65, 65, 68,
-             70, 70, 73, 74, 75, 77, 78, 78, 79, 78, 79, 80, 80, 78, 77, 76, 57,
-             54, 53, 52, 51, 50, 50, 53, 54, 57, 60, 61, 64, 66, 68, 71, 73, 74,
-             76, 78, 79, 82, 82, 83, 84, 85, 84, 82, 81, 82, 83, 84, 63, 60, 58,
-             57, 56, 55, 54, 57, 59, 60, 64, 65, 67, 70, 71, 75, 77, 78, 82, 84,
-             85, 89, 89, 90, 91, 88, 87, 88, 89, 88, 86, 84, 64, 61, 59, 58, 57,
-             56, 55, 58, 59, 61, 64, 65, 68, 71, 72, 75, 78, 79, 82, 85, 86, 90,
-             90, 91, 93, 93, 94, 93, 90, 90, 92, 93, 67, 63, 62, 60, 59, 58, 57,
-             59, 60, 63, 64, 66, 70, 70, 73, 76, 77, 81, 83, 85, 89, 90, 93, 94,
-             94, 96, 96, 96, 97, 97, 95, 93, 68, 64, 63, 61, 60, 60, 58, 58, 61,
-             62, 64, 66, 67, 71, 71, 75, 77, 79, 82, 83, 87, 88, 91, 93, 95, 97,
-             97, 99, 99, 99, 100, 101, 69, 65, 65, 62, 62, 61, 60, 59, 61, 62,
-             64, 65, 68, 68, 72, 73, 76, 77, 81, 82, 85, 87, 89, 92, 93, 97, 98,
-             100, 100, 102, 102, 101, 69, 66, 66, 63, 63, 61, 61, 59, 60, 63,
-             63, 66, 66, 70, 70, 73, 74, 78, 78, 82, 82, 86, 87, 91, 91, 95, 96,
-             101, 101, 103, 103, 105,
-             // Size 4x16
-             31, 48, 57, 68, 32, 46, 53, 63, 36, 46, 51, 60, 40, 46, 50, 58, 44,
-             51, 54, 61, 46, 54, 60, 64, 45, 56, 64, 67, 47, 57, 68, 71, 49, 58,
-             73, 77, 52, 60, 76, 82, 54, 62, 79, 87, 58, 64, 82, 91, 60, 66, 84,
-             95, 62, 64, 84, 97, 64, 66, 81, 99, 65, 68, 83, 100,
-             // Size 16x4
-             31, 32, 36, 40, 44, 46, 45, 47, 49, 52, 54, 58, 60, 62, 64, 65, 48,
-             46, 46, 46, 51, 54, 56, 57, 58, 60, 62, 64, 66, 64, 66, 68, 57, 53,
-             51, 50, 54, 60, 64, 68, 73, 76, 79, 82, 84, 84, 81, 83, 68, 63, 60,
-             58, 61, 64, 67, 71, 77, 82, 87, 91, 95, 97, 99, 100,
-             // Size 8x32
-             32, 34, 48, 49, 54, 63, 67, 69, 31, 35, 47, 47, 51, 60, 63, 65, 31,
-             36, 46, 46, 50, 58, 62, 65, 30, 36, 46, 45, 49, 57, 60, 62, 33, 40,
-             47, 46, 49, 56, 59, 62, 35, 42, 47, 45, 48, 55, 58, 61, 37, 44, 47,
-             45, 48, 54, 57, 60, 42, 45, 50, 49, 51, 57, 59, 59, 44, 46, 51, 51,
-             53, 59, 60, 61, 49, 47, 53, 53, 55, 60, 63, 62, 48, 46, 53, 56, 58,
-             64, 64, 64, 48, 46, 53, 56, 59, 65, 66, 65, 49, 45, 53, 58, 62, 67,
-             70, 68, 50, 46, 54, 59, 65, 70, 70, 68, 51, 47, 54, 60, 65, 71, 73,
-             72, 52, 47, 54, 61, 68, 75, 76, 73, 54, 49, 55, 62, 70, 77, 77, 76,
-             54, 49, 55, 62, 70, 78, 81, 77, 57, 51, 56, 64, 73, 82, 83, 81, 59,
-             52, 58, 65, 74, 84, 85, 82, 60, 53, 58, 65, 75, 85, 89, 85, 63, 56,
-             60, 67, 77, 89, 90, 87, 64, 57, 61, 68, 78, 89, 93, 89, 64, 57, 61,
-             68, 78, 90, 94, 92, 66, 59, 63, 69, 79, 91, 94, 93, 67, 60, 63, 70,
-             78, 88, 96, 97, 68, 61, 63, 71, 79, 87, 96, 98, 69, 62, 63, 71, 80,
-             88, 96, 100, 70, 63, 63, 70, 80, 89, 97, 100, 71, 64, 63, 70, 78,
-             88, 97, 102, 72, 65, 63, 69, 77, 86, 95, 102, 73, 66, 63, 69, 76,
-             84, 93, 101,
-             // Size 32x8
-             32, 31, 31, 30, 33, 35, 37, 42, 44, 49, 48, 48, 49, 50, 51, 52, 54,
-             54, 57, 59, 60, 63, 64, 64, 66, 67, 68, 69, 70, 71, 72, 73, 34, 35,
-             36, 36, 40, 42, 44, 45, 46, 47, 46, 46, 45, 46, 47, 47, 49, 49, 51,
-             52, 53, 56, 57, 57, 59, 60, 61, 62, 63, 64, 65, 66, 48, 47, 46, 46,
-             47, 47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 58, 58,
-             60, 61, 61, 63, 63, 63, 63, 63, 63, 63, 63, 49, 47, 46, 45, 46, 45,
-             45, 49, 51, 53, 56, 56, 58, 59, 60, 61, 62, 62, 64, 65, 65, 67, 68,
-             68, 69, 70, 71, 71, 70, 70, 69, 69, 54, 51, 50, 49, 49, 48, 48, 51,
-             53, 55, 58, 59, 62, 65, 65, 68, 70, 70, 73, 74, 75, 77, 78, 78, 79,
-             78, 79, 80, 80, 78, 77, 76, 63, 60, 58, 57, 56, 55, 54, 57, 59, 60,
-             64, 65, 67, 70, 71, 75, 77, 78, 82, 84, 85, 89, 89, 90, 91, 88, 87,
-             88, 89, 88, 86, 84, 67, 63, 62, 60, 59, 58, 57, 59, 60, 63, 64, 66,
-             70, 70, 73, 76, 77, 81, 83, 85, 89, 90, 93, 94, 94, 96, 96, 96, 97,
-             97, 95, 93, 69, 65, 65, 62, 62, 61, 60, 59, 61, 62, 64, 65, 68, 68,
-             72, 73, 76, 77, 81, 82, 85, 87, 89, 92, 93, 97, 98, 100, 100, 102,
-             102, 101},
-        },
-        // Quantizer level 3.
-        {
-            {// Luma
-             // Size 4x4
-             32, 37, 58, 81, 37, 54, 72, 91, 58, 72, 102, 121, 81, 91, 121, 156,
-             // Size 8x8
-             32, 32, 35, 42, 53, 68, 78, 90, 32, 33, 36, 42, 51, 64, 74, 84, 35,
-             36, 46, 52, 60, 72, 80, 87, 42, 42, 52, 63, 73, 84, 92, 98, 53, 51,
-             60, 73, 86, 100, 109, 114, 68, 64, 72, 84, 100, 117, 128, 133, 78,
-             74, 80, 92, 109, 128, 140, 155, 90, 84, 87, 98, 114, 133, 155, 168,
-             // Size 16x16
-             32, 31, 31, 32, 34, 36, 41, 47, 54, 59, 65, 74, 82, 87, 92, 97, 31,
-             32, 32, 32, 34, 35, 39, 45, 50, 55, 61, 69, 76, 81, 87, 92, 31, 32,
-             33, 33, 35, 36, 40, 44, 49, 54, 59, 67, 73, 78, 83, 88, 32, 32, 33,
-             35, 37, 38, 41, 45, 49, 53, 58, 65, 71, 75, 80, 86, 34, 34, 35, 37,
-             39, 42, 46, 50, 54, 58, 63, 70, 76, 80, 84, 85, 36, 35, 36, 38, 42,
-             48, 52, 56, 60, 64, 68, 75, 80, 85, 90, 91, 41, 39, 40, 41, 46, 52,
-             57, 62, 67, 71, 75, 83, 88, 92, 95, 97, 47, 45, 44, 45, 50, 56, 62,
-             69, 75, 79, 84, 91, 97, 100, 102, 104, 54, 50, 49, 49, 54, 60, 67,
-             75, 82, 87, 92, 100, 106, 110, 109, 112, 59, 55, 54, 53, 58, 64,
-             71, 79, 87, 92, 98, 106, 112, 117, 117, 121, 65, 61, 59, 58, 63,
-             68, 75, 84, 92, 98, 105, 114, 120, 125, 126, 130, 74, 69, 67, 65,
-             70, 75, 83, 91, 100, 106, 114, 123, 131, 135, 137, 140, 82, 76, 73,
-             71, 76, 80, 88, 97, 106, 112, 120, 131, 139, 144, 148, 150, 87, 81,
-             78, 75, 80, 85, 92, 100, 110, 117, 125, 135, 144, 150, 155, 162,
-             92, 87, 83, 80, 84, 90, 95, 102, 109, 117, 126, 137, 148, 155, 162,
-             168, 97, 92, 88, 86, 85, 91, 97, 104, 112, 121, 130, 140, 150, 162,
-             168, 174,
-             // Size 32x32
-             32, 31, 31, 31, 31, 31, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 54,
-             56, 59, 64, 65, 71, 74, 80, 82, 83, 87, 90, 92, 95, 97, 100, 31,
-             32, 32, 32, 32, 32, 32, 33, 34, 35, 35, 38, 40, 42, 45, 46, 51, 53,
-             56, 61, 62, 68, 71, 76, 78, 78, 83, 85, 88, 90, 92, 95, 31, 32, 32,
-             32, 32, 32, 32, 33, 34, 34, 35, 38, 39, 42, 45, 45, 50, 52, 55, 60,
-             61, 67, 69, 74, 76, 77, 81, 84, 87, 89, 92, 95, 31, 32, 32, 32, 32,
-             32, 32, 33, 33, 34, 34, 37, 38, 41, 44, 44, 49, 51, 54, 58, 59, 65,
-             68, 72, 74, 75, 79, 81, 84, 86, 88, 90, 31, 32, 32, 32, 33, 33, 33,
-             34, 35, 36, 36, 39, 40, 42, 44, 45, 49, 51, 54, 58, 59, 64, 67, 71,
-             73, 74, 78, 80, 83, 85, 88, 90, 31, 32, 32, 32, 33, 33, 34, 34, 35,
-             36, 36, 39, 40, 42, 45, 45, 50, 51, 54, 58, 59, 64, 67, 71, 73, 74,
-             78, 80, 82, 84, 86, 89, 32, 32, 32, 32, 33, 34, 35, 36, 37, 38, 38,
-             40, 41, 42, 45, 46, 49, 51, 53, 57, 58, 63, 65, 69, 71, 72, 75, 78,
-             80, 83, 86, 89, 32, 33, 33, 33, 34, 34, 36, 36, 38, 39, 40, 42, 43,
-             44, 47, 47, 51, 53, 55, 59, 60, 65, 67, 71, 73, 73, 77, 78, 80, 82,
-             84, 86, 34, 34, 34, 33, 35, 35, 37, 38, 39, 42, 42, 45, 46, 47, 50,
-             51, 54, 56, 58, 62, 63, 68, 70, 74, 76, 76, 80, 82, 84, 85, 85, 86,
-             35, 35, 34, 34, 36, 36, 38, 39, 42, 46, 47, 49, 50, 52, 55, 55, 59,
-             60, 62, 66, 67, 72, 74, 78, 79, 80, 83, 84, 85, 87, 90, 92, 36, 35,
-             35, 34, 36, 36, 38, 40, 42, 47, 48, 50, 52, 54, 56, 57, 60, 61, 64,
-             67, 68, 73, 75, 79, 80, 81, 85, 87, 90, 91, 91, 92, 39, 38, 38, 37,
-             39, 39, 40, 42, 45, 49, 50, 54, 55, 58, 60, 61, 65, 66, 69, 72, 73,
-             78, 80, 84, 86, 86, 90, 91, 91, 92, 95, 97, 41, 40, 39, 38, 40, 40,
-             41, 43, 46, 50, 52, 55, 57, 60, 62, 63, 67, 69, 71, 75, 75, 80, 83,
-             86, 88, 89, 92, 93, 95, 97, 97, 98, 44, 42, 42, 41, 42, 42, 42, 44,
-             47, 52, 54, 58, 60, 63, 66, 67, 71, 73, 75, 79, 79, 84, 86, 90, 92,
-             92, 96, 98, 98, 98, 101, 104, 47, 45, 45, 44, 44, 45, 45, 47, 50,
-             55, 56, 60, 62, 66, 69, 70, 75, 77, 79, 83, 84, 89, 91, 95, 97, 97,
-             100, 99, 102, 105, 104, 104, 48, 46, 45, 44, 45, 45, 46, 47, 51,
-             55, 57, 61, 63, 67, 70, 71, 76, 78, 80, 84, 85, 90, 93, 96, 98, 99,
-             102, 106, 106, 105, 108, 111, 54, 51, 50, 49, 49, 50, 49, 51, 54,
-             59, 60, 65, 67, 71, 75, 76, 82, 84, 87, 91, 92, 97, 100, 104, 106,
-             106, 110, 108, 109, 112, 112, 111, 56, 53, 52, 51, 51, 51, 51, 53,
-             56, 60, 61, 66, 69, 73, 77, 78, 84, 86, 89, 93, 94, 100, 102, 106,
-             108, 109, 112, 113, 115, 114, 116, 119, 59, 56, 55, 54, 54, 54, 53,
-             55, 58, 62, 64, 69, 71, 75, 79, 80, 87, 89, 92, 97, 98, 103, 106,
-             110, 112, 113, 117, 118, 117, 121, 121, 119, 64, 61, 60, 58, 58,
-             58, 57, 59, 62, 66, 67, 72, 75, 79, 83, 84, 91, 93, 97, 102, 103,
-             109, 112, 116, 118, 119, 122, 121, 125, 123, 125, 128, 65, 62, 61,
-             59, 59, 59, 58, 60, 63, 67, 68, 73, 75, 79, 84, 85, 92, 94, 98,
-             103, 105, 111, 114, 118, 120, 121, 125, 129, 126, 129, 130, 129,
-             71, 68, 67, 65, 64, 64, 63, 65, 68, 72, 73, 78, 80, 84, 89, 90, 97,
-             100, 103, 109, 111, 117, 120, 125, 127, 128, 133, 130, 134, 133,
-             133, 137, 74, 71, 69, 68, 67, 67, 65, 67, 70, 74, 75, 80, 83, 86,
-             91, 93, 100, 102, 106, 112, 114, 120, 123, 128, 131, 131, 135, 137,
-             137, 138, 140, 137, 80, 76, 74, 72, 71, 71, 69, 71, 74, 78, 79, 84,
-             86, 90, 95, 96, 104, 106, 110, 116, 118, 125, 128, 134, 136, 137,
-             142, 141, 142, 143, 143, 147, 82, 78, 76, 74, 73, 73, 71, 73, 76,
-             79, 80, 86, 88, 92, 97, 98, 106, 108, 112, 118, 120, 127, 131, 136,
-             139, 139, 144, 147, 148, 147, 150, 148, 83, 78, 77, 75, 74, 74, 72,
-             73, 76, 80, 81, 86, 89, 92, 97, 99, 106, 109, 113, 119, 121, 128,
-             131, 137, 139, 140, 145, 150, 152, 155, 152, 157, 87, 83, 81, 79,
-             78, 78, 75, 77, 80, 83, 85, 90, 92, 96, 100, 102, 110, 112, 117,
-             122, 125, 133, 135, 142, 144, 145, 150, 151, 155, 158, 162, 158,
-             90, 85, 84, 81, 80, 80, 78, 78, 82, 84, 87, 91, 93, 98, 99, 106,
-             108, 113, 118, 121, 129, 130, 137, 141, 147, 150, 151, 156, 156,
-             161, 164, 169, 92, 88, 87, 84, 83, 82, 80, 80, 84, 85, 90, 91, 95,
-             98, 102, 106, 109, 115, 117, 125, 126, 134, 137, 142, 148, 152,
-             155, 156, 162, 162, 168, 170, 95, 90, 89, 86, 85, 84, 83, 82, 85,
-             87, 91, 92, 97, 98, 105, 105, 112, 114, 121, 123, 129, 133, 138,
-             143, 147, 155, 158, 161, 162, 168, 168, 174, 97, 92, 92, 88, 88,
-             86, 86, 84, 85, 90, 91, 95, 97, 101, 104, 108, 112, 116, 121, 125,
-             130, 133, 140, 143, 150, 152, 162, 164, 168, 168, 174, 175, 100,
-             95, 95, 90, 90, 89, 89, 86, 86, 92, 92, 97, 98, 104, 104, 111, 111,
-             119, 119, 128, 129, 137, 137, 147, 148, 157, 158, 169, 170, 174,
-             175, 181,
-             // Size 4x8
-             32, 35, 59, 83, 32, 36, 57, 78, 34, 47, 65, 82, 41, 53, 78, 97, 51,
-             61, 92, 111, 65, 73, 108, 129, 75, 81, 117, 148, 86, 92, 119, 154,
-             // Size 8x4
-             32, 32, 34, 41, 51, 65, 75, 86, 35, 36, 47, 53, 61, 73, 81, 92, 59,
-             57, 65, 78, 92, 108, 117, 119, 83, 78, 82, 97, 111, 129, 148, 154,
-             // Size 8x16
-             32, 31, 35, 44, 53, 65, 82, 90, 31, 32, 34, 41, 50, 61, 76, 85, 31,
-             33, 35, 42, 49, 59, 73, 81, 32, 34, 37, 42, 49, 58, 71, 79, 34, 35,
-             41, 48, 54, 63, 76, 81, 36, 36, 46, 54, 60, 68, 80, 87, 41, 40, 49,
-             60, 67, 76, 88, 93, 47, 44, 53, 66, 75, 84, 97, 101, 53, 50, 57,
-             71, 82, 92, 106, 108, 58, 54, 61, 75, 87, 98, 112, 116, 65, 59, 66,
-             79, 92, 105, 120, 124, 74, 67, 73, 86, 100, 113, 131, 134, 82, 73,
-             79, 92, 105, 120, 139, 142, 87, 78, 83, 96, 110, 125, 144, 153, 92,
-             83, 84, 97, 114, 132, 150, 157, 97, 88, 86, 97, 111, 128, 147, 163,
-             // Size 16x8
-             32, 31, 31, 32, 34, 36, 41, 47, 53, 58, 65, 74, 82, 87, 92, 97, 31,
-             32, 33, 34, 35, 36, 40, 44, 50, 54, 59, 67, 73, 78, 83, 88, 35, 34,
-             35, 37, 41, 46, 49, 53, 57, 61, 66, 73, 79, 83, 84, 86, 44, 41, 42,
-             42, 48, 54, 60, 66, 71, 75, 79, 86, 92, 96, 97, 97, 53, 50, 49, 49,
-             54, 60, 67, 75, 82, 87, 92, 100, 105, 110, 114, 111, 65, 61, 59,
-             58, 63, 68, 76, 84, 92, 98, 105, 113, 120, 125, 132, 128, 82, 76,
-             73, 71, 76, 80, 88, 97, 106, 112, 120, 131, 139, 144, 150, 147, 90,
-             85, 81, 79, 81, 87, 93, 101, 108, 116, 124, 134, 142, 153, 157,
-             163,
-             // Size 16x32
-             32, 31, 31, 32, 35, 36, 44, 47, 53, 62, 65, 79, 82, 88, 90, 93, 31,
-             32, 32, 32, 35, 35, 42, 45, 51, 59, 62, 75, 78, 83, 86, 88, 31, 32,
-             32, 32, 34, 35, 41, 45, 50, 58, 61, 74, 76, 82, 85, 88, 31, 32, 32,
-             33, 34, 34, 41, 44, 49, 57, 59, 72, 74, 79, 82, 84, 31, 32, 33, 34,
-             35, 36, 42, 44, 49, 57, 59, 71, 73, 79, 81, 84, 32, 32, 33, 34, 36,
-             36, 42, 45, 50, 57, 59, 71, 73, 78, 80, 82, 32, 33, 34, 35, 37, 38,
-             42, 45, 49, 56, 58, 69, 71, 76, 79, 83, 32, 33, 34, 36, 39, 40, 44,
-             47, 51, 58, 60, 71, 73, 76, 78, 80, 34, 34, 35, 37, 41, 42, 48, 50,
-             54, 61, 63, 73, 76, 81, 81, 80, 35, 34, 36, 38, 45, 47, 52, 55, 59,
-             65, 67, 77, 79, 82, 83, 86, 36, 34, 36, 38, 46, 48, 54, 56, 60, 66,
-             68, 78, 80, 85, 87, 86, 39, 37, 39, 40, 48, 50, 58, 60, 65, 71, 73,
-             84, 86, 89, 88, 91, 41, 39, 40, 41, 49, 51, 60, 62, 67, 74, 76, 86,
-             88, 91, 93, 91, 44, 41, 42, 43, 51, 53, 63, 66, 71, 78, 79, 90, 92,
-             97, 94, 97, 47, 44, 44, 45, 53, 56, 66, 69, 75, 82, 84, 95, 97, 98,
-             101, 98, 48, 45, 45, 46, 54, 56, 67, 70, 76, 83, 85, 96, 98, 104,
-             101, 105, 53, 49, 50, 50, 57, 60, 71, 75, 82, 90, 92, 103, 106,
-             107, 108, 105, 55, 51, 51, 51, 59, 61, 72, 77, 84, 92, 94, 106,
-             108, 111, 110, 112, 58, 54, 54, 54, 61, 63, 75, 79, 87, 95, 98,
-             110, 112, 117, 116, 113, 63, 58, 58, 57, 65, 67, 78, 83, 91, 100,
-             103, 116, 118, 119, 119, 121, 65, 60, 59, 58, 66, 68, 79, 84, 92,
-             102, 105, 118, 120, 127, 124, 122, 71, 65, 64, 63, 71, 73, 84, 89,
-             97, 108, 111, 125, 127, 129, 129, 130, 74, 68, 67, 66, 73, 75, 86,
-             91, 100, 110, 113, 128, 131, 135, 134, 130, 79, 72, 71, 70, 77, 79,
-             90, 95, 104, 115, 118, 133, 136, 140, 139, 140, 82, 75, 73, 72, 79,
-             81, 92, 97, 105, 117, 120, 136, 139, 145, 142, 140, 82, 75, 74, 72,
-             79, 81, 92, 97, 106, 117, 121, 136, 139, 148, 150, 149, 87, 79, 78,
-             76, 83, 85, 96, 100, 110, 120, 125, 141, 144, 148, 153, 150, 89,
-             82, 81, 78, 83, 87, 97, 99, 113, 118, 128, 139, 145, 153, 157, 161,
-             92, 84, 83, 80, 84, 89, 97, 101, 114, 116, 132, 135, 150, 153, 157,
-             162, 94, 86, 85, 82, 85, 92, 97, 104, 112, 119, 130, 136, 151, 154,
-             163, 166, 97, 88, 88, 85, 86, 94, 97, 107, 111, 123, 128, 140, 147,
-             159, 163, 167, 99, 91, 91, 87, 87, 97, 97, 110, 110, 126, 126, 144,
-             144, 163, 163, 173,
-             // Size 32x16
-             32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 53,
-             55, 58, 63, 65, 71, 74, 79, 82, 82, 87, 89, 92, 94, 97, 99, 31, 32,
-             32, 32, 32, 32, 33, 33, 34, 34, 34, 37, 39, 41, 44, 45, 49, 51, 54,
-             58, 60, 65, 68, 72, 75, 75, 79, 82, 84, 86, 88, 91, 31, 32, 32, 32,
-             33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 44, 45, 50, 51, 54, 58, 59,
-             64, 67, 71, 73, 74, 78, 81, 83, 85, 88, 91, 32, 32, 32, 33, 34, 34,
-             35, 36, 37, 38, 38, 40, 41, 43, 45, 46, 50, 51, 54, 57, 58, 63, 66,
-             70, 72, 72, 76, 78, 80, 82, 85, 87, 35, 35, 34, 34, 35, 36, 37, 39,
-             41, 45, 46, 48, 49, 51, 53, 54, 57, 59, 61, 65, 66, 71, 73, 77, 79,
-             79, 83, 83, 84, 85, 86, 87, 36, 35, 35, 34, 36, 36, 38, 40, 42, 47,
-             48, 50, 51, 53, 56, 56, 60, 61, 63, 67, 68, 73, 75, 79, 81, 81, 85,
-             87, 89, 92, 94, 97, 44, 42, 41, 41, 42, 42, 42, 44, 48, 52, 54, 58,
-             60, 63, 66, 67, 71, 72, 75, 78, 79, 84, 86, 90, 92, 92, 96, 97, 97,
-             97, 97, 97, 47, 45, 45, 44, 44, 45, 45, 47, 50, 55, 56, 60, 62, 66,
-             69, 70, 75, 77, 79, 83, 84, 89, 91, 95, 97, 97, 100, 99, 101, 104,
-             107, 110, 53, 51, 50, 49, 49, 50, 49, 51, 54, 59, 60, 65, 67, 71,
-             75, 76, 82, 84, 87, 91, 92, 97, 100, 104, 105, 106, 110, 113, 114,
-             112, 111, 110, 62, 59, 58, 57, 57, 57, 56, 58, 61, 65, 66, 71, 74,
-             78, 82, 83, 90, 92, 95, 100, 102, 108, 110, 115, 117, 117, 120,
-             118, 116, 119, 123, 126, 65, 62, 61, 59, 59, 59, 58, 60, 63, 67,
-             68, 73, 76, 79, 84, 85, 92, 94, 98, 103, 105, 111, 113, 118, 120,
-             121, 125, 128, 132, 130, 128, 126, 79, 75, 74, 72, 71, 71, 69, 71,
-             73, 77, 78, 84, 86, 90, 95, 96, 103, 106, 110, 116, 118, 125, 128,
-             133, 136, 136, 141, 139, 135, 136, 140, 144, 82, 78, 76, 74, 73,
-             73, 71, 73, 76, 79, 80, 86, 88, 92, 97, 98, 106, 108, 112, 118,
-             120, 127, 131, 136, 139, 139, 144, 145, 150, 151, 147, 144, 88, 83,
-             82, 79, 79, 78, 76, 76, 81, 82, 85, 89, 91, 97, 98, 104, 107, 111,
-             117, 119, 127, 129, 135, 140, 145, 148, 148, 153, 153, 154, 159,
-             163, 90, 86, 85, 82, 81, 80, 79, 78, 81, 83, 87, 88, 93, 94, 101,
-             101, 108, 110, 116, 119, 124, 129, 134, 139, 142, 150, 153, 157,
-             157, 163, 163, 163, 93, 88, 88, 84, 84, 82, 83, 80, 80, 86, 86, 91,
-             91, 97, 98, 105, 105, 112, 113, 121, 122, 130, 130, 140, 140, 149,
-             150, 161, 162, 166, 167, 173,
-             // Size 4x16
-             31, 36, 62, 88, 32, 35, 58, 82, 32, 36, 57, 79, 33, 38, 56, 76, 34,
-             42, 61, 81, 34, 48, 66, 85, 39, 51, 74, 91, 44, 56, 82, 98, 49, 60,
-             90, 107, 54, 63, 95, 117, 60, 68, 102, 127, 68, 75, 110, 135, 75,
-             81, 117, 145, 79, 85, 120, 148, 84, 89, 116, 153, 88, 94, 123, 159,
-             // Size 16x4
-             31, 32, 32, 33, 34, 34, 39, 44, 49, 54, 60, 68, 75, 79, 84, 88, 36,
-             35, 36, 38, 42, 48, 51, 56, 60, 63, 68, 75, 81, 85, 89, 94, 62, 58,
-             57, 56, 61, 66, 74, 82, 90, 95, 102, 110, 117, 120, 116, 123, 88,
-             82, 79, 76, 81, 85, 91, 98, 107, 117, 127, 135, 145, 148, 153, 159,
-             // Size 8x32
-             32, 31, 35, 44, 53, 65, 82, 90, 31, 32, 35, 42, 51, 62, 78, 86, 31,
-             32, 34, 41, 50, 61, 76, 85, 31, 32, 34, 41, 49, 59, 74, 82, 31, 33,
-             35, 42, 49, 59, 73, 81, 32, 33, 36, 42, 50, 59, 73, 80, 32, 34, 37,
-             42, 49, 58, 71, 79, 32, 34, 39, 44, 51, 60, 73, 78, 34, 35, 41, 48,
-             54, 63, 76, 81, 35, 36, 45, 52, 59, 67, 79, 83, 36, 36, 46, 54, 60,
-             68, 80, 87, 39, 39, 48, 58, 65, 73, 86, 88, 41, 40, 49, 60, 67, 76,
-             88, 93, 44, 42, 51, 63, 71, 79, 92, 94, 47, 44, 53, 66, 75, 84, 97,
-             101, 48, 45, 54, 67, 76, 85, 98, 101, 53, 50, 57, 71, 82, 92, 106,
-             108, 55, 51, 59, 72, 84, 94, 108, 110, 58, 54, 61, 75, 87, 98, 112,
-             116, 63, 58, 65, 78, 91, 103, 118, 119, 65, 59, 66, 79, 92, 105,
-             120, 124, 71, 64, 71, 84, 97, 111, 127, 129, 74, 67, 73, 86, 100,
-             113, 131, 134, 79, 71, 77, 90, 104, 118, 136, 139, 82, 73, 79, 92,
-             105, 120, 139, 142, 82, 74, 79, 92, 106, 121, 139, 150, 87, 78, 83,
-             96, 110, 125, 144, 153, 89, 81, 83, 97, 113, 128, 145, 157, 92, 83,
-             84, 97, 114, 132, 150, 157, 94, 85, 85, 97, 112, 130, 151, 163, 97,
-             88, 86, 97, 111, 128, 147, 163, 99, 91, 87, 97, 110, 126, 144, 163,
-             // Size 32x8
-             32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 53,
-             55, 58, 63, 65, 71, 74, 79, 82, 82, 87, 89, 92, 94, 97, 99, 31, 32,
-             32, 32, 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 44, 45, 50, 51, 54,
-             58, 59, 64, 67, 71, 73, 74, 78, 81, 83, 85, 88, 91, 35, 35, 34, 34,
-             35, 36, 37, 39, 41, 45, 46, 48, 49, 51, 53, 54, 57, 59, 61, 65, 66,
-             71, 73, 77, 79, 79, 83, 83, 84, 85, 86, 87, 44, 42, 41, 41, 42, 42,
-             42, 44, 48, 52, 54, 58, 60, 63, 66, 67, 71, 72, 75, 78, 79, 84, 86,
-             90, 92, 92, 96, 97, 97, 97, 97, 97, 53, 51, 50, 49, 49, 50, 49, 51,
-             54, 59, 60, 65, 67, 71, 75, 76, 82, 84, 87, 91, 92, 97, 100, 104,
-             105, 106, 110, 113, 114, 112, 111, 110, 65, 62, 61, 59, 59, 59, 58,
-             60, 63, 67, 68, 73, 76, 79, 84, 85, 92, 94, 98, 103, 105, 111, 113,
-             118, 120, 121, 125, 128, 132, 130, 128, 126, 82, 78, 76, 74, 73,
-             73, 71, 73, 76, 79, 80, 86, 88, 92, 97, 98, 106, 108, 112, 118,
-             120, 127, 131, 136, 139, 139, 144, 145, 150, 151, 147, 144, 90, 86,
-             85, 82, 81, 80, 79, 78, 81, 83, 87, 88, 93, 94, 101, 101, 108, 110,
-             116, 119, 124, 129, 134, 139, 142, 150, 153, 157, 157, 163, 163,
-             163},
-            {// Chroma
-             // Size 4x4
-             32, 45, 51, 61, 45, 54, 59, 65, 51, 59, 75, 81, 61, 65, 81, 97,
-             // Size 8x8
-             31, 34, 46, 47, 50, 57, 61, 65, 34, 39, 47, 45, 48, 53, 57, 61, 46,
-             47, 52, 52, 54, 58, 61, 62, 47, 45, 52, 58, 62, 65, 68, 68, 50, 48,
-             54, 62, 68, 73, 77, 76, 57, 53, 58, 65, 73, 82, 86, 86, 61, 57, 61,
-             68, 77, 86, 91, 95, 65, 61, 62, 68, 76, 86, 95, 100,
-             // Size 16x16
-             32, 31, 33, 36, 41, 49, 49, 50, 52, 54, 57, 61, 64, 67, 68, 70, 31,
-             31, 34, 39, 42, 47, 46, 47, 49, 51, 53, 57, 60, 62, 64, 66, 33, 34,
-             37, 42, 44, 47, 46, 46, 47, 49, 51, 55, 57, 59, 61, 63, 36, 39, 42,
-             47, 47, 48, 46, 46, 47, 48, 50, 53, 55, 57, 59, 61, 41, 42, 44, 47,
-             48, 50, 49, 50, 50, 52, 53, 56, 58, 60, 61, 60, 49, 47, 47, 48, 50,
-             53, 53, 54, 54, 55, 56, 59, 61, 63, 64, 64, 49, 46, 46, 46, 49, 53,
-             55, 57, 59, 60, 61, 64, 66, 67, 67, 67, 50, 47, 46, 46, 50, 54, 57,
-             61, 63, 64, 66, 69, 70, 72, 71, 71, 52, 49, 47, 47, 50, 54, 59, 63,
-             66, 68, 70, 73, 75, 77, 75, 75, 54, 51, 49, 48, 52, 55, 60, 64, 68,
-             71, 73, 76, 79, 80, 79, 79, 57, 53, 51, 50, 53, 56, 61, 66, 70, 73,
-             76, 80, 82, 84, 83, 84, 61, 57, 55, 53, 56, 59, 64, 69, 73, 76, 80,
-             84, 87, 89, 88, 88, 64, 60, 57, 55, 58, 61, 66, 70, 75, 79, 82, 87,
-             91, 93, 93, 93, 67, 62, 59, 57, 60, 63, 67, 72, 77, 80, 84, 89, 93,
-             95, 96, 97, 68, 64, 61, 59, 61, 64, 67, 71, 75, 79, 83, 88, 93, 96,
-             99, 100, 70, 66, 63, 61, 60, 64, 67, 71, 75, 79, 84, 88, 93, 97,
-             100, 102,
-             // Size 32x32
-             32, 31, 31, 30, 33, 33, 36, 38, 41, 47, 49, 48, 49, 49, 50, 50, 52,
-             53, 54, 56, 57, 60, 61, 63, 64, 65, 67, 67, 68, 69, 70, 71, 31, 31,
-             31, 31, 34, 34, 38, 40, 42, 46, 47, 47, 47, 47, 48, 48, 50, 50, 52,
-             54, 54, 57, 58, 60, 61, 61, 63, 64, 65, 65, 66, 67, 31, 31, 31, 31,
-             34, 35, 39, 40, 42, 46, 47, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53,
-             56, 57, 59, 60, 60, 62, 63, 64, 65, 66, 67, 30, 31, 31, 32, 34, 35,
-             40, 41, 42, 45, 46, 45, 45, 45, 46, 46, 47, 48, 49, 51, 52, 54, 55,
-             57, 58, 58, 60, 61, 62, 62, 63, 64, 33, 34, 34, 34, 37, 38, 42, 43,
-             44, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49, 51, 51, 53, 55, 56, 57,
-             57, 59, 60, 61, 62, 63, 64, 33, 34, 35, 35, 38, 39, 43, 44, 45, 47,
-             47, 46, 46, 45, 46, 46, 47, 48, 49, 51, 51, 53, 54, 56, 57, 57, 59,
-             60, 60, 61, 62, 62, 36, 38, 39, 40, 42, 43, 47, 47, 47, 47, 48, 46,
-             46, 45, 46, 46, 47, 47, 48, 49, 50, 52, 53, 54, 55, 55, 57, 58, 59,
-             60, 61, 62, 38, 40, 40, 41, 43, 44, 47, 47, 48, 48, 49, 48, 47, 47,
-             47, 47, 48, 49, 49, 51, 51, 53, 54, 55, 56, 56, 58, 58, 58, 59, 60,
-             60, 41, 42, 42, 42, 44, 45, 47, 48, 48, 50, 50, 49, 49, 49, 50, 50,
-             50, 51, 52, 53, 53, 55, 56, 57, 58, 58, 60, 61, 61, 61, 60, 60, 47,
-             46, 46, 45, 46, 47, 47, 48, 50, 52, 52, 52, 52, 52, 53, 53, 53, 54,
-             55, 55, 56, 58, 58, 60, 60, 61, 62, 61, 61, 62, 63, 64, 49, 47, 47,
-             46, 47, 47, 48, 49, 50, 52, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56,
-             56, 58, 59, 60, 61, 61, 63, 63, 64, 64, 64, 64, 48, 47, 46, 45, 46,
-             46, 46, 48, 49, 52, 53, 54, 55, 55, 56, 56, 57, 58, 58, 59, 60, 61,
-             62, 63, 64, 64, 66, 65, 65, 65, 66, 67, 49, 47, 46, 45, 46, 46, 46,
-             47, 49, 52, 53, 55, 55, 57, 57, 58, 59, 59, 60, 61, 61, 63, 64, 65,
-             66, 66, 67, 67, 67, 68, 67, 67, 49, 47, 46, 45, 45, 45, 45, 47, 49,
-             52, 53, 55, 57, 58, 59, 60, 61, 62, 62, 63, 63, 65, 66, 67, 68, 68,
-             69, 70, 69, 68, 69, 70, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54,
-             56, 57, 59, 61, 61, 63, 64, 64, 66, 66, 68, 69, 70, 70, 71, 72, 70,
-             71, 72, 71, 70, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 58,
-             60, 61, 61, 63, 64, 65, 66, 67, 68, 69, 71, 71, 71, 73, 74, 73, 72,
-             73, 74, 52, 50, 49, 47, 47, 47, 47, 48, 50, 53, 54, 57, 59, 61, 63,
-             63, 66, 67, 68, 70, 70, 72, 73, 75, 75, 75, 77, 75, 75, 76, 75, 74,
-             53, 50, 50, 48, 48, 48, 47, 49, 51, 54, 55, 58, 59, 62, 64, 64, 67,
-             68, 69, 71, 71, 73, 74, 76, 77, 77, 78, 78, 78, 76, 77, 78, 54, 52,
-             51, 49, 49, 49, 48, 49, 52, 55, 55, 58, 60, 62, 64, 65, 68, 69, 71,
-             73, 73, 75, 76, 78, 79, 79, 80, 80, 79, 80, 79, 78, 56, 54, 53, 51,
-             51, 51, 49, 51, 53, 55, 56, 59, 61, 63, 66, 66, 70, 71, 73, 75, 76,
-             78, 79, 81, 82, 82, 83, 81, 83, 81, 81, 82, 57, 54, 53, 52, 51, 51,
-             50, 51, 53, 56, 56, 60, 61, 63, 66, 67, 70, 71, 73, 76, 76, 79, 80,
-             82, 82, 83, 84, 85, 83, 84, 84, 82, 60, 57, 56, 54, 53, 53, 52, 53,
-             55, 58, 58, 61, 63, 65, 68, 68, 72, 73, 75, 78, 79, 82, 83, 85, 86,
-             86, 88, 86, 87, 86, 85, 86, 61, 58, 57, 55, 55, 54, 53, 54, 56, 58,
-             59, 62, 64, 66, 69, 69, 73, 74, 76, 79, 80, 83, 84, 86, 87, 88, 89,
-             89, 88, 88, 88, 86, 63, 60, 59, 57, 56, 56, 54, 55, 57, 60, 60, 63,
-             65, 67, 70, 71, 75, 76, 78, 81, 82, 85, 86, 89, 90, 90, 92, 91, 91,
-             90, 89, 91, 64, 61, 60, 58, 57, 57, 55, 56, 58, 60, 61, 64, 66, 68,
-             70, 71, 75, 77, 79, 82, 82, 86, 87, 90, 91, 91, 93, 93, 93, 92, 93,
-             91, 65, 61, 60, 58, 57, 57, 55, 56, 58, 61, 61, 64, 66, 68, 71, 71,
-             75, 77, 79, 82, 83, 86, 88, 90, 91, 91, 93, 94, 95, 95, 93, 95, 67,
-             63, 62, 60, 59, 59, 57, 58, 60, 62, 63, 66, 67, 69, 72, 73, 77, 78,
-             80, 83, 84, 88, 89, 92, 93, 93, 95, 95, 96, 96, 97, 95, 67, 64, 63,
-             61, 60, 60, 58, 58, 61, 61, 63, 65, 67, 70, 70, 74, 75, 78, 80, 81,
-             85, 86, 89, 91, 93, 94, 95, 97, 97, 98, 98, 100, 68, 65, 64, 62,
-             61, 60, 59, 58, 61, 61, 64, 65, 67, 69, 71, 73, 75, 78, 79, 83, 83,
-             87, 88, 91, 93, 95, 96, 97, 99, 98, 100, 100, 69, 65, 65, 62, 62,
-             61, 60, 59, 61, 62, 64, 65, 68, 68, 72, 72, 76, 76, 80, 81, 84, 86,
-             88, 90, 92, 95, 96, 98, 98, 100, 100, 101, 70, 66, 66, 63, 63, 62,
-             61, 60, 60, 63, 64, 66, 67, 69, 71, 73, 75, 77, 79, 81, 84, 85, 88,
-             89, 93, 93, 97, 98, 100, 100, 102, 101, 71, 67, 67, 64, 64, 62, 62,
-             60, 60, 64, 64, 67, 67, 70, 70, 74, 74, 78, 78, 82, 82, 86, 86, 91,
-             91, 95, 95, 100, 100, 101, 101, 104,
-             // Size 4x8
-             31, 47, 53, 63, 36, 47, 50, 59, 46, 52, 55, 61, 45, 53, 63, 70, 49,
-             55, 71, 77, 54, 58, 77, 86, 59, 61, 81, 94, 63, 65, 80, 95,
-             // Size 8x4
-             31, 36, 46, 45, 49, 54, 59, 63, 47, 47, 52, 53, 55, 58, 61, 65, 53,
-             50, 55, 63, 71, 77, 81, 80, 63, 59, 61, 70, 77, 86, 94, 95,
-             // Size 8x16
-             32, 33, 45, 49, 52, 57, 64, 68, 31, 34, 45, 46, 49, 53, 60, 64, 33,
-             37, 46, 45, 47, 51, 57, 61, 37, 43, 47, 45, 47, 50, 55, 59, 42, 44,
-             49, 49, 50, 53, 58, 60, 49, 47, 52, 53, 54, 57, 61, 63, 48, 46, 51,
-             57, 59, 61, 66, 67, 50, 46, 52, 59, 63, 66, 71, 71, 52, 47, 53, 61,
-             66, 71, 75, 74, 54, 49, 54, 62, 68, 73, 79, 79, 57, 51, 55, 64, 70,
-             76, 83, 83, 61, 55, 58, 66, 73, 80, 87, 87, 64, 57, 60, 68, 75, 83,
-             91, 91, 66, 59, 61, 69, 77, 84, 93, 95, 68, 61, 61, 68, 77, 86, 94,
-             97, 70, 63, 61, 67, 75, 83, 92, 98,
-             // Size 16x8
-             32, 31, 33, 37, 42, 49, 48, 50, 52, 54, 57, 61, 64, 66, 68, 70, 33,
-             34, 37, 43, 44, 47, 46, 46, 47, 49, 51, 55, 57, 59, 61, 63, 45, 45,
-             46, 47, 49, 52, 51, 52, 53, 54, 55, 58, 60, 61, 61, 61, 49, 46, 45,
-             45, 49, 53, 57, 59, 61, 62, 64, 66, 68, 69, 68, 67, 52, 49, 47, 47,
-             50, 54, 59, 63, 66, 68, 70, 73, 75, 77, 77, 75, 57, 53, 51, 50, 53,
-             57, 61, 66, 71, 73, 76, 80, 83, 84, 86, 83, 64, 60, 57, 55, 58, 61,
-             66, 71, 75, 79, 83, 87, 91, 93, 94, 92, 68, 64, 61, 59, 60, 63, 67,
-             71, 74, 79, 83, 87, 91, 95, 97, 98,
-             // Size 16x32
-             32, 31, 33, 37, 45, 48, 49, 50, 52, 56, 57, 63, 64, 67, 68, 68, 31,
-             31, 34, 38, 45, 47, 47, 48, 50, 53, 54, 60, 61, 63, 64, 65, 31, 32,
-             34, 39, 45, 46, 46, 47, 49, 52, 53, 59, 60, 62, 64, 65, 30, 32, 35,
-             40, 44, 46, 45, 46, 48, 51, 52, 57, 58, 60, 61, 62, 33, 35, 37, 42,
-             46, 47, 45, 46, 47, 50, 51, 56, 57, 60, 61, 62, 33, 36, 38, 43, 46,
-             47, 46, 46, 47, 50, 51, 56, 57, 59, 60, 60, 37, 40, 43, 47, 47, 47,
-             45, 46, 47, 49, 50, 54, 55, 57, 59, 61, 39, 41, 43, 47, 48, 48, 47,
-             47, 48, 50, 51, 55, 56, 57, 58, 59, 42, 43, 44, 47, 49, 50, 49, 50,
-             50, 53, 53, 57, 58, 60, 60, 59, 47, 46, 46, 48, 51, 52, 53, 53, 53,
-             55, 56, 60, 61, 61, 61, 62, 49, 46, 47, 48, 52, 53, 53, 54, 54, 56,
-             57, 60, 61, 63, 63, 62, 48, 46, 46, 47, 51, 53, 56, 56, 57, 59, 60,
-             64, 64, 65, 64, 65, 48, 45, 46, 46, 51, 53, 57, 57, 59, 61, 61, 65,
-             66, 66, 67, 65, 49, 45, 45, 46, 51, 53, 58, 59, 61, 63, 64, 67, 68,
-             70, 67, 68, 50, 46, 46, 46, 52, 54, 59, 61, 63, 65, 66, 70, 71, 70,
-             71, 68, 50, 46, 46, 46, 52, 54, 59, 61, 64, 66, 67, 71, 71, 73, 71,
-             72, 52, 48, 47, 47, 53, 54, 61, 63, 66, 70, 71, 75, 75, 75, 74, 72,
-             53, 49, 48, 48, 53, 55, 61, 64, 67, 71, 72, 76, 77, 77, 75, 76, 54,
-             50, 49, 49, 54, 55, 62, 65, 68, 72, 73, 78, 79, 80, 79, 76, 56, 51,
-             51, 50, 55, 56, 63, 66, 70, 74, 76, 81, 82, 81, 80, 80, 57, 52, 51,
-             50, 55, 56, 64, 66, 70, 75, 76, 82, 83, 85, 83, 80, 60, 54, 54, 52,
-             57, 58, 65, 68, 72, 77, 79, 85, 86, 86, 85, 84, 61, 56, 55, 53, 58,
-             59, 66, 69, 73, 79, 80, 86, 87, 89, 87, 84, 63, 57, 56, 55, 59, 60,
-             67, 70, 75, 80, 82, 89, 90, 91, 89, 89, 64, 58, 57, 56, 60, 61, 68,
-             71, 75, 81, 83, 90, 91, 93, 91, 89, 64, 59, 58, 56, 60, 61, 68, 71,
-             75, 81, 83, 90, 91, 94, 94, 93, 66, 60, 59, 57, 61, 63, 69, 72, 77,
-             82, 84, 92, 93, 94, 95, 93, 67, 61, 60, 58, 61, 63, 69, 70, 78, 80,
-             85, 90, 93, 96, 97, 97, 68, 62, 61, 59, 61, 64, 68, 71, 77, 79, 86,
-             88, 94, 96, 97, 98, 69, 63, 62, 59, 61, 65, 68, 72, 76, 80, 85, 88,
-             94, 95, 99, 99, 70, 63, 63, 60, 61, 66, 67, 73, 75, 81, 83, 89, 92,
-             97, 98, 99, 70, 64, 64, 61, 61, 67, 67, 74, 74, 82, 82, 90, 90, 98,
-             98, 102,
-             // Size 32x16
-             32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 52,
-             53, 54, 56, 57, 60, 61, 63, 64, 64, 66, 67, 68, 69, 70, 70, 31, 31,
-             32, 32, 35, 36, 40, 41, 43, 46, 46, 46, 45, 45, 46, 46, 48, 49, 50,
-             51, 52, 54, 56, 57, 58, 59, 60, 61, 62, 63, 63, 64, 33, 34, 34, 35,
-             37, 38, 43, 43, 44, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49, 51, 51,
-             54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 37, 38, 39, 40, 42, 43,
-             47, 47, 47, 48, 48, 47, 46, 46, 46, 46, 47, 48, 49, 50, 50, 52, 53,
-             55, 56, 56, 57, 58, 59, 59, 60, 61, 45, 45, 45, 44, 46, 46, 47, 48,
-             49, 51, 52, 51, 51, 51, 52, 52, 53, 53, 54, 55, 55, 57, 58, 59, 60,
-             60, 61, 61, 61, 61, 61, 61, 48, 47, 46, 46, 47, 47, 47, 48, 50, 52,
-             53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 56, 58, 59, 60, 61, 61, 63,
-             63, 64, 65, 66, 67, 49, 47, 46, 45, 45, 46, 45, 47, 49, 53, 53, 56,
-             57, 58, 59, 59, 61, 61, 62, 63, 64, 65, 66, 67, 68, 68, 69, 69, 68,
-             68, 67, 67, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59,
-             61, 61, 63, 64, 65, 66, 66, 68, 69, 70, 71, 71, 72, 70, 71, 72, 73,
-             74, 52, 50, 49, 48, 47, 47, 47, 48, 50, 53, 54, 57, 59, 61, 63, 64,
-             66, 67, 68, 70, 70, 72, 73, 75, 75, 75, 77, 78, 77, 76, 75, 74, 56,
-             53, 52, 51, 50, 50, 49, 50, 53, 55, 56, 59, 61, 63, 65, 66, 70, 71,
-             72, 74, 75, 77, 79, 80, 81, 81, 82, 80, 79, 80, 81, 82, 57, 54, 53,
-             52, 51, 51, 50, 51, 53, 56, 57, 60, 61, 64, 66, 67, 71, 72, 73, 76,
-             76, 79, 80, 82, 83, 83, 84, 85, 86, 85, 83, 82, 63, 60, 59, 57, 56,
-             56, 54, 55, 57, 60, 60, 64, 65, 67, 70, 71, 75, 76, 78, 81, 82, 85,
-             86, 89, 90, 90, 92, 90, 88, 88, 89, 90, 64, 61, 60, 58, 57, 57, 55,
-             56, 58, 61, 61, 64, 66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 87, 90,
-             91, 91, 93, 93, 94, 94, 92, 90, 67, 63, 62, 60, 60, 59, 57, 57, 60,
-             61, 63, 65, 66, 70, 70, 73, 75, 77, 80, 81, 85, 86, 89, 91, 93, 94,
-             94, 96, 96, 95, 97, 98, 68, 64, 64, 61, 61, 60, 59, 58, 60, 61, 63,
-             64, 67, 67, 71, 71, 74, 75, 79, 80, 83, 85, 87, 89, 91, 94, 95, 97,
-             97, 99, 98, 98, 68, 65, 65, 62, 62, 60, 61, 59, 59, 62, 62, 65, 65,
-             68, 68, 72, 72, 76, 76, 80, 80, 84, 84, 89, 89, 93, 93, 97, 98, 99,
-             99, 102,
-             // Size 4x16
-             31, 48, 56, 67, 32, 46, 52, 62, 35, 47, 50, 60, 40, 47, 49, 57, 43,
-             50, 53, 60, 46, 53, 56, 63, 45, 53, 61, 66, 46, 54, 65, 70, 48, 54,
-             70, 75, 50, 55, 72, 80, 52, 56, 75, 85, 56, 59, 79, 89, 58, 61, 81,
-             93, 60, 63, 82, 94, 62, 64, 79, 96, 63, 66, 81, 97,
-             // Size 16x4
-             31, 32, 35, 40, 43, 46, 45, 46, 48, 50, 52, 56, 58, 60, 62, 63, 48,
-             46, 47, 47, 50, 53, 53, 54, 54, 55, 56, 59, 61, 63, 64, 66, 56, 52,
-             50, 49, 53, 56, 61, 65, 70, 72, 75, 79, 81, 82, 79, 81, 67, 62, 60,
-             57, 60, 63, 66, 70, 75, 80, 85, 89, 93, 94, 96, 97,
-             // Size 8x32
-             32, 33, 45, 49, 52, 57, 64, 68, 31, 34, 45, 47, 50, 54, 61, 64, 31,
-             34, 45, 46, 49, 53, 60, 64, 30, 35, 44, 45, 48, 52, 58, 61, 33, 37,
-             46, 45, 47, 51, 57, 61, 33, 38, 46, 46, 47, 51, 57, 60, 37, 43, 47,
-             45, 47, 50, 55, 59, 39, 43, 48, 47, 48, 51, 56, 58, 42, 44, 49, 49,
-             50, 53, 58, 60, 47, 46, 51, 53, 53, 56, 61, 61, 49, 47, 52, 53, 54,
-             57, 61, 63, 48, 46, 51, 56, 57, 60, 64, 64, 48, 46, 51, 57, 59, 61,
-             66, 67, 49, 45, 51, 58, 61, 64, 68, 67, 50, 46, 52, 59, 63, 66, 71,
-             71, 50, 46, 52, 59, 64, 67, 71, 71, 52, 47, 53, 61, 66, 71, 75, 74,
-             53, 48, 53, 61, 67, 72, 77, 75, 54, 49, 54, 62, 68, 73, 79, 79, 56,
-             51, 55, 63, 70, 76, 82, 80, 57, 51, 55, 64, 70, 76, 83, 83, 60, 54,
-             57, 65, 72, 79, 86, 85, 61, 55, 58, 66, 73, 80, 87, 87, 63, 56, 59,
-             67, 75, 82, 90, 89, 64, 57, 60, 68, 75, 83, 91, 91, 64, 58, 60, 68,
-             75, 83, 91, 94, 66, 59, 61, 69, 77, 84, 93, 95, 67, 60, 61, 69, 78,
-             85, 93, 97, 68, 61, 61, 68, 77, 86, 94, 97, 69, 62, 61, 68, 76, 85,
-             94, 99, 70, 63, 61, 67, 75, 83, 92, 98, 70, 64, 61, 67, 74, 82, 90,
-             98,
-             // Size 32x8
-             32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 52,
-             53, 54, 56, 57, 60, 61, 63, 64, 64, 66, 67, 68, 69, 70, 70, 33, 34,
-             34, 35, 37, 38, 43, 43, 44, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49,
-             51, 51, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 45, 45, 45, 44,
-             46, 46, 47, 48, 49, 51, 52, 51, 51, 51, 52, 52, 53, 53, 54, 55, 55,
-             57, 58, 59, 60, 60, 61, 61, 61, 61, 61, 61, 49, 47, 46, 45, 45, 46,
-             45, 47, 49, 53, 53, 56, 57, 58, 59, 59, 61, 61, 62, 63, 64, 65, 66,
-             67, 68, 68, 69, 69, 68, 68, 67, 67, 52, 50, 49, 48, 47, 47, 47, 48,
-             50, 53, 54, 57, 59, 61, 63, 64, 66, 67, 68, 70, 70, 72, 73, 75, 75,
-             75, 77, 78, 77, 76, 75, 74, 57, 54, 53, 52, 51, 51, 50, 51, 53, 56,
-             57, 60, 61, 64, 66, 67, 71, 72, 73, 76, 76, 79, 80, 82, 83, 83, 84,
-             85, 86, 85, 83, 82, 64, 61, 60, 58, 57, 57, 55, 56, 58, 61, 61, 64,
-             66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 87, 90, 91, 91, 93, 93, 94,
-             94, 92, 90, 68, 64, 64, 61, 61, 60, 59, 58, 60, 61, 63, 64, 67, 67,
-             71, 71, 74, 75, 79, 80, 83, 85, 87, 89, 91, 94, 95, 97, 97, 99, 98,
-             98},
-        },
-        // Quantizer level 4.
-        {
-            {// Luma
-             // Size 4x4
-             32, 34, 53, 75, 34, 49, 64, 81, 53, 64, 91, 112, 75, 81, 112, 140,
-             // Size 8x8
-             32, 32, 34, 39, 50, 62, 76, 84, 32, 33, 35, 40, 48, 59, 71, 79, 34,
-             35, 39, 46, 53, 63, 74, 81, 39, 40, 46, 56, 65, 75, 86, 92, 50, 48,
-             53, 65, 78, 90, 101, 106, 62, 59, 63, 75, 90, 105, 118, 123, 76,
-             71, 74, 86, 101, 118, 134, 142, 84, 79, 81, 92, 106, 123, 142, 153,
-             // Size 16x16
-             32, 31, 31, 32, 33, 36, 39, 44, 48, 54, 59, 66, 74, 81, 86, 91, 31,
-             32, 32, 32, 33, 35, 38, 42, 46, 51, 56, 63, 70, 77, 81, 86, 31, 32,
-             32, 33, 34, 35, 38, 41, 45, 49, 54, 60, 67, 73, 77, 82, 32, 32, 33,
-             34, 36, 37, 40, 42, 45, 49, 53, 59, 66, 71, 75, 80, 33, 33, 34, 36,
-             38, 42, 44, 46, 50, 53, 57, 63, 69, 74, 78, 80, 36, 35, 35, 37, 42,
-             48, 50, 54, 57, 60, 64, 69, 75, 80, 84, 85, 39, 38, 38, 40, 44, 50,
-             54, 58, 61, 65, 69, 74, 80, 85, 89, 91, 44, 42, 41, 42, 46, 54, 58,
-             63, 67, 71, 75, 80, 86, 91, 95, 97, 48, 46, 45, 45, 50, 57, 61, 67,
-             71, 76, 80, 86, 93, 98, 101, 104, 54, 51, 49, 49, 53, 60, 65, 71,
-             76, 82, 87, 93, 100, 105, 109, 112, 59, 56, 54, 53, 57, 64, 69, 75,
-             80, 87, 92, 99, 106, 112, 116, 120, 66, 63, 60, 59, 63, 69, 74, 80,
-             86, 93, 99, 107, 115, 121, 125, 129, 74, 70, 67, 66, 69, 75, 80,
-             86, 93, 100, 106, 115, 123, 130, 135, 138, 81, 77, 73, 71, 74, 80,
-             85, 91, 98, 105, 112, 121, 130, 137, 142, 148, 86, 81, 77, 75, 78,
-             84, 89, 95, 101, 109, 116, 125, 135, 142, 147, 153, 91, 86, 82, 80,
-             80, 85, 91, 97, 104, 112, 120, 129, 138, 148, 153, 159,
-             // Size 32x32
-             32, 31, 31, 31, 31, 31, 32, 32, 33, 34, 36, 36, 39, 41, 44, 46, 48,
-             52, 54, 58, 59, 65, 66, 71, 74, 80, 81, 83, 86, 89, 91, 93, 31, 32,
-             32, 32, 32, 32, 32, 32, 33, 34, 35, 35, 38, 39, 42, 44, 46, 50, 51,
-             56, 56, 62, 63, 68, 71, 76, 77, 78, 82, 84, 86, 88, 31, 32, 32, 32,
-             32, 32, 32, 32, 33, 34, 35, 35, 38, 39, 42, 44, 46, 49, 51, 55, 56,
-             61, 63, 67, 70, 75, 77, 78, 81, 84, 86, 88, 31, 32, 32, 32, 32, 32,
-             32, 32, 33, 33, 34, 34, 37, 38, 41, 42, 44, 48, 49, 53, 54, 59, 60,
-             65, 68, 72, 74, 75, 78, 80, 82, 84, 31, 32, 32, 32, 32, 33, 33, 33,
-             34, 34, 35, 35, 38, 39, 41, 43, 45, 48, 49, 53, 54, 59, 60, 65, 67,
-             72, 73, 74, 77, 80, 82, 84, 31, 32, 32, 32, 33, 33, 33, 34, 35, 35,
-             36, 36, 39, 40, 42, 44, 45, 48, 50, 53, 54, 59, 60, 64, 67, 71, 73,
-             74, 77, 79, 81, 83, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 37, 38,
-             40, 40, 42, 44, 45, 48, 49, 53, 53, 58, 59, 63, 66, 70, 71, 72, 75,
-             78, 80, 83, 32, 32, 32, 32, 33, 34, 35, 35, 36, 37, 38, 38, 40, 41,
-             42, 44, 46, 48, 49, 53, 53, 58, 59, 63, 65, 69, 71, 72, 74, 77, 79,
-             80, 33, 33, 33, 33, 34, 35, 36, 36, 38, 39, 42, 42, 44, 45, 46, 48,
-             50, 52, 53, 57, 57, 62, 63, 67, 69, 73, 74, 75, 78, 79, 80, 81, 34,
-             34, 34, 33, 34, 35, 36, 37, 39, 39, 42, 43, 45, 46, 47, 49, 51, 53,
-             54, 58, 58, 63, 64, 68, 70, 74, 75, 76, 79, 81, 84, 86, 36, 35, 35,
-             34, 35, 36, 37, 38, 42, 42, 48, 48, 50, 51, 54, 55, 57, 59, 60, 63,
-             64, 68, 69, 73, 75, 79, 80, 81, 84, 85, 85, 86, 36, 35, 35, 34, 35,
-             36, 38, 38, 42, 43, 48, 49, 51, 52, 54, 55, 57, 59, 60, 64, 64, 68,
-             69, 73, 75, 79, 80, 81, 84, 86, 88, 91, 39, 38, 38, 37, 38, 39, 40,
-             40, 44, 45, 50, 51, 54, 55, 58, 59, 61, 64, 65, 68, 69, 73, 74, 78,
-             80, 84, 85, 86, 89, 91, 91, 91, 41, 39, 39, 38, 39, 40, 40, 41, 45,
-             46, 51, 52, 55, 56, 59, 61, 63, 65, 67, 70, 70, 75, 76, 80, 82, 86,
-             87, 88, 91, 92, 94, 96, 44, 42, 42, 41, 41, 42, 42, 42, 46, 47, 54,
-             54, 58, 59, 63, 65, 67, 70, 71, 75, 75, 79, 80, 84, 86, 90, 91, 92,
-             95, 97, 97, 97, 46, 44, 44, 42, 43, 44, 44, 44, 48, 49, 55, 55, 59,
-             61, 65, 67, 69, 72, 74, 77, 78, 82, 83, 87, 89, 93, 94, 95, 98, 98,
-             100, 103, 48, 46, 46, 44, 45, 45, 45, 46, 50, 51, 57, 57, 61, 63,
-             67, 69, 71, 74, 76, 80, 80, 85, 86, 90, 93, 96, 98, 99, 101, 104,
-             104, 103, 52, 50, 49, 48, 48, 48, 48, 48, 52, 53, 59, 59, 64, 65,
-             70, 72, 74, 78, 80, 84, 85, 90, 91, 95, 97, 101, 103, 104, 106,
-             106, 107, 110, 54, 51, 51, 49, 49, 50, 49, 49, 53, 54, 60, 60, 65,
-             67, 71, 74, 76, 80, 82, 86, 87, 92, 93, 97, 100, 104, 105, 106,
-             109, 112, 112, 110, 58, 56, 55, 53, 53, 53, 53, 53, 57, 58, 63, 64,
-             68, 70, 75, 77, 80, 84, 86, 91, 91, 97, 98, 103, 105, 110, 111,
-             112, 115, 114, 115, 118, 59, 56, 56, 54, 54, 54, 53, 53, 57, 58,
-             64, 64, 69, 70, 75, 78, 80, 85, 87, 91, 92, 98, 99, 103, 106, 110,
-             112, 113, 116, 119, 120, 119, 65, 62, 61, 59, 59, 59, 58, 58, 62,
-             63, 68, 68, 73, 75, 79, 82, 85, 90, 92, 97, 98, 105, 106, 111, 114,
-             118, 120, 121, 124, 123, 123, 126, 66, 63, 63, 60, 60, 60, 59, 59,
-             63, 64, 69, 69, 74, 76, 80, 83, 86, 91, 93, 98, 99, 106, 107, 112,
-             115, 119, 121, 122, 125, 128, 129, 126, 71, 68, 67, 65, 65, 64, 63,
-             63, 67, 68, 73, 73, 78, 80, 84, 87, 90, 95, 97, 103, 103, 111, 112,
-             117, 120, 125, 127, 128, 131, 132, 132, 135, 74, 71, 70, 68, 67,
-             67, 66, 65, 69, 70, 75, 75, 80, 82, 86, 89, 93, 97, 100, 105, 106,
-             114, 115, 120, 123, 128, 130, 131, 135, 135, 138, 136, 80, 76, 75,
-             72, 72, 71, 70, 69, 73, 74, 79, 79, 84, 86, 90, 93, 96, 101, 104,
-             110, 110, 118, 119, 125, 128, 134, 136, 137, 140, 142, 140, 144,
-             81, 77, 77, 74, 73, 73, 71, 71, 74, 75, 80, 80, 85, 87, 91, 94, 98,
-             103, 105, 111, 112, 120, 121, 127, 130, 136, 137, 139, 142, 145,
-             148, 144, 83, 78, 78, 75, 74, 74, 72, 72, 75, 76, 81, 81, 86, 88,
-             92, 95, 99, 104, 106, 112, 113, 121, 122, 128, 131, 137, 139, 140,
-             144, 148, 150, 155, 86, 82, 81, 78, 77, 77, 75, 74, 78, 79, 84, 84,
-             89, 91, 95, 98, 101, 106, 109, 115, 116, 124, 125, 131, 135, 140,
-             142, 144, 147, 149, 153, 155, 89, 84, 84, 80, 80, 79, 78, 77, 79,
-             81, 85, 86, 91, 92, 97, 98, 104, 106, 112, 114, 119, 123, 128, 132,
-             135, 142, 145, 148, 149, 153, 154, 159, 91, 86, 86, 82, 82, 81, 80,
-             79, 80, 84, 85, 88, 91, 94, 97, 100, 104, 107, 112, 115, 120, 123,
-             129, 132, 138, 140, 148, 150, 153, 154, 159, 159, 93, 88, 88, 84,
-             84, 83, 83, 80, 81, 86, 86, 91, 91, 96, 97, 103, 103, 110, 110,
-             118, 119, 126, 126, 135, 136, 144, 144, 155, 155, 159, 159, 164,
-             // Size 4x8
-             32, 35, 51, 77, 32, 36, 50, 72, 34, 42, 54, 75, 38, 51, 67, 87, 48,
-             59, 80, 103, 60, 68, 92, 119, 72, 79, 104, 135, 81, 86, 112, 144,
-             // Size 8x4
-             32, 32, 34, 38, 48, 60, 72, 81, 35, 36, 42, 51, 59, 68, 79, 86, 51,
-             50, 54, 67, 80, 92, 104, 112, 77, 72, 75, 87, 103, 119, 135, 144,
-             // Size 8x16
-             32, 31, 33, 40, 51, 65, 79, 87, 31, 32, 33, 39, 49, 61, 74, 82, 31,
-             32, 34, 38, 47, 59, 71, 79, 32, 33, 36, 40, 48, 58, 69, 77, 33, 34,
-             38, 44, 52, 62, 72, 78, 36, 35, 42, 51, 58, 68, 78, 84, 39, 38, 44,
-             54, 63, 73, 84, 89, 44, 41, 46, 59, 69, 79, 90, 96, 48, 45, 50, 62,
-             74, 85, 96, 103, 53, 49, 53, 66, 79, 92, 103, 111, 58, 54, 57, 70,
-             84, 98, 110, 118, 66, 60, 63, 75, 90, 106, 119, 126, 74, 67, 69,
-             81, 97, 113, 128, 134, 81, 73, 75, 86, 102, 120, 135, 143, 86, 78,
-             78, 90, 106, 124, 140, 147, 91, 82, 80, 90, 103, 119, 137, 151,
-             // Size 16x8
-             32, 31, 31, 32, 33, 36, 39, 44, 48, 53, 58, 66, 74, 81, 86, 91, 31,
-             32, 32, 33, 34, 35, 38, 41, 45, 49, 54, 60, 67, 73, 78, 82, 33, 33,
-             34, 36, 38, 42, 44, 46, 50, 53, 57, 63, 69, 75, 78, 80, 40, 39, 38,
-             40, 44, 51, 54, 59, 62, 66, 70, 75, 81, 86, 90, 90, 51, 49, 47, 48,
-             52, 58, 63, 69, 74, 79, 84, 90, 97, 102, 106, 103, 65, 61, 59, 58,
-             62, 68, 73, 79, 85, 92, 98, 106, 113, 120, 124, 119, 79, 74, 71,
-             69, 72, 78, 84, 90, 96, 103, 110, 119, 128, 135, 140, 137, 87, 82,
-             79, 77, 78, 84, 89, 96, 103, 111, 118, 126, 134, 143, 147, 151,
-             // Size 16x32
-             32, 31, 31, 32, 33, 36, 40, 44, 51, 53, 65, 66, 79, 81, 87, 90, 31,
-             32, 32, 32, 33, 35, 39, 42, 49, 51, 62, 63, 75, 77, 83, 85, 31, 32,
-             32, 32, 33, 35, 39, 42, 49, 51, 61, 62, 74, 76, 82, 85, 31, 32, 32,
-             33, 33, 34, 38, 41, 47, 49, 59, 60, 72, 74, 79, 81, 31, 32, 32, 33,
-             34, 35, 38, 41, 47, 49, 59, 60, 71, 73, 79, 81, 32, 32, 33, 34, 35,
-             36, 39, 42, 48, 50, 59, 60, 71, 72, 78, 80, 32, 32, 33, 35, 36, 37,
-             40, 42, 48, 49, 58, 59, 69, 71, 77, 80, 32, 33, 33, 35, 36, 38, 41,
-             42, 48, 49, 58, 59, 69, 70, 75, 77, 33, 33, 34, 36, 38, 41, 44, 46,
-             52, 53, 62, 63, 72, 74, 78, 78, 34, 34, 34, 37, 39, 42, 45, 48, 53,
-             54, 63, 64, 73, 75, 80, 83, 36, 34, 35, 38, 42, 48, 51, 54, 58, 60,
-             68, 69, 78, 80, 84, 83, 36, 35, 35, 38, 42, 48, 51, 54, 59, 60, 68,
-             69, 79, 80, 85, 87, 39, 37, 38, 40, 44, 50, 54, 58, 63, 65, 73, 74,
-             84, 85, 89, 88, 40, 38, 39, 41, 45, 51, 56, 59, 65, 67, 75, 76, 85,
-             87, 90, 93, 44, 41, 41, 43, 46, 53, 59, 63, 69, 71, 79, 80, 90, 91,
-             96, 93, 46, 43, 43, 44, 48, 55, 60, 65, 72, 73, 82, 83, 93, 94, 97,
-             100, 48, 45, 45, 46, 50, 56, 62, 67, 74, 76, 85, 86, 96, 98, 103,
-             100, 52, 48, 48, 49, 52, 59, 65, 70, 78, 80, 90, 91, 101, 103, 105,
-             107, 53, 49, 49, 50, 53, 60, 66, 71, 79, 82, 92, 93, 103, 105, 111,
-             107, 58, 53, 53, 53, 57, 63, 69, 74, 83, 86, 97, 98, 109, 111, 113,
-             115, 58, 54, 54, 54, 57, 63, 70, 75, 84, 87, 98, 99, 110, 112, 118,
-             115, 65, 60, 59, 58, 62, 68, 74, 79, 89, 92, 105, 106, 118, 119,
-             122, 123, 66, 61, 60, 59, 63, 69, 75, 80, 90, 93, 106, 107, 119,
-             121, 126, 123, 71, 65, 65, 63, 67, 73, 79, 84, 94, 97, 111, 112,
-             125, 127, 131, 132, 74, 68, 67, 66, 69, 75, 81, 86, 97, 100, 113,
-             115, 128, 130, 134, 132, 79, 72, 72, 70, 73, 79, 85, 90, 101, 104,
-             118, 119, 133, 135, 141, 140, 81, 74, 73, 71, 75, 80, 86, 91, 102,
-             105, 120, 121, 135, 137, 143, 140, 82, 75, 74, 72, 75, 81, 87, 92,
-             103, 106, 121, 122, 136, 139, 147, 151, 86, 78, 78, 75, 78, 84, 90,
-             95, 106, 109, 124, 125, 140, 142, 147, 151, 88, 81, 80, 77, 80, 86,
-             90, 98, 105, 112, 122, 127, 140, 144, 152, 155, 91, 83, 82, 79, 80,
-             88, 90, 100, 103, 114, 119, 130, 137, 148, 151, 155, 93, 85, 85,
-             81, 81, 90, 90, 102, 103, 117, 117, 134, 134, 151, 152, 160,
-             // Size 32x16
-             32, 31, 31, 31, 31, 32, 32, 32, 33, 34, 36, 36, 39, 40, 44, 46, 48,
-             52, 53, 58, 58, 65, 66, 71, 74, 79, 81, 82, 86, 88, 91, 93, 31, 32,
-             32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 37, 38, 41, 43, 45, 48, 49,
-             53, 54, 60, 61, 65, 68, 72, 74, 75, 78, 81, 83, 85, 31, 32, 32, 32,
-             32, 33, 33, 33, 34, 34, 35, 35, 38, 39, 41, 43, 45, 48, 49, 53, 54,
-             59, 60, 65, 67, 72, 73, 74, 78, 80, 82, 85, 32, 32, 32, 33, 33, 34,
-             35, 35, 36, 37, 38, 38, 40, 41, 43, 44, 46, 49, 50, 53, 54, 58, 59,
-             63, 66, 70, 71, 72, 75, 77, 79, 81, 33, 33, 33, 33, 34, 35, 36, 36,
-             38, 39, 42, 42, 44, 45, 46, 48, 50, 52, 53, 57, 57, 62, 63, 67, 69,
-             73, 75, 75, 78, 80, 80, 81, 36, 35, 35, 34, 35, 36, 37, 38, 41, 42,
-             48, 48, 50, 51, 53, 55, 56, 59, 60, 63, 63, 68, 69, 73, 75, 79, 80,
-             81, 84, 86, 88, 90, 40, 39, 39, 38, 38, 39, 40, 41, 44, 45, 51, 51,
-             54, 56, 59, 60, 62, 65, 66, 69, 70, 74, 75, 79, 81, 85, 86, 87, 90,
-             90, 90, 90, 44, 42, 42, 41, 41, 42, 42, 42, 46, 48, 54, 54, 58, 59,
-             63, 65, 67, 70, 71, 74, 75, 79, 80, 84, 86, 90, 91, 92, 95, 98,
-             100, 102, 51, 49, 49, 47, 47, 48, 48, 48, 52, 53, 58, 59, 63, 65,
-             69, 72, 74, 78, 79, 83, 84, 89, 90, 94, 97, 101, 102, 103, 106,
-             105, 103, 103, 53, 51, 51, 49, 49, 50, 49, 49, 53, 54, 60, 60, 65,
-             67, 71, 73, 76, 80, 82, 86, 87, 92, 93, 97, 100, 104, 105, 106,
-             109, 112, 114, 117, 65, 62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68,
-             73, 75, 79, 82, 85, 90, 92, 97, 98, 105, 106, 111, 113, 118, 120,
-             121, 124, 122, 119, 117, 66, 63, 62, 60, 60, 60, 59, 59, 63, 64,
-             69, 69, 74, 76, 80, 83, 86, 91, 93, 98, 99, 106, 107, 112, 115,
-             119, 121, 122, 125, 127, 130, 134, 79, 75, 74, 72, 71, 71, 69, 69,
-             72, 73, 78, 79, 84, 85, 90, 93, 96, 101, 103, 109, 110, 118, 119,
-             125, 128, 133, 135, 136, 140, 140, 137, 134, 81, 77, 76, 74, 73,
-             72, 71, 70, 74, 75, 80, 80, 85, 87, 91, 94, 98, 103, 105, 111, 112,
-             119, 121, 127, 130, 135, 137, 139, 142, 144, 148, 151, 87, 83, 82,
-             79, 79, 78, 77, 75, 78, 80, 84, 85, 89, 90, 96, 97, 103, 105, 111,
-             113, 118, 122, 126, 131, 134, 141, 143, 147, 147, 152, 151, 152,
-             90, 85, 85, 81, 81, 80, 80, 77, 78, 83, 83, 87, 88, 93, 93, 100,
-             100, 107, 107, 115, 115, 123, 123, 132, 132, 140, 140, 151, 151,
-             155, 155, 160,
-             // Size 4x16
-             31, 36, 53, 81, 32, 35, 51, 76, 32, 35, 49, 73, 32, 37, 49, 71, 33,
-             41, 53, 74, 34, 48, 60, 80, 37, 50, 65, 85, 41, 53, 71, 91, 45, 56,
-             76, 98, 49, 60, 82, 105, 54, 63, 87, 112, 61, 69, 93, 121, 68, 75,
-             100, 130, 74, 80, 105, 137, 78, 84, 109, 142, 83, 88, 114, 148,
-             // Size 16x4
-             31, 32, 32, 32, 33, 34, 37, 41, 45, 49, 54, 61, 68, 74, 78, 83, 36,
-             35, 35, 37, 41, 48, 50, 53, 56, 60, 63, 69, 75, 80, 84, 88, 53, 51,
-             49, 49, 53, 60, 65, 71, 76, 82, 87, 93, 100, 105, 109, 114, 81, 76,
-             73, 71, 74, 80, 85, 91, 98, 105, 112, 121, 130, 137, 142, 148,
-             // Size 8x32
-             32, 31, 33, 40, 51, 65, 79, 87, 31, 32, 33, 39, 49, 62, 75, 83, 31,
-             32, 33, 39, 49, 61, 74, 82, 31, 32, 33, 38, 47, 59, 72, 79, 31, 32,
-             34, 38, 47, 59, 71, 79, 32, 33, 35, 39, 48, 59, 71, 78, 32, 33, 36,
-             40, 48, 58, 69, 77, 32, 33, 36, 41, 48, 58, 69, 75, 33, 34, 38, 44,
-             52, 62, 72, 78, 34, 34, 39, 45, 53, 63, 73, 80, 36, 35, 42, 51, 58,
-             68, 78, 84, 36, 35, 42, 51, 59, 68, 79, 85, 39, 38, 44, 54, 63, 73,
-             84, 89, 40, 39, 45, 56, 65, 75, 85, 90, 44, 41, 46, 59, 69, 79, 90,
-             96, 46, 43, 48, 60, 72, 82, 93, 97, 48, 45, 50, 62, 74, 85, 96,
-             103, 52, 48, 52, 65, 78, 90, 101, 105, 53, 49, 53, 66, 79, 92, 103,
-             111, 58, 53, 57, 69, 83, 97, 109, 113, 58, 54, 57, 70, 84, 98, 110,
-             118, 65, 59, 62, 74, 89, 105, 118, 122, 66, 60, 63, 75, 90, 106,
-             119, 126, 71, 65, 67, 79, 94, 111, 125, 131, 74, 67, 69, 81, 97,
-             113, 128, 134, 79, 72, 73, 85, 101, 118, 133, 141, 81, 73, 75, 86,
-             102, 120, 135, 143, 82, 74, 75, 87, 103, 121, 136, 147, 86, 78, 78,
-             90, 106, 124, 140, 147, 88, 80, 80, 90, 105, 122, 140, 152, 91, 82,
-             80, 90, 103, 119, 137, 151, 93, 85, 81, 90, 103, 117, 134, 152,
-             // Size 32x8
-             32, 31, 31, 31, 31, 32, 32, 32, 33, 34, 36, 36, 39, 40, 44, 46, 48,
-             52, 53, 58, 58, 65, 66, 71, 74, 79, 81, 82, 86, 88, 91, 93, 31, 32,
-             32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 38, 39, 41, 43, 45, 48, 49,
-             53, 54, 59, 60, 65, 67, 72, 73, 74, 78, 80, 82, 85, 33, 33, 33, 33,
-             34, 35, 36, 36, 38, 39, 42, 42, 44, 45, 46, 48, 50, 52, 53, 57, 57,
-             62, 63, 67, 69, 73, 75, 75, 78, 80, 80, 81, 40, 39, 39, 38, 38, 39,
-             40, 41, 44, 45, 51, 51, 54, 56, 59, 60, 62, 65, 66, 69, 70, 74, 75,
-             79, 81, 85, 86, 87, 90, 90, 90, 90, 51, 49, 49, 47, 47, 48, 48, 48,
-             52, 53, 58, 59, 63, 65, 69, 72, 74, 78, 79, 83, 84, 89, 90, 94, 97,
-             101, 102, 103, 106, 105, 103, 103, 65, 62, 61, 59, 59, 59, 58, 58,
-             62, 63, 68, 68, 73, 75, 79, 82, 85, 90, 92, 97, 98, 105, 106, 111,
-             113, 118, 120, 121, 124, 122, 119, 117, 79, 75, 74, 72, 71, 71, 69,
-             69, 72, 73, 78, 79, 84, 85, 90, 93, 96, 101, 103, 109, 110, 118,
-             119, 125, 128, 133, 135, 136, 140, 140, 137, 134, 87, 83, 82, 79,
-             79, 78, 77, 75, 78, 80, 84, 85, 89, 90, 96, 97, 103, 105, 111, 113,
-             118, 122, 126, 131, 134, 141, 143, 147, 147, 152, 151, 152},
-            {// Chroma
-             // Size 4x4
-             32, 46, 49, 58, 46, 53, 55, 62, 49, 55, 70, 78, 58, 62, 78, 91,
-             // Size 8x8
-             31, 34, 42, 47, 49, 54, 60, 64, 34, 39, 45, 46, 47, 51, 56, 59, 42,
-             45, 48, 49, 50, 53, 57, 60, 47, 46, 49, 55, 58, 61, 65, 66, 49, 47,
-             50, 58, 65, 69, 73, 74, 54, 51, 53, 61, 69, 76, 82, 83, 60, 56, 57,
-             65, 73, 82, 89, 92, 64, 59, 60, 66, 74, 83, 92, 96,
-             // Size 16x16
-             32, 31, 31, 35, 40, 49, 48, 49, 50, 52, 54, 57, 61, 64, 66, 68, 31,
-             31, 32, 37, 41, 47, 47, 46, 48, 49, 51, 54, 57, 60, 62, 64, 31, 32,
-             34, 39, 43, 46, 46, 45, 46, 47, 49, 52, 55, 57, 59, 61, 35, 37, 39,
-             44, 46, 47, 46, 45, 46, 47, 48, 51, 53, 56, 57, 59, 40, 41, 43, 46,
-             48, 50, 49, 48, 49, 49, 51, 53, 55, 57, 59, 59, 49, 47, 46, 47, 50,
-             53, 53, 53, 54, 54, 55, 57, 59, 61, 62, 62, 48, 47, 46, 46, 49, 53,
-             54, 55, 56, 57, 58, 60, 62, 64, 65, 65, 49, 46, 45, 45, 48, 53, 55,
-             58, 60, 61, 62, 64, 66, 68, 69, 69, 50, 48, 46, 46, 49, 54, 56, 60,
-             61, 63, 65, 67, 69, 71, 72, 72, 52, 49, 47, 47, 49, 54, 57, 61, 63,
-             66, 68, 71, 73, 75, 76, 77, 54, 51, 49, 48, 51, 55, 58, 62, 65, 68,
-             71, 74, 76, 78, 80, 81, 57, 54, 52, 51, 53, 57, 60, 64, 67, 71, 74,
-             77, 80, 83, 84, 85, 61, 57, 55, 53, 55, 59, 62, 66, 69, 73, 76, 80,
-             84, 87, 89, 89, 64, 60, 57, 56, 57, 61, 64, 68, 71, 75, 78, 83, 87,
-             90, 92, 94, 66, 62, 59, 57, 59, 62, 65, 69, 72, 76, 80, 84, 89, 92,
-             94, 96, 68, 64, 61, 59, 59, 62, 65, 69, 72, 77, 81, 85, 89, 94, 96,
-             98,
-             // Size 32x32
-             32, 31, 31, 30, 31, 33, 35, 36, 40, 41, 49, 49, 48, 48, 49, 50, 50,
-             52, 52, 54, 54, 57, 57, 60, 61, 63, 64, 65, 66, 67, 68, 69, 31, 31,
-             31, 31, 32, 34, 37, 38, 41, 42, 47, 47, 47, 47, 47, 47, 48, 49, 50,
-             52, 52, 54, 55, 57, 58, 60, 61, 61, 63, 64, 64, 65, 31, 31, 31, 31,
-             32, 35, 37, 39, 41, 42, 47, 47, 47, 46, 46, 47, 48, 49, 49, 51, 51,
-             54, 54, 56, 57, 59, 60, 61, 62, 63, 64, 65, 30, 31, 31, 32, 33, 35,
-             38, 40, 42, 42, 46, 46, 45, 45, 45, 45, 46, 47, 47, 49, 49, 52, 52,
-             54, 55, 57, 58, 58, 60, 61, 61, 62, 31, 32, 32, 33, 34, 37, 39, 41,
-             43, 43, 46, 46, 46, 45, 45, 46, 46, 47, 47, 49, 49, 51, 52, 54, 55,
-             57, 57, 58, 59, 60, 61, 62, 33, 34, 35, 35, 37, 39, 41, 43, 44, 45,
-             47, 47, 46, 46, 45, 46, 46, 47, 47, 49, 49, 51, 51, 53, 54, 56, 57,
-             57, 58, 59, 60, 61, 35, 37, 37, 38, 39, 41, 44, 46, 46, 46, 47, 47,
-             46, 46, 45, 46, 46, 47, 47, 48, 48, 50, 51, 52, 53, 55, 56, 56, 57,
-             58, 59, 61, 36, 38, 39, 40, 41, 43, 46, 47, 47, 47, 48, 47, 46, 46,
-             45, 46, 46, 46, 47, 48, 48, 50, 50, 52, 53, 54, 55, 55, 56, 57, 58,
-             58, 40, 41, 41, 42, 43, 44, 46, 47, 48, 48, 50, 49, 49, 49, 48, 49,
-             49, 49, 49, 51, 51, 52, 53, 54, 55, 57, 57, 58, 59, 59, 59, 59, 41,
-             42, 42, 42, 43, 45, 46, 47, 48, 48, 50, 50, 49, 49, 49, 49, 50, 50,
-             50, 52, 52, 53, 53, 55, 56, 57, 58, 58, 59, 60, 61, 62, 49, 47, 47,
-             46, 46, 47, 47, 48, 50, 50, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55,
-             55, 56, 57, 58, 59, 60, 61, 61, 62, 62, 62, 62, 49, 47, 47, 46, 46,
-             47, 47, 47, 49, 50, 53, 53, 53, 53, 54, 54, 54, 54, 54, 55, 56, 57,
-             57, 59, 59, 61, 61, 62, 63, 63, 64, 65, 48, 47, 47, 45, 46, 46, 46,
-             46, 49, 49, 53, 53, 54, 54, 55, 56, 56, 57, 57, 58, 58, 60, 60, 61,
-             62, 63, 64, 64, 65, 66, 65, 65, 48, 47, 46, 45, 45, 46, 46, 46, 49,
-             49, 53, 53, 54, 55, 56, 57, 57, 58, 58, 59, 60, 61, 61, 63, 63, 65,
-             65, 65, 66, 66, 67, 68, 49, 47, 46, 45, 45, 45, 45, 45, 48, 49, 53,
-             54, 55, 56, 58, 59, 60, 61, 61, 62, 62, 63, 64, 65, 66, 67, 68, 68,
-             69, 70, 69, 68, 50, 47, 47, 45, 46, 46, 46, 46, 49, 49, 54, 54, 56,
-             57, 59, 60, 60, 62, 62, 63, 64, 65, 65, 67, 68, 69, 69, 70, 70, 70,
-             71, 71, 50, 48, 48, 46, 46, 46, 46, 46, 49, 50, 54, 54, 56, 57, 60,
-             60, 61, 63, 63, 65, 65, 67, 67, 68, 69, 71, 71, 71, 72, 73, 72, 71,
-             52, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, 61, 62, 63,
-             65, 65, 67, 67, 69, 70, 71, 72, 73, 74, 74, 75, 74, 74, 75, 52, 50,
-             49, 47, 47, 47, 47, 47, 49, 50, 54, 54, 57, 58, 61, 62, 63, 65, 66,
-             68, 68, 70, 71, 72, 73, 75, 75, 75, 76, 77, 77, 75, 54, 52, 51, 49,
-             49, 49, 48, 48, 51, 52, 55, 55, 58, 59, 62, 63, 65, 67, 68, 70, 70,
-             73, 73, 75, 76, 78, 78, 78, 79, 78, 78, 79, 54, 52, 51, 49, 49, 49,
-             48, 48, 51, 52, 55, 56, 58, 60, 62, 64, 65, 67, 68, 70, 71, 73, 74,
-             75, 76, 78, 78, 79, 80, 81, 81, 79, 57, 54, 54, 52, 51, 51, 50, 50,
-             52, 53, 56, 57, 60, 61, 63, 65, 67, 69, 70, 73, 73, 76, 77, 79, 80,
-             82, 82, 83, 84, 83, 82, 83, 57, 55, 54, 52, 52, 51, 51, 50, 53, 53,
-             57, 57, 60, 61, 64, 65, 67, 70, 71, 73, 74, 77, 77, 79, 80, 82, 83,
-             83, 84, 85, 85, 83, 60, 57, 56, 54, 54, 53, 52, 52, 54, 55, 58, 59,
-             61, 63, 65, 67, 68, 71, 72, 75, 75, 79, 79, 82, 83, 85, 86, 86, 87,
-             87, 86, 87, 61, 58, 57, 55, 55, 54, 53, 53, 55, 56, 59, 59, 62, 63,
-             66, 68, 69, 72, 73, 76, 76, 80, 80, 83, 84, 86, 87, 88, 89, 89, 89,
-             87, 63, 60, 59, 57, 57, 56, 55, 54, 57, 57, 60, 61, 63, 65, 67, 69,
-             71, 73, 75, 78, 78, 82, 82, 85, 86, 89, 89, 90, 91, 92, 90, 91, 64,
-             61, 60, 58, 57, 57, 56, 55, 57, 58, 61, 61, 64, 65, 68, 69, 71, 74,
-             75, 78, 78, 82, 83, 86, 87, 89, 90, 91, 92, 93, 94, 91, 65, 61, 61,
-             58, 58, 57, 56, 55, 58, 58, 61, 62, 64, 65, 68, 70, 71, 74, 75, 78,
-             79, 83, 83, 86, 88, 90, 91, 91, 93, 94, 94, 96, 66, 63, 62, 60, 59,
-             58, 57, 56, 59, 59, 62, 63, 65, 66, 69, 70, 72, 75, 76, 79, 80, 84,
-             84, 87, 89, 91, 92, 93, 94, 94, 96, 96, 67, 64, 63, 61, 60, 59, 58,
-             57, 59, 60, 62, 63, 66, 66, 70, 70, 73, 74, 77, 78, 81, 83, 85, 87,
-             89, 92, 93, 94, 94, 96, 96, 97, 68, 64, 64, 61, 61, 60, 59, 58, 59,
-             61, 62, 64, 65, 67, 69, 71, 72, 74, 77, 78, 81, 82, 85, 86, 89, 90,
-             94, 94, 96, 96, 98, 97, 69, 65, 65, 62, 62, 61, 61, 58, 59, 62, 62,
-             65, 65, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 87, 87, 91, 91, 96,
-             96, 97, 97, 99,
-             // Size 4x8
-             31, 47, 50, 61, 36, 47, 47, 57, 43, 50, 50, 58, 45, 53, 58, 65, 47,
-             54, 66, 74, 52, 56, 70, 82, 57, 60, 75, 90, 61, 63, 77, 93,
-             // Size 8x4
-             31, 36, 43, 45, 47, 52, 57, 61, 47, 47, 50, 53, 54, 56, 60, 63, 50,
-             47, 50, 58, 66, 70, 75, 77, 61, 57, 58, 65, 74, 82, 90, 93,
-             // Size 8x16
-             32, 32, 40, 49, 51, 57, 63, 67, 31, 33, 41, 47, 49, 54, 59, 63, 31,
-             35, 43, 46, 47, 51, 57, 60, 35, 39, 46, 46, 47, 50, 55, 58, 41, 43,
-             48, 49, 49, 52, 57, 59, 49, 47, 50, 53, 54, 57, 60, 62, 48, 46, 49,
-             54, 57, 60, 64, 65, 49, 45, 48, 56, 61, 64, 67, 69, 50, 46, 49, 57,
-             63, 67, 71, 73, 52, 48, 50, 58, 65, 71, 75, 77, 54, 50, 51, 59, 67,
-             73, 78, 81, 57, 52, 53, 61, 69, 77, 82, 85, 61, 55, 56, 63, 72, 80,
-             86, 88, 64, 58, 58, 65, 73, 82, 89, 92, 66, 59, 59, 66, 75, 84, 91,
-             94, 68, 61, 59, 65, 72, 81, 89, 95,
-             // Size 16x8
-             32, 31, 31, 35, 41, 49, 48, 49, 50, 52, 54, 57, 61, 64, 66, 68, 32,
-             33, 35, 39, 43, 47, 46, 45, 46, 48, 50, 52, 55, 58, 59, 61, 40, 41,
-             43, 46, 48, 50, 49, 48, 49, 50, 51, 53, 56, 58, 59, 59, 49, 47, 46,
-             46, 49, 53, 54, 56, 57, 58, 59, 61, 63, 65, 66, 65, 51, 49, 47, 47,
-             49, 54, 57, 61, 63, 65, 67, 69, 72, 73, 75, 72, 57, 54, 51, 50, 52,
-             57, 60, 64, 67, 71, 73, 77, 80, 82, 84, 81, 63, 59, 57, 55, 57, 60,
-             64, 67, 71, 75, 78, 82, 86, 89, 91, 89, 67, 63, 60, 58, 59, 62, 65,
-             69, 73, 77, 81, 85, 88, 92, 94, 95,
-             // Size 16x32
-             32, 31, 32, 37, 40, 48, 49, 49, 51, 52, 57, 58, 63, 64, 67, 67, 31,
-             31, 33, 38, 41, 47, 47, 47, 49, 50, 54, 55, 60, 61, 63, 64, 31, 31,
-             33, 38, 41, 47, 47, 47, 49, 49, 54, 54, 59, 60, 63, 64, 30, 32, 33,
-             40, 42, 46, 45, 45, 47, 48, 52, 52, 57, 58, 60, 61, 31, 33, 35, 41,
-             43, 46, 46, 45, 47, 48, 51, 52, 57, 57, 60, 61, 33, 36, 37, 43, 44,
-             47, 46, 46, 47, 47, 51, 52, 56, 57, 59, 60, 35, 38, 39, 45, 46, 47,
-             46, 45, 47, 47, 50, 51, 55, 56, 58, 60, 37, 40, 41, 47, 47, 47, 46,
-             45, 46, 47, 50, 50, 54, 55, 57, 58, 41, 42, 43, 47, 48, 49, 49, 48,
-             49, 50, 52, 53, 57, 57, 59, 58, 42, 43, 43, 47, 48, 50, 49, 49, 50,
-             50, 53, 54, 57, 58, 60, 61, 49, 46, 47, 48, 50, 53, 53, 53, 54, 54,
-             57, 57, 60, 61, 62, 61, 49, 46, 47, 48, 50, 53, 53, 54, 54, 55, 57,
-             57, 61, 61, 63, 64, 48, 46, 46, 47, 49, 53, 54, 56, 57, 57, 60, 60,
-             64, 64, 65, 64, 48, 45, 46, 46, 49, 53, 55, 56, 58, 58, 61, 61, 65,
-             65, 66, 67, 49, 45, 45, 46, 48, 53, 56, 58, 61, 61, 64, 64, 67, 68,
-             69, 67, 49, 46, 46, 46, 49, 53, 57, 59, 62, 62, 65, 66, 69, 69, 70,
-             70, 50, 46, 46, 46, 49, 54, 57, 59, 63, 64, 67, 67, 71, 71, 73, 71,
-             51, 47, 47, 47, 49, 54, 58, 61, 64, 66, 69, 70, 73, 74, 74, 74, 52,
-             48, 48, 47, 50, 54, 58, 61, 65, 66, 71, 71, 75, 75, 77, 74, 54, 50,
-             49, 48, 51, 55, 59, 62, 67, 68, 73, 73, 77, 78, 78, 78, 54, 50, 50,
-             49, 51, 55, 59, 62, 67, 68, 73, 74, 78, 78, 81, 78, 57, 52, 52, 50,
-             52, 56, 60, 64, 69, 70, 76, 77, 82, 82, 83, 82, 57, 52, 52, 51, 53,
-             57, 61, 64, 69, 71, 77, 77, 82, 83, 85, 82, 60, 54, 54, 52, 55, 58,
-             62, 65, 71, 72, 79, 79, 85, 86, 87, 86, 61, 56, 55, 53, 56, 59, 63,
-             66, 72, 73, 80, 81, 86, 87, 88, 86, 63, 57, 57, 55, 57, 60, 64, 67,
-             73, 75, 82, 82, 89, 90, 92, 90, 64, 58, 58, 55, 58, 61, 65, 68, 73,
-             75, 82, 83, 89, 90, 92, 90, 64, 59, 58, 56, 58, 61, 65, 68, 74, 75,
-             83, 83, 90, 91, 94, 95, 66, 60, 59, 57, 59, 62, 66, 69, 75, 76, 84,
-             85, 91, 92, 94, 95, 67, 61, 60, 58, 59, 63, 66, 70, 74, 77, 82, 85,
-             91, 93, 96, 96, 68, 62, 61, 58, 59, 64, 65, 71, 72, 78, 81, 86, 89,
-             94, 95, 96, 68, 62, 62, 59, 59, 65, 65, 71, 71, 79, 79, 87, 87, 95,
-             95, 98,
-             // Size 32x16
-             32, 31, 31, 30, 31, 33, 35, 37, 41, 42, 49, 49, 48, 48, 49, 49, 50,
-             51, 52, 54, 54, 57, 57, 60, 61, 63, 64, 64, 66, 67, 68, 68, 31, 31,
-             31, 32, 33, 36, 38, 40, 42, 43, 46, 46, 46, 45, 45, 46, 46, 47, 48,
-             50, 50, 52, 52, 54, 56, 57, 58, 59, 60, 61, 62, 62, 32, 33, 33, 33,
-             35, 37, 39, 41, 43, 43, 47, 47, 46, 46, 45, 46, 46, 47, 48, 49, 50,
-             52, 52, 54, 55, 57, 58, 58, 59, 60, 61, 62, 37, 38, 38, 40, 41, 43,
-             45, 47, 47, 47, 48, 48, 47, 46, 46, 46, 46, 47, 47, 48, 49, 50, 51,
-             52, 53, 55, 55, 56, 57, 58, 58, 59, 40, 41, 41, 42, 43, 44, 46, 47,
-             48, 48, 50, 50, 49, 49, 48, 49, 49, 49, 50, 51, 51, 52, 53, 55, 56,
-             57, 58, 58, 59, 59, 59, 59, 48, 47, 47, 46, 46, 47, 47, 47, 49, 50,
-             53, 53, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 57, 58, 59, 60, 61,
-             61, 62, 63, 64, 65, 49, 47, 47, 45, 46, 46, 46, 46, 49, 49, 53, 53,
-             54, 55, 56, 57, 57, 58, 58, 59, 59, 60, 61, 62, 63, 64, 65, 65, 66,
-             66, 65, 65, 49, 47, 47, 45, 45, 46, 45, 45, 48, 49, 53, 54, 56, 56,
-             58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 66, 67, 68, 68, 69, 70, 71,
-             71, 51, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, 61, 62,
-             63, 64, 65, 67, 67, 69, 69, 71, 72, 73, 73, 74, 75, 74, 72, 71, 52,
-             50, 49, 48, 48, 47, 47, 47, 50, 50, 54, 55, 57, 58, 61, 62, 64, 66,
-             66, 68, 68, 70, 71, 72, 73, 75, 75, 75, 76, 77, 78, 79, 57, 54, 54,
-             52, 51, 51, 50, 50, 52, 53, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73,
-             73, 76, 77, 79, 80, 82, 82, 83, 84, 82, 81, 79, 58, 55, 54, 52, 52,
-             52, 51, 50, 53, 54, 57, 57, 60, 61, 64, 66, 67, 70, 71, 73, 74, 77,
-             77, 79, 81, 82, 83, 83, 85, 85, 86, 87, 63, 60, 59, 57, 57, 56, 55,
-             54, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 75, 77, 78, 82, 82, 85,
-             86, 89, 89, 90, 91, 91, 89, 87, 64, 61, 60, 58, 57, 57, 56, 55, 57,
-             58, 61, 61, 64, 65, 68, 69, 71, 74, 75, 78, 78, 82, 83, 86, 87, 90,
-             90, 91, 92, 93, 94, 95, 67, 63, 63, 60, 60, 59, 58, 57, 59, 60, 62,
-             63, 65, 66, 69, 70, 73, 74, 77, 78, 81, 83, 85, 87, 88, 92, 92, 94,
-             94, 96, 95, 95, 67, 64, 64, 61, 61, 60, 60, 58, 58, 61, 61, 64, 64,
-             67, 67, 70, 71, 74, 74, 78, 78, 82, 82, 86, 86, 90, 90, 95, 95, 96,
-             96, 98,
-             // Size 4x16
-             31, 48, 52, 64, 31, 47, 49, 60, 33, 46, 48, 57, 38, 47, 47, 56, 42,
-             49, 50, 57, 46, 53, 54, 61, 46, 53, 57, 64, 45, 53, 61, 68, 46, 54,
-             64, 71, 48, 54, 66, 75, 50, 55, 68, 78, 52, 57, 71, 83, 56, 59, 73,
-             87, 58, 61, 75, 90, 60, 62, 76, 92, 62, 64, 78, 94,
-             // Size 16x4
-             31, 31, 33, 38, 42, 46, 46, 45, 46, 48, 50, 52, 56, 58, 60, 62, 48,
-             47, 46, 47, 49, 53, 53, 53, 54, 54, 55, 57, 59, 61, 62, 64, 52, 49,
-             48, 47, 50, 54, 57, 61, 64, 66, 68, 71, 73, 75, 76, 78, 64, 60, 57,
-             56, 57, 61, 64, 68, 71, 75, 78, 83, 87, 90, 92, 94,
-             // Size 8x32
-             32, 32, 40, 49, 51, 57, 63, 67, 31, 33, 41, 47, 49, 54, 60, 63, 31,
-             33, 41, 47, 49, 54, 59, 63, 30, 33, 42, 45, 47, 52, 57, 60, 31, 35,
-             43, 46, 47, 51, 57, 60, 33, 37, 44, 46, 47, 51, 56, 59, 35, 39, 46,
-             46, 47, 50, 55, 58, 37, 41, 47, 46, 46, 50, 54, 57, 41, 43, 48, 49,
-             49, 52, 57, 59, 42, 43, 48, 49, 50, 53, 57, 60, 49, 47, 50, 53, 54,
-             57, 60, 62, 49, 47, 50, 53, 54, 57, 61, 63, 48, 46, 49, 54, 57, 60,
-             64, 65, 48, 46, 49, 55, 58, 61, 65, 66, 49, 45, 48, 56, 61, 64, 67,
-             69, 49, 46, 49, 57, 62, 65, 69, 70, 50, 46, 49, 57, 63, 67, 71, 73,
-             51, 47, 49, 58, 64, 69, 73, 74, 52, 48, 50, 58, 65, 71, 75, 77, 54,
-             49, 51, 59, 67, 73, 77, 78, 54, 50, 51, 59, 67, 73, 78, 81, 57, 52,
-             52, 60, 69, 76, 82, 83, 57, 52, 53, 61, 69, 77, 82, 85, 60, 54, 55,
-             62, 71, 79, 85, 87, 61, 55, 56, 63, 72, 80, 86, 88, 63, 57, 57, 64,
-             73, 82, 89, 92, 64, 58, 58, 65, 73, 82, 89, 92, 64, 58, 58, 65, 74,
-             83, 90, 94, 66, 59, 59, 66, 75, 84, 91, 94, 67, 60, 59, 66, 74, 82,
-             91, 96, 68, 61, 59, 65, 72, 81, 89, 95, 68, 62, 59, 65, 71, 79, 87,
-             95,
-             // Size 32x8
-             32, 31, 31, 30, 31, 33, 35, 37, 41, 42, 49, 49, 48, 48, 49, 49, 50,
-             51, 52, 54, 54, 57, 57, 60, 61, 63, 64, 64, 66, 67, 68, 68, 32, 33,
-             33, 33, 35, 37, 39, 41, 43, 43, 47, 47, 46, 46, 45, 46, 46, 47, 48,
-             49, 50, 52, 52, 54, 55, 57, 58, 58, 59, 60, 61, 62, 40, 41, 41, 42,
-             43, 44, 46, 47, 48, 48, 50, 50, 49, 49, 48, 49, 49, 49, 50, 51, 51,
-             52, 53, 55, 56, 57, 58, 58, 59, 59, 59, 59, 49, 47, 47, 45, 46, 46,
-             46, 46, 49, 49, 53, 53, 54, 55, 56, 57, 57, 58, 58, 59, 59, 60, 61,
-             62, 63, 64, 65, 65, 66, 66, 65, 65, 51, 49, 49, 47, 47, 47, 47, 46,
-             49, 50, 54, 54, 57, 58, 61, 62, 63, 64, 65, 67, 67, 69, 69, 71, 72,
-             73, 73, 74, 75, 74, 72, 71, 57, 54, 54, 52, 51, 51, 50, 50, 52, 53,
-             57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 73, 76, 77, 79, 80, 82, 82,
-             83, 84, 82, 81, 79, 63, 60, 59, 57, 57, 56, 55, 54, 57, 57, 60, 61,
-             64, 65, 67, 69, 71, 73, 75, 77, 78, 82, 82, 85, 86, 89, 89, 90, 91,
-             91, 89, 87, 67, 63, 63, 60, 60, 59, 58, 57, 59, 60, 62, 63, 65, 66,
-             69, 70, 73, 74, 77, 78, 81, 83, 85, 87, 88, 92, 92, 94, 94, 96, 95,
-             95},
-        },
-        // Quantizer level 5.
-        {
-            {// Luma
-             // Size 4x4
-             32, 34, 49, 72, 34, 48, 60, 79, 49, 60, 82, 104, 72, 79, 104, 134,
-             // Size 8x8
-             32, 32, 34, 38, 46, 56, 68, 78, 32, 33, 35, 39, 45, 54, 64, 74, 34,
-             35, 39, 45, 51, 58, 68, 76, 38, 39, 45, 54, 61, 69, 78, 86, 46, 45,
-             51, 61, 71, 80, 90, 99, 56, 54, 58, 69, 80, 92, 103, 113, 68, 64,
-             68, 78, 90, 103, 117, 128, 78, 74, 76, 86, 99, 113, 128, 140,
-             // Size 16x16
-             32, 31, 31, 31, 32, 34, 36, 39, 44, 48, 54, 59, 65, 71, 80, 83, 31,
-             32, 32, 32, 32, 34, 35, 38, 42, 46, 51, 56, 62, 68, 76, 78, 31, 32,
-             32, 32, 32, 33, 34, 37, 41, 44, 49, 54, 59, 65, 72, 75, 31, 32, 32,
-             33, 34, 35, 36, 39, 42, 45, 50, 54, 59, 64, 71, 74, 32, 32, 32, 34,
-             35, 37, 38, 40, 42, 46, 49, 53, 58, 63, 69, 72, 34, 34, 33, 35, 37,
-             39, 42, 45, 47, 51, 54, 58, 63, 68, 74, 76, 36, 35, 34, 36, 38, 42,
-             48, 50, 54, 57, 60, 64, 68, 73, 79, 81, 39, 38, 37, 39, 40, 45, 50,
-             54, 58, 61, 65, 69, 73, 78, 84, 86, 44, 42, 41, 42, 42, 47, 54, 58,
-             63, 67, 71, 75, 79, 84, 90, 92, 48, 46, 44, 45, 46, 51, 57, 61, 67,
-             71, 76, 80, 85, 90, 96, 99, 54, 51, 49, 50, 49, 54, 60, 65, 71, 76,
-             82, 87, 92, 97, 104, 106, 59, 56, 54, 54, 53, 58, 64, 69, 75, 80,
-             87, 92, 98, 103, 110, 113, 65, 62, 59, 59, 58, 63, 68, 73, 79, 85,
-             92, 98, 105, 111, 118, 121, 71, 68, 65, 64, 63, 68, 73, 78, 84, 90,
-             97, 103, 111, 117, 125, 128, 80, 76, 72, 71, 69, 74, 79, 84, 90,
-             96, 104, 110, 118, 125, 134, 137, 83, 78, 75, 74, 72, 76, 81, 86,
-             92, 99, 106, 113, 121, 128, 137, 140,
-             // Size 32x32
-             32, 31, 31, 31, 31, 31, 31, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44,
-             48, 48, 54, 54, 59, 59, 65, 65, 71, 71, 80, 80, 83, 83, 87, 31, 32,
-             32, 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 42, 46, 46,
-             51, 51, 56, 56, 62, 62, 68, 68, 76, 76, 78, 78, 83, 31, 32, 32, 32,
-             32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 42, 46, 46, 51, 51,
-             56, 56, 62, 62, 68, 68, 76, 76, 78, 78, 83, 31, 32, 32, 32, 32, 32,
-             32, 32, 32, 33, 33, 34, 34, 37, 37, 41, 41, 44, 44, 49, 49, 54, 54,
-             59, 59, 65, 65, 72, 72, 75, 75, 79, 31, 32, 32, 32, 32, 32, 32, 32,
-             32, 33, 33, 34, 34, 37, 37, 41, 41, 44, 44, 49, 49, 54, 54, 59, 59,
-             65, 65, 72, 72, 75, 75, 79, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35,
-             35, 36, 36, 39, 39, 42, 42, 45, 45, 50, 50, 54, 54, 59, 59, 64, 64,
-             71, 71, 74, 74, 77, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 36,
-             36, 39, 39, 42, 42, 45, 45, 50, 50, 54, 54, 59, 59, 64, 64, 71, 71,
-             74, 74, 77, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 37, 38, 38, 40,
-             40, 42, 42, 46, 46, 49, 49, 53, 53, 58, 58, 63, 63, 69, 69, 72, 72,
-             75, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 42,
-             42, 46, 46, 49, 49, 53, 53, 58, 58, 63, 63, 69, 69, 72, 72, 75, 34,
-             34, 34, 33, 33, 35, 35, 37, 37, 39, 39, 42, 42, 45, 45, 47, 47, 51,
-             51, 54, 54, 58, 58, 63, 63, 68, 68, 74, 74, 76, 76, 80, 34, 34, 34,
-             33, 33, 35, 35, 37, 37, 39, 39, 42, 42, 45, 45, 47, 47, 51, 51, 54,
-             54, 58, 58, 63, 63, 68, 68, 74, 74, 76, 76, 80, 36, 35, 35, 34, 34,
-             36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 54, 54, 57, 57, 60, 60, 64,
-             64, 68, 68, 73, 73, 79, 79, 81, 81, 84, 36, 35, 35, 34, 34, 36, 36,
-             38, 38, 42, 42, 48, 48, 50, 50, 54, 54, 57, 57, 60, 60, 64, 64, 68,
-             68, 73, 73, 79, 79, 81, 81, 84, 39, 38, 38, 37, 37, 39, 39, 40, 40,
-             45, 45, 50, 50, 54, 54, 58, 58, 61, 61, 65, 65, 69, 69, 73, 73, 78,
-             78, 84, 84, 86, 86, 90, 39, 38, 38, 37, 37, 39, 39, 40, 40, 45, 45,
-             50, 50, 54, 54, 58, 58, 61, 61, 65, 65, 69, 69, 73, 73, 78, 78, 84,
-             84, 86, 86, 90, 44, 42, 42, 41, 41, 42, 42, 42, 42, 47, 47, 54, 54,
-             58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92,
-             92, 96, 44, 42, 42, 41, 41, 42, 42, 42, 42, 47, 47, 54, 54, 58, 58,
-             63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92, 92, 96,
-             48, 46, 46, 44, 44, 45, 45, 46, 46, 51, 51, 57, 57, 61, 61, 67, 67,
-             71, 71, 76, 76, 80, 80, 85, 85, 90, 90, 96, 96, 99, 99, 102, 48,
-             46, 46, 44, 44, 45, 45, 46, 46, 51, 51, 57, 57, 61, 61, 67, 67, 71,
-             71, 76, 76, 80, 80, 85, 85, 90, 90, 96, 96, 99, 99, 102, 54, 51,
-             51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76,
-             82, 82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 54, 51,
-             51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76,
-             82, 82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 59, 56,
-             56, 54, 54, 54, 54, 53, 53, 58, 58, 64, 64, 69, 69, 75, 75, 80, 80,
-             87, 87, 92, 92, 98, 98, 103, 103, 110, 110, 113, 113, 116, 59, 56,
-             56, 54, 54, 54, 54, 53, 53, 58, 58, 64, 64, 69, 69, 75, 75, 80, 80,
-             87, 87, 92, 92, 98, 98, 103, 103, 110, 110, 113, 113, 116, 65, 62,
-             62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85,
-             92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121, 121, 124, 65,
-             62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85,
-             85, 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121, 121, 124,
-             71, 68, 68, 65, 65, 64, 64, 63, 63, 68, 68, 73, 73, 78, 78, 84, 84,
-             90, 90, 97, 97, 103, 103, 111, 111, 117, 117, 125, 125, 128, 128,
-             132, 71, 68, 68, 65, 65, 64, 64, 63, 63, 68, 68, 73, 73, 78, 78,
-             84, 84, 90, 90, 97, 97, 103, 103, 111, 111, 117, 117, 125, 125,
-             128, 128, 132, 80, 76, 76, 72, 72, 71, 71, 69, 69, 74, 74, 79, 79,
-             84, 84, 90, 90, 96, 96, 104, 104, 110, 110, 118, 118, 125, 125,
-             134, 134, 137, 137, 141, 80, 76, 76, 72, 72, 71, 71, 69, 69, 74,
-             74, 79, 79, 84, 84, 90, 90, 96, 96, 104, 104, 110, 110, 118, 118,
-             125, 125, 134, 134, 137, 137, 141, 83, 78, 78, 75, 75, 74, 74, 72,
-             72, 76, 76, 81, 81, 86, 86, 92, 92, 99, 99, 106, 106, 113, 113,
-             121, 121, 128, 128, 137, 137, 140, 140, 144, 83, 78, 78, 75, 75,
-             74, 74, 72, 72, 76, 76, 81, 81, 86, 86, 92, 92, 99, 99, 106, 106,
-             113, 113, 121, 121, 128, 128, 137, 137, 140, 140, 144, 87, 83, 83,
-             79, 79, 77, 77, 75, 75, 80, 80, 84, 84, 90, 90, 96, 96, 102, 102,
-             109, 109, 116, 116, 124, 124, 132, 132, 141, 141, 144, 144, 149,
-             // Size 4x8
-             32, 35, 51, 75, 32, 36, 50, 71, 34, 42, 54, 73, 37, 50, 65, 84, 45,
-             56, 76, 96, 54, 63, 87, 110, 65, 73, 97, 125, 75, 81, 106, 136,
-             // Size 8x4
-             32, 32, 34, 37, 45, 54, 65, 75, 35, 36, 42, 50, 56, 63, 73, 81, 51,
-             50, 54, 65, 76, 87, 97, 106, 75, 71, 73, 84, 96, 110, 125, 136,
-             // Size 8x16
-             32, 31, 32, 36, 44, 53, 65, 79, 31, 32, 32, 35, 42, 51, 62, 75, 31,
-             32, 33, 34, 41, 49, 59, 72, 32, 32, 34, 36, 42, 50, 59, 71, 32, 33,
-             35, 38, 42, 49, 58, 69, 34, 34, 37, 42, 48, 54, 63, 73, 36, 34, 38,
-             48, 54, 60, 68, 78, 39, 37, 40, 50, 58, 65, 73, 84, 44, 41, 43, 53,
-             63, 71, 79, 90, 48, 45, 46, 56, 67, 76, 85, 96, 53, 49, 50, 60, 71,
-             82, 92, 103, 58, 54, 54, 63, 75, 87, 98, 110, 65, 60, 58, 68, 79,
-             92, 105, 118, 71, 65, 63, 73, 84, 97, 111, 125, 79, 72, 70, 79, 90,
-             104, 118, 133, 82, 75, 72, 81, 92, 106, 121, 136,
-             // Size 16x8
-             32, 31, 31, 32, 32, 34, 36, 39, 44, 48, 53, 58, 65, 71, 79, 82, 31,
-             32, 32, 32, 33, 34, 34, 37, 41, 45, 49, 54, 60, 65, 72, 75, 32, 32,
-             33, 34, 35, 37, 38, 40, 43, 46, 50, 54, 58, 63, 70, 72, 36, 35, 34,
-             36, 38, 42, 48, 50, 53, 56, 60, 63, 68, 73, 79, 81, 44, 42, 41, 42,
-             42, 48, 54, 58, 63, 67, 71, 75, 79, 84, 90, 92, 53, 51, 49, 50, 49,
-             54, 60, 65, 71, 76, 82, 87, 92, 97, 104, 106, 65, 62, 59, 59, 58,
-             63, 68, 73, 79, 85, 92, 98, 105, 111, 118, 121, 79, 75, 72, 71, 69,
-             73, 78, 84, 90, 96, 103, 110, 118, 125, 133, 136,
-             // Size 16x32
-             32, 31, 31, 32, 32, 36, 36, 44, 44, 53, 53, 65, 65, 79, 79, 87, 31,
-             32, 32, 32, 32, 35, 35, 42, 42, 51, 51, 62, 62, 75, 75, 82, 31, 32,
-             32, 32, 32, 35, 35, 42, 42, 51, 51, 62, 62, 75, 75, 82, 31, 32, 32,
-             33, 33, 34, 34, 41, 41, 49, 49, 59, 59, 72, 72, 78, 31, 32, 32, 33,
-             33, 34, 34, 41, 41, 49, 49, 59, 59, 72, 72, 78, 32, 32, 32, 34, 34,
-             36, 36, 42, 42, 50, 50, 59, 59, 71, 71, 77, 32, 32, 32, 34, 34, 36,
-             36, 42, 42, 50, 50, 59, 59, 71, 71, 77, 32, 33, 33, 35, 35, 38, 38,
-             42, 42, 49, 49, 58, 58, 69, 69, 75, 32, 33, 33, 35, 35, 38, 38, 42,
-             42, 49, 49, 58, 58, 69, 69, 75, 34, 34, 34, 37, 37, 42, 42, 48, 48,
-             54, 54, 63, 63, 73, 73, 79, 34, 34, 34, 37, 37, 42, 42, 48, 48, 54,
-             54, 63, 63, 73, 73, 79, 36, 34, 34, 38, 38, 48, 48, 54, 54, 60, 60,
-             68, 68, 78, 78, 84, 36, 34, 34, 38, 38, 48, 48, 54, 54, 60, 60, 68,
-             68, 78, 78, 84, 39, 37, 37, 40, 40, 50, 50, 58, 58, 65, 65, 73, 73,
-             84, 84, 89, 39, 37, 37, 40, 40, 50, 50, 58, 58, 65, 65, 73, 73, 84,
-             84, 89, 44, 41, 41, 43, 43, 53, 53, 63, 63, 71, 71, 79, 79, 90, 90,
-             95, 44, 41, 41, 43, 43, 53, 53, 63, 63, 71, 71, 79, 79, 90, 90, 95,
-             48, 45, 45, 46, 46, 56, 56, 67, 67, 76, 76, 85, 85, 96, 96, 102,
-             48, 45, 45, 46, 46, 56, 56, 67, 67, 76, 76, 85, 85, 96, 96, 102,
-             53, 49, 49, 50, 50, 60, 60, 71, 71, 82, 82, 92, 92, 103, 103, 109,
-             53, 49, 49, 50, 50, 60, 60, 71, 71, 82, 82, 92, 92, 103, 103, 109,
-             58, 54, 54, 54, 54, 63, 63, 75, 75, 87, 87, 98, 98, 110, 110, 116,
-             58, 54, 54, 54, 54, 63, 63, 75, 75, 87, 87, 98, 98, 110, 110, 116,
-             65, 60, 60, 58, 58, 68, 68, 79, 79, 92, 92, 105, 105, 118, 118,
-             124, 65, 60, 60, 58, 58, 68, 68, 79, 79, 92, 92, 105, 105, 118,
-             118, 124, 71, 65, 65, 63, 63, 73, 73, 84, 84, 97, 97, 111, 111,
-             125, 125, 132, 71, 65, 65, 63, 63, 73, 73, 84, 84, 97, 97, 111,
-             111, 125, 125, 132, 79, 72, 72, 70, 70, 79, 79, 90, 90, 104, 104,
-             118, 118, 133, 133, 141, 79, 72, 72, 70, 70, 79, 79, 90, 90, 104,
-             104, 118, 118, 133, 133, 141, 82, 75, 75, 72, 72, 81, 81, 92, 92,
-             106, 106, 121, 121, 136, 136, 144, 82, 75, 75, 72, 72, 81, 81, 92,
-             92, 106, 106, 121, 121, 136, 136, 144, 87, 79, 79, 76, 76, 84, 84,
-             96, 96, 109, 109, 124, 124, 141, 141, 149,
-             // Size 32x16
-             32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44,
-             48, 48, 53, 53, 58, 58, 65, 65, 71, 71, 79, 79, 82, 82, 87, 31, 32,
-             32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45,
-             49, 49, 54, 54, 60, 60, 65, 65, 72, 72, 75, 75, 79, 31, 32, 32, 32,
-             32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49,
-             54, 54, 60, 60, 65, 65, 72, 72, 75, 75, 79, 32, 32, 32, 33, 33, 34,
-             34, 35, 35, 37, 37, 38, 38, 40, 40, 43, 43, 46, 46, 50, 50, 54, 54,
-             58, 58, 63, 63, 70, 70, 72, 72, 76, 32, 32, 32, 33, 33, 34, 34, 35,
-             35, 37, 37, 38, 38, 40, 40, 43, 43, 46, 46, 50, 50, 54, 54, 58, 58,
-             63, 63, 70, 70, 72, 72, 76, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42,
-             42, 48, 48, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 68, 68, 73, 73,
-             79, 79, 81, 81, 84, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48,
-             48, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 68, 68, 73, 73, 79, 79,
-             81, 81, 84, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58,
-             58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92, 92,
-             96, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58, 58, 63,
-             63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92, 92, 96, 53,
-             51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76,
-             76, 82, 82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 53,
-             51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76,
-             76, 82, 82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 65,
-             62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85,
-             85, 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121, 121, 124,
-             65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79,
-             85, 85, 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121, 121,
-             124, 79, 75, 75, 72, 72, 71, 71, 69, 69, 73, 73, 78, 78, 84, 84,
-             90, 90, 96, 96, 103, 103, 110, 110, 118, 118, 125, 125, 133, 133,
-             136, 136, 141, 79, 75, 75, 72, 72, 71, 71, 69, 69, 73, 73, 78, 78,
-             84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118, 118, 125, 125,
-             133, 133, 136, 136, 141, 87, 82, 82, 78, 78, 77, 77, 75, 75, 79,
-             79, 84, 84, 89, 89, 95, 95, 102, 102, 109, 109, 116, 116, 124, 124,
-             132, 132, 141, 141, 144, 144, 149,
-             // Size 4x16
-             31, 36, 53, 79, 32, 35, 51, 75, 32, 34, 49, 72, 32, 36, 50, 71, 33,
-             38, 49, 69, 34, 42, 54, 73, 34, 48, 60, 78, 37, 50, 65, 84, 41, 53,
-             71, 90, 45, 56, 76, 96, 49, 60, 82, 103, 54, 63, 87, 110, 60, 68,
-             92, 118, 65, 73, 97, 125, 72, 79, 104, 133, 75, 81, 106, 136,
-             // Size 16x4
-             31, 32, 32, 32, 33, 34, 34, 37, 41, 45, 49, 54, 60, 65, 72, 75, 36,
-             35, 34, 36, 38, 42, 48, 50, 53, 56, 60, 63, 68, 73, 79, 81, 53, 51,
-             49, 50, 49, 54, 60, 65, 71, 76, 82, 87, 92, 97, 104, 106, 79, 75,
-             72, 71, 69, 73, 78, 84, 90, 96, 103, 110, 118, 125, 133, 136,
-             // Size 8x32
-             32, 31, 32, 36, 44, 53, 65, 79, 31, 32, 32, 35, 42, 51, 62, 75, 31,
-             32, 32, 35, 42, 51, 62, 75, 31, 32, 33, 34, 41, 49, 59, 72, 31, 32,
-             33, 34, 41, 49, 59, 72, 32, 32, 34, 36, 42, 50, 59, 71, 32, 32, 34,
-             36, 42, 50, 59, 71, 32, 33, 35, 38, 42, 49, 58, 69, 32, 33, 35, 38,
-             42, 49, 58, 69, 34, 34, 37, 42, 48, 54, 63, 73, 34, 34, 37, 42, 48,
-             54, 63, 73, 36, 34, 38, 48, 54, 60, 68, 78, 36, 34, 38, 48, 54, 60,
-             68, 78, 39, 37, 40, 50, 58, 65, 73, 84, 39, 37, 40, 50, 58, 65, 73,
-             84, 44, 41, 43, 53, 63, 71, 79, 90, 44, 41, 43, 53, 63, 71, 79, 90,
-             48, 45, 46, 56, 67, 76, 85, 96, 48, 45, 46, 56, 67, 76, 85, 96, 53,
-             49, 50, 60, 71, 82, 92, 103, 53, 49, 50, 60, 71, 82, 92, 103, 58,
-             54, 54, 63, 75, 87, 98, 110, 58, 54, 54, 63, 75, 87, 98, 110, 65,
-             60, 58, 68, 79, 92, 105, 118, 65, 60, 58, 68, 79, 92, 105, 118, 71,
-             65, 63, 73, 84, 97, 111, 125, 71, 65, 63, 73, 84, 97, 111, 125, 79,
-             72, 70, 79, 90, 104, 118, 133, 79, 72, 70, 79, 90, 104, 118, 133,
-             82, 75, 72, 81, 92, 106, 121, 136, 82, 75, 72, 81, 92, 106, 121,
-             136, 87, 79, 76, 84, 96, 109, 124, 141,
-             // Size 32x8
-             32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44,
-             48, 48, 53, 53, 58, 58, 65, 65, 71, 71, 79, 79, 82, 82, 87, 31, 32,
-             32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45,
-             49, 49, 54, 54, 60, 60, 65, 65, 72, 72, 75, 75, 79, 32, 32, 32, 33,
-             33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43, 43, 46, 46, 50, 50,
-             54, 54, 58, 58, 63, 63, 70, 70, 72, 72, 76, 36, 35, 35, 34, 34, 36,
-             36, 38, 38, 42, 42, 48, 48, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63,
-             68, 68, 73, 73, 79, 79, 81, 81, 84, 44, 42, 42, 41, 41, 42, 42, 42,
-             42, 48, 48, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79,
-             84, 84, 90, 90, 92, 92, 96, 53, 51, 51, 49, 49, 50, 50, 49, 49, 54,
-             54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, 87, 87, 92, 92, 97, 97,
-             104, 104, 106, 106, 109, 65, 62, 62, 59, 59, 59, 59, 58, 58, 63,
-             63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 105, 111,
-             111, 118, 118, 121, 121, 124, 79, 75, 75, 72, 72, 71, 71, 69, 69,
-             73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118,
-             118, 125, 125, 133, 133, 136, 136, 141},
-            {// Chroma
-             // Size 4x4
-             32, 46, 47, 57, 46, 53, 54, 60, 47, 54, 66, 75, 57, 60, 75, 89,
-             // Size 8x8
-             31, 34, 42, 47, 48, 52, 57, 61, 34, 39, 45, 46, 46, 49, 53, 57, 42,
-             45, 48, 49, 50, 52, 55, 58, 47, 46, 49, 54, 56, 58, 61, 64, 48, 46,
-             50, 56, 61, 65, 68, 71, 52, 49, 52, 58, 65, 71, 75, 79, 57, 53, 55,
-             61, 68, 75, 82, 86, 61, 57, 58, 64, 71, 79, 86, 91,
-             // Size 16x16
-             32, 31, 30, 33, 36, 41, 49, 48, 49, 50, 52, 54, 57, 60, 63, 65, 31,
-             31, 31, 34, 38, 42, 47, 47, 47, 48, 50, 52, 54, 57, 60, 61, 30, 31,
-             32, 35, 40, 42, 46, 45, 45, 46, 47, 49, 52, 54, 57, 58, 33, 34, 35,
-             39, 43, 45, 47, 46, 45, 46, 47, 49, 51, 53, 56, 57, 36, 38, 40, 43,
-             47, 47, 48, 46, 45, 46, 47, 48, 50, 52, 54, 55, 41, 42, 42, 45, 47,
-             48, 50, 49, 49, 50, 50, 52, 53, 55, 57, 58, 49, 47, 46, 47, 48, 50,
-             53, 53, 53, 54, 54, 55, 56, 58, 60, 61, 48, 47, 45, 46, 46, 49, 53,
-             54, 55, 56, 57, 58, 60, 61, 63, 64, 49, 47, 45, 45, 45, 49, 53, 55,
-             58, 60, 61, 62, 63, 65, 67, 68, 50, 48, 46, 46, 46, 50, 54, 56, 60,
-             61, 63, 65, 67, 68, 71, 71, 52, 50, 47, 47, 47, 50, 54, 57, 61, 63,
-             66, 68, 70, 72, 75, 75, 54, 52, 49, 49, 48, 52, 55, 58, 62, 65, 68,
-             71, 73, 75, 78, 79, 57, 54, 52, 51, 50, 53, 56, 60, 63, 67, 70, 73,
-             76, 79, 82, 83, 60, 57, 54, 53, 52, 55, 58, 61, 65, 68, 72, 75, 79,
-             82, 85, 86, 63, 60, 57, 56, 54, 57, 60, 63, 67, 71, 75, 78, 82, 85,
-             89, 90, 65, 61, 58, 57, 55, 58, 61, 64, 68, 71, 75, 79, 83, 86, 90,
-             91,
-             // Size 32x32
-             32, 31, 31, 30, 30, 33, 33, 36, 36, 41, 41, 49, 49, 48, 48, 49, 49,
-             50, 50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 65, 65, 67, 31, 31,
-             31, 31, 31, 34, 34, 38, 38, 42, 42, 47, 47, 47, 47, 47, 47, 48, 48,
-             50, 50, 52, 52, 54, 54, 57, 57, 60, 60, 61, 61, 63, 31, 31, 31, 31,
-             31, 34, 34, 38, 38, 42, 42, 47, 47, 47, 47, 47, 47, 48, 48, 50, 50,
-             52, 52, 54, 54, 57, 57, 60, 60, 61, 61, 63, 30, 31, 31, 32, 32, 35,
-             35, 40, 40, 42, 42, 46, 46, 45, 45, 45, 45, 46, 46, 47, 47, 49, 49,
-             52, 52, 54, 54, 57, 57, 58, 58, 60, 30, 31, 31, 32, 32, 35, 35, 40,
-             40, 42, 42, 46, 46, 45, 45, 45, 45, 46, 46, 47, 47, 49, 49, 52, 52,
-             54, 54, 57, 57, 58, 58, 60, 33, 34, 34, 35, 35, 39, 39, 43, 43, 45,
-             45, 47, 47, 46, 46, 45, 45, 46, 46, 47, 47, 49, 49, 51, 51, 53, 53,
-             56, 56, 57, 57, 59, 33, 34, 34, 35, 35, 39, 39, 43, 43, 45, 45, 47,
-             47, 46, 46, 45, 45, 46, 46, 47, 47, 49, 49, 51, 51, 53, 53, 56, 56,
-             57, 57, 59, 36, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 46,
-             46, 45, 45, 46, 46, 47, 47, 48, 48, 50, 50, 52, 52, 54, 54, 55, 55,
-             57, 36, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 46, 46, 45,
-             45, 46, 46, 47, 47, 48, 48, 50, 50, 52, 52, 54, 54, 55, 55, 57, 41,
-             42, 42, 42, 42, 45, 45, 47, 47, 48, 48, 50, 50, 49, 49, 49, 49, 50,
-             50, 50, 50, 52, 52, 53, 53, 55, 55, 57, 57, 58, 58, 60, 41, 42, 42,
-             42, 42, 45, 45, 47, 47, 48, 48, 50, 50, 49, 49, 49, 49, 50, 50, 50,
-             50, 52, 52, 53, 53, 55, 55, 57, 57, 58, 58, 60, 49, 47, 47, 46, 46,
-             47, 47, 48, 48, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55,
-             55, 56, 56, 58, 58, 60, 60, 61, 61, 62, 49, 47, 47, 46, 46, 47, 47,
-             48, 48, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56,
-             56, 58, 58, 60, 60, 61, 61, 62, 48, 47, 47, 45, 45, 46, 46, 46, 46,
-             49, 49, 53, 53, 54, 54, 55, 55, 56, 56, 57, 57, 58, 58, 60, 60, 61,
-             61, 63, 63, 64, 64, 66, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49,
-             53, 53, 54, 54, 55, 55, 56, 56, 57, 57, 58, 58, 60, 60, 61, 61, 63,
-             63, 64, 64, 66, 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53, 53,
-             55, 55, 58, 58, 60, 60, 61, 61, 62, 62, 63, 63, 65, 65, 67, 67, 68,
-             68, 69, 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53, 53, 55, 55,
-             58, 58, 60, 60, 61, 61, 62, 62, 63, 63, 65, 65, 67, 67, 68, 68, 69,
-             50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, 56, 60, 60,
-             61, 61, 63, 63, 65, 65, 67, 67, 68, 68, 71, 71, 71, 71, 72, 50, 48,
-             48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, 56, 60, 60, 61, 61,
-             63, 63, 65, 65, 67, 67, 68, 68, 71, 71, 71, 71, 72, 52, 50, 50, 47,
-             47, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 63, 63, 66, 66,
-             68, 68, 70, 70, 72, 72, 75, 75, 75, 75, 76, 52, 50, 50, 47, 47, 47,
-             47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 63, 63, 66, 66, 68, 68,
-             70, 70, 72, 72, 75, 75, 75, 75, 76, 54, 52, 52, 49, 49, 49, 49, 48,
-             48, 52, 52, 55, 55, 58, 58, 62, 62, 65, 65, 68, 68, 71, 71, 73, 73,
-             75, 75, 78, 78, 79, 79, 80, 54, 52, 52, 49, 49, 49, 49, 48, 48, 52,
-             52, 55, 55, 58, 58, 62, 62, 65, 65, 68, 68, 71, 71, 73, 73, 75, 75,
-             78, 78, 79, 79, 80, 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 56,
-             56, 60, 60, 63, 63, 67, 67, 70, 70, 73, 73, 76, 76, 79, 79, 82, 82,
-             83, 83, 84, 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 56, 56, 60,
-             60, 63, 63, 67, 67, 70, 70, 73, 73, 76, 76, 79, 79, 82, 82, 83, 83,
-             84, 60, 57, 57, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 61, 61, 65,
-             65, 68, 68, 72, 72, 75, 75, 79, 79, 82, 82, 85, 85, 86, 86, 88, 60,
-             57, 57, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 61, 61, 65, 65, 68,
-             68, 72, 72, 75, 75, 79, 79, 82, 82, 85, 85, 86, 86, 88, 63, 60, 60,
-             57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 63, 63, 67, 67, 71, 71, 75,
-             75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 63, 60, 60, 57, 57,
-             56, 56, 54, 54, 57, 57, 60, 60, 63, 63, 67, 67, 71, 71, 75, 75, 78,
-             78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 65, 61, 61, 58, 58, 57, 57,
-             55, 55, 58, 58, 61, 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83,
-             83, 86, 86, 90, 90, 91, 91, 93, 65, 61, 61, 58, 58, 57, 57, 55, 55,
-             58, 58, 61, 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 86,
-             86, 90, 90, 91, 91, 93, 67, 63, 63, 60, 60, 59, 59, 57, 57, 60, 60,
-             62, 62, 66, 66, 69, 69, 72, 72, 76, 76, 80, 80, 84, 84, 88, 88, 92,
-             92, 93, 93, 95,
-             // Size 4x8
-             31, 47, 50, 60, 36, 47, 47, 56, 43, 50, 50, 57, 46, 53, 57, 64, 46,
-             54, 64, 71, 50, 55, 68, 78, 54, 58, 72, 85, 59, 61, 75, 90,
-             // Size 8x4
-             31, 36, 43, 46, 46, 50, 54, 59, 47, 47, 50, 53, 54, 55, 58, 61, 50,
-             47, 50, 57, 64, 68, 72, 75, 60, 56, 57, 64, 71, 78, 85, 90,
-             // Size 8x16
-             32, 31, 37, 48, 49, 52, 57, 63, 31, 31, 38, 47, 47, 50, 54, 60, 30,
-             32, 40, 46, 45, 48, 52, 57, 33, 36, 43, 47, 46, 47, 51, 56, 37, 40,
-             47, 47, 45, 47, 50, 54, 42, 43, 47, 50, 49, 50, 53, 57, 49, 46, 48,
-             53, 53, 54, 57, 60, 48, 46, 47, 53, 56, 57, 60, 64, 49, 45, 46, 53,
-             58, 61, 64, 67, 50, 46, 46, 54, 59, 64, 67, 71, 52, 48, 47, 54, 61,
-             66, 71, 75, 54, 50, 49, 55, 62, 68, 73, 78, 57, 52, 50, 56, 64, 70,
-             76, 82, 60, 54, 52, 58, 65, 72, 79, 85, 63, 57, 55, 60, 67, 75, 82,
-             89, 64, 59, 56, 61, 68, 75, 83, 90,
-             // Size 16x8
-             32, 31, 30, 33, 37, 42, 49, 48, 49, 50, 52, 54, 57, 60, 63, 64, 31,
-             31, 32, 36, 40, 43, 46, 46, 45, 46, 48, 50, 52, 54, 57, 59, 37, 38,
-             40, 43, 47, 47, 48, 47, 46, 46, 47, 49, 50, 52, 55, 56, 48, 47, 46,
-             47, 47, 50, 53, 53, 53, 54, 54, 55, 56, 58, 60, 61, 49, 47, 45, 46,
-             45, 49, 53, 56, 58, 59, 61, 62, 64, 65, 67, 68, 52, 50, 48, 47, 47,
-             50, 54, 57, 61, 64, 66, 68, 70, 72, 75, 75, 57, 54, 52, 51, 50, 53,
-             57, 60, 64, 67, 71, 73, 76, 79, 82, 83, 63, 60, 57, 56, 54, 57, 60,
-             64, 67, 71, 75, 78, 82, 85, 89, 90,
-             // Size 16x32
-             32, 31, 31, 37, 37, 48, 48, 49, 49, 52, 52, 57, 57, 63, 63, 66, 31,
-             31, 31, 38, 38, 47, 47, 47, 47, 50, 50, 54, 54, 60, 60, 63, 31, 31,
-             31, 38, 38, 47, 47, 47, 47, 50, 50, 54, 54, 60, 60, 63, 30, 32, 32,
-             40, 40, 46, 46, 45, 45, 48, 48, 52, 52, 57, 57, 60, 30, 32, 32, 40,
-             40, 46, 46, 45, 45, 48, 48, 52, 52, 57, 57, 60, 33, 36, 36, 43, 43,
-             47, 47, 46, 46, 47, 47, 51, 51, 56, 56, 59, 33, 36, 36, 43, 43, 47,
-             47, 46, 46, 47, 47, 51, 51, 56, 56, 59, 37, 40, 40, 47, 47, 47, 47,
-             45, 45, 47, 47, 50, 50, 54, 54, 57, 37, 40, 40, 47, 47, 47, 47, 45,
-             45, 47, 47, 50, 50, 54, 54, 57, 42, 43, 43, 47, 47, 50, 50, 49, 49,
-             50, 50, 53, 53, 57, 57, 60, 42, 43, 43, 47, 47, 50, 50, 49, 49, 50,
-             50, 53, 53, 57, 57, 60, 49, 46, 46, 48, 48, 53, 53, 53, 53, 54, 54,
-             57, 57, 60, 60, 62, 49, 46, 46, 48, 48, 53, 53, 53, 53, 54, 54, 57,
-             57, 60, 60, 62, 48, 46, 46, 47, 47, 53, 53, 56, 56, 57, 57, 60, 60,
-             64, 64, 66, 48, 46, 46, 47, 47, 53, 53, 56, 56, 57, 57, 60, 60, 64,
-             64, 66, 49, 45, 45, 46, 46, 53, 53, 58, 58, 61, 61, 64, 64, 67, 67,
-             69, 49, 45, 45, 46, 46, 53, 53, 58, 58, 61, 61, 64, 64, 67, 67, 69,
-             50, 46, 46, 46, 46, 54, 54, 59, 59, 64, 64, 67, 67, 71, 71, 73, 50,
-             46, 46, 46, 46, 54, 54, 59, 59, 64, 64, 67, 67, 71, 71, 73, 52, 48,
-             48, 47, 47, 54, 54, 61, 61, 66, 66, 71, 71, 75, 75, 77, 52, 48, 48,
-             47, 47, 54, 54, 61, 61, 66, 66, 71, 71, 75, 75, 77, 54, 50, 50, 49,
-             49, 55, 55, 62, 62, 68, 68, 73, 73, 78, 78, 80, 54, 50, 50, 49, 49,
-             55, 55, 62, 62, 68, 68, 73, 73, 78, 78, 80, 57, 52, 52, 50, 50, 56,
-             56, 64, 64, 70, 70, 76, 76, 82, 82, 84, 57, 52, 52, 50, 50, 56, 56,
-             64, 64, 70, 70, 76, 76, 82, 82, 84, 60, 54, 54, 52, 52, 58, 58, 65,
-             65, 72, 72, 79, 79, 85, 85, 88, 60, 54, 54, 52, 52, 58, 58, 65, 65,
-             72, 72, 79, 79, 85, 85, 88, 63, 57, 57, 55, 55, 60, 60, 67, 67, 75,
-             75, 82, 82, 89, 89, 92, 63, 57, 57, 55, 55, 60, 60, 67, 67, 75, 75,
-             82, 82, 89, 89, 92, 64, 59, 59, 56, 56, 61, 61, 68, 68, 75, 75, 83,
-             83, 90, 90, 93, 64, 59, 59, 56, 56, 61, 61, 68, 68, 75, 75, 83, 83,
-             90, 90, 93, 66, 60, 60, 57, 57, 63, 63, 69, 69, 77, 77, 84, 84, 92,
-             92, 95,
-             // Size 32x16
-             32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 49,
-             50, 50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 64, 64, 66, 31, 31,
-             31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46,
-             48, 48, 50, 50, 52, 52, 54, 54, 57, 57, 59, 59, 60, 31, 31, 31, 32,
-             32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48,
-             50, 50, 52, 52, 54, 54, 57, 57, 59, 59, 60, 37, 38, 38, 40, 40, 43,
-             43, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 47, 47, 49, 49,
-             50, 50, 52, 52, 55, 55, 56, 56, 57, 37, 38, 38, 40, 40, 43, 43, 47,
-             47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 47, 47, 49, 49, 50, 50,
-             52, 52, 55, 55, 56, 56, 57, 48, 47, 47, 46, 46, 47, 47, 47, 47, 50,
-             50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, 58,
-             60, 60, 61, 61, 63, 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53,
-             53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, 58, 60, 60,
-             61, 61, 63, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56,
-             56, 58, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67, 67, 68, 68,
-             69, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56, 56, 58,
-             58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67, 67, 68, 68, 69, 52,
-             50, 50, 48, 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64,
-             64, 66, 66, 68, 68, 70, 70, 72, 72, 75, 75, 75, 75, 77, 52, 50, 50,
-             48, 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64, 64, 66,
-             66, 68, 68, 70, 70, 72, 72, 75, 75, 75, 75, 77, 57, 54, 54, 52, 52,
-             51, 51, 50, 50, 53, 53, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 73,
-             73, 76, 76, 79, 79, 82, 82, 83, 83, 84, 57, 54, 54, 52, 52, 51, 51,
-             50, 50, 53, 53, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76,
-             76, 79, 79, 82, 82, 83, 83, 84, 63, 60, 60, 57, 57, 56, 56, 54, 54,
-             57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85,
-             85, 89, 89, 90, 90, 92, 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57,
-             60, 60, 64, 64, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89,
-             89, 90, 90, 92, 66, 63, 63, 60, 60, 59, 59, 57, 57, 60, 60, 62, 62,
-             66, 66, 69, 69, 73, 73, 77, 77, 80, 80, 84, 84, 88, 88, 92, 92, 93,
-             93, 95,
-             // Size 4x16
-             31, 48, 52, 63, 31, 47, 50, 60, 32, 46, 48, 57, 36, 47, 47, 56, 40,
-             47, 47, 54, 43, 50, 50, 57, 46, 53, 54, 60, 46, 53, 57, 64, 45, 53,
-             61, 67, 46, 54, 64, 71, 48, 54, 66, 75, 50, 55, 68, 78, 52, 56, 70,
-             82, 54, 58, 72, 85, 57, 60, 75, 89, 59, 61, 75, 90,
-             // Size 16x4
-             31, 31, 32, 36, 40, 43, 46, 46, 45, 46, 48, 50, 52, 54, 57, 59, 48,
-             47, 46, 47, 47, 50, 53, 53, 53, 54, 54, 55, 56, 58, 60, 61, 52, 50,
-             48, 47, 47, 50, 54, 57, 61, 64, 66, 68, 70, 72, 75, 75, 63, 60, 57,
-             56, 54, 57, 60, 64, 67, 71, 75, 78, 82, 85, 89, 90,
-             // Size 8x32
-             32, 31, 37, 48, 49, 52, 57, 63, 31, 31, 38, 47, 47, 50, 54, 60, 31,
-             31, 38, 47, 47, 50, 54, 60, 30, 32, 40, 46, 45, 48, 52, 57, 30, 32,
-             40, 46, 45, 48, 52, 57, 33, 36, 43, 47, 46, 47, 51, 56, 33, 36, 43,
-             47, 46, 47, 51, 56, 37, 40, 47, 47, 45, 47, 50, 54, 37, 40, 47, 47,
-             45, 47, 50, 54, 42, 43, 47, 50, 49, 50, 53, 57, 42, 43, 47, 50, 49,
-             50, 53, 57, 49, 46, 48, 53, 53, 54, 57, 60, 49, 46, 48, 53, 53, 54,
-             57, 60, 48, 46, 47, 53, 56, 57, 60, 64, 48, 46, 47, 53, 56, 57, 60,
-             64, 49, 45, 46, 53, 58, 61, 64, 67, 49, 45, 46, 53, 58, 61, 64, 67,
-             50, 46, 46, 54, 59, 64, 67, 71, 50, 46, 46, 54, 59, 64, 67, 71, 52,
-             48, 47, 54, 61, 66, 71, 75, 52, 48, 47, 54, 61, 66, 71, 75, 54, 50,
-             49, 55, 62, 68, 73, 78, 54, 50, 49, 55, 62, 68, 73, 78, 57, 52, 50,
-             56, 64, 70, 76, 82, 57, 52, 50, 56, 64, 70, 76, 82, 60, 54, 52, 58,
-             65, 72, 79, 85, 60, 54, 52, 58, 65, 72, 79, 85, 63, 57, 55, 60, 67,
-             75, 82, 89, 63, 57, 55, 60, 67, 75, 82, 89, 64, 59, 56, 61, 68, 75,
-             83, 90, 64, 59, 56, 61, 68, 75, 83, 90, 66, 60, 57, 63, 69, 77, 84,
-             92,
-             // Size 32x8
-             32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 49,
-             50, 50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 64, 64, 66, 31, 31,
-             31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46,
-             48, 48, 50, 50, 52, 52, 54, 54, 57, 57, 59, 59, 60, 37, 38, 38, 40,
-             40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 47, 47,
-             49, 49, 50, 50, 52, 52, 55, 55, 56, 56, 57, 48, 47, 47, 46, 46, 47,
-             47, 47, 47, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55,
-             56, 56, 58, 58, 60, 60, 61, 61, 63, 49, 47, 47, 45, 45, 46, 46, 45,
-             45, 49, 49, 53, 53, 56, 56, 58, 58, 59, 59, 61, 61, 62, 62, 64, 64,
-             65, 65, 67, 67, 68, 68, 69, 52, 50, 50, 48, 48, 47, 47, 47, 47, 50,
-             50, 54, 54, 57, 57, 61, 61, 64, 64, 66, 66, 68, 68, 70, 70, 72, 72,
-             75, 75, 75, 75, 77, 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 57,
-             57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76, 76, 79, 79, 82, 82,
-             83, 83, 84, 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 64,
-             64, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90,
-             92},
-        },
-        // Quantizer level 6.
-        {
-            {// Luma
-             // Size 4x4
-             32, 33, 45, 62, 33, 39, 51, 64, 45, 51, 71, 87, 62, 64, 87, 108,
-             // Size 8x8
-             31, 32, 32, 35, 42, 51, 59, 69, 32, 32, 33, 35, 41, 49, 56, 65, 32,
-             33, 35, 38, 43, 49, 56, 64, 35, 35, 38, 48, 54, 59, 66, 73, 42, 41,
-             43, 54, 63, 71, 77, 85, 51, 49, 49, 59, 71, 81, 89, 97, 59, 56, 56,
-             66, 77, 89, 98, 108, 69, 65, 64, 73, 85, 97, 108, 119,
-             // Size 16x16
-             32, 31, 31, 31, 32, 34, 35, 38, 41, 45, 48, 54, 59, 65, 71, 80, 31,
-             32, 32, 32, 32, 34, 35, 37, 40, 43, 46, 51, 56, 62, 68, 76, 31, 32,
-             32, 32, 32, 33, 34, 36, 38, 41, 44, 49, 54, 59, 65, 72, 31, 32, 32,
-             33, 34, 35, 36, 38, 40, 42, 45, 50, 54, 59, 64, 71, 32, 32, 32, 34,
-             35, 37, 38, 39, 41, 43, 46, 49, 53, 58, 63, 69, 34, 34, 33, 35, 37,
-             39, 42, 44, 46, 48, 51, 54, 58, 63, 68, 74, 35, 35, 34, 36, 38, 42,
-             46, 48, 50, 53, 55, 59, 62, 67, 72, 78, 38, 37, 36, 38, 39, 44, 48,
-             51, 54, 57, 59, 63, 67, 71, 76, 82, 41, 40, 38, 40, 41, 46, 50, 54,
-             57, 60, 63, 67, 71, 75, 80, 86, 45, 43, 41, 42, 43, 48, 53, 57, 60,
-             65, 68, 72, 76, 81, 85, 91, 48, 46, 44, 45, 46, 51, 55, 59, 63, 68,
-             71, 76, 80, 85, 90, 96, 54, 51, 49, 50, 49, 54, 59, 63, 67, 72, 76,
-             82, 87, 92, 97, 104, 59, 56, 54, 54, 53, 58, 62, 67, 71, 76, 80,
-             87, 92, 98, 103, 110, 65, 62, 59, 59, 58, 63, 67, 71, 75, 81, 85,
-             92, 98, 105, 111, 118, 71, 68, 65, 64, 63, 68, 72, 76, 80, 85, 90,
-             97, 103, 111, 117, 125, 80, 76, 72, 71, 69, 74, 78, 82, 86, 91, 96,
-             104, 110, 118, 125, 134,
-             // Size 32x32
-             32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41,
-             44, 45, 48, 48, 53, 54, 57, 59, 62, 65, 67, 71, 72, 80, 80, 31, 31,
-             32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 38, 40, 42, 43,
-             46, 46, 51, 52, 55, 56, 59, 62, 64, 68, 69, 76, 76, 31, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 38, 40, 42, 43, 46, 46,
-             51, 51, 55, 56, 59, 62, 64, 68, 69, 76, 76, 31, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 33, 33, 34, 34, 36, 38, 39, 41, 42, 45, 45, 49, 50,
-             53, 54, 57, 60, 62, 66, 66, 73, 73, 31, 32, 32, 32, 32, 32, 32, 32,
-             32, 33, 33, 33, 34, 34, 36, 37, 38, 41, 41, 44, 44, 49, 49, 52, 54,
-             56, 59, 61, 65, 65, 72, 72, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33,
-             34, 34, 35, 35, 37, 38, 39, 41, 42, 45, 45, 49, 49, 52, 54, 56, 59,
-             61, 64, 65, 72, 72, 31, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35,
-             36, 36, 38, 39, 40, 42, 42, 45, 45, 49, 50, 52, 54, 56, 59, 60, 64,
-             65, 71, 71, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37,
-             38, 39, 40, 42, 43, 45, 45, 49, 49, 52, 54, 56, 59, 60, 64, 64, 70,
-             70, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 37, 37, 38, 38, 39, 40,
-             41, 42, 43, 46, 46, 49, 49, 52, 53, 55, 58, 59, 63, 63, 69, 69, 32,
-             32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 41, 41, 43,
-             43, 46, 46, 49, 50, 52, 54, 56, 58, 60, 63, 64, 70, 70, 34, 34, 34,
-             33, 33, 34, 35, 35, 37, 37, 39, 39, 42, 42, 44, 45, 46, 47, 48, 51,
-             51, 54, 54, 57, 58, 60, 63, 64, 68, 68, 74, 74, 34, 34, 34, 33, 33,
-             34, 35, 35, 37, 37, 39, 39, 42, 42, 44, 45, 46, 47, 48, 51, 51, 54,
-             54, 57, 58, 60, 63, 64, 68, 68, 74, 74, 35, 35, 35, 34, 34, 35, 36,
-             36, 38, 38, 42, 42, 46, 47, 48, 49, 50, 52, 53, 55, 55, 58, 59, 61,
-             62, 64, 67, 68, 72, 72, 78, 78, 36, 35, 35, 34, 34, 35, 36, 37, 38,
-             38, 42, 42, 47, 48, 50, 50, 52, 54, 54, 57, 57, 59, 60, 62, 64, 66,
-             68, 69, 73, 73, 79, 79, 38, 37, 37, 36, 36, 37, 38, 38, 39, 40, 44,
-             44, 48, 50, 51, 52, 54, 56, 57, 59, 59, 62, 63, 65, 67, 69, 71, 72,
-             76, 76, 82, 82, 39, 38, 38, 38, 37, 38, 39, 39, 40, 41, 45, 45, 49,
-             50, 52, 54, 55, 58, 58, 61, 61, 64, 65, 67, 69, 71, 73, 74, 78, 78,
-             84, 84, 41, 40, 40, 39, 38, 39, 40, 40, 41, 41, 46, 46, 50, 52, 54,
-             55, 57, 60, 60, 63, 63, 67, 67, 70, 71, 73, 75, 77, 80, 81, 86, 86,
-             44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 47, 47, 52, 54, 56, 58, 60,
-             63, 64, 67, 67, 71, 71, 74, 75, 77, 79, 81, 84, 85, 90, 90, 45, 43,
-             43, 42, 41, 42, 42, 43, 43, 43, 48, 48, 53, 54, 57, 58, 60, 64, 65,
-             68, 68, 72, 72, 75, 76, 78, 81, 82, 85, 86, 91, 91, 48, 46, 46, 45,
-             44, 45, 45, 45, 46, 46, 51, 51, 55, 57, 59, 61, 63, 67, 68, 71, 71,
-             75, 76, 79, 80, 83, 85, 87, 90, 91, 96, 96, 48, 46, 46, 45, 44, 45,
-             45, 45, 46, 46, 51, 51, 55, 57, 59, 61, 63, 67, 68, 71, 71, 75, 76,
-             79, 80, 83, 85, 87, 90, 91, 96, 96, 53, 51, 51, 49, 49, 49, 49, 49,
-             49, 49, 54, 54, 58, 59, 62, 64, 67, 71, 72, 75, 75, 81, 81, 85, 86,
-             89, 91, 93, 97, 97, 103, 103, 54, 52, 51, 50, 49, 49, 50, 49, 49,
-             50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81, 82, 85, 87, 89,
-             92, 94, 97, 98, 104, 104, 57, 55, 55, 53, 52, 52, 52, 52, 52, 52,
-             57, 57, 61, 62, 65, 67, 70, 74, 75, 79, 79, 85, 85, 89, 90, 93, 96,
-             98, 102, 102, 108, 108, 59, 56, 56, 54, 54, 54, 54, 54, 53, 54, 58,
-             58, 62, 64, 67, 69, 71, 75, 76, 80, 80, 86, 87, 90, 92, 95, 98, 99,
-             103, 104, 110, 110, 62, 59, 59, 57, 56, 56, 56, 56, 55, 56, 60, 60,
-             64, 66, 69, 71, 73, 77, 78, 83, 83, 89, 89, 93, 95, 98, 101, 103,
-             107, 108, 114, 114, 65, 62, 62, 60, 59, 59, 59, 59, 58, 58, 63, 63,
-             67, 68, 71, 73, 75, 79, 81, 85, 85, 91, 92, 96, 98, 101, 105, 106,
-             111, 111, 118, 118, 67, 64, 64, 62, 61, 61, 60, 60, 59, 60, 64, 64,
-             68, 69, 72, 74, 77, 81, 82, 87, 87, 93, 94, 98, 99, 103, 106, 108,
-             113, 113, 120, 120, 71, 68, 68, 66, 65, 64, 64, 64, 63, 63, 68, 68,
-             72, 73, 76, 78, 80, 84, 85, 90, 90, 97, 97, 102, 103, 107, 111,
-             113, 117, 118, 125, 125, 72, 69, 69, 66, 65, 65, 65, 64, 63, 64,
-             68, 68, 72, 73, 76, 78, 81, 85, 86, 91, 91, 97, 98, 102, 104, 108,
-             111, 113, 118, 119, 126, 126, 80, 76, 76, 73, 72, 72, 71, 70, 69,
-             70, 74, 74, 78, 79, 82, 84, 86, 90, 91, 96, 96, 103, 104, 108, 110,
-             114, 118, 120, 125, 126, 134, 134, 80, 76, 76, 73, 72, 72, 71, 70,
-             69, 70, 74, 74, 78, 79, 82, 84, 86, 90, 91, 96, 96, 103, 104, 108,
-             110, 114, 118, 120, 125, 126, 134, 134,
-             // Size 4x8
-             32, 34, 43, 62, 32, 34, 42, 59, 33, 37, 44, 58, 35, 43, 54, 68, 41,
-             48, 64, 79, 49, 54, 71, 91, 57, 60, 78, 101, 66, 68, 86, 111,
-             // Size 8x4
-             32, 32, 33, 35, 41, 49, 57, 66, 34, 34, 37, 43, 48, 54, 60, 68, 43,
-             42, 44, 54, 64, 71, 78, 86, 62, 59, 58, 68, 79, 91, 101, 111,
-             // Size 8x16
-             32, 31, 32, 36, 44, 53, 62, 73, 31, 32, 32, 35, 42, 51, 59, 69, 31,
-             32, 33, 34, 41, 49, 57, 66, 32, 32, 34, 36, 42, 50, 57, 65, 32, 33,
-             35, 38, 42, 49, 56, 64, 34, 34, 37, 42, 48, 54, 61, 69, 35, 34, 38,
-             47, 52, 59, 65, 73, 38, 36, 40, 49, 56, 63, 69, 77, 41, 39, 41, 51,
-             60, 67, 74, 81, 44, 42, 43, 54, 64, 72, 79, 86, 48, 45, 46, 56, 67,
-             76, 83, 91, 53, 49, 50, 60, 71, 82, 90, 99, 58, 54, 54, 63, 75, 87,
-             95, 105, 65, 60, 58, 68, 79, 92, 102, 112, 71, 65, 63, 73, 84, 97,
-             108, 119, 79, 72, 70, 79, 90, 104, 115, 127,
-             // Size 16x8
-             32, 31, 31, 32, 32, 34, 35, 38, 41, 44, 48, 53, 58, 65, 71, 79, 31,
-             32, 32, 32, 33, 34, 34, 36, 39, 42, 45, 49, 54, 60, 65, 72, 32, 32,
-             33, 34, 35, 37, 38, 40, 41, 43, 46, 50, 54, 58, 63, 70, 36, 35, 34,
-             36, 38, 42, 47, 49, 51, 54, 56, 60, 63, 68, 73, 79, 44, 42, 41, 42,
-             42, 48, 52, 56, 60, 64, 67, 71, 75, 79, 84, 90, 53, 51, 49, 50, 49,
-             54, 59, 63, 67, 72, 76, 82, 87, 92, 97, 104, 62, 59, 57, 57, 56,
-             61, 65, 69, 74, 79, 83, 90, 95, 102, 108, 115, 73, 69, 66, 65, 64,
-             69, 73, 77, 81, 86, 91, 99, 105, 112, 119, 127,
-             // Size 16x32
-             32, 31, 31, 32, 32, 34, 36, 38, 44, 44, 53, 53, 62, 65, 73, 79, 31,
-             32, 32, 32, 32, 34, 35, 37, 42, 43, 51, 51, 60, 62, 70, 75, 31, 32,
-             32, 32, 32, 34, 35, 37, 42, 43, 51, 51, 59, 62, 69, 75, 31, 32, 32,
-             32, 32, 33, 35, 36, 41, 42, 50, 50, 58, 60, 67, 73, 31, 32, 32, 32,
-             33, 33, 34, 36, 41, 41, 49, 49, 57, 59, 66, 72, 31, 32, 32, 33, 33,
-             34, 35, 37, 41, 42, 49, 49, 57, 59, 66, 71, 32, 32, 32, 33, 34, 35,
-             36, 38, 42, 43, 50, 50, 57, 59, 65, 71, 32, 32, 32, 34, 34, 35, 37,
-             38, 42, 43, 49, 49, 56, 59, 65, 70, 32, 32, 33, 34, 35, 37, 38, 39,
-             42, 43, 49, 49, 56, 58, 64, 69, 32, 33, 33, 34, 35, 37, 39, 40, 43,
-             44, 50, 50, 56, 58, 64, 69, 34, 34, 34, 36, 37, 39, 42, 44, 48, 48,
-             54, 54, 61, 63, 69, 73, 34, 34, 34, 36, 37, 39, 42, 44, 48, 48, 54,
-             54, 61, 63, 69, 73, 35, 34, 34, 37, 38, 42, 47, 48, 52, 53, 59, 59,
-             65, 67, 73, 77, 36, 35, 34, 37, 38, 43, 48, 49, 54, 54, 60, 60, 66,
-             68, 74, 78, 38, 36, 36, 38, 40, 44, 49, 51, 56, 57, 63, 63, 69, 71,
-             77, 81, 39, 38, 37, 40, 40, 45, 50, 52, 58, 58, 65, 65, 71, 73, 79,
-             84, 41, 39, 39, 41, 41, 46, 51, 54, 60, 60, 67, 67, 74, 76, 81, 86,
-             44, 41, 41, 42, 43, 48, 53, 56, 63, 64, 71, 71, 78, 79, 85, 90, 44,
-             42, 42, 43, 43, 48, 54, 56, 64, 64, 72, 72, 79, 81, 86, 91, 48, 45,
-             45, 46, 46, 51, 56, 59, 67, 67, 76, 76, 83, 85, 91, 96, 48, 45, 45,
-             46, 46, 51, 56, 59, 67, 67, 76, 76, 83, 85, 91, 96, 53, 49, 49, 49,
-             49, 54, 59, 62, 71, 71, 81, 81, 89, 91, 98, 103, 53, 50, 49, 50,
-             50, 54, 60, 63, 71, 72, 82, 82, 90, 92, 99, 103, 57, 53, 52, 52,
-             52, 57, 62, 65, 74, 75, 85, 85, 94, 96, 103, 108, 58, 54, 54, 54,
-             54, 58, 63, 67, 75, 76, 87, 87, 95, 98, 105, 110, 61, 57, 57, 56,
-             56, 60, 66, 69, 77, 78, 89, 89, 98, 101, 108, 114, 65, 60, 60, 59,
-             58, 63, 68, 71, 79, 80, 92, 92, 102, 105, 112, 118, 67, 62, 61, 60,
-             60, 64, 69, 72, 81, 82, 94, 94, 103, 106, 114, 120, 71, 66, 65, 64,
-             63, 68, 73, 76, 84, 85, 97, 97, 108, 111, 119, 125, 72, 66, 66, 64,
-             64, 68, 73, 76, 85, 86, 98, 98, 108, 111, 119, 125, 79, 73, 72, 71,
-             70, 74, 79, 82, 90, 91, 104, 104, 115, 118, 127, 133, 79, 73, 72,
-             71, 70, 74, 79, 82, 90, 91, 104, 104, 115, 118, 127, 133,
-             // Size 32x16
-             32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41,
-             44, 44, 48, 48, 53, 53, 57, 58, 61, 65, 67, 71, 72, 79, 79, 31, 32,
-             32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 38, 39, 41, 42,
-             45, 45, 49, 50, 53, 54, 57, 60, 62, 66, 66, 73, 73, 31, 32, 32, 32,
-             32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 36, 37, 39, 41, 42, 45, 45,
-             49, 49, 52, 54, 57, 60, 61, 65, 66, 72, 72, 32, 32, 32, 32, 32, 33,
-             33, 34, 34, 34, 36, 36, 37, 37, 38, 40, 41, 42, 43, 46, 46, 49, 50,
-             52, 54, 56, 59, 60, 64, 64, 71, 71, 32, 32, 32, 32, 33, 33, 34, 34,
-             35, 35, 37, 37, 38, 38, 40, 40, 41, 43, 43, 46, 46, 49, 50, 52, 54,
-             56, 58, 60, 63, 64, 70, 70, 34, 34, 34, 33, 33, 34, 35, 35, 37, 37,
-             39, 39, 42, 43, 44, 45, 46, 48, 48, 51, 51, 54, 54, 57, 58, 60, 63,
-             64, 68, 68, 74, 74, 36, 35, 35, 35, 34, 35, 36, 37, 38, 39, 42, 42,
-             47, 48, 49, 50, 51, 53, 54, 56, 56, 59, 60, 62, 63, 66, 68, 69, 73,
-             73, 79, 79, 38, 37, 37, 36, 36, 37, 38, 38, 39, 40, 44, 44, 48, 49,
-             51, 52, 54, 56, 56, 59, 59, 62, 63, 65, 67, 69, 71, 72, 76, 76, 82,
-             82, 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 48, 48, 52, 54, 56, 58,
-             60, 63, 64, 67, 67, 71, 71, 74, 75, 77, 79, 81, 84, 85, 90, 90, 44,
-             43, 43, 42, 41, 42, 43, 43, 43, 44, 48, 48, 53, 54, 57, 58, 60, 64,
-             64, 67, 67, 71, 72, 75, 76, 78, 80, 82, 85, 86, 91, 91, 53, 51, 51,
-             50, 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76,
-             76, 81, 82, 85, 87, 89, 92, 94, 97, 98, 104, 104, 53, 51, 51, 50,
-             49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76,
-             81, 82, 85, 87, 89, 92, 94, 97, 98, 104, 104, 62, 60, 59, 58, 57,
-             57, 57, 56, 56, 56, 61, 61, 65, 66, 69, 71, 74, 78, 79, 83, 83, 89,
-             90, 94, 95, 98, 102, 103, 108, 108, 115, 115, 65, 62, 62, 60, 59,
-             59, 59, 59, 58, 58, 63, 63, 67, 68, 71, 73, 76, 79, 81, 85, 85, 91,
-             92, 96, 98, 101, 105, 106, 111, 111, 118, 118, 73, 70, 69, 67, 66,
-             66, 65, 65, 64, 64, 69, 69, 73, 74, 77, 79, 81, 85, 86, 91, 91, 98,
-             99, 103, 105, 108, 112, 114, 119, 119, 127, 127, 79, 75, 75, 73,
-             72, 71, 71, 70, 69, 69, 73, 73, 77, 78, 81, 84, 86, 90, 91, 96, 96,
-             103, 103, 108, 110, 114, 118, 120, 125, 125, 133, 133,
-             // Size 4x16
-             31, 34, 44, 65, 32, 34, 43, 62, 32, 33, 41, 59, 32, 35, 43, 59, 32,
-             37, 43, 58, 34, 39, 48, 63, 34, 42, 53, 67, 36, 44, 57, 71, 39, 46,
-             60, 76, 42, 48, 64, 81, 45, 51, 67, 85, 50, 54, 72, 92, 54, 58, 76,
-             98, 60, 63, 80, 105, 66, 68, 85, 111, 73, 74, 91, 118,
-             // Size 16x4
-             31, 32, 32, 32, 32, 34, 34, 36, 39, 42, 45, 50, 54, 60, 66, 73, 34,
-             34, 33, 35, 37, 39, 42, 44, 46, 48, 51, 54, 58, 63, 68, 74, 44, 43,
-             41, 43, 43, 48, 53, 57, 60, 64, 67, 72, 76, 80, 85, 91, 65, 62, 59,
-             59, 58, 63, 67, 71, 76, 81, 85, 92, 98, 105, 111, 118,
-             // Size 8x32
-             32, 31, 32, 36, 44, 53, 62, 73, 31, 32, 32, 35, 42, 51, 60, 70, 31,
-             32, 32, 35, 42, 51, 59, 69, 31, 32, 32, 35, 41, 50, 58, 67, 31, 32,
-             33, 34, 41, 49, 57, 66, 31, 32, 33, 35, 41, 49, 57, 66, 32, 32, 34,
-             36, 42, 50, 57, 65, 32, 32, 34, 37, 42, 49, 56, 65, 32, 33, 35, 38,
-             42, 49, 56, 64, 32, 33, 35, 39, 43, 50, 56, 64, 34, 34, 37, 42, 48,
-             54, 61, 69, 34, 34, 37, 42, 48, 54, 61, 69, 35, 34, 38, 47, 52, 59,
-             65, 73, 36, 34, 38, 48, 54, 60, 66, 74, 38, 36, 40, 49, 56, 63, 69,
-             77, 39, 37, 40, 50, 58, 65, 71, 79, 41, 39, 41, 51, 60, 67, 74, 81,
-             44, 41, 43, 53, 63, 71, 78, 85, 44, 42, 43, 54, 64, 72, 79, 86, 48,
-             45, 46, 56, 67, 76, 83, 91, 48, 45, 46, 56, 67, 76, 83, 91, 53, 49,
-             49, 59, 71, 81, 89, 98, 53, 49, 50, 60, 71, 82, 90, 99, 57, 52, 52,
-             62, 74, 85, 94, 103, 58, 54, 54, 63, 75, 87, 95, 105, 61, 57, 56,
-             66, 77, 89, 98, 108, 65, 60, 58, 68, 79, 92, 102, 112, 67, 61, 60,
-             69, 81, 94, 103, 114, 71, 65, 63, 73, 84, 97, 108, 119, 72, 66, 64,
-             73, 85, 98, 108, 119, 79, 72, 70, 79, 90, 104, 115, 127, 79, 72,
-             70, 79, 90, 104, 115, 127,
-             // Size 32x8
-             32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41,
-             44, 44, 48, 48, 53, 53, 57, 58, 61, 65, 67, 71, 72, 79, 79, 31, 32,
-             32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 36, 37, 39, 41, 42,
-             45, 45, 49, 49, 52, 54, 57, 60, 61, 65, 66, 72, 72, 32, 32, 32, 32,
-             33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 41, 43, 43, 46, 46,
-             49, 50, 52, 54, 56, 58, 60, 63, 64, 70, 70, 36, 35, 35, 35, 34, 35,
-             36, 37, 38, 39, 42, 42, 47, 48, 49, 50, 51, 53, 54, 56, 56, 59, 60,
-             62, 63, 66, 68, 69, 73, 73, 79, 79, 44, 42, 42, 41, 41, 41, 42, 42,
-             42, 43, 48, 48, 52, 54, 56, 58, 60, 63, 64, 67, 67, 71, 71, 74, 75,
-             77, 79, 81, 84, 85, 90, 90, 53, 51, 51, 50, 49, 49, 50, 49, 49, 50,
-             54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81, 82, 85, 87, 89, 92,
-             94, 97, 98, 104, 104, 62, 60, 59, 58, 57, 57, 57, 56, 56, 56, 61,
-             61, 65, 66, 69, 71, 74, 78, 79, 83, 83, 89, 90, 94, 95, 98, 102,
-             103, 108, 108, 115, 115, 73, 70, 69, 67, 66, 66, 65, 65, 64, 64,
-             69, 69, 73, 74, 77, 79, 81, 85, 86, 91, 91, 98, 99, 103, 105, 108,
-             112, 114, 119, 119, 127, 127},
-            {// Chroma
-             // Size 4x4
-             31, 42, 47, 53, 42, 48, 50, 54, 47, 50, 61, 67, 53, 54, 67, 78,
-             // Size 8x8
-             31, 32, 38, 48, 47, 50, 53, 57, 32, 35, 42, 47, 45, 47, 50, 54, 38,
-             42, 47, 48, 45, 47, 49, 52, 48, 47, 48, 53, 53, 54, 56, 58, 47, 45,
-             45, 53, 58, 61, 63, 65, 50, 47, 47, 54, 61, 66, 69, 72, 53, 50, 49,
-             56, 63, 69, 73, 77, 57, 54, 52, 58, 65, 72, 77, 82,
-             // Size 16x16
-             32, 31, 30, 33, 36, 41, 47, 49, 49, 49, 50, 52, 54, 57, 60, 63, 31,
-             31, 31, 34, 38, 42, 46, 47, 47, 47, 48, 50, 52, 54, 57, 60, 30, 31,
-             32, 35, 40, 42, 45, 46, 45, 45, 46, 47, 49, 52, 54, 57, 33, 34, 35,
-             39, 43, 45, 47, 46, 46, 45, 46, 47, 49, 51, 53, 56, 36, 38, 40, 43,
-             47, 47, 47, 47, 46, 45, 46, 47, 48, 50, 52, 54, 41, 42, 42, 45, 47,
-             48, 50, 50, 49, 49, 50, 50, 52, 53, 55, 57, 47, 46, 45, 47, 47, 50,
-             52, 52, 52, 52, 53, 53, 55, 56, 58, 60, 49, 47, 46, 46, 47, 50, 52,
-             53, 54, 55, 55, 56, 57, 58, 60, 62, 49, 47, 45, 46, 46, 49, 52, 54,
-             55, 57, 58, 59, 60, 61, 63, 65, 49, 47, 45, 45, 45, 49, 52, 55, 57,
-             59, 60, 61, 63, 64, 66, 68, 50, 48, 46, 46, 46, 50, 53, 55, 58, 60,
-             61, 63, 65, 67, 68, 71, 52, 50, 47, 47, 47, 50, 53, 56, 59, 61, 63,
-             66, 68, 70, 72, 75, 54, 52, 49, 49, 48, 52, 55, 57, 60, 63, 65, 68,
-             71, 73, 75, 78, 57, 54, 52, 51, 50, 53, 56, 58, 61, 64, 67, 70, 73,
-             76, 79, 82, 60, 57, 54, 53, 52, 55, 58, 60, 63, 66, 68, 72, 75, 79,
-             82, 85, 63, 60, 57, 56, 54, 57, 60, 62, 65, 68, 71, 75, 78, 82, 85,
-             89,
-             // Size 32x32
-             32, 31, 31, 30, 30, 32, 33, 34, 36, 37, 41, 41, 47, 49, 49, 48, 49,
-             49, 49, 50, 50, 52, 52, 54, 54, 56, 57, 58, 60, 60, 63, 63, 31, 31,
-             31, 31, 31, 32, 34, 35, 38, 38, 42, 42, 46, 48, 47, 47, 47, 47, 47,
-             48, 48, 50, 50, 51, 52, 53, 54, 55, 57, 57, 60, 60, 31, 31, 31, 31,
-             31, 33, 34, 35, 38, 39, 42, 42, 46, 47, 47, 47, 47, 47, 47, 48, 48,
-             49, 50, 51, 52, 53, 54, 55, 57, 57, 60, 60, 30, 31, 31, 31, 31, 33,
-             35, 36, 39, 40, 42, 42, 46, 47, 46, 46, 46, 45, 46, 47, 47, 48, 48,
-             50, 50, 51, 52, 53, 55, 55, 58, 58, 30, 31, 31, 31, 32, 33, 35, 36,
-             40, 40, 42, 42, 45, 46, 46, 45, 45, 45, 45, 46, 46, 47, 47, 49, 49,
-             51, 52, 52, 54, 54, 57, 57, 32, 32, 33, 33, 33, 35, 37, 38, 41, 42,
-             43, 43, 46, 47, 46, 46, 45, 45, 45, 46, 46, 47, 47, 49, 49, 50, 51,
-             52, 54, 54, 57, 57, 33, 34, 34, 35, 35, 37, 39, 40, 43, 43, 45, 45,
-             47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 49, 49, 50, 51, 52, 53,
-             54, 56, 56, 34, 35, 35, 36, 36, 38, 40, 41, 44, 44, 45, 45, 47, 47,
-             47, 46, 46, 45, 45, 46, 46, 47, 47, 48, 49, 50, 51, 51, 53, 53, 55,
-             55, 36, 38, 38, 39, 40, 41, 43, 44, 47, 47, 47, 47, 47, 48, 47, 46,
-             46, 45, 45, 46, 46, 46, 47, 48, 48, 49, 50, 50, 52, 52, 54, 54, 37,
-             38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 48, 48, 47, 47, 46, 45,
-             46, 46, 46, 47, 47, 48, 48, 49, 50, 51, 52, 52, 55, 55, 41, 42, 42,
-             42, 42, 43, 45, 45, 47, 47, 48, 48, 50, 50, 50, 49, 49, 49, 49, 50,
-             50, 50, 50, 51, 52, 52, 53, 54, 55, 55, 57, 57, 41, 42, 42, 42, 42,
-             43, 45, 45, 47, 47, 48, 48, 50, 50, 50, 49, 49, 49, 49, 50, 50, 50,
-             50, 51, 52, 52, 53, 54, 55, 55, 57, 57, 47, 46, 46, 46, 45, 46, 47,
-             47, 47, 48, 50, 50, 52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53, 54,
-             55, 55, 56, 56, 58, 58, 60, 60, 49, 48, 47, 47, 46, 47, 47, 47, 48,
-             48, 50, 50, 52, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56,
-             56, 57, 58, 58, 60, 60, 49, 47, 47, 46, 46, 46, 46, 47, 47, 47, 50,
-             50, 52, 53, 53, 54, 54, 55, 55, 55, 55, 56, 56, 57, 57, 58, 58, 59,
-             60, 60, 62, 62, 48, 47, 47, 46, 45, 46, 46, 46, 46, 47, 49, 49, 52,
-             53, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58, 59, 60, 60, 61, 62,
-             63, 63, 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 49, 49, 52, 53, 54,
-             55, 55, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62, 63, 63, 65, 65,
-             49, 47, 47, 45, 45, 45, 45, 45, 45, 45, 49, 49, 52, 53, 55, 55, 57,
-             58, 59, 60, 60, 61, 61, 62, 62, 63, 63, 64, 65, 65, 67, 67, 49, 47,
-             47, 46, 45, 45, 45, 45, 45, 46, 49, 49, 52, 53, 55, 56, 57, 59, 59,
-             60, 60, 61, 61, 62, 63, 63, 64, 65, 66, 66, 68, 68, 50, 48, 48, 47,
-             46, 46, 46, 46, 46, 46, 50, 50, 53, 54, 55, 56, 58, 60, 60, 61, 61,
-             63, 63, 65, 65, 66, 67, 67, 68, 69, 71, 71, 50, 48, 48, 47, 46, 46,
-             46, 46, 46, 46, 50, 50, 53, 54, 55, 56, 58, 60, 60, 61, 61, 63, 63,
-             65, 65, 66, 67, 67, 68, 69, 71, 71, 52, 50, 49, 48, 47, 47, 47, 47,
-             46, 47, 50, 50, 53, 54, 56, 57, 59, 61, 61, 63, 63, 66, 66, 67, 68,
-             69, 70, 71, 72, 72, 74, 74, 52, 50, 50, 48, 47, 47, 47, 47, 47, 47,
-             50, 50, 53, 54, 56, 57, 59, 61, 61, 63, 63, 66, 66, 68, 68, 69, 70,
-             71, 72, 73, 75, 75, 54, 51, 51, 50, 49, 49, 49, 48, 48, 48, 51, 51,
-             54, 55, 57, 58, 60, 62, 62, 65, 65, 67, 68, 69, 70, 71, 72, 73, 74,
-             75, 77, 77, 54, 52, 52, 50, 49, 49, 49, 49, 48, 48, 52, 52, 55, 55,
-             57, 58, 60, 62, 63, 65, 65, 68, 68, 70, 71, 72, 73, 74, 75, 76, 78,
-             78, 56, 53, 53, 51, 51, 50, 50, 50, 49, 49, 52, 52, 55, 56, 58, 59,
-             61, 63, 63, 66, 66, 69, 69, 71, 72, 73, 75, 75, 77, 77, 80, 80, 57,
-             54, 54, 52, 52, 51, 51, 51, 50, 50, 53, 53, 56, 56, 58, 60, 61, 63,
-             64, 67, 67, 70, 70, 72, 73, 75, 76, 77, 79, 79, 82, 82, 58, 55, 55,
-             53, 52, 52, 52, 51, 50, 51, 54, 54, 56, 57, 59, 60, 62, 64, 65, 67,
-             67, 71, 71, 73, 74, 75, 77, 78, 80, 80, 83, 83, 60, 57, 57, 55, 54,
-             54, 53, 53, 52, 52, 55, 55, 58, 58, 60, 61, 63, 65, 66, 68, 68, 72,
-             72, 74, 75, 77, 79, 80, 82, 82, 85, 85, 60, 57, 57, 55, 54, 54, 54,
-             53, 52, 52, 55, 55, 58, 58, 60, 62, 63, 65, 66, 69, 69, 72, 73, 75,
-             76, 77, 79, 80, 82, 82, 85, 85, 63, 60, 60, 58, 57, 57, 56, 55, 54,
-             55, 57, 57, 60, 60, 62, 63, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80,
-             82, 83, 85, 85, 89, 89, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57,
-             57, 60, 60, 62, 63, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83,
-             85, 85, 89, 89,
-             // Size 4x8
-             31, 42, 47, 54, 33, 44, 45, 51, 40, 47, 46, 50, 47, 50, 54, 57, 45,
-             49, 59, 64, 48, 50, 61, 70, 51, 52, 63, 75, 55, 55, 66, 79,
-             // Size 8x4
-             31, 33, 40, 47, 45, 48, 51, 55, 42, 44, 47, 50, 49, 50, 52, 55, 47,
-             45, 46, 54, 59, 61, 63, 66, 54, 51, 50, 57, 64, 70, 75, 79,
-             // Size 8x16
-             32, 31, 37, 48, 49, 52, 56, 61, 31, 31, 38, 47, 47, 50, 53, 57, 30,
-             32, 40, 46, 45, 48, 51, 55, 33, 36, 43, 47, 46, 47, 50, 54, 37, 40,
-             47, 47, 45, 47, 49, 52, 42, 43, 47, 50, 49, 50, 53, 56, 47, 46, 48,
-             52, 53, 53, 55, 58, 48, 46, 47, 53, 55, 56, 58, 61, 48, 45, 46, 53,
-             57, 59, 61, 63, 49, 45, 46, 53, 58, 62, 64, 66, 50, 46, 46, 54, 59,
-             64, 66, 69, 52, 48, 47, 54, 61, 66, 70, 73, 54, 50, 49, 55, 62, 68,
-             72, 76, 57, 52, 50, 56, 64, 70, 75, 79, 60, 54, 52, 58, 65, 72, 77,
-             82, 63, 57, 55, 60, 67, 75, 80, 86,
-             // Size 16x8
-             32, 31, 30, 33, 37, 42, 47, 48, 48, 49, 50, 52, 54, 57, 60, 63, 31,
-             31, 32, 36, 40, 43, 46, 46, 45, 45, 46, 48, 50, 52, 54, 57, 37, 38,
-             40, 43, 47, 47, 48, 47, 46, 46, 46, 47, 49, 50, 52, 55, 48, 47, 46,
-             47, 47, 50, 52, 53, 53, 53, 54, 54, 55, 56, 58, 60, 49, 47, 45, 46,
-             45, 49, 53, 55, 57, 58, 59, 61, 62, 64, 65, 67, 52, 50, 48, 47, 47,
-             50, 53, 56, 59, 62, 64, 66, 68, 70, 72, 75, 56, 53, 51, 50, 49, 53,
-             55, 58, 61, 64, 66, 70, 72, 75, 77, 80, 61, 57, 55, 54, 52, 56, 58,
-             61, 63, 66, 69, 73, 76, 79, 82, 86,
-             // Size 16x32
-             32, 31, 31, 35, 37, 42, 48, 48, 49, 49, 52, 52, 56, 57, 61, 63, 31,
-             31, 31, 36, 38, 42, 47, 47, 47, 47, 50, 50, 54, 54, 58, 60, 31, 31,
-             31, 36, 38, 42, 47, 47, 47, 47, 50, 50, 53, 54, 57, 60, 30, 32, 32,
-             37, 39, 42, 46, 46, 46, 46, 48, 48, 52, 52, 56, 58, 30, 32, 32, 37,
-             40, 42, 46, 46, 45, 45, 48, 48, 51, 52, 55, 57, 32, 33, 34, 39, 41,
-             44, 46, 46, 45, 45, 48, 48, 51, 51, 54, 57, 33, 35, 36, 40, 43, 45,
-             47, 46, 46, 46, 47, 47, 50, 51, 54, 56, 34, 37, 37, 42, 44, 45, 47,
-             47, 45, 46, 47, 47, 50, 51, 53, 55, 37, 40, 40, 45, 47, 47, 47, 47,
-             45, 46, 47, 47, 49, 50, 52, 54, 37, 40, 40, 45, 47, 47, 48, 47, 46,
-             46, 47, 47, 49, 50, 53, 55, 42, 43, 43, 46, 47, 48, 50, 50, 49, 49,
-             50, 50, 53, 53, 56, 57, 42, 43, 43, 46, 47, 48, 50, 50, 49, 49, 50,
-             50, 53, 53, 56, 57, 47, 46, 46, 47, 48, 50, 52, 52, 53, 53, 53, 53,
-             55, 56, 58, 60, 49, 47, 46, 47, 48, 50, 53, 53, 53, 54, 54, 54, 56,
-             57, 59, 60, 48, 46, 46, 47, 47, 50, 53, 53, 55, 55, 56, 56, 58, 58,
-             61, 62, 48, 46, 46, 46, 47, 50, 53, 54, 56, 56, 57, 57, 59, 60, 62,
-             64, 48, 46, 45, 46, 46, 49, 53, 54, 57, 57, 59, 59, 61, 61, 63, 65,
-             49, 45, 45, 45, 46, 49, 53, 55, 58, 59, 61, 61, 63, 64, 66, 67, 49,
-             46, 45, 46, 46, 49, 53, 55, 58, 59, 62, 62, 64, 64, 66, 68, 50, 47,
-             46, 46, 46, 50, 54, 55, 59, 60, 64, 64, 66, 67, 69, 71, 50, 47, 46,
-             46, 46, 50, 54, 55, 59, 60, 64, 64, 66, 67, 69, 71, 52, 48, 48, 47,
-             47, 50, 54, 56, 61, 61, 66, 66, 69, 70, 72, 74, 52, 48, 48, 47, 47,
-             50, 54, 56, 61, 61, 66, 66, 70, 71, 73, 75, 53, 50, 49, 48, 48, 51,
-             55, 57, 62, 62, 68, 68, 71, 72, 75, 77, 54, 50, 50, 49, 49, 52, 55,
-             57, 62, 63, 68, 68, 72, 73, 76, 78, 55, 51, 51, 50, 49, 52, 56, 58,
-             63, 63, 69, 69, 74, 75, 78, 80, 57, 52, 52, 51, 50, 53, 56, 58, 64,
-             64, 70, 70, 75, 76, 79, 82, 58, 53, 53, 51, 51, 54, 57, 59, 64, 65,
-             71, 71, 76, 77, 80, 83, 60, 55, 54, 53, 52, 55, 58, 60, 65, 66, 72,
-             72, 77, 79, 82, 85, 60, 55, 55, 53, 53, 55, 59, 60, 65, 66, 73, 73,
-             78, 79, 83, 85, 63, 58, 57, 56, 55, 58, 60, 62, 67, 68, 75, 75, 80,
-             82, 86, 89, 63, 58, 57, 56, 55, 58, 60, 62, 67, 68, 75, 75, 80, 82,
-             86, 89,
-             // Size 32x16
-             32, 31, 31, 30, 30, 32, 33, 34, 37, 37, 42, 42, 47, 49, 48, 48, 48,
-             49, 49, 50, 50, 52, 52, 53, 54, 55, 57, 58, 60, 60, 63, 63, 31, 31,
-             31, 32, 32, 33, 35, 37, 40, 40, 43, 43, 46, 47, 46, 46, 46, 45, 46,
-             47, 47, 48, 48, 50, 50, 51, 52, 53, 55, 55, 58, 58, 31, 31, 31, 32,
-             32, 34, 36, 37, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 45, 46, 46,
-             48, 48, 49, 50, 51, 52, 53, 54, 55, 57, 57, 35, 36, 36, 37, 37, 39,
-             40, 42, 45, 45, 46, 46, 47, 47, 47, 46, 46, 45, 46, 46, 46, 47, 47,
-             48, 49, 50, 51, 51, 53, 53, 56, 56, 37, 38, 38, 39, 40, 41, 43, 44,
-             47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 48, 49,
-             49, 50, 51, 52, 53, 55, 55, 42, 42, 42, 42, 42, 44, 45, 45, 47, 47,
-             48, 48, 50, 50, 50, 50, 49, 49, 49, 50, 50, 50, 50, 51, 52, 52, 53,
-             54, 55, 55, 58, 58, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 50, 50,
-             52, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 57, 58,
-             59, 60, 60, 48, 47, 47, 46, 46, 46, 46, 47, 47, 47, 50, 50, 52, 53,
-             53, 54, 54, 55, 55, 55, 55, 56, 56, 57, 57, 58, 58, 59, 60, 60, 62,
-             62, 49, 47, 47, 46, 45, 45, 46, 45, 45, 46, 49, 49, 53, 53, 55, 56,
-             57, 58, 58, 59, 59, 61, 61, 62, 62, 63, 64, 64, 65, 65, 67, 67, 49,
-             47, 47, 46, 45, 45, 46, 46, 46, 46, 49, 49, 53, 54, 55, 56, 57, 59,
-             59, 60, 60, 61, 61, 62, 63, 63, 64, 65, 66, 66, 68, 68, 52, 50, 50,
-             48, 48, 48, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64,
-             64, 66, 66, 68, 68, 69, 70, 71, 72, 73, 75, 75, 52, 50, 50, 48, 48,
-             48, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64, 64, 66,
-             66, 68, 68, 69, 70, 71, 72, 73, 75, 75, 56, 54, 53, 52, 51, 51, 50,
-             50, 49, 49, 53, 53, 55, 56, 58, 59, 61, 63, 64, 66, 66, 69, 70, 71,
-             72, 74, 75, 76, 77, 78, 80, 80, 57, 54, 54, 52, 52, 51, 51, 51, 50,
-             50, 53, 53, 56, 57, 58, 60, 61, 64, 64, 67, 67, 70, 71, 72, 73, 75,
-             76, 77, 79, 79, 82, 82, 61, 58, 57, 56, 55, 54, 54, 53, 52, 53, 56,
-             56, 58, 59, 61, 62, 63, 66, 66, 69, 69, 72, 73, 75, 76, 78, 79, 80,
-             82, 83, 86, 86, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57, 60,
-             60, 62, 64, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85,
-             89, 89,
-             // Size 4x16
-             31, 42, 49, 57, 31, 42, 47, 54, 32, 42, 45, 52, 35, 45, 46, 51, 40,
-             47, 46, 50, 43, 48, 49, 53, 46, 50, 53, 56, 46, 50, 55, 58, 46, 49,
-             57, 61, 46, 49, 59, 64, 47, 50, 60, 67, 48, 50, 61, 71, 50, 52, 63,
-             73, 52, 53, 64, 76, 55, 55, 66, 79, 58, 58, 68, 82,
-             // Size 16x4
-             31, 31, 32, 35, 40, 43, 46, 46, 46, 46, 47, 48, 50, 52, 55, 58, 42,
-             42, 42, 45, 47, 48, 50, 50, 49, 49, 50, 50, 52, 53, 55, 58, 49, 47,
-             45, 46, 46, 49, 53, 55, 57, 59, 60, 61, 63, 64, 66, 68, 57, 54, 52,
-             51, 50, 53, 56, 58, 61, 64, 67, 71, 73, 76, 79, 82,
-             // Size 8x32
-             32, 31, 37, 48, 49, 52, 56, 61, 31, 31, 38, 47, 47, 50, 54, 58, 31,
-             31, 38, 47, 47, 50, 53, 57, 30, 32, 39, 46, 46, 48, 52, 56, 30, 32,
-             40, 46, 45, 48, 51, 55, 32, 34, 41, 46, 45, 48, 51, 54, 33, 36, 43,
-             47, 46, 47, 50, 54, 34, 37, 44, 47, 45, 47, 50, 53, 37, 40, 47, 47,
-             45, 47, 49, 52, 37, 40, 47, 48, 46, 47, 49, 53, 42, 43, 47, 50, 49,
-             50, 53, 56, 42, 43, 47, 50, 49, 50, 53, 56, 47, 46, 48, 52, 53, 53,
-             55, 58, 49, 46, 48, 53, 53, 54, 56, 59, 48, 46, 47, 53, 55, 56, 58,
-             61, 48, 46, 47, 53, 56, 57, 59, 62, 48, 45, 46, 53, 57, 59, 61, 63,
-             49, 45, 46, 53, 58, 61, 63, 66, 49, 45, 46, 53, 58, 62, 64, 66, 50,
-             46, 46, 54, 59, 64, 66, 69, 50, 46, 46, 54, 59, 64, 66, 69, 52, 48,
-             47, 54, 61, 66, 69, 72, 52, 48, 47, 54, 61, 66, 70, 73, 53, 49, 48,
-             55, 62, 68, 71, 75, 54, 50, 49, 55, 62, 68, 72, 76, 55, 51, 49, 56,
-             63, 69, 74, 78, 57, 52, 50, 56, 64, 70, 75, 79, 58, 53, 51, 57, 64,
-             71, 76, 80, 60, 54, 52, 58, 65, 72, 77, 82, 60, 55, 53, 59, 65, 73,
-             78, 83, 63, 57, 55, 60, 67, 75, 80, 86, 63, 57, 55, 60, 67, 75, 80,
-             86,
-             // Size 32x8
-             32, 31, 31, 30, 30, 32, 33, 34, 37, 37, 42, 42, 47, 49, 48, 48, 48,
-             49, 49, 50, 50, 52, 52, 53, 54, 55, 57, 58, 60, 60, 63, 63, 31, 31,
-             31, 32, 32, 34, 36, 37, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 45,
-             46, 46, 48, 48, 49, 50, 51, 52, 53, 54, 55, 57, 57, 37, 38, 38, 39,
-             40, 41, 43, 44, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46,
-             47, 47, 48, 49, 49, 50, 51, 52, 53, 55, 55, 48, 47, 47, 46, 46, 46,
-             47, 47, 47, 48, 50, 50, 52, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54,
-             55, 55, 56, 56, 57, 58, 59, 60, 60, 49, 47, 47, 46, 45, 45, 46, 45,
-             45, 46, 49, 49, 53, 53, 55, 56, 57, 58, 58, 59, 59, 61, 61, 62, 62,
-             63, 64, 64, 65, 65, 67, 67, 52, 50, 50, 48, 48, 48, 47, 47, 47, 47,
-             50, 50, 53, 54, 56, 57, 59, 61, 62, 64, 64, 66, 66, 68, 68, 69, 70,
-             71, 72, 73, 75, 75, 56, 54, 53, 52, 51, 51, 50, 50, 49, 49, 53, 53,
-             55, 56, 58, 59, 61, 63, 64, 66, 66, 69, 70, 71, 72, 74, 75, 76, 77,
-             78, 80, 80, 61, 58, 57, 56, 55, 54, 54, 53, 52, 53, 56, 56, 58, 59,
-             61, 62, 63, 66, 66, 69, 69, 72, 73, 75, 76, 78, 79, 80, 82, 83, 86,
-             86},
-        },
-        // Quantizer level 7.
-        {
-            {// Luma
-             // Size 4x4
-             32, 33, 42, 55, 33, 38, 46, 57, 42, 46, 63, 75, 55, 57, 75, 92,
-             // Size 8x8
-             31, 32, 32, 34, 38, 46, 52, 63, 32, 32, 32, 34, 37, 44, 49, 59, 32,
-             32, 35, 37, 40, 45, 49, 58, 34, 34, 37, 42, 47, 52, 56, 65, 38, 37,
-             40, 47, 54, 60, 65, 73, 46, 44, 45, 52, 60, 69, 75, 84, 52, 49, 49,
-             56, 65, 75, 82, 92, 63, 59, 58, 65, 73, 84, 92, 105,
-             // Size 16x16
-             32, 31, 31, 31, 32, 32, 34, 36, 38, 41, 44, 48, 54, 58, 61, 65, 31,
-             32, 32, 32, 32, 32, 34, 35, 38, 40, 42, 46, 51, 55, 58, 62, 31, 32,
-             32, 32, 32, 32, 33, 34, 37, 38, 41, 44, 49, 53, 56, 59, 31, 32, 32,
-             33, 33, 33, 35, 36, 38, 40, 42, 45, 49, 53, 56, 59, 32, 32, 32, 33,
-             34, 34, 36, 37, 39, 40, 42, 45, 49, 53, 55, 59, 32, 32, 32, 33, 34,
-             35, 37, 38, 40, 41, 42, 46, 49, 52, 55, 58, 34, 34, 33, 35, 36, 37,
-             39, 42, 44, 46, 47, 51, 54, 57, 60, 63, 36, 35, 34, 36, 37, 38, 42,
-             48, 50, 52, 54, 57, 60, 63, 65, 68, 38, 38, 37, 38, 39, 40, 44, 50,
-             52, 54, 57, 60, 64, 67, 69, 72, 41, 40, 38, 40, 40, 41, 46, 52, 54,
-             57, 60, 63, 67, 70, 73, 75, 44, 42, 41, 42, 42, 42, 47, 54, 57, 60,
-             63, 67, 71, 74, 77, 79, 48, 46, 44, 45, 45, 46, 51, 57, 60, 63, 67,
-             71, 76, 79, 82, 85, 54, 51, 49, 49, 49, 49, 54, 60, 64, 67, 71, 76,
-             82, 86, 89, 92, 58, 55, 53, 53, 53, 52, 57, 63, 67, 70, 74, 79, 86,
-             90, 93, 97, 61, 58, 56, 56, 55, 55, 60, 65, 69, 73, 77, 82, 89, 93,
-             97, 101, 65, 62, 59, 59, 59, 58, 63, 68, 72, 75, 79, 85, 92, 97,
-             101, 105,
-             // Size 32x32
-             32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 34, 36, 36, 38,
-             39, 41, 44, 44, 47, 48, 50, 54, 54, 58, 59, 61, 65, 65, 70, 31, 31,
-             31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 38, 38, 40,
-             42, 42, 46, 47, 49, 52, 52, 56, 57, 59, 63, 63, 67, 31, 31, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 38, 38, 40, 42, 42,
-             45, 46, 48, 51, 51, 55, 56, 58, 62, 62, 67, 31, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 37, 38, 39, 42, 42, 45, 45,
-             47, 50, 50, 54, 55, 57, 61, 61, 65, 31, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 33, 33, 34, 34, 34, 37, 37, 38, 41, 41, 44, 44, 46, 49,
-             49, 53, 54, 56, 59, 59, 64, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 33, 33, 34, 34, 34, 37, 37, 38, 41, 41, 44, 44, 46, 49, 49, 53,
-             54, 56, 59, 59, 64, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34,
-             35, 35, 36, 36, 38, 39, 40, 42, 42, 44, 45, 47, 49, 49, 53, 54, 56,
-             59, 59, 63, 31, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 36,
-             36, 36, 38, 39, 40, 42, 42, 45, 45, 47, 50, 50, 53, 54, 56, 59, 59,
-             63, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 36, 36, 37, 37,
-             39, 39, 40, 42, 42, 45, 45, 47, 49, 49, 53, 54, 55, 59, 59, 63, 32,
-             32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40,
-             41, 42, 42, 45, 46, 47, 49, 49, 52, 53, 55, 58, 58, 62, 32, 32, 32,
-             32, 32, 32, 33, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 42,
-             42, 45, 46, 47, 49, 49, 52, 53, 55, 58, 58, 62, 33, 33, 33, 33, 33,
-             33, 34, 35, 35, 36, 36, 38, 39, 40, 42, 42, 43, 44, 45, 46, 46, 49,
-             50, 51, 53, 53, 56, 57, 59, 62, 62, 66, 34, 34, 34, 34, 33, 33, 35,
-             35, 36, 37, 37, 39, 39, 41, 42, 42, 44, 45, 46, 47, 47, 50, 51, 52,
-             54, 54, 57, 58, 60, 63, 63, 67, 34, 34, 34, 34, 34, 34, 35, 36, 36,
-             37, 37, 40, 41, 42, 45, 45, 46, 47, 48, 50, 50, 52, 53, 54, 56, 56,
-             59, 60, 62, 65, 65, 69, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38,
-             42, 42, 45, 48, 48, 50, 50, 52, 54, 54, 56, 57, 58, 60, 60, 63, 64,
-             65, 68, 68, 72, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42,
-             45, 48, 48, 50, 50, 52, 54, 54, 56, 57, 58, 60, 60, 63, 64, 65, 68,
-             68, 72, 38, 38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43, 44, 46, 50,
-             50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 67, 68, 69, 72, 72, 76,
-             39, 38, 38, 38, 37, 37, 39, 39, 39, 40, 40, 44, 45, 47, 50, 50, 53,
-             54, 55, 58, 58, 60, 61, 62, 65, 65, 68, 69, 70, 73, 73, 77, 41, 40,
-             40, 39, 38, 38, 40, 40, 40, 41, 41, 45, 46, 48, 52, 52, 54, 55, 57,
-             60, 60, 62, 63, 65, 67, 67, 70, 71, 73, 75, 75, 79, 44, 42, 42, 42,
-             41, 41, 42, 42, 42, 42, 42, 46, 47, 50, 54, 54, 57, 58, 60, 63, 63,
-             66, 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 44, 42, 42, 42, 41, 41,
-             42, 42, 42, 42, 42, 46, 47, 50, 54, 54, 57, 58, 60, 63, 63, 66, 67,
-             68, 71, 71, 74, 75, 77, 79, 79, 83, 47, 46, 45, 45, 44, 44, 44, 45,
-             45, 45, 45, 49, 50, 52, 56, 56, 59, 60, 62, 66, 66, 69, 70, 72, 75,
-             75, 78, 79, 81, 84, 84, 88, 48, 47, 46, 45, 44, 44, 45, 45, 45, 46,
-             46, 50, 51, 53, 57, 57, 60, 61, 63, 67, 67, 70, 71, 73, 76, 76, 79,
-             80, 82, 85, 85, 89, 50, 49, 48, 47, 46, 46, 47, 47, 47, 47, 47, 51,
-             52, 54, 58, 58, 61, 62, 65, 68, 68, 72, 73, 75, 78, 78, 82, 83, 85,
-             88, 88, 92, 54, 52, 51, 50, 49, 49, 49, 50, 49, 49, 49, 53, 54, 56,
-             60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 82, 86, 87, 89, 92, 92,
-             96, 54, 52, 51, 50, 49, 49, 49, 50, 49, 49, 49, 53, 54, 56, 60, 60,
-             64, 65, 67, 71, 71, 75, 76, 78, 82, 82, 86, 87, 89, 92, 92, 96, 58,
-             56, 55, 54, 53, 53, 53, 53, 53, 52, 52, 56, 57, 59, 63, 63, 67, 68,
-             70, 74, 74, 78, 79, 82, 86, 86, 90, 91, 93, 97, 97, 101, 59, 57,
-             56, 55, 54, 54, 54, 54, 54, 53, 53, 57, 58, 60, 64, 64, 68, 69, 71,
-             75, 75, 79, 80, 83, 87, 87, 91, 92, 94, 98, 98, 102, 61, 59, 58,
-             57, 56, 56, 56, 56, 55, 55, 55, 59, 60, 62, 65, 65, 69, 70, 73, 77,
-             77, 81, 82, 85, 89, 89, 93, 94, 97, 101, 101, 105, 65, 63, 62, 61,
-             59, 59, 59, 59, 59, 58, 58, 62, 63, 65, 68, 68, 72, 73, 75, 79, 79,
-             84, 85, 88, 92, 92, 97, 98, 101, 105, 105, 109, 65, 63, 62, 61, 59,
-             59, 59, 59, 59, 58, 58, 62, 63, 65, 68, 68, 72, 73, 75, 79, 79, 84,
-             85, 88, 92, 92, 97, 98, 101, 105, 105, 109, 70, 67, 67, 65, 64, 64,
-             63, 63, 63, 62, 62, 66, 67, 69, 72, 72, 76, 77, 79, 83, 83, 88, 89,
-             92, 96, 96, 101, 102, 105, 109, 109, 114,
-             // Size 4x8
-             32, 32, 42, 56, 32, 33, 41, 53, 32, 35, 42, 52, 34, 37, 50, 59, 38,
-             40, 58, 68, 44, 45, 66, 78, 50, 50, 71, 86, 61, 58, 79, 97,
-             // Size 8x4
-             32, 32, 32, 34, 38, 44, 50, 61, 32, 33, 35, 37, 40, 45, 50, 58, 42,
-             41, 42, 50, 58, 66, 71, 79, 56, 53, 52, 59, 68, 78, 86, 97,
-             // Size 8x16
-             32, 31, 32, 35, 39, 44, 53, 65, 31, 32, 32, 35, 38, 42, 51, 62, 31,
-             32, 33, 34, 37, 41, 49, 59, 31, 32, 34, 35, 38, 42, 49, 59, 32, 32,
-             34, 36, 39, 42, 49, 58, 32, 33, 35, 37, 40, 42, 49, 58, 34, 34, 37,
-             41, 44, 48, 54, 63, 36, 34, 38, 46, 50, 54, 60, 68, 38, 37, 40, 47,
-             52, 57, 64, 72, 41, 39, 41, 49, 54, 60, 67, 76, 44, 41, 43, 51, 57,
-             63, 71, 79, 48, 45, 46, 54, 60, 67, 76, 85, 53, 49, 50, 57, 64, 71,
-             82, 92, 57, 53, 53, 60, 67, 74, 86, 97, 61, 56, 56, 63, 69, 77, 89,
-             100, 65, 60, 58, 66, 72, 79, 92, 105,
-             // Size 16x8
-             32, 31, 31, 31, 32, 32, 34, 36, 38, 41, 44, 48, 53, 57, 61, 65, 31,
-             32, 32, 32, 32, 33, 34, 34, 37, 39, 41, 45, 49, 53, 56, 60, 32, 32,
-             33, 34, 34, 35, 37, 38, 40, 41, 43, 46, 50, 53, 56, 58, 35, 35, 34,
-             35, 36, 37, 41, 46, 47, 49, 51, 54, 57, 60, 63, 66, 39, 38, 37, 38,
-             39, 40, 44, 50, 52, 54, 57, 60, 64, 67, 69, 72, 44, 42, 41, 42, 42,
-             42, 48, 54, 57, 60, 63, 67, 71, 74, 77, 79, 53, 51, 49, 49, 49, 49,
-             54, 60, 64, 67, 71, 76, 82, 86, 89, 92, 65, 62, 59, 59, 58, 58, 63,
-             68, 72, 76, 79, 85, 92, 97, 100, 105,
-             // Size 16x32
-             32, 31, 31, 31, 32, 32, 35, 36, 39, 44, 44, 51, 53, 58, 65, 65, 31,
-             32, 32, 32, 32, 32, 35, 35, 38, 42, 42, 49, 52, 56, 63, 63, 31, 32,
-             32, 32, 32, 32, 35, 35, 38, 42, 42, 49, 51, 55, 62, 62, 31, 32, 32,
-             32, 32, 32, 34, 35, 37, 41, 41, 48, 50, 54, 61, 61, 31, 32, 32, 32,
-             33, 33, 34, 34, 37, 41, 41, 47, 49, 53, 59, 59, 31, 32, 32, 32, 33,
-             33, 34, 34, 37, 41, 41, 47, 49, 53, 59, 59, 31, 32, 32, 33, 34, 34,
-             35, 36, 38, 42, 42, 48, 49, 53, 59, 59, 32, 32, 32, 33, 34, 34, 36,
-             36, 38, 42, 42, 48, 50, 53, 59, 59, 32, 32, 32, 33, 34, 34, 36, 37,
-             39, 42, 42, 48, 49, 53, 58, 58, 32, 32, 33, 34, 35, 35, 37, 38, 40,
-             42, 42, 48, 49, 52, 58, 58, 32, 32, 33, 34, 35, 35, 37, 38, 40, 42,
-             42, 48, 49, 52, 58, 58, 33, 33, 33, 35, 36, 36, 40, 41, 43, 46, 46,
-             52, 53, 56, 62, 62, 34, 34, 34, 35, 37, 37, 41, 42, 44, 48, 48, 53,
-             54, 57, 63, 63, 34, 34, 34, 35, 37, 37, 43, 44, 46, 50, 50, 55, 56,
-             59, 65, 65, 36, 35, 34, 36, 38, 38, 46, 48, 50, 54, 54, 58, 60, 63,
-             68, 68, 36, 35, 34, 36, 38, 38, 46, 48, 50, 54, 54, 58, 60, 63, 68,
-             68, 38, 37, 37, 38, 40, 40, 47, 50, 52, 57, 57, 62, 64, 67, 72, 72,
-             39, 38, 37, 39, 40, 40, 48, 50, 53, 58, 58, 63, 65, 68, 73, 73, 41,
-             39, 39, 40, 41, 41, 49, 51, 54, 60, 60, 66, 67, 70, 76, 76, 44, 41,
-             41, 42, 43, 43, 51, 53, 57, 63, 63, 69, 71, 74, 79, 79, 44, 41, 41,
-             42, 43, 43, 51, 53, 57, 63, 63, 69, 71, 74, 79, 79, 47, 44, 44, 44,
-             45, 45, 53, 56, 59, 66, 66, 73, 75, 78, 84, 84, 48, 45, 45, 45, 46,
-             46, 54, 56, 60, 67, 67, 74, 76, 79, 85, 85, 50, 47, 46, 47, 47, 47,
-             55, 58, 61, 68, 68, 76, 78, 82, 88, 88, 53, 50, 49, 50, 50, 50, 57,
-             60, 64, 71, 71, 79, 82, 86, 92, 92, 53, 50, 49, 50, 50, 50, 57, 60,
-             64, 71, 71, 79, 82, 86, 92, 92, 57, 54, 53, 53, 53, 53, 60, 63, 67,
-             74, 74, 83, 86, 90, 97, 97, 58, 55, 54, 54, 54, 54, 61, 63, 68, 75,
-             75, 84, 87, 91, 98, 98, 61, 57, 56, 56, 56, 56, 63, 65, 69, 77, 77,
-             86, 89, 93, 100, 100, 65, 61, 60, 59, 58, 58, 66, 68, 72, 79, 79,
-             89, 92, 97, 105, 105, 65, 61, 60, 59, 58, 58, 66, 68, 72, 79, 79,
-             89, 92, 97, 105, 105, 70, 65, 64, 63, 62, 62, 70, 72, 76, 83, 83,
-             93, 96, 101, 109, 109,
-             // Size 32x16
-             32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 38,
-             39, 41, 44, 44, 47, 48, 50, 53, 53, 57, 58, 61, 65, 65, 70, 31, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 37, 38, 39,
-             41, 41, 44, 45, 47, 50, 50, 54, 55, 57, 61, 61, 65, 31, 32, 32, 32,
-             32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 37, 37, 39, 41, 41,
-             44, 45, 46, 49, 49, 53, 54, 56, 60, 60, 64, 31, 32, 32, 32, 32, 32,
-             33, 33, 33, 34, 34, 35, 35, 35, 36, 36, 38, 39, 40, 42, 42, 44, 45,
-             47, 50, 50, 53, 54, 56, 59, 59, 63, 32, 32, 32, 32, 33, 33, 34, 34,
-             34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 43, 43, 45, 46, 47, 50,
-             50, 53, 54, 56, 58, 58, 62, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35,
-             35, 36, 37, 37, 38, 38, 40, 40, 41, 43, 43, 45, 46, 47, 50, 50, 53,
-             54, 56, 58, 58, 62, 35, 35, 35, 34, 34, 34, 35, 36, 36, 37, 37, 40,
-             41, 43, 46, 46, 47, 48, 49, 51, 51, 53, 54, 55, 57, 57, 60, 61, 63,
-             66, 66, 70, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 41, 42, 44,
-             48, 48, 50, 50, 51, 53, 53, 56, 56, 58, 60, 60, 63, 63, 65, 68, 68,
-             72, 39, 38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43, 44, 46, 50, 50,
-             52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 67, 68, 69, 72, 72, 76, 44,
-             42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58,
-             60, 63, 63, 66, 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 44, 42, 42,
-             41, 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58, 60, 63,
-             63, 66, 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 51, 49, 49, 48, 47,
-             47, 48, 48, 48, 48, 48, 52, 53, 55, 58, 58, 62, 63, 66, 69, 69, 73,
-             74, 76, 79, 79, 83, 84, 86, 89, 89, 93, 53, 52, 51, 50, 49, 49, 49,
-             50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78,
-             82, 82, 86, 87, 89, 92, 92, 96, 58, 56, 55, 54, 53, 53, 53, 53, 53,
-             52, 52, 56, 57, 59, 63, 63, 67, 68, 70, 74, 74, 78, 79, 82, 86, 86,
-             90, 91, 93, 97, 97, 101, 65, 63, 62, 61, 59, 59, 59, 59, 58, 58,
-             58, 62, 63, 65, 68, 68, 72, 73, 76, 79, 79, 84, 85, 88, 92, 92, 97,
-             98, 100, 105, 105, 109, 65, 63, 62, 61, 59, 59, 59, 59, 58, 58, 58,
-             62, 63, 65, 68, 68, 72, 73, 76, 79, 79, 84, 85, 88, 92, 92, 97, 98,
-             100, 105, 105, 109,
-             // Size 4x16
-             31, 32, 44, 58, 32, 32, 42, 55, 32, 33, 41, 53, 32, 34, 42, 53, 32,
-             34, 42, 53, 32, 35, 42, 52, 34, 37, 48, 57, 35, 38, 54, 63, 37, 40,
-             57, 67, 39, 41, 60, 70, 41, 43, 63, 74, 45, 46, 67, 79, 50, 50, 71,
-             86, 54, 53, 74, 90, 57, 56, 77, 93, 61, 58, 79, 97,
-             // Size 16x4
-             31, 32, 32, 32, 32, 32, 34, 35, 37, 39, 41, 45, 50, 54, 57, 61, 32,
-             32, 33, 34, 34, 35, 37, 38, 40, 41, 43, 46, 50, 53, 56, 58, 44, 42,
-             41, 42, 42, 42, 48, 54, 57, 60, 63, 67, 71, 74, 77, 79, 58, 55, 53,
-             53, 53, 52, 57, 63, 67, 70, 74, 79, 86, 90, 93, 97,
-             // Size 8x32
-             32, 31, 32, 35, 39, 44, 53, 65, 31, 32, 32, 35, 38, 42, 52, 63, 31,
-             32, 32, 35, 38, 42, 51, 62, 31, 32, 32, 34, 37, 41, 50, 61, 31, 32,
-             33, 34, 37, 41, 49, 59, 31, 32, 33, 34, 37, 41, 49, 59, 31, 32, 34,
-             35, 38, 42, 49, 59, 32, 32, 34, 36, 38, 42, 50, 59, 32, 32, 34, 36,
-             39, 42, 49, 58, 32, 33, 35, 37, 40, 42, 49, 58, 32, 33, 35, 37, 40,
-             42, 49, 58, 33, 33, 36, 40, 43, 46, 53, 62, 34, 34, 37, 41, 44, 48,
-             54, 63, 34, 34, 37, 43, 46, 50, 56, 65, 36, 34, 38, 46, 50, 54, 60,
-             68, 36, 34, 38, 46, 50, 54, 60, 68, 38, 37, 40, 47, 52, 57, 64, 72,
-             39, 37, 40, 48, 53, 58, 65, 73, 41, 39, 41, 49, 54, 60, 67, 76, 44,
-             41, 43, 51, 57, 63, 71, 79, 44, 41, 43, 51, 57, 63, 71, 79, 47, 44,
-             45, 53, 59, 66, 75, 84, 48, 45, 46, 54, 60, 67, 76, 85, 50, 46, 47,
-             55, 61, 68, 78, 88, 53, 49, 50, 57, 64, 71, 82, 92, 53, 49, 50, 57,
-             64, 71, 82, 92, 57, 53, 53, 60, 67, 74, 86, 97, 58, 54, 54, 61, 68,
-             75, 87, 98, 61, 56, 56, 63, 69, 77, 89, 100, 65, 60, 58, 66, 72,
-             79, 92, 105, 65, 60, 58, 66, 72, 79, 92, 105, 70, 64, 62, 70, 76,
-             83, 96, 109,
-             // Size 32x8
-             32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 38,
-             39, 41, 44, 44, 47, 48, 50, 53, 53, 57, 58, 61, 65, 65, 70, 31, 32,
-             32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 37, 37, 39,
-             41, 41, 44, 45, 46, 49, 49, 53, 54, 56, 60, 60, 64, 32, 32, 32, 32,
-             33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 43, 43,
-             45, 46, 47, 50, 50, 53, 54, 56, 58, 58, 62, 35, 35, 35, 34, 34, 34,
-             35, 36, 36, 37, 37, 40, 41, 43, 46, 46, 47, 48, 49, 51, 51, 53, 54,
-             55, 57, 57, 60, 61, 63, 66, 66, 70, 39, 38, 38, 37, 37, 37, 38, 38,
-             39, 40, 40, 43, 44, 46, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64,
-             64, 67, 68, 69, 72, 72, 76, 44, 42, 42, 41, 41, 41, 42, 42, 42, 42,
-             42, 46, 48, 50, 54, 54, 57, 58, 60, 63, 63, 66, 67, 68, 71, 71, 74,
-             75, 77, 79, 79, 83, 53, 52, 51, 50, 49, 49, 49, 50, 49, 49, 49, 53,
-             54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 82, 86, 87, 89,
-             92, 92, 96, 65, 63, 62, 61, 59, 59, 59, 59, 58, 58, 58, 62, 63, 65,
-             68, 68, 72, 73, 76, 79, 79, 84, 85, 88, 92, 92, 97, 98, 100, 105,
-             105, 109},
-            {// Chroma
-             // Size 4x4
-             31, 41, 46, 51, 41, 48, 48, 51, 46, 48, 58, 62, 51, 51, 62, 71,
-             // Size 8x8
-             31, 31, 38, 44, 47, 48, 50, 55, 31, 32, 40, 44, 45, 46, 47, 52, 38,
-             40, 47, 47, 46, 46, 47, 50, 44, 44, 47, 50, 51, 51, 52, 54, 47, 45,
-             46, 51, 54, 56, 57, 60, 48, 46, 46, 51, 56, 61, 63, 66, 50, 47, 47,
-             52, 57, 63, 66, 70, 55, 52, 50, 54, 60, 66, 70, 76,
-             // Size 16x16
-             32, 31, 30, 33, 34, 36, 41, 49, 48, 49, 49, 50, 52, 54, 55, 57, 31,
-             31, 31, 34, 36, 38, 42, 47, 47, 47, 47, 48, 50, 51, 53, 54, 30, 31,
-             32, 34, 37, 40, 42, 46, 45, 45, 45, 46, 47, 49, 50, 52, 33, 34, 34,
-             37, 40, 42, 44, 47, 46, 46, 45, 46, 47, 49, 50, 51, 34, 36, 37, 40,
-             42, 45, 46, 47, 46, 46, 45, 46, 47, 48, 49, 50, 36, 38, 40, 42, 45,
-             47, 47, 48, 47, 46, 45, 46, 47, 48, 49, 50, 41, 42, 42, 44, 46, 47,
-             48, 50, 50, 49, 49, 50, 50, 51, 52, 53, 49, 47, 46, 47, 47, 48, 50,
-             53, 53, 53, 53, 54, 54, 55, 56, 56, 48, 47, 45, 46, 46, 47, 50, 53,
-             54, 54, 55, 56, 57, 58, 58, 59, 49, 47, 45, 46, 46, 46, 49, 53, 54,
-             55, 57, 58, 59, 60, 60, 61, 49, 47, 45, 45, 45, 45, 49, 53, 55, 57,
-             58, 60, 61, 62, 63, 63, 50, 48, 46, 46, 46, 46, 50, 54, 56, 58, 60,
-             61, 63, 65, 66, 67, 52, 50, 47, 47, 47, 47, 50, 54, 57, 59, 61, 63,
-             66, 68, 69, 70, 54, 51, 49, 49, 48, 48, 51, 55, 58, 60, 62, 65, 68,
-             70, 71, 73, 55, 53, 50, 50, 49, 49, 52, 56, 58, 60, 63, 66, 69, 71,
-             73, 74, 57, 54, 52, 51, 50, 50, 53, 56, 59, 61, 63, 67, 70, 73, 74,
-             76,
-             // Size 32x32
-             32, 31, 31, 31, 30, 30, 33, 33, 34, 36, 36, 40, 41, 44, 49, 49, 48,
-             48, 49, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59, 31, 31,
-             31, 31, 31, 31, 33, 34, 36, 38, 38, 41, 42, 44, 48, 48, 47, 47, 47,
-             47, 47, 48, 49, 49, 50, 50, 52, 52, 53, 55, 55, 57, 31, 31, 31, 31,
-             31, 31, 34, 34, 36, 38, 38, 41, 42, 44, 47, 47, 47, 47, 47, 47, 47,
-             48, 48, 49, 50, 50, 51, 52, 53, 54, 54, 56, 31, 31, 31, 31, 31, 31,
-             34, 35, 36, 39, 39, 41, 42, 44, 47, 47, 46, 46, 46, 46, 46, 47, 47,
-             48, 49, 49, 50, 51, 52, 53, 53, 55, 30, 31, 31, 31, 32, 32, 34, 35,
-             37, 40, 40, 42, 42, 44, 46, 46, 45, 45, 45, 45, 45, 46, 46, 47, 47,
-             47, 49, 49, 50, 52, 52, 54, 30, 31, 31, 31, 32, 32, 34, 35, 37, 40,
-             40, 42, 42, 44, 46, 46, 45, 45, 45, 45, 45, 46, 46, 47, 47, 47, 49,
-             49, 50, 52, 52, 54, 33, 33, 34, 34, 34, 34, 37, 38, 40, 42, 42, 44,
-             44, 45, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 49, 49, 50,
-             51, 51, 53, 33, 34, 34, 35, 35, 35, 38, 39, 40, 43, 43, 44, 45, 46,
-             47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 49, 49, 50, 51, 51,
-             53, 34, 36, 36, 36, 37, 37, 40, 40, 42, 45, 45, 45, 46, 46, 47, 47,
-             46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52, 36,
-             38, 38, 39, 40, 40, 42, 43, 45, 47, 47, 47, 47, 47, 48, 48, 47, 46,
-             46, 45, 45, 46, 46, 46, 47, 47, 48, 48, 49, 50, 50, 51, 36, 38, 38,
-             39, 40, 40, 42, 43, 45, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 45,
-             45, 46, 46, 46, 47, 47, 48, 48, 49, 50, 50, 51, 40, 41, 41, 41, 42,
-             42, 44, 44, 45, 47, 47, 48, 48, 49, 50, 50, 49, 49, 49, 48, 48, 49,
-             49, 49, 49, 49, 51, 51, 51, 52, 52, 54, 41, 42, 42, 42, 42, 42, 44,
-             45, 46, 47, 47, 48, 48, 49, 50, 50, 50, 49, 49, 49, 49, 50, 50, 50,
-             50, 50, 51, 52, 52, 53, 53, 55, 44, 44, 44, 44, 44, 44, 45, 46, 46,
-             47, 47, 49, 49, 50, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 52, 52,
-             53, 53, 54, 54, 54, 56, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48,
-             50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 55, 55,
-             56, 56, 56, 58, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50,
-             51, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 55, 55, 56, 56,
-             56, 58, 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51, 53,
-             53, 54, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58, 58, 59, 59, 60,
-             48, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, 53, 53, 54,
-             54, 55, 55, 55, 56, 56, 57, 57, 57, 58, 58, 59, 60, 60, 61, 49, 47,
-             47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, 53, 53, 54, 55, 55,
-             57, 57, 57, 58, 58, 59, 59, 60, 60, 60, 61, 61, 63, 49, 47, 47, 46,
-             45, 45, 45, 45, 45, 45, 45, 48, 49, 51, 53, 53, 55, 55, 57, 58, 58,
-             59, 60, 60, 61, 61, 62, 62, 63, 63, 63, 65, 49, 47, 47, 46, 45, 45,
-             45, 45, 45, 45, 45, 48, 49, 51, 53, 53, 55, 55, 57, 58, 58, 59, 60,
-             60, 61, 61, 62, 62, 63, 63, 63, 65, 50, 48, 48, 47, 46, 46, 46, 46,
-             46, 46, 46, 49, 50, 51, 54, 54, 56, 56, 57, 59, 59, 61, 61, 62, 63,
-             63, 64, 64, 65, 66, 66, 67, 50, 49, 48, 47, 46, 46, 46, 46, 46, 46,
-             46, 49, 50, 51, 54, 54, 56, 56, 58, 60, 60, 61, 61, 62, 63, 63, 65,
-             65, 66, 67, 67, 68, 51, 49, 49, 48, 47, 47, 47, 47, 47, 46, 46, 49,
-             50, 51, 54, 54, 56, 57, 58, 60, 60, 62, 62, 63, 65, 65, 66, 66, 67,
-             68, 68, 70, 52, 50, 50, 49, 47, 47, 47, 47, 47, 47, 47, 49, 50, 52,
-             54, 54, 57, 57, 59, 61, 61, 63, 63, 65, 66, 66, 68, 68, 69, 70, 70,
-             72, 52, 50, 50, 49, 47, 47, 47, 47, 47, 47, 47, 49, 50, 52, 54, 54,
-             57, 57, 59, 61, 61, 63, 63, 65, 66, 66, 68, 68, 69, 70, 70, 72, 54,
-             52, 51, 50, 49, 49, 49, 49, 48, 48, 48, 51, 51, 53, 55, 55, 58, 58,
-             60, 62, 62, 64, 65, 66, 68, 68, 70, 70, 71, 73, 73, 74, 54, 52, 52,
-             51, 49, 49, 49, 49, 49, 48, 48, 51, 52, 53, 55, 55, 58, 58, 60, 62,
-             62, 64, 65, 66, 68, 68, 70, 71, 72, 73, 73, 75, 55, 53, 53, 52, 50,
-             50, 50, 50, 49, 49, 49, 51, 52, 54, 56, 56, 58, 59, 60, 63, 63, 65,
-             66, 67, 69, 69, 71, 72, 73, 74, 74, 76, 57, 55, 54, 53, 52, 52, 51,
-             51, 50, 50, 50, 52, 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68,
-             70, 70, 73, 73, 74, 76, 76, 78, 57, 55, 54, 53, 52, 52, 51, 51, 50,
-             50, 50, 52, 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68, 70, 70,
-             73, 73, 74, 76, 76, 78, 59, 57, 56, 55, 54, 54, 53, 53, 52, 51, 51,
-             54, 55, 56, 58, 58, 60, 61, 63, 65, 65, 67, 68, 70, 72, 72, 74, 75,
-             76, 78, 78, 80,
-             // Size 4x8
-             31, 38, 47, 52, 32, 40, 45, 49, 39, 47, 45, 48, 44, 47, 51, 53, 46,
-             47, 56, 58, 47, 46, 59, 64, 48, 47, 61, 68, 53, 50, 64, 73,
-             // Size 8x4
-             31, 32, 39, 44, 46, 47, 48, 53, 38, 40, 47, 47, 47, 46, 47, 50, 47,
-             45, 45, 51, 56, 59, 61, 64, 52, 49, 48, 53, 58, 64, 68, 73,
-             // Size 8x16
-             32, 31, 37, 45, 48, 49, 52, 57, 31, 31, 38, 45, 47, 47, 50, 54, 30,
-             32, 40, 44, 45, 45, 48, 52, 33, 35, 42, 46, 46, 45, 47, 51, 35, 37,
-             44, 46, 46, 45, 47, 51, 37, 40, 47, 47, 47, 45, 47, 50, 42, 43, 47,
-             49, 50, 49, 50, 53, 49, 46, 48, 52, 53, 53, 54, 57, 48, 46, 47, 51,
-             54, 55, 57, 59, 48, 45, 46, 51, 54, 57, 59, 61, 49, 45, 46, 51, 55,
-             58, 61, 64, 50, 46, 46, 52, 56, 59, 64, 67, 52, 48, 47, 53, 57, 61,
-             66, 71, 54, 49, 48, 54, 58, 62, 68, 73, 55, 51, 49, 54, 58, 63, 69,
-             74, 57, 52, 50, 55, 59, 64, 70, 76,
-             // Size 16x8
-             32, 31, 30, 33, 35, 37, 42, 49, 48, 48, 49, 50, 52, 54, 55, 57, 31,
-             31, 32, 35, 37, 40, 43, 46, 46, 45, 45, 46, 48, 49, 51, 52, 37, 38,
-             40, 42, 44, 47, 47, 48, 47, 46, 46, 46, 47, 48, 49, 50, 45, 45, 44,
-             46, 46, 47, 49, 52, 51, 51, 51, 52, 53, 54, 54, 55, 48, 47, 45, 46,
-             46, 47, 50, 53, 54, 54, 55, 56, 57, 58, 58, 59, 49, 47, 45, 45, 45,
-             45, 49, 53, 55, 57, 58, 59, 61, 62, 63, 64, 52, 50, 48, 47, 47, 47,
-             50, 54, 57, 59, 61, 64, 66, 68, 69, 70, 57, 54, 52, 51, 51, 50, 53,
-             57, 59, 61, 64, 67, 71, 73, 74, 76,
-             // Size 16x32
-             32, 31, 31, 33, 37, 37, 45, 48, 48, 49, 49, 51, 52, 54, 57, 57, 31,
-             31, 31, 34, 38, 38, 45, 47, 47, 47, 47, 50, 50, 52, 55, 55, 31, 31,
-             31, 34, 38, 38, 45, 47, 47, 47, 47, 49, 50, 51, 54, 54, 31, 31, 32,
-             34, 39, 39, 45, 46, 46, 46, 46, 48, 49, 51, 53, 53, 30, 32, 32, 35,
-             40, 40, 44, 46, 45, 45, 45, 47, 48, 49, 52, 52, 30, 32, 32, 35, 40,
-             40, 44, 46, 45, 45, 45, 47, 48, 49, 52, 52, 33, 34, 35, 37, 42, 42,
-             46, 47, 46, 45, 45, 47, 47, 49, 51, 51, 33, 35, 36, 38, 43, 43, 46,
-             47, 46, 46, 46, 47, 47, 49, 51, 51, 35, 37, 37, 40, 44, 44, 46, 47,
-             46, 45, 45, 47, 47, 48, 51, 51, 37, 39, 40, 43, 47, 47, 47, 47, 47,
-             45, 45, 46, 47, 48, 50, 50, 37, 39, 40, 43, 47, 47, 47, 47, 47, 45,
-             45, 46, 47, 48, 50, 50, 41, 42, 42, 44, 47, 47, 49, 49, 49, 48, 48,
-             49, 50, 51, 52, 52, 42, 42, 43, 44, 47, 47, 49, 50, 50, 49, 49, 50,
-             50, 51, 53, 53, 44, 44, 44, 45, 47, 47, 50, 51, 51, 51, 51, 52, 52,
-             53, 54, 54, 49, 47, 46, 47, 48, 48, 52, 53, 53, 53, 53, 54, 54, 55,
-             57, 57, 49, 47, 46, 47, 48, 48, 52, 53, 53, 53, 53, 54, 54, 55, 57,
-             57, 48, 46, 46, 46, 47, 47, 51, 53, 54, 55, 55, 56, 57, 58, 59, 59,
-             48, 46, 46, 46, 47, 47, 51, 53, 54, 56, 56, 57, 57, 58, 60, 60, 48,
-             46, 45, 46, 46, 46, 51, 53, 54, 57, 57, 58, 59, 60, 61, 61, 49, 46,
-             45, 45, 46, 46, 51, 53, 55, 58, 58, 61, 61, 62, 64, 64, 49, 46, 45,
-             45, 46, 46, 51, 53, 55, 58, 58, 61, 61, 62, 64, 64, 50, 47, 46, 46,
-             46, 46, 52, 54, 56, 59, 59, 62, 63, 64, 66, 66, 50, 47, 46, 46, 46,
-             46, 52, 54, 56, 59, 59, 63, 64, 65, 67, 67, 51, 48, 47, 47, 47, 47,
-             52, 54, 56, 60, 60, 64, 65, 66, 68, 68, 52, 48, 48, 47, 47, 47, 53,
-             54, 57, 61, 61, 65, 66, 68, 71, 71, 52, 48, 48, 47, 47, 47, 53, 54,
-             57, 61, 61, 65, 66, 68, 71, 71, 54, 50, 49, 49, 48, 48, 54, 55, 58,
-             62, 62, 67, 68, 70, 73, 73, 54, 51, 50, 49, 49, 49, 54, 55, 58, 62,
-             62, 67, 68, 70, 73, 73, 55, 51, 51, 50, 49, 49, 54, 56, 58, 63, 63,
-             68, 69, 71, 74, 74, 57, 53, 52, 51, 50, 50, 55, 56, 59, 64, 64, 69,
-             70, 73, 76, 76, 57, 53, 52, 51, 50, 50, 55, 56, 59, 64, 64, 69, 70,
-             73, 76, 76, 59, 55, 54, 53, 52, 52, 57, 58, 61, 65, 65, 70, 72, 74,
-             78, 78,
-             // Size 32x16
-             32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 48,
-             48, 48, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59, 31, 31,
-             31, 31, 32, 32, 34, 35, 37, 39, 39, 42, 42, 44, 47, 47, 46, 46, 46,
-             46, 46, 47, 47, 48, 48, 48, 50, 51, 51, 53, 53, 55, 31, 31, 31, 32,
-             32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 46, 46, 45, 45, 45,
-             46, 46, 47, 48, 48, 49, 50, 51, 52, 52, 54, 33, 34, 34, 34, 35, 35,
-             37, 38, 40, 43, 43, 44, 44, 45, 47, 47, 46, 46, 46, 45, 45, 46, 46,
-             47, 47, 47, 49, 49, 50, 51, 51, 53, 37, 38, 38, 39, 40, 40, 42, 43,
-             44, 47, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47,
-             47, 48, 49, 49, 50, 50, 52, 37, 38, 38, 39, 40, 40, 42, 43, 44, 47,
-             47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48,
-             49, 49, 50, 50, 52, 45, 45, 45, 45, 44, 44, 46, 46, 46, 47, 47, 49,
-             49, 50, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 53, 53, 54, 54, 54,
-             55, 55, 57, 48, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 49, 50, 51,
-             53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 55, 55, 56, 56, 56,
-             58, 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53,
-             54, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58, 58, 59, 59, 61, 49,
-             47, 47, 46, 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56,
-             57, 58, 58, 59, 59, 60, 61, 61, 62, 62, 63, 64, 64, 65, 49, 47, 47,
-             46, 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56, 57, 58,
-             58, 59, 59, 60, 61, 61, 62, 62, 63, 64, 64, 65, 51, 50, 49, 48, 47,
-             47, 47, 47, 47, 46, 46, 49, 50, 52, 54, 54, 56, 57, 58, 61, 61, 62,
-             63, 64, 65, 65, 67, 67, 68, 69, 69, 70, 52, 50, 50, 49, 48, 48, 47,
-             47, 47, 47, 47, 50, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 64, 65,
-             66, 66, 68, 68, 69, 70, 70, 72, 54, 52, 51, 51, 49, 49, 49, 49, 48,
-             48, 48, 51, 51, 53, 55, 55, 58, 58, 60, 62, 62, 64, 65, 66, 68, 68,
-             70, 70, 71, 73, 73, 74, 57, 55, 54, 53, 52, 52, 51, 51, 51, 50, 50,
-             52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 66, 67, 68, 71, 71, 73, 73,
-             74, 76, 76, 78, 57, 55, 54, 53, 52, 52, 51, 51, 51, 50, 50, 52, 53,
-             54, 57, 57, 59, 60, 61, 64, 64, 66, 67, 68, 71, 71, 73, 73, 74, 76,
-             76, 78,
-             // Size 4x16
-             31, 37, 49, 54, 31, 38, 47, 51, 32, 40, 45, 49, 34, 42, 45, 49, 37,
-             44, 45, 48, 39, 47, 45, 48, 42, 47, 49, 51, 47, 48, 53, 55, 46, 47,
-             55, 58, 46, 46, 57, 60, 46, 46, 58, 62, 47, 46, 59, 65, 48, 47, 61,
-             68, 50, 48, 62, 70, 51, 49, 63, 71, 53, 50, 64, 73,
-             // Size 16x4
-             31, 31, 32, 34, 37, 39, 42, 47, 46, 46, 46, 47, 48, 50, 51, 53, 37,
-             38, 40, 42, 44, 47, 47, 48, 47, 46, 46, 46, 47, 48, 49, 50, 49, 47,
-             45, 45, 45, 45, 49, 53, 55, 57, 58, 59, 61, 62, 63, 64, 54, 51, 49,
-             49, 48, 48, 51, 55, 58, 60, 62, 65, 68, 70, 71, 73,
-             // Size 8x32
-             32, 31, 37, 45, 48, 49, 52, 57, 31, 31, 38, 45, 47, 47, 50, 55, 31,
-             31, 38, 45, 47, 47, 50, 54, 31, 32, 39, 45, 46, 46, 49, 53, 30, 32,
-             40, 44, 45, 45, 48, 52, 30, 32, 40, 44, 45, 45, 48, 52, 33, 35, 42,
-             46, 46, 45, 47, 51, 33, 36, 43, 46, 46, 46, 47, 51, 35, 37, 44, 46,
-             46, 45, 47, 51, 37, 40, 47, 47, 47, 45, 47, 50, 37, 40, 47, 47, 47,
-             45, 47, 50, 41, 42, 47, 49, 49, 48, 50, 52, 42, 43, 47, 49, 50, 49,
-             50, 53, 44, 44, 47, 50, 51, 51, 52, 54, 49, 46, 48, 52, 53, 53, 54,
-             57, 49, 46, 48, 52, 53, 53, 54, 57, 48, 46, 47, 51, 54, 55, 57, 59,
-             48, 46, 47, 51, 54, 56, 57, 60, 48, 45, 46, 51, 54, 57, 59, 61, 49,
-             45, 46, 51, 55, 58, 61, 64, 49, 45, 46, 51, 55, 58, 61, 64, 50, 46,
-             46, 52, 56, 59, 63, 66, 50, 46, 46, 52, 56, 59, 64, 67, 51, 47, 47,
-             52, 56, 60, 65, 68, 52, 48, 47, 53, 57, 61, 66, 71, 52, 48, 47, 53,
-             57, 61, 66, 71, 54, 49, 48, 54, 58, 62, 68, 73, 54, 50, 49, 54, 58,
-             62, 68, 73, 55, 51, 49, 54, 58, 63, 69, 74, 57, 52, 50, 55, 59, 64,
-             70, 76, 57, 52, 50, 55, 59, 64, 70, 76, 59, 54, 52, 57, 61, 65, 72,
-             78,
-             // Size 32x8
-             32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 48,
-             48, 48, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59, 31, 31,
-             31, 32, 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 46, 46, 45,
-             45, 45, 46, 46, 47, 48, 48, 49, 50, 51, 52, 52, 54, 37, 38, 38, 39,
-             40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46,
-             46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52, 45, 45, 45, 45, 44, 44,
-             46, 46, 46, 47, 47, 49, 49, 50, 52, 52, 51, 51, 51, 51, 51, 52, 52,
-             52, 53, 53, 54, 54, 54, 55, 55, 57, 48, 47, 47, 46, 45, 45, 46, 46,
-             46, 47, 47, 49, 50, 51, 53, 53, 54, 54, 54, 55, 55, 56, 56, 56, 57,
-             57, 58, 58, 58, 59, 59, 61, 49, 47, 47, 46, 45, 45, 45, 46, 45, 45,
-             45, 48, 49, 51, 53, 53, 55, 56, 57, 58, 58, 59, 59, 60, 61, 61, 62,
-             62, 63, 64, 64, 65, 52, 50, 50, 49, 48, 48, 47, 47, 47, 47, 47, 50,
-             50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 64, 65, 66, 66, 68, 68, 69,
-             70, 70, 72, 57, 55, 54, 53, 52, 52, 51, 51, 51, 50, 50, 52, 53, 54,
-             57, 57, 59, 60, 61, 64, 64, 66, 67, 68, 71, 71, 73, 73, 74, 76, 76,
-             78},
-        },
-        // Quantizer level 8.
-        {
-            {// Luma
-             // Size 4x4
-             32, 32, 38, 51, 32, 35, 40, 49, 38, 40, 54, 64, 51, 49, 64, 81,
-             // Size 8x8
-             31, 32, 32, 34, 35, 41, 47, 53, 32, 32, 32, 33, 34, 40, 44, 50, 32,
-             32, 34, 35, 37, 41, 45, 51, 34, 33, 35, 39, 42, 47, 51, 55, 35, 34,
-             37, 42, 48, 53, 57, 61, 41, 40, 41, 47, 53, 60, 65, 70, 47, 44, 45,
-             51, 57, 65, 71, 77, 53, 50, 51, 55, 61, 70, 77, 85,
-             // Size 16x16
-             32, 31, 31, 31, 31, 32, 32, 34, 36, 38, 39, 44, 47, 49, 54, 59, 31,
-             32, 32, 32, 32, 32, 33, 34, 35, 37, 38, 42, 45, 47, 51, 56, 31, 32,
-             32, 32, 32, 32, 33, 33, 34, 36, 37, 41, 44, 46, 50, 54, 31, 32, 32,
-             32, 32, 33, 33, 34, 35, 36, 38, 41, 44, 45, 49, 54, 31, 32, 32, 32,
-             33, 34, 34, 35, 36, 38, 39, 42, 45, 46, 50, 54, 32, 32, 32, 33, 34,
-             35, 36, 37, 38, 39, 40, 42, 45, 46, 49, 53, 32, 33, 33, 33, 34, 36,
-             36, 38, 40, 41, 42, 44, 47, 48, 51, 55, 34, 34, 33, 34, 35, 37, 38,
-             39, 42, 44, 45, 47, 50, 51, 54, 58, 36, 35, 34, 35, 36, 38, 40, 42,
-             48, 50, 50, 54, 56, 57, 60, 64, 38, 37, 36, 36, 38, 39, 41, 44, 50,
-             51, 52, 56, 58, 60, 63, 67, 39, 38, 37, 38, 39, 40, 42, 45, 50, 52,
-             54, 58, 60, 62, 65, 69, 44, 42, 41, 41, 42, 42, 44, 47, 54, 56, 58,
-             63, 66, 68, 71, 75, 47, 45, 44, 44, 45, 45, 47, 50, 56, 58, 60, 66,
-             69, 71, 75, 79, 49, 47, 46, 45, 46, 46, 48, 51, 57, 60, 62, 68, 71,
-             73, 77, 81, 54, 51, 50, 49, 50, 49, 51, 54, 60, 63, 65, 71, 75, 77,
-             82, 87, 59, 56, 54, 54, 54, 53, 55, 58, 64, 67, 69, 75, 79, 81, 87,
-             92,
-             // Size 32x32
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36,
-             36, 38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 54, 55, 59, 59, 31, 31,
-             31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 37,
-             39, 39, 41, 43, 43, 46, 47, 48, 51, 52, 53, 57, 57, 31, 31, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 37, 38, 38,
-             41, 42, 43, 45, 46, 47, 51, 51, 53, 56, 56, 31, 31, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 37, 38, 38, 41, 42,
-             42, 45, 46, 47, 51, 51, 52, 56, 56, 31, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 36, 37, 37, 40, 41, 41, 44,
-             45, 46, 49, 50, 51, 54, 54, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 33, 33, 33, 34, 34, 34, 36, 37, 37, 40, 41, 41, 44, 44, 45,
-             49, 49, 50, 54, 54, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
-             33, 34, 34, 34, 35, 35, 36, 38, 38, 40, 41, 41, 44, 45, 45, 49, 49,
-             50, 54, 54, 31, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35,
-             35, 35, 36, 36, 38, 39, 39, 41, 42, 42, 44, 45, 46, 49, 50, 51, 54,
-             54, 31, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 36,
-             36, 36, 38, 39, 39, 41, 42, 42, 45, 45, 46, 49, 50, 51, 54, 54, 32,
-             32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37,
-             38, 39, 39, 41, 42, 42, 45, 45, 46, 49, 49, 51, 54, 54, 32, 32, 32,
-             32, 32, 32, 33, 34, 34, 34, 35, 35, 36, 37, 37, 37, 38, 38, 39, 40,
-             40, 42, 42, 43, 45, 46, 46, 49, 49, 50, 53, 53, 32, 32, 32, 32, 32,
-             32, 33, 34, 34, 34, 35, 35, 36, 37, 37, 37, 38, 38, 39, 40, 40, 42,
-             42, 43, 45, 46, 46, 49, 49, 50, 53, 53, 32, 33, 33, 33, 33, 33, 33,
-             34, 34, 35, 36, 36, 36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 44, 45,
-             47, 47, 48, 51, 51, 52, 55, 55, 34, 34, 34, 34, 33, 33, 34, 35, 35,
-             35, 37, 37, 38, 39, 39, 41, 42, 42, 44, 45, 45, 47, 47, 48, 50, 51,
-             51, 54, 54, 55, 58, 58, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 37,
-             37, 38, 39, 39, 41, 42, 42, 44, 45, 45, 47, 47, 48, 50, 51, 51, 54,
-             54, 55, 58, 58, 35, 34, 34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 39,
-             41, 41, 43, 45, 45, 47, 47, 47, 49, 50, 51, 53, 53, 54, 57, 57, 58,
-             61, 61, 36, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42,
-             45, 48, 48, 50, 50, 50, 53, 54, 54, 56, 57, 57, 59, 60, 61, 64, 64,
-             36, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48,
-             48, 50, 50, 50, 53, 54, 54, 56, 57, 57, 59, 60, 61, 64, 64, 38, 37,
-             37, 37, 36, 36, 36, 38, 38, 38, 39, 39, 41, 44, 44, 47, 50, 50, 51,
-             52, 52, 55, 56, 56, 58, 59, 60, 62, 63, 64, 67, 67, 39, 39, 38, 38,
-             37, 37, 38, 39, 39, 39, 40, 40, 42, 45, 45, 47, 50, 50, 52, 54, 54,
-             56, 58, 58, 60, 61, 62, 64, 65, 66, 69, 69, 39, 39, 38, 38, 37, 37,
-             38, 39, 39, 39, 40, 40, 42, 45, 45, 47, 50, 50, 52, 54, 54, 56, 58,
-             58, 60, 61, 62, 64, 65, 66, 69, 69, 42, 41, 41, 41, 40, 40, 40, 41,
-             41, 41, 42, 42, 44, 47, 47, 49, 53, 53, 55, 56, 56, 60, 61, 62, 64,
-             65, 66, 69, 69, 70, 73, 73, 44, 43, 42, 42, 41, 41, 41, 42, 42, 42,
-             42, 42, 44, 47, 47, 50, 54, 54, 56, 58, 58, 61, 63, 64, 66, 67, 68,
-             71, 71, 72, 75, 75, 44, 43, 43, 42, 41, 41, 41, 42, 42, 42, 43, 43,
-             45, 48, 48, 51, 54, 54, 56, 58, 58, 62, 64, 64, 66, 67, 68, 71, 72,
-             73, 76, 76, 47, 46, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 47, 50,
-             50, 53, 56, 56, 58, 60, 60, 64, 66, 66, 69, 70, 71, 74, 75, 76, 79,
-             79, 48, 47, 46, 46, 45, 44, 45, 45, 45, 45, 46, 46, 47, 51, 51, 53,
-             57, 57, 59, 61, 61, 65, 67, 67, 70, 71, 72, 75, 76, 77, 80, 80, 49,
-             48, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 48, 51, 51, 54, 57, 57,
-             60, 62, 62, 66, 68, 68, 71, 72, 73, 77, 77, 78, 81, 81, 53, 51, 51,
-             51, 49, 49, 49, 49, 49, 49, 49, 49, 51, 54, 54, 57, 59, 59, 62, 64,
-             64, 69, 71, 71, 74, 75, 77, 81, 81, 83, 86, 86, 54, 52, 51, 51, 50,
-             49, 49, 50, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69,
-             71, 72, 75, 76, 77, 81, 82, 83, 87, 87, 55, 53, 53, 52, 51, 50, 50,
-             51, 51, 51, 50, 50, 52, 55, 55, 58, 61, 61, 64, 66, 66, 70, 72, 73,
-             76, 77, 78, 83, 83, 85, 88, 88, 59, 57, 56, 56, 54, 54, 54, 54, 54,
-             54, 53, 53, 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80,
-             81, 86, 87, 88, 92, 92, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53,
-             53, 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86,
-             87, 88, 92, 92,
-             // Size 4x8
-             32, 32, 37, 52, 32, 33, 36, 49, 32, 34, 38, 49, 34, 37, 44, 54, 35,
-             38, 49, 60, 40, 42, 55, 69, 46, 46, 59, 76, 52, 51, 64, 83,
-             // Size 8x4
-             32, 32, 32, 34, 35, 40, 46, 52, 32, 33, 34, 37, 38, 42, 46, 51, 37,
-             36, 38, 44, 49, 55, 59, 64, 52, 49, 49, 54, 60, 69, 76, 83,
-             // Size 8x16
-             32, 31, 32, 32, 36, 44, 47, 53, 31, 32, 32, 33, 35, 42, 45, 51, 31,
-             32, 32, 33, 35, 41, 44, 49, 31, 32, 33, 33, 35, 41, 44, 49, 32, 32,
-             34, 34, 36, 42, 45, 50, 32, 33, 35, 36, 38, 42, 45, 49, 32, 33, 35,
-             36, 40, 44, 47, 51, 34, 34, 36, 38, 42, 48, 50, 54, 36, 34, 37, 40,
-             48, 54, 56, 60, 38, 36, 39, 41, 49, 56, 58, 63, 39, 37, 40, 42, 50,
-             58, 60, 65, 44, 41, 42, 45, 53, 63, 66, 71, 47, 44, 45, 47, 56, 66,
-             69, 75, 49, 46, 47, 48, 57, 67, 71, 77, 53, 49, 50, 51, 60, 71, 75,
-             82, 58, 54, 54, 55, 63, 75, 79, 87,
-             // Size 16x8
-             32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 31,
-             32, 32, 32, 32, 33, 33, 34, 34, 36, 37, 41, 44, 46, 49, 54, 32, 32,
-             32, 33, 34, 35, 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 32, 33, 33,
-             33, 34, 36, 36, 38, 40, 41, 42, 45, 47, 48, 51, 55, 36, 35, 35, 35,
-             36, 38, 40, 42, 48, 49, 50, 53, 56, 57, 60, 63, 44, 42, 41, 41, 42,
-             42, 44, 48, 54, 56, 58, 63, 66, 67, 71, 75, 47, 45, 44, 44, 45, 45,
-             47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 53, 51, 49, 49, 50, 49, 51,
-             54, 60, 63, 65, 71, 75, 77, 82, 87,
-             // Size 16x32
-             32, 31, 31, 31, 32, 32, 32, 35, 36, 38, 44, 44, 47, 53, 53, 59, 31,
-             32, 32, 32, 32, 32, 33, 35, 35, 37, 43, 43, 46, 52, 52, 57, 31, 32,
-             32, 32, 32, 32, 33, 35, 35, 37, 42, 42, 45, 51, 51, 56, 31, 32, 32,
-             32, 32, 32, 33, 35, 35, 37, 42, 42, 45, 51, 51, 56, 31, 32, 32, 32,
-             32, 32, 33, 34, 35, 36, 41, 41, 44, 49, 49, 54, 31, 32, 32, 32, 32,
-             33, 33, 34, 34, 36, 41, 41, 44, 49, 49, 54, 31, 32, 32, 32, 33, 33,
-             33, 35, 35, 36, 41, 41, 44, 49, 49, 54, 32, 32, 32, 32, 33, 34, 34,
-             36, 36, 38, 42, 42, 45, 49, 49, 54, 32, 32, 32, 33, 34, 34, 34, 36,
-             36, 38, 42, 42, 45, 50, 50, 54, 32, 32, 32, 33, 34, 34, 35, 37, 37,
-             38, 42, 42, 45, 49, 49, 54, 32, 32, 33, 33, 35, 35, 36, 38, 38, 39,
-             42, 42, 45, 49, 49, 53, 32, 32, 33, 33, 35, 35, 36, 38, 38, 39, 42,
-             42, 45, 49, 49, 53, 32, 33, 33, 33, 35, 36, 36, 39, 40, 41, 44, 44,
-             47, 51, 51, 55, 34, 34, 34, 34, 36, 37, 38, 42, 42, 44, 48, 48, 50,
-             54, 54, 58, 34, 34, 34, 34, 36, 37, 38, 42, 42, 44, 48, 48, 50, 54,
-             54, 58, 35, 34, 34, 34, 37, 37, 39, 44, 45, 46, 50, 50, 53, 57, 57,
-             61, 36, 35, 34, 35, 37, 38, 40, 47, 48, 49, 54, 54, 56, 60, 60, 64,
-             36, 35, 34, 35, 37, 38, 40, 47, 48, 49, 54, 54, 56, 60, 60, 64, 38,
-             37, 36, 37, 39, 40, 41, 48, 49, 51, 56, 56, 58, 63, 63, 67, 39, 38,
-             37, 38, 40, 40, 42, 49, 50, 52, 58, 58, 60, 65, 65, 69, 39, 38, 37,
-             38, 40, 40, 42, 49, 50, 52, 58, 58, 60, 65, 65, 69, 42, 40, 40, 40,
-             42, 42, 44, 51, 52, 55, 61, 61, 64, 69, 69, 73, 44, 42, 41, 41, 42,
-             43, 45, 52, 53, 56, 63, 63, 66, 71, 71, 75, 44, 42, 41, 41, 43, 43,
-             45, 52, 54, 56, 63, 63, 66, 72, 72, 76, 47, 45, 44, 44, 45, 45, 47,
-             54, 56, 58, 66, 66, 69, 75, 75, 79, 48, 46, 45, 45, 46, 46, 48, 55,
-             56, 59, 67, 67, 70, 76, 76, 80, 49, 47, 46, 46, 47, 47, 48, 56, 57,
-             60, 67, 67, 71, 77, 77, 81, 53, 50, 49, 49, 49, 49, 51, 58, 59, 62,
-             71, 71, 74, 81, 81, 86, 53, 51, 49, 49, 50, 50, 51, 59, 60, 63, 71,
-             71, 75, 82, 82, 87, 55, 52, 51, 51, 51, 51, 53, 60, 61, 64, 72, 72,
-             76, 83, 83, 88, 58, 55, 54, 54, 54, 54, 55, 62, 63, 67, 75, 75, 79,
-             87, 87, 92, 58, 55, 54, 54, 54, 54, 55, 62, 63, 67, 75, 75, 79, 87,
-             87, 92,
-             // Size 32x16
-             32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 35, 36,
-             36, 38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 53, 55, 58, 58, 31, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 37,
-             38, 38, 40, 42, 42, 45, 46, 47, 50, 51, 52, 55, 55, 31, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 36, 37, 37,
-             40, 41, 41, 44, 45, 46, 49, 49, 51, 54, 54, 31, 32, 32, 32, 32, 32,
-             32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 35, 35, 37, 38, 38, 40, 41,
-             41, 44, 45, 46, 49, 49, 51, 54, 54, 32, 32, 32, 32, 32, 32, 33, 33,
-             34, 34, 35, 35, 35, 36, 36, 37, 37, 37, 39, 40, 40, 42, 42, 43, 45,
-             46, 47, 49, 50, 51, 54, 54, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34,
-             35, 35, 36, 37, 37, 37, 38, 38, 40, 40, 40, 42, 43, 43, 45, 46, 47,
-             49, 50, 51, 54, 54, 32, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36,
-             36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 45, 45, 47, 48, 48, 51, 51,
-             53, 55, 55, 35, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 39, 42,
-             42, 44, 47, 47, 48, 49, 49, 51, 52, 52, 54, 55, 56, 58, 59, 60, 62,
-             62, 36, 35, 35, 35, 35, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45,
-             48, 48, 49, 50, 50, 52, 53, 54, 56, 56, 57, 59, 60, 61, 63, 63, 38,
-             37, 37, 37, 36, 36, 36, 38, 38, 38, 39, 39, 41, 44, 44, 46, 49, 49,
-             51, 52, 52, 55, 56, 56, 58, 59, 60, 62, 63, 64, 67, 67, 44, 43, 42,
-             42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58,
-             58, 61, 63, 63, 66, 67, 67, 71, 71, 72, 75, 75, 44, 43, 42, 42, 41,
-             41, 41, 42, 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58, 58, 61,
-             63, 63, 66, 67, 67, 71, 71, 72, 75, 75, 47, 46, 45, 45, 44, 44, 44,
-             45, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66,
-             69, 70, 71, 74, 75, 76, 79, 79, 53, 52, 51, 51, 49, 49, 49, 49, 50,
-             49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69, 71, 72, 75, 76,
-             77, 81, 82, 83, 87, 87, 53, 52, 51, 51, 49, 49, 49, 49, 50, 49, 49,
-             49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81,
-             82, 83, 87, 87, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53, 55,
-             58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88,
-             92, 92,
-             // Size 4x16
-             31, 32, 38, 53, 32, 32, 37, 51, 32, 32, 36, 49, 32, 33, 36, 49, 32,
-             34, 38, 50, 32, 35, 39, 49, 33, 36, 41, 51, 34, 37, 44, 54, 35, 38,
-             49, 60, 37, 40, 51, 63, 38, 40, 52, 65, 42, 43, 56, 71, 45, 45, 58,
-             75, 47, 47, 60, 77, 51, 50, 63, 82, 55, 54, 67, 87,
-             // Size 16x4
-             31, 32, 32, 32, 32, 32, 33, 34, 35, 37, 38, 42, 45, 47, 51, 55, 32,
-             32, 32, 33, 34, 35, 36, 37, 38, 40, 40, 43, 45, 47, 50, 54, 38, 37,
-             36, 36, 38, 39, 41, 44, 49, 51, 52, 56, 58, 60, 63, 67, 53, 51, 49,
-             49, 50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82, 87,
-             // Size 8x32
-             32, 31, 32, 32, 36, 44, 47, 53, 31, 32, 32, 33, 35, 43, 46, 52, 31,
-             32, 32, 33, 35, 42, 45, 51, 31, 32, 32, 33, 35, 42, 45, 51, 31, 32,
-             32, 33, 35, 41, 44, 49, 31, 32, 32, 33, 34, 41, 44, 49, 31, 32, 33,
-             33, 35, 41, 44, 49, 32, 32, 33, 34, 36, 42, 45, 49, 32, 32, 34, 34,
-             36, 42, 45, 50, 32, 32, 34, 35, 37, 42, 45, 49, 32, 33, 35, 36, 38,
-             42, 45, 49, 32, 33, 35, 36, 38, 42, 45, 49, 32, 33, 35, 36, 40, 44,
-             47, 51, 34, 34, 36, 38, 42, 48, 50, 54, 34, 34, 36, 38, 42, 48, 50,
-             54, 35, 34, 37, 39, 45, 50, 53, 57, 36, 34, 37, 40, 48, 54, 56, 60,
-             36, 34, 37, 40, 48, 54, 56, 60, 38, 36, 39, 41, 49, 56, 58, 63, 39,
-             37, 40, 42, 50, 58, 60, 65, 39, 37, 40, 42, 50, 58, 60, 65, 42, 40,
-             42, 44, 52, 61, 64, 69, 44, 41, 42, 45, 53, 63, 66, 71, 44, 41, 43,
-             45, 54, 63, 66, 72, 47, 44, 45, 47, 56, 66, 69, 75, 48, 45, 46, 48,
-             56, 67, 70, 76, 49, 46, 47, 48, 57, 67, 71, 77, 53, 49, 49, 51, 59,
-             71, 74, 81, 53, 49, 50, 51, 60, 71, 75, 82, 55, 51, 51, 53, 61, 72,
-             76, 83, 58, 54, 54, 55, 63, 75, 79, 87, 58, 54, 54, 55, 63, 75, 79,
-             87,
-             // Size 32x8
-             32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 35, 36,
-             36, 38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 53, 55, 58, 58, 31, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 36,
-             37, 37, 40, 41, 41, 44, 45, 46, 49, 49, 51, 54, 54, 32, 32, 32, 32,
-             32, 32, 33, 33, 34, 34, 35, 35, 35, 36, 36, 37, 37, 37, 39, 40, 40,
-             42, 42, 43, 45, 46, 47, 49, 50, 51, 54, 54, 32, 33, 33, 33, 33, 33,
-             33, 34, 34, 35, 36, 36, 36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 45,
-             45, 47, 48, 48, 51, 51, 53, 55, 55, 36, 35, 35, 35, 35, 34, 35, 36,
-             36, 37, 38, 38, 40, 42, 42, 45, 48, 48, 49, 50, 50, 52, 53, 54, 56,
-             56, 57, 59, 60, 61, 63, 63, 44, 43, 42, 42, 41, 41, 41, 42, 42, 42,
-             42, 42, 44, 48, 48, 50, 54, 54, 56, 58, 58, 61, 63, 63, 66, 67, 67,
-             71, 71, 72, 75, 75, 47, 46, 45, 45, 44, 44, 44, 45, 45, 45, 45, 45,
-             47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66, 69, 70, 71, 74, 75,
-             76, 79, 79, 53, 52, 51, 51, 49, 49, 49, 49, 50, 49, 49, 49, 51, 54,
-             54, 57, 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81, 82, 83, 87,
-             87},
-            {// Chroma
-             // Size 4x4
-             31, 38, 47, 49, 38, 47, 46, 46, 47, 46, 54, 57, 49, 46, 57, 66,
-             // Size 8x8
-             31, 31, 35, 42, 48, 47, 49, 51, 31, 32, 36, 42, 46, 45, 46, 48, 35,
-             36, 41, 45, 47, 45, 46, 48, 42, 42, 45, 48, 50, 49, 50, 51, 48, 46,
-             47, 50, 53, 53, 54, 54, 47, 45, 45, 49, 53, 57, 59, 60, 49, 46, 46,
-             50, 54, 59, 61, 64, 51, 48, 48, 51, 54, 60, 64, 68,
-             // Size 16x16
-             32, 31, 30, 31, 33, 36, 38, 41, 49, 49, 48, 49, 50, 51, 52, 54, 31,
-             31, 31, 32, 34, 38, 40, 42, 47, 47, 47, 47, 48, 48, 50, 52, 30, 31,
-             31, 32, 35, 39, 41, 42, 46, 46, 46, 45, 46, 47, 48, 50, 31, 32, 32,
-             33, 36, 40, 41, 43, 46, 46, 45, 45, 46, 46, 47, 49, 33, 34, 35, 36,
-             39, 43, 44, 45, 47, 46, 46, 45, 46, 47, 47, 49, 36, 38, 39, 40, 43,
-             47, 47, 47, 48, 47, 46, 45, 46, 46, 47, 48, 38, 40, 41, 41, 44, 47,
-             47, 48, 49, 48, 48, 47, 47, 47, 48, 49, 41, 42, 42, 43, 45, 47, 48,
-             48, 50, 50, 49, 49, 50, 50, 50, 52, 49, 47, 46, 46, 47, 48, 49, 50,
-             53, 53, 53, 53, 54, 54, 54, 55, 49, 47, 46, 46, 46, 47, 48, 50, 53,
-             53, 54, 55, 55, 55, 56, 57, 48, 47, 46, 45, 46, 46, 48, 49, 53, 54,
-             54, 55, 56, 56, 57, 58, 49, 47, 45, 45, 45, 45, 47, 49, 53, 55, 55,
-             58, 59, 60, 61, 62, 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59,
-             61, 61, 63, 64, 51, 48, 47, 46, 47, 46, 47, 50, 54, 55, 56, 60, 61,
-             62, 64, 66, 52, 50, 48, 47, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64,
-             66, 68, 54, 52, 50, 49, 49, 48, 49, 52, 55, 57, 58, 62, 64, 66, 68,
-             71,
-             // Size 32x32
-             32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 36, 36, 38, 41, 41, 45, 49,
-             49, 49, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54, 31, 31,
-             31, 31, 31, 31, 31, 34, 34, 35, 38, 38, 39, 42, 42, 45, 48, 48, 47,
-             47, 47, 47, 47, 47, 49, 49, 49, 50, 50, 51, 53, 53, 31, 31, 31, 31,
-             31, 31, 32, 34, 34, 35, 38, 38, 40, 42, 42, 45, 47, 47, 47, 47, 47,
-             47, 47, 47, 48, 48, 48, 49, 50, 50, 52, 52, 31, 31, 31, 31, 31, 31,
-             32, 34, 34, 36, 38, 38, 40, 42, 42, 45, 47, 47, 47, 47, 47, 47, 46,
-             47, 48, 48, 48, 49, 49, 50, 52, 52, 30, 31, 31, 31, 31, 31, 32, 35,
-             35, 36, 39, 39, 41, 42, 42, 44, 46, 46, 46, 46, 46, 45, 45, 45, 46,
-             47, 47, 48, 48, 48, 50, 50, 30, 31, 31, 31, 31, 32, 32, 35, 35, 36,
-             40, 40, 41, 42, 42, 44, 46, 46, 46, 45, 45, 45, 45, 45, 46, 46, 46,
-             47, 47, 48, 49, 49, 31, 31, 32, 32, 32, 32, 33, 35, 36, 37, 40, 40,
-             41, 43, 43, 44, 46, 46, 46, 45, 45, 45, 45, 45, 46, 46, 46, 47, 47,
-             48, 49, 49, 33, 34, 34, 34, 35, 35, 35, 38, 38, 40, 43, 43, 43, 44,
-             44, 46, 47, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47, 47, 48, 49,
-             49, 33, 34, 34, 34, 35, 35, 36, 38, 39, 40, 43, 43, 44, 45, 45, 46,
-             47, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47, 47, 48, 49, 49, 34,
-             35, 35, 36, 36, 36, 37, 40, 40, 41, 44, 44, 45, 45, 45, 46, 47, 47,
-             47, 46, 46, 45, 45, 45, 46, 46, 46, 47, 47, 48, 49, 49, 36, 38, 38,
-             38, 39, 40, 40, 43, 43, 44, 47, 47, 47, 47, 47, 47, 48, 48, 47, 46,
-             46, 45, 45, 45, 46, 46, 46, 46, 47, 47, 48, 48, 36, 38, 38, 38, 39,
-             40, 40, 43, 43, 44, 47, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 45,
-             45, 45, 46, 46, 46, 46, 47, 47, 48, 48, 38, 39, 40, 40, 41, 41, 41,
-             43, 44, 45, 47, 47, 47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47,
-             47, 47, 47, 48, 48, 48, 49, 49, 41, 42, 42, 42, 42, 42, 43, 44, 45,
-             45, 47, 47, 48, 48, 48, 49, 50, 50, 50, 49, 49, 49, 49, 49, 50, 50,
-             50, 50, 50, 51, 52, 52, 41, 42, 42, 42, 42, 42, 43, 44, 45, 45, 47,
-             47, 48, 48, 48, 49, 50, 50, 50, 49, 49, 49, 49, 49, 50, 50, 50, 50,
-             50, 51, 52, 52, 45, 45, 45, 45, 44, 44, 44, 46, 46, 46, 47, 47, 48,
-             49, 49, 50, 51, 51, 51, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52, 52,
-             53, 53, 49, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50, 50,
-             51, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55,
-             49, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50, 50, 51, 53,
-             53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55, 49, 47,
-             47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 51, 53, 53, 53,
-             54, 54, 54, 55, 55, 55, 55, 55, 56, 56, 56, 57, 57, 48, 47, 47, 47,
-             46, 45, 45, 46, 46, 46, 46, 46, 48, 49, 49, 51, 53, 53, 54, 54, 54,
-             55, 55, 56, 56, 56, 56, 57, 57, 58, 58, 58, 48, 47, 47, 47, 46, 45,
-             45, 46, 46, 46, 46, 46, 48, 49, 49, 51, 53, 53, 54, 54, 54, 55, 55,
-             56, 56, 56, 56, 57, 57, 58, 58, 58, 49, 47, 47, 47, 45, 45, 45, 45,
-             45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 54, 55, 55, 57, 57, 58, 58,
-             59, 59, 60, 60, 60, 61, 61, 49, 47, 47, 46, 45, 45, 45, 45, 45, 45,
-             45, 45, 47, 49, 49, 51, 53, 53, 55, 55, 55, 57, 58, 58, 59, 60, 60,
-             61, 61, 61, 62, 62, 49, 47, 47, 47, 45, 45, 45, 45, 45, 45, 45, 45,
-             47, 49, 49, 51, 53, 53, 55, 56, 56, 58, 58, 59, 59, 60, 60, 61, 61,
-             62, 63, 63, 50, 49, 48, 48, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50,
-             50, 52, 54, 54, 55, 56, 56, 58, 59, 59, 61, 61, 61, 63, 63, 63, 64,
-             64, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 52,
-             54, 54, 55, 56, 56, 59, 60, 60, 61, 61, 62, 63, 63, 64, 65, 65, 51,
-             49, 48, 48, 47, 46, 46, 47, 47, 46, 46, 46, 47, 50, 50, 52, 54, 54,
-             55, 56, 56, 59, 60, 60, 61, 62, 62, 64, 64, 64, 66, 66, 52, 50, 49,
-             49, 48, 47, 47, 47, 47, 47, 46, 46, 48, 50, 50, 52, 54, 54, 56, 57,
-             57, 60, 61, 61, 63, 63, 64, 66, 66, 67, 68, 68, 52, 50, 50, 49, 48,
-             47, 47, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60,
-             61, 61, 63, 63, 64, 66, 66, 67, 68, 68, 53, 51, 50, 50, 48, 48, 48,
-             48, 48, 48, 47, 47, 48, 51, 51, 52, 54, 54, 56, 58, 58, 60, 61, 62,
-             63, 64, 64, 67, 67, 68, 69, 69, 54, 53, 52, 52, 50, 49, 49, 49, 49,
-             49, 48, 48, 49, 52, 52, 53, 55, 55, 57, 58, 58, 61, 62, 63, 64, 65,
-             66, 68, 68, 69, 71, 71, 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48,
-             48, 49, 52, 52, 53, 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68,
-             68, 69, 71, 71,
-             // Size 4x8
-             31, 38, 47, 50, 31, 40, 46, 48, 36, 44, 47, 47, 42, 47, 50, 50, 47,
-             48, 53, 54, 46, 46, 54, 60, 48, 46, 55, 64, 50, 48, 56, 67,
-             // Size 8x4
-             31, 31, 36, 42, 47, 46, 48, 50, 38, 40, 44, 47, 48, 46, 46, 48, 47,
-             46, 47, 50, 53, 54, 55, 56, 50, 48, 47, 50, 54, 60, 64, 67,
-             // Size 8x16
-             32, 31, 35, 38, 48, 49, 50, 52, 31, 31, 37, 40, 47, 47, 48, 50, 30,
-             32, 38, 40, 46, 45, 46, 48, 31, 33, 38, 41, 46, 45, 46, 48, 33, 36,
-             41, 44, 47, 46, 46, 47, 37, 40, 45, 47, 47, 45, 46, 47, 39, 41, 46,
-             47, 48, 47, 47, 48, 42, 43, 46, 48, 50, 49, 50, 50, 49, 46, 48, 49,
-             53, 53, 54, 54, 48, 46, 47, 48, 53, 55, 55, 56, 48, 46, 46, 48, 53,
-             56, 56, 57, 49, 45, 45, 47, 53, 58, 59, 61, 50, 46, 46, 48, 54, 59,
-             61, 63, 51, 47, 47, 48, 54, 60, 61, 64, 52, 48, 47, 48, 54, 61, 63,
-             66, 54, 50, 49, 50, 55, 62, 65, 68,
-             // Size 16x8
-             32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 31,
-             31, 32, 33, 36, 40, 41, 43, 46, 46, 46, 45, 46, 47, 48, 50, 35, 37,
-             38, 38, 41, 45, 46, 46, 48, 47, 46, 45, 46, 47, 47, 49, 38, 40, 40,
-             41, 44, 47, 47, 48, 49, 48, 48, 47, 48, 48, 48, 50, 48, 47, 46, 46,
-             47, 47, 48, 50, 53, 53, 53, 53, 54, 54, 54, 55, 49, 47, 45, 45, 46,
-             45, 47, 49, 53, 55, 56, 58, 59, 60, 61, 62, 50, 48, 46, 46, 46, 46,
-             47, 50, 54, 55, 56, 59, 61, 61, 63, 65, 52, 50, 48, 48, 47, 47, 48,
-             50, 54, 56, 57, 61, 63, 64, 66, 68,
-             // Size 16x32
-             32, 31, 31, 31, 35, 37, 38, 47, 48, 48, 49, 49, 50, 52, 52, 54, 31,
-             31, 31, 32, 36, 38, 39, 46, 47, 47, 48, 48, 49, 50, 50, 53, 31, 31,
-             31, 32, 37, 38, 40, 46, 47, 47, 47, 47, 48, 50, 50, 52, 31, 31, 31,
-             32, 37, 38, 40, 46, 47, 47, 47, 47, 48, 50, 50, 52, 30, 31, 32, 32,
-             38, 39, 40, 45, 46, 46, 45, 45, 46, 48, 48, 50, 30, 31, 32, 33, 38,
-             40, 41, 45, 46, 46, 45, 45, 46, 48, 48, 50, 31, 32, 33, 33, 38, 40,
-             41, 45, 46, 46, 45, 45, 46, 48, 48, 50, 33, 35, 35, 36, 41, 43, 43,
-             46, 47, 46, 45, 45, 46, 47, 47, 49, 33, 35, 36, 36, 41, 43, 44, 46,
-             47, 46, 46, 46, 46, 47, 47, 49, 34, 36, 37, 37, 42, 44, 45, 47, 47,
-             47, 45, 45, 46, 47, 47, 49, 37, 39, 40, 41, 45, 47, 47, 47, 47, 47,
-             45, 45, 46, 47, 47, 48, 37, 39, 40, 41, 45, 47, 47, 47, 47, 47, 45,
-             45, 46, 47, 47, 48, 39, 40, 41, 42, 46, 47, 47, 48, 48, 48, 47, 47,
-             47, 48, 48, 50, 42, 42, 43, 43, 46, 47, 48, 50, 50, 50, 49, 49, 50,
-             50, 50, 52, 42, 42, 43, 43, 46, 47, 48, 50, 50, 50, 49, 49, 50, 50,
-             50, 52, 45, 45, 44, 45, 47, 47, 48, 51, 51, 51, 51, 51, 52, 52, 52,
-             54, 49, 47, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 54, 54, 54, 55,
-             49, 47, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 54, 54, 54, 55, 48,
-             47, 46, 46, 47, 47, 48, 52, 53, 53, 55, 55, 55, 56, 56, 57, 48, 46,
-             46, 46, 46, 47, 48, 52, 53, 54, 56, 56, 56, 57, 57, 59, 48, 46, 46,
-             46, 46, 47, 48, 52, 53, 54, 56, 56, 56, 57, 57, 59, 49, 46, 45, 45,
-             46, 46, 47, 52, 53, 54, 57, 57, 58, 60, 60, 61, 49, 46, 45, 45, 45,
-             46, 47, 52, 53, 55, 58, 58, 59, 61, 61, 62, 49, 46, 45, 45, 46, 46,
-             47, 52, 53, 55, 58, 58, 60, 61, 61, 63, 50, 47, 46, 46, 46, 46, 48,
-             53, 54, 55, 59, 59, 61, 63, 63, 65, 50, 48, 46, 46, 46, 46, 48, 53,
-             54, 55, 59, 59, 61, 64, 64, 65, 51, 48, 47, 47, 47, 47, 48, 53, 54,
-             55, 60, 60, 61, 64, 64, 66, 52, 49, 48, 48, 47, 47, 48, 53, 54, 56,
-             61, 61, 63, 66, 66, 68, 52, 49, 48, 48, 47, 47, 48, 53, 54, 56, 61,
-             61, 63, 66, 66, 68, 53, 50, 48, 48, 48, 48, 49, 54, 54, 56, 61, 61,
-             63, 67, 67, 69, 54, 51, 50, 50, 49, 49, 50, 55, 55, 57, 62, 62, 65,
-             68, 68, 71, 54, 51, 50, 50, 49, 49, 50, 55, 55, 57, 62, 62, 65, 68,
-             68, 71,
-             // Size 32x16
-             32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 37, 37, 39, 42, 42, 45, 49,
-             49, 48, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54, 31, 31,
-             31, 31, 31, 31, 32, 35, 35, 36, 39, 39, 40, 42, 42, 45, 47, 47, 47,
-             46, 46, 46, 46, 46, 47, 48, 48, 49, 49, 50, 51, 51, 31, 31, 31, 31,
-             32, 32, 33, 35, 36, 37, 40, 40, 41, 43, 43, 44, 46, 46, 46, 46, 46,
-             45, 45, 45, 46, 46, 47, 48, 48, 48, 50, 50, 31, 32, 32, 32, 32, 33,
-             33, 36, 36, 37, 41, 41, 42, 43, 43, 45, 47, 47, 46, 46, 46, 45, 45,
-             45, 46, 46, 47, 48, 48, 48, 50, 50, 35, 36, 37, 37, 38, 38, 38, 41,
-             41, 42, 45, 45, 46, 46, 46, 47, 48, 48, 47, 46, 46, 46, 45, 46, 46,
-             46, 47, 47, 47, 48, 49, 49, 37, 38, 38, 38, 39, 40, 40, 43, 43, 44,
-             47, 47, 47, 47, 47, 47, 48, 48, 47, 47, 47, 46, 46, 46, 46, 46, 47,
-             47, 47, 48, 49, 49, 38, 39, 40, 40, 40, 41, 41, 43, 44, 45, 47, 47,
-             47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 48, 48, 48, 48, 48,
-             49, 50, 50, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47, 47, 48, 50,
-             50, 51, 52, 52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53, 53, 54, 55,
-             55, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 50, 50, 51,
-             53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55, 48,
-             47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 51, 53, 53,
-             53, 54, 54, 54, 55, 55, 55, 55, 55, 56, 56, 56, 57, 57, 49, 48, 47,
-             47, 45, 45, 45, 45, 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56,
-             56, 57, 58, 58, 59, 59, 60, 61, 61, 61, 62, 62, 49, 48, 47, 47, 45,
-             45, 45, 45, 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 57,
-             58, 58, 59, 59, 60, 61, 61, 61, 62, 62, 50, 49, 48, 48, 46, 46, 46,
-             46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 60,
-             61, 61, 61, 63, 63, 63, 65, 65, 52, 50, 50, 50, 48, 48, 48, 47, 47,
-             47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 64,
-             64, 66, 66, 67, 68, 68, 52, 50, 50, 50, 48, 48, 48, 47, 47, 47, 47,
-             47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 64, 64, 66,
-             66, 67, 68, 68, 54, 53, 52, 52, 50, 50, 50, 49, 49, 49, 48, 48, 50,
-             52, 52, 54, 55, 55, 57, 59, 59, 61, 62, 63, 65, 65, 66, 68, 68, 69,
-             71, 71,
-             // Size 4x16
-             31, 37, 48, 52, 31, 38, 47, 50, 31, 39, 46, 48, 32, 40, 46, 48, 35,
-             43, 46, 47, 39, 47, 47, 47, 40, 47, 48, 48, 42, 47, 50, 50, 47, 48,
-             53, 54, 47, 47, 53, 56, 46, 47, 54, 57, 46, 46, 55, 61, 47, 46, 55,
-             63, 48, 47, 55, 64, 49, 47, 56, 66, 51, 49, 57, 68,
-             // Size 16x4
-             31, 31, 31, 32, 35, 39, 40, 42, 47, 47, 46, 46, 47, 48, 49, 51, 37,
-             38, 39, 40, 43, 47, 47, 47, 48, 47, 47, 46, 46, 47, 47, 49, 48, 47,
-             46, 46, 46, 47, 48, 50, 53, 53, 54, 55, 55, 55, 56, 57, 52, 50, 48,
-             48, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64, 66, 68,
-             // Size 8x32
-             32, 31, 35, 38, 48, 49, 50, 52, 31, 31, 36, 39, 47, 48, 49, 50, 31,
-             31, 37, 40, 47, 47, 48, 50, 31, 31, 37, 40, 47, 47, 48, 50, 30, 32,
-             38, 40, 46, 45, 46, 48, 30, 32, 38, 41, 46, 45, 46, 48, 31, 33, 38,
-             41, 46, 45, 46, 48, 33, 35, 41, 43, 47, 45, 46, 47, 33, 36, 41, 44,
-             47, 46, 46, 47, 34, 37, 42, 45, 47, 45, 46, 47, 37, 40, 45, 47, 47,
-             45, 46, 47, 37, 40, 45, 47, 47, 45, 46, 47, 39, 41, 46, 47, 48, 47,
-             47, 48, 42, 43, 46, 48, 50, 49, 50, 50, 42, 43, 46, 48, 50, 49, 50,
-             50, 45, 44, 47, 48, 51, 51, 52, 52, 49, 46, 48, 49, 53, 53, 54, 54,
-             49, 46, 48, 49, 53, 53, 54, 54, 48, 46, 47, 48, 53, 55, 55, 56, 48,
-             46, 46, 48, 53, 56, 56, 57, 48, 46, 46, 48, 53, 56, 56, 57, 49, 45,
-             46, 47, 53, 57, 58, 60, 49, 45, 45, 47, 53, 58, 59, 61, 49, 45, 46,
-             47, 53, 58, 60, 61, 50, 46, 46, 48, 54, 59, 61, 63, 50, 46, 46, 48,
-             54, 59, 61, 64, 51, 47, 47, 48, 54, 60, 61, 64, 52, 48, 47, 48, 54,
-             61, 63, 66, 52, 48, 47, 48, 54, 61, 63, 66, 53, 48, 48, 49, 54, 61,
-             63, 67, 54, 50, 49, 50, 55, 62, 65, 68, 54, 50, 49, 50, 55, 62, 65,
-             68,
-             // Size 32x8
-             32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 37, 37, 39, 42, 42, 45, 49,
-             49, 48, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54, 31, 31,
-             31, 31, 32, 32, 33, 35, 36, 37, 40, 40, 41, 43, 43, 44, 46, 46, 46,
-             46, 46, 45, 45, 45, 46, 46, 47, 48, 48, 48, 50, 50, 35, 36, 37, 37,
-             38, 38, 38, 41, 41, 42, 45, 45, 46, 46, 46, 47, 48, 48, 47, 46, 46,
-             46, 45, 46, 46, 46, 47, 47, 47, 48, 49, 49, 38, 39, 40, 40, 40, 41,
-             41, 43, 44, 45, 47, 47, 47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47,
-             47, 48, 48, 48, 48, 48, 49, 50, 50, 48, 47, 47, 47, 46, 46, 46, 47,
-             47, 47, 47, 47, 48, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 54,
-             54, 54, 54, 54, 54, 55, 55, 49, 48, 47, 47, 45, 45, 45, 45, 46, 45,
-             45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 57, 58, 58, 59, 59, 60,
-             61, 61, 61, 62, 62, 50, 49, 48, 48, 46, 46, 46, 46, 46, 46, 46, 46,
-             47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 60, 61, 61, 61, 63, 63,
-             63, 65, 65, 52, 50, 50, 50, 48, 48, 48, 47, 47, 47, 47, 47, 48, 50,
-             50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 64, 64, 66, 66, 67, 68,
-             68},
-        },
-        // Quantizer level 9.
-        {
-            {// Luma
-             // Size 4x4
-             32, 32, 35, 43, 32, 34, 37, 43, 35, 37, 48, 54, 43, 43, 54, 65,
-             // Size 8x8
-             31, 31, 32, 32, 34, 37, 43, 47, 31, 32, 32, 32, 34, 36, 41, 44, 32,
-             32, 33, 34, 35, 38, 42, 45, 32, 32, 34, 35, 37, 39, 42, 46, 34, 34,
-             35, 37, 41, 45, 49, 52, 37, 36, 38, 39, 45, 51, 56, 59, 43, 41, 42,
-             42, 49, 56, 63, 67, 47, 44, 45, 46, 52, 59, 67, 71,
-             // Size 16x16
-             32, 31, 31, 31, 31, 31, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 31,
-             32, 32, 32, 32, 32, 32, 33, 34, 35, 35, 38, 40, 42, 45, 46, 31, 32,
-             32, 32, 32, 32, 32, 33, 34, 34, 35, 38, 39, 42, 45, 45, 31, 32, 32,
-             32, 32, 32, 32, 33, 33, 34, 34, 37, 38, 41, 44, 44, 31, 32, 32, 32,
-             33, 33, 33, 34, 35, 36, 36, 39, 40, 42, 44, 45, 31, 32, 32, 32, 33,
-             33, 34, 34, 35, 36, 36, 39, 40, 42, 45, 45, 32, 32, 32, 32, 33, 34,
-             35, 36, 37, 38, 38, 40, 41, 42, 45, 46, 32, 33, 33, 33, 34, 34, 36,
-             36, 38, 39, 40, 42, 43, 44, 47, 47, 34, 34, 34, 33, 35, 35, 37, 38,
-             39, 42, 42, 45, 46, 47, 50, 51, 35, 35, 34, 34, 36, 36, 38, 39, 42,
-             46, 47, 49, 50, 52, 55, 55, 36, 35, 35, 34, 36, 36, 38, 40, 42, 47,
-             48, 50, 52, 54, 56, 57, 39, 38, 38, 37, 39, 39, 40, 42, 45, 49, 50,
-             54, 55, 58, 60, 61, 41, 40, 39, 38, 40, 40, 41, 43, 46, 50, 52, 55,
-             57, 60, 62, 63, 44, 42, 42, 41, 42, 42, 42, 44, 47, 52, 54, 58, 60,
-             63, 66, 67, 47, 45, 45, 44, 44, 45, 45, 47, 50, 55, 56, 60, 62, 66,
-             69, 70, 48, 46, 45, 44, 45, 45, 46, 47, 51, 55, 57, 61, 63, 67, 70,
-             71,
-             // Size 32x32
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34,
-             34, 35, 36, 36, 38, 39, 39, 41, 44, 44, 45, 47, 48, 48, 51, 31, 31,
-             31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35,
-             35, 35, 37, 39, 39, 40, 43, 43, 44, 46, 47, 47, 50, 31, 31, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 35,
-             37, 38, 38, 40, 42, 42, 43, 45, 46, 46, 49, 31, 31, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 35, 37, 38,
-             38, 40, 42, 42, 43, 45, 46, 46, 49, 31, 31, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 36, 38, 38, 39,
-             42, 42, 42, 45, 45, 45, 48, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 36, 37, 37, 38, 41, 41,
-             41, 44, 44, 44, 47, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 33, 33, 33, 34, 34, 34, 34, 36, 37, 37, 38, 41, 41, 41, 44,
-             44, 44, 47, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
-             33, 34, 34, 34, 34, 35, 35, 36, 38, 38, 39, 41, 41, 42, 44, 45, 45,
-             47, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 35,
-             35, 35, 36, 36, 36, 37, 39, 39, 40, 42, 42, 42, 44, 45, 45, 48, 31,
-             32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35,
-             36, 36, 36, 38, 39, 39, 40, 42, 42, 42, 45, 45, 45, 48, 31, 32, 32,
-             32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36,
-             36, 38, 39, 39, 40, 42, 42, 42, 45, 45, 45, 48, 32, 32, 32, 32, 32,
-             32, 32, 33, 33, 33, 33, 34, 35, 35, 35, 36, 36, 36, 37, 37, 37, 39,
-             40, 40, 41, 42, 42, 43, 45, 45, 45, 48, 32, 32, 32, 32, 32, 32, 32,
-             33, 33, 34, 34, 35, 35, 35, 36, 37, 37, 37, 38, 38, 38, 39, 40, 40,
-             41, 42, 42, 43, 45, 46, 46, 48, 32, 32, 32, 32, 32, 32, 32, 33, 33,
-             34, 34, 35, 35, 35, 36, 37, 37, 37, 38, 38, 38, 39, 40, 40, 41, 42,
-             42, 43, 45, 46, 46, 48, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34,
-             35, 36, 36, 36, 38, 38, 38, 39, 40, 40, 41, 42, 42, 43, 44, 44, 45,
-             47, 47, 47, 50, 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36, 37,
-             37, 38, 39, 39, 40, 42, 42, 42, 44, 45, 45, 46, 47, 47, 48, 50, 51,
-             51, 53, 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36, 37, 37, 38,
-             39, 39, 40, 42, 42, 42, 44, 45, 45, 46, 47, 47, 48, 50, 51, 51, 53,
-             34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 38, 40, 40,
-             41, 43, 44, 44, 45, 46, 46, 47, 49, 49, 49, 51, 52, 52, 54, 35, 35,
-             35, 35, 34, 34, 34, 34, 36, 36, 36, 37, 38, 38, 39, 42, 42, 43, 46,
-             47, 47, 48, 49, 49, 50, 52, 52, 53, 55, 55, 55, 57, 36, 35, 35, 35,
-             35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 44, 47, 48, 48,
-             50, 50, 50, 52, 54, 54, 54, 56, 57, 57, 58, 36, 35, 35, 35, 35, 34,
-             34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 44, 47, 48, 48, 50, 50,
-             50, 52, 54, 54, 54, 56, 57, 57, 58, 38, 37, 37, 37, 36, 36, 36, 36,
-             37, 38, 38, 39, 39, 39, 41, 44, 44, 45, 48, 50, 50, 51, 52, 52, 54,
-             56, 56, 57, 58, 59, 59, 61, 39, 39, 38, 38, 38, 37, 37, 38, 39, 39,
-             39, 40, 40, 40, 42, 45, 45, 46, 49, 50, 50, 52, 54, 54, 55, 58, 58,
-             58, 60, 61, 61, 63, 39, 39, 38, 38, 38, 37, 37, 38, 39, 39, 39, 40,
-             40, 40, 42, 45, 45, 46, 49, 50, 50, 52, 54, 54, 55, 58, 58, 58, 60,
-             61, 61, 63, 41, 40, 40, 40, 39, 38, 38, 39, 40, 40, 40, 41, 41, 41,
-             43, 46, 46, 47, 50, 52, 52, 54, 55, 55, 57, 60, 60, 60, 62, 63, 63,
-             66, 44, 43, 42, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44, 47,
-             47, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, 67, 69, 44,
-             43, 42, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44, 47, 47, 49,
-             52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, 67, 69, 45, 44, 43,
-             43, 42, 41, 41, 42, 42, 42, 42, 43, 43, 43, 45, 48, 48, 49, 53, 54,
-             54, 57, 58, 58, 60, 64, 64, 65, 67, 68, 68, 70, 47, 46, 45, 45, 45,
-             44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55, 56, 56, 58,
-             60, 60, 62, 66, 66, 67, 69, 70, 70, 73, 48, 47, 46, 46, 45, 44, 44,
-             45, 45, 45, 45, 45, 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61,
-             63, 67, 67, 68, 70, 71, 71, 74, 48, 47, 46, 46, 45, 44, 44, 45, 45,
-             45, 45, 45, 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61, 63, 67,
-             67, 68, 70, 71, 71, 74, 51, 50, 49, 49, 48, 47, 47, 47, 48, 48, 48,
-             48, 48, 48, 50, 53, 53, 54, 57, 58, 58, 61, 63, 63, 66, 69, 69, 70,
-             73, 74, 74, 77,
-             // Size 4x8
-             31, 32, 35, 43, 32, 33, 34, 41, 32, 34, 36, 42, 32, 35, 38, 42, 34,
-             37, 43, 49, 37, 40, 49, 56, 42, 43, 53, 63, 46, 46, 56, 67,
-             // Size 8x4
-             31, 32, 32, 32, 34, 37, 42, 46, 32, 33, 34, 35, 37, 40, 43, 46, 35,
-             34, 36, 38, 43, 49, 53, 56, 43, 41, 42, 42, 49, 56, 63, 67,
-             // Size 8x16
-             32, 31, 31, 32, 35, 36, 44, 47, 31, 32, 32, 32, 35, 35, 42, 45, 31,
-             32, 32, 32, 34, 35, 41, 45, 31, 32, 32, 33, 34, 34, 41, 44, 31, 32,
-             33, 34, 35, 36, 42, 44, 32, 32, 33, 34, 36, 36, 42, 45, 32, 33, 34,
-             35, 37, 38, 42, 45, 32, 33, 34, 36, 39, 40, 44, 47, 34, 34, 35, 37,
-             41, 42, 48, 50, 35, 34, 36, 38, 45, 47, 52, 55, 36, 34, 36, 38, 46,
-             48, 54, 56, 39, 37, 39, 40, 48, 50, 58, 60, 41, 39, 40, 41, 49, 51,
-             60, 62, 44, 41, 42, 43, 51, 53, 63, 66, 47, 44, 44, 45, 53, 56, 66,
-             69, 48, 45, 45, 46, 54, 56, 67, 70,
-             // Size 16x8
-             32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 31,
-             32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 37, 39, 41, 44, 45, 31, 32,
-             32, 32, 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 44, 45, 32, 32, 32,
-             33, 34, 34, 35, 36, 37, 38, 38, 40, 41, 43, 45, 46, 35, 35, 34, 34,
-             35, 36, 37, 39, 41, 45, 46, 48, 49, 51, 53, 54, 36, 35, 35, 34, 36,
-             36, 38, 40, 42, 47, 48, 50, 51, 53, 56, 56, 44, 42, 41, 41, 42, 42,
-             42, 44, 48, 52, 54, 58, 60, 63, 66, 67, 47, 45, 45, 44, 44, 45, 45,
-             47, 50, 55, 56, 60, 62, 66, 69, 70,
-             // Size 16x32
-             32, 31, 31, 31, 31, 32, 32, 32, 35, 36, 36, 40, 44, 44, 47, 53, 31,
-             31, 32, 32, 32, 32, 32, 33, 35, 35, 35, 39, 43, 43, 46, 52, 31, 32,
-             32, 32, 32, 32, 32, 33, 35, 35, 35, 39, 42, 42, 45, 51, 31, 32, 32,
-             32, 32, 32, 32, 33, 35, 35, 35, 39, 42, 42, 45, 51, 31, 32, 32, 32,
-             32, 32, 32, 33, 34, 35, 35, 39, 41, 41, 45, 50, 31, 32, 32, 32, 32,
-             33, 33, 33, 34, 34, 34, 38, 41, 41, 44, 49, 31, 32, 32, 32, 32, 33,
-             33, 33, 34, 34, 34, 38, 41, 41, 44, 49, 31, 32, 32, 32, 32, 33, 33,
-             33, 34, 35, 35, 38, 41, 41, 44, 49, 31, 32, 32, 32, 33, 34, 34, 34,
-             35, 36, 36, 39, 42, 42, 44, 49, 32, 32, 32, 32, 33, 34, 34, 34, 36,
-             36, 36, 39, 42, 42, 45, 50, 32, 32, 32, 32, 33, 34, 34, 34, 36, 36,
-             36, 39, 42, 42, 45, 50, 32, 32, 32, 32, 33, 35, 35, 35, 37, 37, 37,
-             40, 42, 42, 45, 49, 32, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 41,
-             42, 42, 45, 49, 32, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 41, 42,
-             42, 45, 49, 32, 33, 33, 33, 34, 36, 36, 36, 39, 40, 40, 42, 44, 44,
-             47, 51, 34, 34, 34, 34, 35, 37, 37, 38, 41, 42, 42, 45, 48, 48, 50,
-             54, 34, 34, 34, 34, 35, 37, 37, 38, 41, 42, 42, 45, 48, 48, 50, 54,
-             34, 34, 34, 34, 35, 37, 37, 38, 42, 43, 43, 46, 49, 49, 51, 55, 35,
-             35, 34, 34, 36, 38, 38, 39, 45, 47, 47, 50, 52, 52, 55, 59, 36, 35,
-             34, 34, 36, 38, 38, 40, 46, 48, 48, 51, 54, 54, 56, 60, 36, 35, 34,
-             34, 36, 38, 38, 40, 46, 48, 48, 51, 54, 54, 56, 60, 38, 37, 36, 36,
-             37, 40, 40, 41, 47, 49, 49, 53, 56, 56, 58, 63, 39, 38, 37, 37, 39,
-             40, 40, 42, 48, 50, 50, 54, 58, 58, 60, 65, 39, 38, 37, 37, 39, 40,
-             40, 42, 48, 50, 50, 54, 58, 58, 60, 65, 41, 40, 39, 39, 40, 41, 41,
-             43, 49, 51, 51, 56, 60, 60, 62, 67, 44, 42, 41, 41, 42, 43, 43, 45,
-             51, 53, 53, 59, 63, 63, 66, 71, 44, 42, 41, 41, 42, 43, 43, 45, 51,
-             53, 53, 59, 63, 63, 66, 71, 44, 43, 42, 42, 42, 43, 43, 45, 51, 54,
-             54, 59, 64, 64, 67, 72, 47, 45, 44, 44, 44, 45, 45, 47, 53, 56, 56,
-             61, 66, 66, 69, 75, 48, 46, 45, 45, 45, 46, 46, 48, 54, 56, 56, 62,
-             67, 67, 70, 76, 48, 46, 45, 45, 45, 46, 46, 48, 54, 56, 56, 62, 67,
-             67, 70, 76, 51, 49, 47, 47, 48, 48, 48, 50, 56, 58, 58, 64, 69, 69,
-             73, 79,
-             // Size 32x16
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34,
-             34, 35, 36, 36, 38, 39, 39, 41, 44, 44, 44, 47, 48, 48, 51, 31, 31,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35,
-             35, 35, 37, 38, 38, 40, 42, 42, 43, 45, 46, 46, 49, 31, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34,
-             36, 37, 37, 39, 41, 41, 42, 44, 45, 45, 47, 31, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 36, 37,
-             37, 39, 41, 41, 42, 44, 45, 45, 47, 31, 32, 32, 32, 32, 32, 32, 32,
-             33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37, 39, 39, 40,
-             42, 42, 42, 44, 45, 45, 48, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34,
-             34, 35, 35, 35, 36, 37, 37, 37, 38, 38, 38, 40, 40, 40, 41, 43, 43,
-             43, 45, 46, 46, 48, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35,
-             35, 35, 36, 37, 37, 37, 38, 38, 38, 40, 40, 40, 41, 43, 43, 43, 45,
-             46, 46, 48, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36,
-             36, 38, 38, 38, 39, 40, 40, 41, 42, 42, 43, 45, 45, 45, 47, 48, 48,
-             50, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 37, 37, 37, 39, 41,
-             41, 42, 45, 46, 46, 47, 48, 48, 49, 51, 51, 51, 53, 54, 54, 56, 36,
-             35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43,
-             47, 48, 48, 49, 50, 50, 51, 53, 53, 54, 56, 56, 56, 58, 36, 35, 35,
-             35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43, 47, 48,
-             48, 49, 50, 50, 51, 53, 53, 54, 56, 56, 56, 58, 40, 39, 39, 39, 39,
-             38, 38, 38, 39, 39, 39, 40, 41, 41, 42, 45, 45, 46, 50, 51, 51, 53,
-             54, 54, 56, 59, 59, 59, 61, 62, 62, 64, 44, 43, 42, 42, 41, 41, 41,
-             41, 42, 42, 42, 42, 42, 42, 44, 48, 48, 49, 52, 54, 54, 56, 58, 58,
-             60, 63, 63, 64, 66, 67, 67, 69, 44, 43, 42, 42, 41, 41, 41, 41, 42,
-             42, 42, 42, 42, 42, 44, 48, 48, 49, 52, 54, 54, 56, 58, 58, 60, 63,
-             63, 64, 66, 67, 67, 69, 47, 46, 45, 45, 45, 44, 44, 44, 44, 45, 45,
-             45, 45, 45, 47, 50, 50, 51, 55, 56, 56, 58, 60, 60, 62, 66, 66, 67,
-             69, 70, 70, 73, 53, 52, 51, 51, 50, 49, 49, 49, 49, 50, 50, 49, 49,
-             49, 51, 54, 54, 55, 59, 60, 60, 63, 65, 65, 67, 71, 71, 72, 75, 76,
-             76, 79,
-             // Size 4x16
-             31, 32, 36, 44, 32, 32, 35, 42, 32, 32, 35, 41, 32, 33, 34, 41, 32,
-             34, 36, 42, 32, 34, 36, 42, 32, 35, 38, 42, 33, 36, 40, 44, 34, 37,
-             42, 48, 35, 38, 47, 52, 35, 38, 48, 54, 38, 40, 50, 58, 40, 41, 51,
-             60, 42, 43, 53, 63, 45, 45, 56, 66, 46, 46, 56, 67,
-             // Size 16x4
-             31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 35, 38, 40, 42, 45, 46, 32,
-             32, 32, 33, 34, 34, 35, 36, 37, 38, 38, 40, 41, 43, 45, 46, 36, 35,
-             35, 34, 36, 36, 38, 40, 42, 47, 48, 50, 51, 53, 56, 56, 44, 42, 41,
-             41, 42, 42, 42, 44, 48, 52, 54, 58, 60, 63, 66, 67,
-             // Size 8x32
-             32, 31, 31, 32, 35, 36, 44, 47, 31, 32, 32, 32, 35, 35, 43, 46, 31,
-             32, 32, 32, 35, 35, 42, 45, 31, 32, 32, 32, 35, 35, 42, 45, 31, 32,
-             32, 32, 34, 35, 41, 45, 31, 32, 32, 33, 34, 34, 41, 44, 31, 32, 32,
-             33, 34, 34, 41, 44, 31, 32, 32, 33, 34, 35, 41, 44, 31, 32, 33, 34,
-             35, 36, 42, 44, 32, 32, 33, 34, 36, 36, 42, 45, 32, 32, 33, 34, 36,
-             36, 42, 45, 32, 32, 33, 35, 37, 37, 42, 45, 32, 33, 34, 35, 37, 38,
-             42, 45, 32, 33, 34, 35, 37, 38, 42, 45, 32, 33, 34, 36, 39, 40, 44,
-             47, 34, 34, 35, 37, 41, 42, 48, 50, 34, 34, 35, 37, 41, 42, 48, 50,
-             34, 34, 35, 37, 42, 43, 49, 51, 35, 34, 36, 38, 45, 47, 52, 55, 36,
-             34, 36, 38, 46, 48, 54, 56, 36, 34, 36, 38, 46, 48, 54, 56, 38, 36,
-             37, 40, 47, 49, 56, 58, 39, 37, 39, 40, 48, 50, 58, 60, 39, 37, 39,
-             40, 48, 50, 58, 60, 41, 39, 40, 41, 49, 51, 60, 62, 44, 41, 42, 43,
-             51, 53, 63, 66, 44, 41, 42, 43, 51, 53, 63, 66, 44, 42, 42, 43, 51,
-             54, 64, 67, 47, 44, 44, 45, 53, 56, 66, 69, 48, 45, 45, 46, 54, 56,
-             67, 70, 48, 45, 45, 46, 54, 56, 67, 70, 51, 47, 48, 48, 56, 58, 69,
-             73,
-             // Size 32x8
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34,
-             34, 35, 36, 36, 38, 39, 39, 41, 44, 44, 44, 47, 48, 48, 51, 31, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34,
-             34, 34, 36, 37, 37, 39, 41, 41, 42, 44, 45, 45, 47, 31, 32, 32, 32,
-             32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36,
-             37, 39, 39, 40, 42, 42, 42, 44, 45, 45, 48, 32, 32, 32, 32, 32, 33,
-             33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 38, 38, 38, 40, 40,
-             40, 41, 43, 43, 43, 45, 46, 46, 48, 35, 35, 35, 35, 34, 34, 34, 34,
-             35, 36, 36, 37, 37, 37, 39, 41, 41, 42, 45, 46, 46, 47, 48, 48, 49,
-             51, 51, 51, 53, 54, 54, 56, 36, 35, 35, 35, 35, 34, 34, 35, 36, 36,
-             36, 37, 38, 38, 40, 42, 42, 43, 47, 48, 48, 49, 50, 50, 51, 53, 53,
-             54, 56, 56, 56, 58, 44, 43, 42, 42, 41, 41, 41, 41, 42, 42, 42, 42,
-             42, 42, 44, 48, 48, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66,
-             67, 67, 69, 47, 46, 45, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 45,
-             47, 50, 50, 51, 55, 56, 56, 58, 60, 60, 62, 66, 66, 67, 69, 70, 70,
-             73},
-            {// Chroma
-             // Size 4x4
-             31, 37, 47, 47, 37, 44, 47, 45, 47, 47, 53, 53, 47, 45, 53, 59,
-             // Size 8x8
-             31, 31, 34, 37, 43, 48, 47, 49, 31, 32, 35, 40, 43, 46, 45, 46, 34,
-             35, 39, 43, 45, 46, 45, 46, 37, 40, 43, 47, 47, 47, 45, 46, 43, 43,
-             45, 47, 49, 50, 50, 50, 48, 46, 46, 47, 50, 53, 55, 55, 47, 45, 45,
-             45, 50, 55, 58, 60, 49, 46, 46, 46, 50, 55, 60, 61,
-             // Size 16x16
-             32, 31, 31, 30, 33, 33, 36, 38, 41, 47, 49, 48, 49, 49, 50, 50, 31,
-             31, 31, 31, 34, 34, 38, 40, 42, 46, 47, 47, 47, 47, 48, 48, 31, 31,
-             31, 31, 34, 35, 39, 40, 42, 46, 47, 46, 46, 46, 47, 47, 30, 31, 31,
-             32, 34, 35, 40, 41, 42, 45, 46, 45, 45, 45, 46, 46, 33, 34, 34, 34,
-             37, 38, 42, 43, 44, 46, 47, 46, 46, 45, 46, 46, 33, 34, 35, 35, 38,
-             39, 43, 44, 45, 47, 47, 46, 46, 45, 46, 46, 36, 38, 39, 40, 42, 43,
-             47, 47, 47, 47, 48, 46, 46, 45, 46, 46, 38, 40, 40, 41, 43, 44, 47,
-             47, 48, 48, 49, 48, 47, 47, 47, 47, 41, 42, 42, 42, 44, 45, 47, 48,
-             48, 50, 50, 49, 49, 49, 50, 50, 47, 46, 46, 45, 46, 47, 47, 48, 50,
-             52, 52, 52, 52, 52, 53, 53, 49, 47, 47, 46, 47, 47, 48, 49, 50, 52,
-             53, 53, 53, 53, 54, 54, 48, 47, 46, 45, 46, 46, 46, 48, 49, 52, 53,
-             54, 55, 55, 56, 56, 49, 47, 46, 45, 46, 46, 46, 47, 49, 52, 53, 55,
-             55, 57, 57, 58, 49, 47, 46, 45, 45, 45, 45, 47, 49, 52, 53, 55, 57,
-             58, 59, 60, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59,
-             61, 61, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 58, 60, 61,
-             61,
-             // Size 32x32
-             32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 36, 36, 38, 41, 41,
-             43, 47, 49, 49, 49, 48, 48, 49, 49, 49, 49, 50, 50, 50, 51, 31, 31,
-             31, 31, 31, 31, 31, 31, 33, 34, 34, 36, 37, 37, 39, 42, 42, 43, 47,
-             48, 48, 48, 47, 47, 47, 47, 47, 48, 49, 49, 49, 50, 31, 31, 31, 31,
-             31, 31, 31, 32, 34, 34, 34, 37, 38, 38, 40, 42, 42, 43, 46, 47, 47,
-             47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 31, 31, 31, 31, 31, 31,
-             31, 32, 34, 34, 34, 37, 38, 38, 40, 42, 42, 43, 46, 47, 47, 47, 47,
-             47, 47, 47, 47, 47, 48, 48, 48, 49, 31, 31, 31, 31, 31, 31, 31, 32,
-             34, 35, 35, 37, 39, 39, 40, 42, 42, 43, 46, 47, 47, 46, 46, 46, 46,
-             46, 46, 46, 47, 47, 47, 48, 30, 31, 31, 31, 31, 32, 32, 32, 34, 35,
-             35, 38, 40, 40, 41, 42, 42, 43, 45, 46, 46, 46, 45, 45, 45, 45, 45,
-             45, 46, 46, 46, 47, 30, 31, 31, 31, 31, 32, 32, 32, 34, 35, 35, 38,
-             40, 40, 41, 42, 42, 43, 45, 46, 46, 46, 45, 45, 45, 45, 45, 45, 46,
-             46, 46, 47, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40,
-             41, 43, 43, 43, 46, 46, 46, 46, 45, 45, 45, 45, 45, 45, 46, 46, 46,
-             47, 33, 33, 34, 34, 34, 34, 34, 35, 37, 38, 38, 41, 42, 42, 43, 44,
-             44, 45, 46, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 33,
-             34, 34, 34, 35, 35, 35, 36, 38, 39, 39, 41, 43, 43, 44, 45, 45, 45,
-             47, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 33, 34, 34,
-             34, 35, 35, 35, 36, 38, 39, 39, 41, 43, 43, 44, 45, 45, 45, 47, 47,
-             47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 35, 36, 37, 37, 37,
-             38, 38, 38, 41, 41, 41, 44, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47,
-             46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 36, 37, 38, 38, 39, 40, 40,
-             40, 42, 43, 43, 46, 47, 47, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46,
-             46, 45, 45, 45, 46, 46, 46, 46, 36, 37, 38, 38, 39, 40, 40, 40, 42,
-             43, 43, 46, 47, 47, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 46, 45,
-             45, 45, 46, 46, 46, 46, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, 44,
-             46, 47, 47, 47, 48, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 47,
-             47, 47, 47, 48, 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46, 47,
-             47, 48, 48, 48, 49, 50, 50, 50, 50, 49, 49, 49, 49, 49, 49, 50, 50,
-             50, 50, 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46, 47, 47, 48,
-             48, 48, 49, 50, 50, 50, 50, 49, 49, 49, 49, 49, 49, 50, 50, 50, 50,
-             43, 43, 43, 43, 43, 43, 43, 43, 45, 45, 45, 46, 47, 47, 48, 49, 49,
-             49, 50, 51, 51, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 51, 47, 47,
-             46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52,
-             52, 52, 52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53, 49, 48, 47, 47,
-             47, 46, 46, 46, 47, 47, 47, 47, 48, 48, 49, 50, 50, 51, 52, 53, 53,
-             53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 47, 46,
-             46, 46, 47, 47, 47, 47, 48, 48, 49, 50, 50, 51, 52, 53, 53, 53, 53,
-             53, 53, 53, 53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 46, 46, 46, 46,
-             46, 46, 46, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, 54, 54, 54,
-             55, 55, 55, 55, 55, 55, 56, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46,
-             46, 46, 46, 46, 48, 49, 49, 50, 52, 53, 53, 54, 54, 54, 55, 55, 55,
-             56, 56, 56, 56, 57, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46,
-             46, 46, 48, 49, 49, 50, 52, 53, 53, 54, 54, 54, 55, 55, 55, 56, 56,
-             56, 56, 57, 49, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46,
-             47, 49, 49, 50, 52, 53, 53, 54, 55, 55, 55, 57, 57, 57, 57, 58, 58,
-             58, 49, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49,
-             49, 50, 52, 53, 53, 55, 55, 55, 57, 58, 58, 59, 59, 60, 60, 60, 49,
-             47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 50,
-             52, 53, 53, 55, 55, 55, 57, 58, 58, 59, 59, 60, 60, 60, 49, 48, 47,
-             47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 50, 52, 53,
-             53, 55, 56, 56, 57, 59, 59, 59, 60, 60, 60, 61, 50, 49, 48, 48, 47,
-             46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55,
-             56, 56, 57, 59, 59, 60, 61, 61, 61, 62, 50, 49, 48, 48, 47, 46, 46,
-             46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56,
-             58, 60, 60, 60, 61, 61, 61, 63, 50, 49, 48, 48, 47, 46, 46, 46, 46,
-             46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 58, 60,
-             60, 60, 61, 61, 61, 63, 51, 50, 49, 49, 48, 47, 47, 47, 47, 47, 47,
-             47, 46, 46, 48, 50, 50, 51, 53, 54, 54, 56, 57, 57, 58, 60, 60, 61,
-             62, 63, 63, 64,
-             // Size 4x8
-             31, 38, 47, 48, 31, 40, 46, 45, 35, 43, 47, 46, 39, 47, 47, 45, 43,
-             47, 50, 50, 47, 47, 53, 55, 46, 46, 53, 58, 48, 46, 54, 59,
-             // Size 8x4
-             31, 31, 35, 39, 43, 47, 46, 48, 38, 40, 43, 47, 47, 47, 46, 46, 47,
-             46, 47, 47, 50, 53, 53, 54, 48, 45, 46, 45, 50, 55, 58, 59,
-             // Size 8x16
-             32, 31, 33, 37, 45, 48, 49, 50, 31, 31, 34, 38, 45, 47, 47, 48, 31,
-             32, 34, 39, 45, 46, 46, 47, 30, 32, 35, 40, 44, 46, 45, 46, 33, 35,
-             37, 42, 46, 47, 45, 46, 33, 36, 38, 43, 46, 47, 46, 46, 37, 40, 43,
-             47, 47, 47, 45, 46, 39, 41, 43, 47, 48, 48, 47, 47, 42, 43, 44, 47,
-             49, 50, 49, 50, 47, 46, 46, 48, 51, 52, 53, 53, 49, 46, 47, 48, 52,
-             53, 53, 54, 48, 46, 46, 47, 51, 53, 56, 56, 48, 45, 46, 46, 51, 53,
-             57, 57, 49, 45, 45, 46, 51, 53, 58, 59, 50, 46, 46, 46, 52, 54, 59,
-             61, 50, 46, 46, 46, 52, 54, 59, 61,
-             // Size 16x8
-             32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 31,
-             31, 32, 32, 35, 36, 40, 41, 43, 46, 46, 46, 45, 45, 46, 46, 33, 34,
-             34, 35, 37, 38, 43, 43, 44, 46, 47, 46, 46, 45, 46, 46, 37, 38, 39,
-             40, 42, 43, 47, 47, 47, 48, 48, 47, 46, 46, 46, 46, 45, 45, 45, 44,
-             46, 46, 47, 48, 49, 51, 52, 51, 51, 51, 52, 52, 48, 47, 46, 46, 47,
-             47, 47, 48, 50, 52, 53, 53, 53, 53, 54, 54, 49, 47, 46, 45, 45, 46,
-             45, 47, 49, 53, 53, 56, 57, 58, 59, 59, 50, 48, 47, 46, 46, 46, 46,
-             47, 50, 53, 54, 56, 57, 59, 61, 61,
-             // Size 16x32
-             32, 31, 31, 31, 33, 37, 37, 38, 45, 48, 48, 49, 49, 49, 50, 52, 31,
-             31, 31, 31, 33, 38, 38, 39, 45, 47, 47, 48, 48, 48, 49, 51, 31, 31,
-             31, 31, 34, 38, 38, 40, 45, 47, 47, 47, 47, 47, 48, 50, 31, 31, 31,
-             31, 34, 38, 38, 40, 45, 47, 47, 47, 47, 47, 48, 50, 31, 31, 32, 32,
-             34, 39, 39, 40, 45, 46, 46, 46, 46, 46, 47, 49, 30, 31, 32, 32, 35,
-             40, 40, 41, 44, 46, 46, 45, 45, 45, 46, 48, 30, 31, 32, 32, 35, 40,
-             40, 41, 44, 46, 46, 45, 45, 45, 46, 48, 31, 32, 33, 33, 35, 40, 40,
-             41, 45, 46, 46, 45, 45, 45, 46, 48, 33, 34, 35, 35, 37, 42, 42, 43,
-             46, 47, 47, 46, 45, 45, 46, 47, 33, 35, 36, 36, 38, 43, 43, 44, 46,
-             47, 47, 46, 46, 46, 46, 47, 33, 35, 36, 36, 38, 43, 43, 44, 46, 47,
-             47, 46, 46, 46, 46, 47, 35, 37, 38, 38, 41, 45, 45, 46, 47, 47, 47,
-             46, 45, 45, 46, 47, 37, 39, 40, 40, 43, 47, 47, 47, 47, 47, 47, 46,
-             45, 45, 46, 47, 37, 39, 40, 40, 43, 47, 47, 47, 47, 47, 47, 46, 45,
-             45, 46, 47, 39, 40, 41, 41, 43, 47, 47, 47, 48, 48, 48, 47, 47, 47,
-             47, 48, 42, 42, 43, 43, 44, 47, 47, 48, 49, 50, 50, 49, 49, 49, 50,
-             50, 42, 42, 43, 43, 44, 47, 47, 48, 49, 50, 50, 49, 49, 49, 50, 50,
-             43, 43, 43, 43, 45, 47, 47, 48, 50, 50, 50, 50, 50, 50, 50, 51, 47,
-             46, 46, 46, 46, 48, 48, 48, 51, 52, 52, 52, 53, 53, 53, 53, 49, 47,
-             46, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 53, 54, 54, 49, 47, 46,
-             46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 53, 54, 54, 48, 47, 46, 46,
-             46, 47, 47, 48, 52, 53, 53, 54, 55, 55, 55, 56, 48, 47, 46, 46, 46,
-             47, 47, 48, 51, 53, 53, 54, 56, 56, 56, 57, 48, 47, 46, 46, 46, 47,
-             47, 48, 51, 53, 53, 54, 56, 56, 56, 57, 48, 47, 45, 45, 46, 46, 46,
-             47, 51, 53, 53, 55, 57, 57, 57, 59, 49, 46, 45, 45, 45, 46, 46, 47,
-             51, 53, 53, 56, 58, 58, 59, 61, 49, 46, 45, 45, 45, 46, 46, 47, 51,
-             53, 53, 56, 58, 58, 59, 61, 49, 47, 45, 45, 45, 46, 46, 47, 52, 53,
-             53, 56, 58, 58, 60, 62, 50, 48, 46, 46, 46, 46, 46, 48, 52, 54, 54,
-             57, 59, 59, 61, 63, 50, 48, 46, 46, 46, 46, 46, 48, 52, 54, 54, 57,
-             59, 59, 61, 64, 50, 48, 46, 46, 46, 46, 46, 48, 52, 54, 54, 57, 59,
-             59, 61, 64, 51, 49, 47, 47, 47, 47, 47, 48, 52, 54, 54, 58, 60, 60,
-             62, 65,
-             // Size 32x16
-             32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 42,
-             43, 47, 49, 49, 48, 48, 48, 48, 49, 49, 49, 50, 50, 50, 51, 31, 31,
-             31, 31, 31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42, 42, 43, 46,
-             47, 47, 47, 47, 47, 47, 46, 46, 47, 48, 48, 48, 49, 31, 31, 31, 31,
-             32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46,
-             46, 46, 46, 45, 45, 45, 45, 46, 46, 46, 47, 31, 31, 31, 31, 32, 32,
-             32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 46,
-             46, 45, 45, 45, 45, 46, 46, 46, 47, 33, 33, 34, 34, 34, 35, 35, 35,
-             37, 38, 38, 41, 43, 43, 43, 44, 44, 45, 46, 47, 47, 46, 46, 46, 46,
-             45, 45, 45, 46, 46, 46, 47, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43,
-             43, 45, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 46, 46, 46,
-             46, 46, 46, 46, 47, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45,
-             47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 46, 46, 46, 46, 46,
-             46, 46, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47,
-             47, 48, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 47, 48, 48, 48,
-             48, 45, 45, 45, 45, 45, 44, 44, 45, 46, 46, 46, 47, 47, 47, 48, 49,
-             49, 50, 51, 52, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52, 48,
-             47, 47, 47, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50,
-             52, 53, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 48, 47, 47,
-             47, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52, 53,
-             53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 46,
-             45, 45, 45, 46, 46, 46, 46, 46, 46, 47, 49, 49, 50, 52, 53, 53, 54,
-             54, 54, 55, 56, 56, 56, 57, 57, 57, 58, 49, 48, 47, 47, 46, 45, 45,
-             45, 45, 46, 46, 45, 45, 45, 47, 49, 49, 50, 53, 53, 53, 55, 56, 56,
-             57, 58, 58, 58, 59, 59, 59, 60, 49, 48, 47, 47, 46, 45, 45, 45, 45,
-             46, 46, 45, 45, 45, 47, 49, 49, 50, 53, 53, 53, 55, 56, 56, 57, 58,
-             58, 58, 59, 59, 59, 60, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46,
-             46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 57, 59, 59, 60,
-             61, 61, 61, 62, 52, 51, 50, 50, 49, 48, 48, 48, 47, 47, 47, 47, 47,
-             47, 48, 50, 50, 51, 53, 54, 54, 56, 57, 57, 59, 61, 61, 62, 63, 64,
-             64, 65,
-             // Size 4x16
-             31, 37, 48, 49, 31, 38, 47, 47, 31, 39, 46, 46, 31, 40, 46, 45, 34,
-             42, 47, 45, 35, 43, 47, 46, 39, 47, 47, 45, 40, 47, 48, 47, 42, 47,
-             50, 49, 46, 48, 52, 53, 47, 48, 53, 53, 47, 47, 53, 56, 47, 46, 53,
-             57, 46, 46, 53, 58, 48, 46, 54, 59, 48, 46, 54, 59,
-             // Size 16x4
-             31, 31, 31, 31, 34, 35, 39, 40, 42, 46, 47, 47, 47, 46, 48, 48, 37,
-             38, 39, 40, 42, 43, 47, 47, 47, 48, 48, 47, 46, 46, 46, 46, 48, 47,
-             46, 46, 47, 47, 47, 48, 50, 52, 53, 53, 53, 53, 54, 54, 49, 47, 46,
-             45, 45, 46, 45, 47, 49, 53, 53, 56, 57, 58, 59, 59,
-             // Size 8x32
-             32, 31, 33, 37, 45, 48, 49, 50, 31, 31, 33, 38, 45, 47, 48, 49, 31,
-             31, 34, 38, 45, 47, 47, 48, 31, 31, 34, 38, 45, 47, 47, 48, 31, 32,
-             34, 39, 45, 46, 46, 47, 30, 32, 35, 40, 44, 46, 45, 46, 30, 32, 35,
-             40, 44, 46, 45, 46, 31, 33, 35, 40, 45, 46, 45, 46, 33, 35, 37, 42,
-             46, 47, 45, 46, 33, 36, 38, 43, 46, 47, 46, 46, 33, 36, 38, 43, 46,
-             47, 46, 46, 35, 38, 41, 45, 47, 47, 45, 46, 37, 40, 43, 47, 47, 47,
-             45, 46, 37, 40, 43, 47, 47, 47, 45, 46, 39, 41, 43, 47, 48, 48, 47,
-             47, 42, 43, 44, 47, 49, 50, 49, 50, 42, 43, 44, 47, 49, 50, 49, 50,
-             43, 43, 45, 47, 50, 50, 50, 50, 47, 46, 46, 48, 51, 52, 53, 53, 49,
-             46, 47, 48, 52, 53, 53, 54, 49, 46, 47, 48, 52, 53, 53, 54, 48, 46,
-             46, 47, 52, 53, 55, 55, 48, 46, 46, 47, 51, 53, 56, 56, 48, 46, 46,
-             47, 51, 53, 56, 56, 48, 45, 46, 46, 51, 53, 57, 57, 49, 45, 45, 46,
-             51, 53, 58, 59, 49, 45, 45, 46, 51, 53, 58, 59, 49, 45, 45, 46, 52,
-             53, 58, 60, 50, 46, 46, 46, 52, 54, 59, 61, 50, 46, 46, 46, 52, 54,
-             59, 61, 50, 46, 46, 46, 52, 54, 59, 61, 51, 47, 47, 47, 52, 54, 60,
-             62,
-             // Size 32x8
-             32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 42,
-             43, 47, 49, 49, 48, 48, 48, 48, 49, 49, 49, 50, 50, 50, 51, 31, 31,
-             31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 43, 43, 46,
-             46, 46, 46, 46, 46, 45, 45, 45, 45, 46, 46, 46, 47, 33, 33, 34, 34,
-             34, 35, 35, 35, 37, 38, 38, 41, 43, 43, 43, 44, 44, 45, 46, 47, 47,
-             46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 37, 38, 38, 38, 39, 40,
-             40, 40, 42, 43, 43, 45, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47,
-             47, 46, 46, 46, 46, 46, 46, 46, 47, 45, 45, 45, 45, 45, 44, 44, 45,
-             46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 51, 52, 52, 52, 51, 51, 51,
-             51, 51, 52, 52, 52, 52, 52, 48, 47, 47, 47, 46, 46, 46, 46, 47, 47,
-             47, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, 53, 53, 53, 53, 53,
-             53, 54, 54, 54, 54, 49, 48, 47, 47, 46, 45, 45, 45, 45, 46, 46, 45,
-             45, 45, 47, 49, 49, 50, 53, 53, 53, 55, 56, 56, 57, 58, 58, 58, 59,
-             59, 59, 60, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46,
-             47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 57, 59, 59, 60, 61, 61, 61,
-             62},
-        },
-        // Quantizer level 10.
-        {
-            {// Luma
-             // Size 4x4
-             32, 32, 34, 38, 32, 33, 35, 39, 34, 35, 39, 45, 38, 39, 45, 54,
-             // Size 8x8
-             31, 31, 32, 32, 33, 34, 37, 41, 31, 32, 32, 32, 33, 34, 36, 39, 32,
-             32, 32, 33, 34, 35, 37, 40, 32, 32, 33, 34, 35, 36, 38, 41, 33, 33,
-             34, 35, 37, 39, 41, 44, 34, 34, 35, 36, 39, 43, 46, 49, 37, 36, 37,
-             38, 41, 46, 51, 54, 41, 39, 40, 41, 44, 49, 54, 58,
-             // Size 16x16
-             32, 31, 31, 31, 31, 31, 31, 32, 32, 34, 34, 36, 36, 39, 39, 44, 31,
-             32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 31, 32,
-             32, 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 31, 32, 32,
-             32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 37, 37, 41, 31, 32, 32, 32,
-             32, 32, 32, 32, 32, 33, 33, 34, 34, 37, 37, 41, 31, 32, 32, 32, 32,
-             33, 33, 34, 34, 35, 35, 36, 36, 39, 39, 42, 31, 32, 32, 32, 32, 33,
-             33, 34, 34, 35, 35, 36, 36, 39, 39, 42, 32, 32, 32, 32, 32, 34, 34,
-             35, 35, 37, 37, 38, 38, 40, 40, 42, 32, 32, 32, 32, 32, 34, 34, 35,
-             35, 37, 37, 38, 38, 40, 40, 42, 34, 34, 34, 33, 33, 35, 35, 37, 37,
-             39, 39, 42, 42, 45, 45, 47, 34, 34, 34, 33, 33, 35, 35, 37, 37, 39,
-             39, 42, 42, 45, 45, 47, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42,
-             48, 48, 50, 50, 54, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48,
-             48, 50, 50, 54, 39, 38, 38, 37, 37, 39, 39, 40, 40, 45, 45, 50, 50,
-             54, 54, 58, 39, 38, 38, 37, 37, 39, 39, 40, 40, 45, 45, 50, 50, 54,
-             54, 58, 44, 42, 42, 41, 41, 42, 42, 42, 42, 47, 47, 54, 54, 58, 58,
-             63,
-             // Size 32x32
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
-             33, 34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34,
-             34, 34, 34, 35, 35, 35, 37, 39, 39, 39, 41, 43, 43, 31, 31, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34,
-             34, 35, 35, 35, 37, 38, 38, 38, 40, 42, 42, 31, 31, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35,
-             35, 35, 37, 38, 38, 38, 40, 42, 42, 31, 31, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35,
-             37, 38, 38, 38, 40, 42, 42, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 36, 38,
-             38, 38, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 36, 37, 37, 37,
-             39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41,
-             41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 33, 33, 33, 33, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, 31,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34,
-             34, 34, 34, 35, 35, 35, 35, 37, 38, 38, 38, 40, 41, 41, 31, 32, 32,
-             32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35,
-             35, 36, 36, 36, 36, 38, 39, 39, 39, 40, 42, 42, 31, 32, 32, 32, 32,
-             32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36,
-             36, 36, 36, 38, 39, 39, 39, 40, 42, 42, 31, 32, 32, 32, 32, 32, 32,
-             32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 36, 36,
-             36, 38, 39, 39, 39, 40, 42, 42, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             33, 33, 33, 33, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 37, 37, 38,
-             40, 40, 40, 41, 42, 42, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34,
-             34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40,
-             40, 41, 42, 42, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34,
-             34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41,
-             42, 42, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35,
-             35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 42, 42,
-             33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 36, 36, 36,
-             37, 38, 38, 38, 39, 40, 40, 40, 41, 42, 42, 42, 44, 45, 45, 34, 34,
-             34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39,
-             39, 39, 41, 42, 42, 42, 44, 45, 45, 45, 46, 47, 47, 34, 34, 34, 34,
-             34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39, 39, 39,
-             41, 42, 42, 42, 44, 45, 45, 45, 46, 47, 47, 34, 34, 34, 34, 34, 34,
-             33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39, 39, 39, 41, 42,
-             42, 42, 44, 45, 45, 45, 46, 47, 47, 35, 34, 34, 34, 34, 34, 34, 34,
-             34, 35, 36, 36, 36, 36, 37, 37, 37, 39, 41, 41, 41, 43, 45, 45, 45,
-             46, 47, 47, 47, 49, 50, 50, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35,
-             36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50,
-             50, 50, 52, 54, 54, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36,
-             36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50,
-             52, 54, 54, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37,
-             38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 54,
-             54, 37, 37, 37, 37, 37, 36, 36, 36, 36, 37, 38, 38, 38, 38, 39, 39,
-             39, 41, 44, 44, 44, 46, 49, 49, 49, 51, 52, 52, 52, 54, 56, 56, 39,
-             39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42,
-             45, 45, 45, 47, 50, 50, 50, 52, 54, 54, 54, 56, 58, 58, 39, 39, 38,
-             38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45,
-             45, 47, 50, 50, 50, 52, 54, 54, 54, 56, 58, 58, 39, 39, 38, 38, 38,
-             38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47,
-             50, 50, 50, 52, 54, 54, 54, 56, 58, 58, 41, 41, 40, 40, 40, 39, 39,
-             39, 39, 40, 40, 40, 40, 41, 41, 41, 41, 44, 46, 46, 46, 49, 52, 52,
-             52, 54, 56, 56, 56, 58, 60, 60, 44, 43, 42, 42, 42, 41, 41, 41, 41,
-             41, 42, 42, 42, 42, 42, 42, 42, 45, 47, 47, 47, 50, 54, 54, 54, 56,
-             58, 58, 58, 60, 63, 63, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42,
-             42, 42, 42, 42, 42, 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58,
-             58, 60, 63, 63,
-             // Size 4x8
-             31, 32, 34, 39, 32, 32, 34, 38, 32, 33, 34, 38, 32, 33, 36, 40, 33,
-             34, 38, 42, 34, 36, 41, 47, 37, 38, 44, 52, 40, 40, 46, 56,
-             // Size 8x4
-             31, 32, 32, 32, 33, 34, 37, 40, 32, 32, 33, 33, 34, 36, 38, 40, 34,
-             34, 34, 36, 38, 41, 44, 46, 39, 38, 38, 40, 42, 47, 52, 56,
-             // Size 8x16
-             32, 31, 31, 32, 32, 36, 36, 44, 31, 32, 32, 32, 32, 35, 35, 42, 31,
-             32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 33, 33, 34, 34, 41, 31, 32,
-             32, 33, 33, 34, 34, 41, 32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32,
-             34, 34, 36, 36, 42, 32, 33, 33, 35, 35, 38, 38, 42, 32, 33, 33, 35,
-             35, 38, 38, 42, 34, 34, 34, 37, 37, 42, 42, 48, 34, 34, 34, 37, 37,
-             42, 42, 48, 36, 34, 34, 38, 38, 48, 48, 54, 36, 34, 34, 38, 38, 48,
-             48, 54, 39, 37, 37, 40, 40, 50, 50, 58, 39, 37, 37, 40, 40, 50, 50,
-             58, 44, 41, 41, 43, 43, 53, 53, 63,
-             // Size 16x8
-             32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 31,
-             32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 31, 32,
-             32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 32, 32, 32,
-             33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43, 32, 32, 32, 33,
-             33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43, 36, 35, 35, 34, 34,
-             36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 53, 36, 35, 35, 34, 34, 36,
-             36, 38, 38, 42, 42, 48, 48, 50, 50, 53, 44, 42, 42, 41, 41, 42, 42,
-             42, 42, 48, 48, 54, 54, 58, 58, 63,
-             // Size 16x32
-             32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 39, 44, 44, 31,
-             31, 31, 31, 31, 32, 32, 32, 32, 34, 35, 35, 35, 39, 43, 43, 31, 32,
-             32, 32, 32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32,
-             32, 32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32,
-             32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32, 32,
-             32, 32, 32, 32, 34, 35, 35, 35, 38, 41, 41, 31, 32, 32, 32, 32, 32,
-             33, 33, 33, 33, 34, 34, 34, 37, 41, 41, 31, 32, 32, 32, 32, 32, 33,
-             33, 33, 33, 34, 34, 34, 37, 41, 41, 31, 32, 32, 32, 32, 32, 33, 33,
-             33, 33, 34, 34, 34, 37, 41, 41, 31, 32, 32, 32, 32, 33, 33, 33, 33,
-             34, 35, 35, 35, 38, 41, 41, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35,
-             36, 36, 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36,
-             36, 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 36,
-             36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34, 34, 36, 37, 37, 37,
-             40, 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40,
-             42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42,
-             42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42, 42,
-             33, 33, 33, 33, 33, 34, 36, 36, 36, 38, 40, 40, 40, 42, 45, 45, 34,
-             34, 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 34, 34,
-             34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 34, 34, 34,
-             34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 35, 34, 34, 34,
-             34, 36, 37, 37, 37, 41, 45, 45, 45, 47, 50, 50, 36, 35, 34, 34, 34,
-             36, 38, 38, 38, 43, 48, 48, 48, 51, 54, 54, 36, 35, 34, 34, 34, 36,
-             38, 38, 38, 43, 48, 48, 48, 51, 54, 54, 36, 35, 34, 34, 34, 36, 38,
-             38, 38, 43, 48, 48, 48, 51, 54, 54, 37, 37, 36, 36, 36, 38, 39, 39,
-             39, 44, 49, 49, 49, 52, 56, 56, 39, 38, 37, 37, 37, 39, 40, 40, 40,
-             45, 50, 50, 50, 54, 58, 58, 39, 38, 37, 37, 37, 39, 40, 40, 40, 45,
-             50, 50, 50, 54, 58, 58, 39, 38, 37, 37, 37, 39, 40, 40, 40, 45, 50,
-             50, 50, 54, 58, 58, 41, 40, 39, 39, 39, 40, 42, 42, 42, 46, 52, 52,
-             52, 56, 60, 60, 44, 42, 41, 41, 41, 42, 43, 43, 43, 48, 53, 53, 53,
-             58, 63, 63, 44, 42, 41, 41, 41, 42, 43, 43, 43, 48, 53, 53, 53, 58,
-             63, 63,
-             // Size 32x16
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
-             33, 34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44, 31, 31,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34,
-             34, 34, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42, 42, 31, 31, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34,
-             34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, 31, 31, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34,
-             34, 34, 36, 37, 37, 37, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34,
-             36, 37, 37, 37, 39, 41, 41, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
-             33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 38, 39,
-             39, 39, 40, 42, 42, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34,
-             34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40,
-             42, 43, 43, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34,
-             35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43,
-             43, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35,
-             35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 34,
-             34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38,
-             39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 48, 48, 36, 35, 35,
-             35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42,
-             42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 53, 53, 36, 35, 35, 35, 35,
-             35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45,
-             48, 48, 48, 49, 50, 50, 50, 52, 53, 53, 36, 35, 35, 35, 35, 35, 34,
-             34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48,
-             48, 49, 50, 50, 50, 52, 53, 53, 39, 39, 38, 38, 38, 38, 37, 37, 37,
-             38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47, 51, 51, 51, 52,
-             54, 54, 54, 56, 58, 58, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42,
-             42, 42, 42, 42, 42, 42, 45, 48, 48, 48, 50, 54, 54, 54, 56, 58, 58,
-             58, 60, 63, 63, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42,
-             42, 42, 42, 42, 45, 48, 48, 48, 50, 54, 54, 54, 56, 58, 58, 58, 60,
-             63, 63,
-             // Size 4x16
-             31, 32, 34, 39, 32, 32, 34, 38, 32, 32, 34, 38, 32, 32, 33, 37, 32,
-             32, 33, 37, 32, 33, 35, 39, 32, 33, 35, 39, 32, 34, 37, 40, 32, 34,
-             37, 40, 34, 35, 39, 45, 34, 35, 39, 45, 35, 36, 43, 51, 35, 36, 43,
-             51, 38, 39, 45, 54, 38, 39, 45, 54, 42, 42, 48, 58,
-             // Size 16x4
-             31, 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 32,
-             32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 39, 39, 42, 34, 34,
-             34, 33, 33, 35, 35, 37, 37, 39, 39, 43, 43, 45, 45, 48, 39, 38, 38,
-             37, 37, 39, 39, 40, 40, 45, 45, 51, 51, 54, 54, 58,
-             // Size 8x32
-             32, 31, 31, 32, 32, 36, 36, 44, 31, 31, 31, 32, 32, 35, 35, 43, 31,
-             32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 42, 31, 32,
-             32, 32, 32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 41, 31, 32, 32,
-             33, 33, 34, 34, 41, 31, 32, 32, 33, 33, 34, 34, 41, 31, 32, 32, 33,
-             33, 34, 34, 41, 31, 32, 32, 33, 33, 35, 35, 41, 32, 32, 32, 34, 34,
-             36, 36, 42, 32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32, 34, 34, 36,
-             36, 42, 32, 32, 32, 34, 34, 37, 37, 42, 32, 33, 33, 35, 35, 38, 38,
-             42, 32, 33, 33, 35, 35, 38, 38, 42, 32, 33, 33, 35, 35, 38, 38, 42,
-             33, 33, 33, 36, 36, 40, 40, 45, 34, 34, 34, 37, 37, 42, 42, 48, 34,
-             34, 34, 37, 37, 42, 42, 48, 34, 34, 34, 37, 37, 42, 42, 48, 35, 34,
-             34, 37, 37, 45, 45, 50, 36, 34, 34, 38, 38, 48, 48, 54, 36, 34, 34,
-             38, 38, 48, 48, 54, 36, 34, 34, 38, 38, 48, 48, 54, 37, 36, 36, 39,
-             39, 49, 49, 56, 39, 37, 37, 40, 40, 50, 50, 58, 39, 37, 37, 40, 40,
-             50, 50, 58, 39, 37, 37, 40, 40, 50, 50, 58, 41, 39, 39, 42, 42, 52,
-             52, 60, 44, 41, 41, 43, 43, 53, 53, 63, 44, 41, 41, 43, 43, 53, 53,
-             63,
-             // Size 32x8
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
-             33, 34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44, 31, 31,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34,
-             34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, 31, 31, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34,
-             34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, 32, 32, 32, 32, 32, 32,
-             33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38,
-             38, 38, 39, 40, 40, 40, 42, 43, 43, 32, 32, 32, 32, 32, 32, 33, 33,
-             33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38,
-             39, 40, 40, 40, 42, 43, 43, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35,
-             36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50,
-             50, 50, 52, 53, 53, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36,
-             36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50,
-             52, 53, 53, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42,
-             42, 42, 42, 45, 48, 48, 48, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63,
-             63},
-            {// Chroma
-             // Size 4x4
-             31, 34, 42, 47, 34, 39, 45, 46, 42, 45, 48, 49, 47, 46, 49, 54,
-             // Size 8x8
-             31, 31, 32, 35, 39, 45, 48, 48, 31, 31, 33, 37, 41, 44, 46, 46, 32,
-             33, 35, 39, 42, 45, 46, 45, 35, 37, 39, 43, 45, 47, 47, 46, 39, 41,
-             42, 45, 47, 48, 48, 47, 45, 44, 45, 47, 48, 50, 51, 51, 48, 46, 46,
-             47, 48, 51, 53, 54, 48, 46, 45, 46, 47, 51, 54, 56,
-             // Size 16x16
-             32, 31, 31, 30, 30, 33, 33, 36, 36, 41, 41, 49, 49, 48, 48, 49, 31,
-             31, 31, 31, 31, 34, 34, 38, 38, 42, 42, 47, 47, 47, 47, 47, 31, 31,
-             31, 31, 31, 34, 34, 38, 38, 42, 42, 47, 47, 47, 47, 47, 30, 31, 31,
-             32, 32, 35, 35, 40, 40, 42, 42, 46, 46, 45, 45, 45, 30, 31, 31, 32,
-             32, 35, 35, 40, 40, 42, 42, 46, 46, 45, 45, 45, 33, 34, 34, 35, 35,
-             39, 39, 43, 43, 45, 45, 47, 47, 46, 46, 45, 33, 34, 34, 35, 35, 39,
-             39, 43, 43, 45, 45, 47, 47, 46, 46, 45, 36, 38, 38, 40, 40, 43, 43,
-             47, 47, 47, 47, 48, 48, 46, 46, 45, 36, 38, 38, 40, 40, 43, 43, 47,
-             47, 47, 47, 48, 48, 46, 46, 45, 41, 42, 42, 42, 42, 45, 45, 47, 47,
-             48, 48, 50, 50, 49, 49, 49, 41, 42, 42, 42, 42, 45, 45, 47, 47, 48,
-             48, 50, 50, 49, 49, 49, 49, 47, 47, 46, 46, 47, 47, 48, 48, 50, 50,
-             53, 53, 53, 53, 53, 49, 47, 47, 46, 46, 47, 47, 48, 48, 50, 50, 53,
-             53, 53, 53, 53, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53,
-             54, 54, 55, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54,
-             54, 55, 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53, 53, 55, 55,
-             58,
-             // Size 32x32
-             32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 36, 36, 36,
-             39, 41, 41, 41, 45, 49, 49, 49, 49, 48, 48, 48, 49, 49, 49, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 35, 37, 37, 37, 39, 42,
-             42, 42, 45, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 33, 34, 34, 34, 36, 38, 38, 38, 40, 42, 42, 42,
-             45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 33, 34, 34, 34, 36, 38, 38, 38, 40, 42, 42, 42, 45, 47,
-             47, 47, 47, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 33, 34, 34, 34, 36, 38, 38, 38, 40, 42, 42, 42, 45, 47, 47, 47,
-             47, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33,
-             35, 35, 35, 37, 39, 39, 39, 41, 42, 42, 42, 44, 47, 47, 47, 46, 46,
-             46, 46, 46, 46, 46, 30, 31, 31, 31, 31, 31, 32, 32, 32, 33, 35, 35,
-             35, 37, 40, 40, 40, 41, 42, 42, 42, 44, 46, 46, 46, 46, 45, 45, 45,
-             45, 45, 45, 30, 31, 31, 31, 31, 31, 32, 32, 32, 33, 35, 35, 35, 37,
-             40, 40, 40, 41, 42, 42, 42, 44, 46, 46, 46, 46, 45, 45, 45, 45, 45,
-             45, 30, 31, 31, 31, 31, 31, 32, 32, 32, 33, 35, 35, 35, 37, 40, 40,
-             40, 41, 42, 42, 42, 44, 46, 46, 46, 46, 45, 45, 45, 45, 45, 45, 32,
-             32, 33, 33, 33, 33, 33, 33, 33, 35, 37, 37, 37, 39, 41, 41, 41, 42,
-             43, 43, 43, 45, 47, 47, 47, 46, 46, 46, 46, 45, 45, 45, 33, 34, 34,
-             34, 34, 35, 35, 35, 35, 37, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45,
-             45, 46, 47, 47, 47, 47, 46, 46, 46, 46, 45, 45, 33, 34, 34, 34, 34,
-             35, 35, 35, 35, 37, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46,
-             47, 47, 47, 47, 46, 46, 46, 46, 45, 45, 33, 34, 34, 34, 34, 35, 35,
-             35, 35, 37, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 47, 47,
-             47, 47, 46, 46, 46, 46, 45, 45, 35, 35, 36, 36, 36, 37, 37, 37, 37,
-             39, 41, 41, 41, 43, 45, 45, 45, 45, 46, 46, 46, 47, 47, 47, 47, 47,
-             46, 46, 46, 46, 45, 45, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43,
-             43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 46, 46,
-             46, 46, 45, 45, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43,
-             45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 46, 46, 46, 46,
-             45, 45, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47,
-             47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 46, 46, 46, 46, 45, 45,
-             39, 39, 40, 40, 40, 41, 41, 41, 41, 42, 44, 44, 44, 45, 47, 47, 47,
-             47, 48, 48, 48, 48, 49, 49, 49, 48, 48, 48, 48, 47, 47, 47, 41, 42,
-             42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47, 47, 48, 48,
-             48, 48, 49, 50, 50, 50, 50, 49, 49, 49, 49, 49, 49, 41, 42, 42, 42,
-             42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47, 47, 48, 48, 48, 48,
-             49, 50, 50, 50, 50, 49, 49, 49, 49, 49, 49, 41, 42, 42, 42, 42, 42,
-             42, 42, 42, 43, 45, 45, 45, 46, 47, 47, 47, 48, 48, 48, 48, 49, 50,
-             50, 50, 50, 49, 49, 49, 49, 49, 49, 45, 45, 45, 45, 45, 44, 44, 44,
-             44, 45, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 50, 51, 51, 51,
-             51, 51, 51, 51, 51, 51, 51, 49, 48, 47, 47, 47, 47, 46, 46, 46, 47,
-             47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53,
-             53, 53, 53, 53, 53, 49, 48, 47, 47, 47, 47, 46, 46, 46, 47, 47, 47,
-             47, 47, 48, 48, 48, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53,
-             53, 53, 53, 49, 48, 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47,
-             48, 48, 48, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53,
-             53, 49, 48, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47,
-             47, 48, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 48,
-             48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48,
-             49, 49, 49, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 55, 48, 48, 47,
-             47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49, 49,
-             49, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 55, 48, 48, 47, 47, 47,
-             46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49, 49, 49, 51,
-             53, 53, 53, 53, 54, 54, 54, 55, 55, 55, 49, 48, 47, 47, 47, 46, 45,
-             45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 47, 49, 49, 49, 51, 53, 53,
-             53, 54, 55, 55, 55, 56, 57, 57, 49, 48, 47, 47, 47, 46, 45, 45, 45,
-             45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54,
-             55, 55, 55, 57, 58, 58, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45,
-             45, 45, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55,
-             55, 57, 58, 58,
-             // Size 4x8
-             31, 34, 42, 48, 31, 35, 42, 46, 33, 37, 44, 46, 36, 41, 46, 46, 40,
-             44, 48, 48, 45, 46, 49, 51, 47, 47, 50, 54, 47, 46, 49, 55,
-             // Size 8x4
-             31, 31, 33, 36, 40, 45, 47, 47, 34, 35, 37, 41, 44, 46, 47, 46, 42,
-             42, 44, 46, 48, 49, 50, 49, 48, 46, 46, 46, 48, 51, 54, 55,
-             // Size 8x16
-             32, 31, 31, 37, 37, 48, 48, 49, 31, 31, 31, 38, 38, 47, 47, 47, 31,
-             31, 31, 38, 38, 47, 47, 47, 30, 32, 32, 40, 40, 46, 46, 45, 30, 32,
-             32, 40, 40, 46, 46, 45, 33, 36, 36, 43, 43, 47, 47, 46, 33, 36, 36,
-             43, 43, 47, 47, 46, 37, 40, 40, 47, 47, 47, 47, 45, 37, 40, 40, 47,
-             47, 47, 47, 45, 42, 43, 43, 47, 47, 50, 50, 49, 42, 43, 43, 47, 47,
-             50, 50, 49, 49, 46, 46, 48, 48, 53, 53, 53, 49, 46, 46, 48, 48, 53,
-             53, 53, 48, 46, 46, 47, 47, 53, 53, 56, 48, 46, 46, 47, 47, 53, 53,
-             56, 49, 45, 45, 46, 46, 53, 53, 58,
-             // Size 16x8
-             32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 31,
-             31, 31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 31, 31,
-             31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 37, 38, 38,
-             40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46, 37, 38, 38, 40,
-             40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46, 48, 47, 47, 46, 46,
-             47, 47, 47, 47, 50, 50, 53, 53, 53, 53, 53, 48, 47, 47, 46, 46, 47,
-             47, 47, 47, 50, 50, 53, 53, 53, 53, 53, 49, 47, 47, 45, 45, 46, 46,
-             45, 45, 49, 49, 53, 53, 56, 56, 58,
-             // Size 16x32
-             32, 31, 31, 31, 31, 33, 37, 37, 37, 42, 48, 48, 48, 48, 49, 49, 31,
-             31, 31, 31, 31, 34, 37, 37, 37, 42, 47, 47, 47, 48, 48, 48, 31, 31,
-             31, 31, 31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 31,
-             31, 31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31,
-             31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 32, 32, 32,
-             35, 39, 39, 39, 42, 46, 46, 46, 46, 46, 46, 30, 31, 32, 32, 32, 35,
-             40, 40, 40, 42, 46, 46, 46, 45, 45, 45, 30, 31, 32, 32, 32, 35, 40,
-             40, 40, 42, 46, 46, 46, 45, 45, 45, 30, 31, 32, 32, 32, 35, 40, 40,
-             40, 42, 46, 46, 46, 45, 45, 45, 32, 33, 34, 34, 34, 37, 41, 41, 41,
-             44, 46, 46, 46, 46, 45, 45, 33, 34, 36, 36, 36, 39, 43, 43, 43, 45,
-             47, 47, 47, 46, 46, 46, 33, 34, 36, 36, 36, 39, 43, 43, 43, 45, 47,
-             47, 47, 46, 46, 46, 33, 34, 36, 36, 36, 39, 43, 43, 43, 45, 47, 47,
-             47, 46, 46, 46, 35, 36, 38, 38, 38, 41, 45, 45, 45, 46, 47, 47, 47,
-             46, 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46,
-             45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46, 45,
-             45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46, 45, 45,
-             39, 40, 41, 41, 41, 44, 47, 47, 47, 48, 49, 49, 49, 48, 47, 47, 42,
-             42, 43, 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 42, 42,
-             43, 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 42, 42, 43,
-             43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 45, 45, 44, 44,
-             44, 46, 47, 47, 47, 49, 51, 51, 51, 51, 51, 51, 49, 48, 46, 46, 46,
-             47, 48, 48, 48, 50, 53, 53, 53, 53, 53, 53, 49, 48, 46, 46, 46, 47,
-             48, 48, 48, 50, 53, 53, 53, 53, 53, 53, 49, 48, 46, 46, 46, 47, 48,
-             48, 48, 50, 53, 53, 53, 53, 53, 53, 48, 47, 46, 46, 46, 47, 47, 47,
-             47, 50, 53, 53, 53, 54, 54, 54, 48, 47, 46, 46, 46, 46, 47, 47, 47,
-             50, 53, 53, 53, 54, 56, 56, 48, 47, 46, 46, 46, 46, 47, 47, 47, 50,
-             53, 53, 53, 54, 56, 56, 48, 47, 46, 46, 46, 46, 47, 47, 47, 50, 53,
-             53, 53, 54, 56, 56, 48, 47, 45, 45, 45, 46, 46, 46, 46, 49, 53, 53,
-             53, 55, 57, 57, 49, 47, 45, 45, 45, 45, 46, 46, 46, 49, 53, 53, 53,
-             56, 58, 58, 49, 47, 45, 45, 45, 45, 46, 46, 46, 49, 53, 53, 53, 56,
-             58, 58,
-             // Size 32x16
-             32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 37, 37, 37,
-             39, 42, 42, 42, 45, 49, 49, 49, 48, 48, 48, 48, 48, 49, 49, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 33, 34, 34, 34, 36, 38, 38, 38, 40, 42,
-             42, 42, 45, 48, 48, 48, 47, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31,
-             31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43,
-             44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45, 31, 31, 31, 31, 31, 32,
-             32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46,
-             46, 46, 46, 46, 46, 46, 45, 45, 45, 31, 31, 31, 31, 31, 32, 32, 32,
-             32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46,
-             46, 46, 46, 46, 45, 45, 45, 33, 34, 34, 34, 34, 35, 35, 35, 35, 37,
-             39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 46,
-             46, 46, 46, 45, 45, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43,
-             43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47,
-             46, 46, 46, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45,
-             47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46,
-             46, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47,
-             47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46, 42,
-             42, 42, 42, 42, 42, 42, 42, 42, 44, 45, 45, 45, 46, 47, 47, 47, 48,
-             48, 48, 48, 49, 50, 50, 50, 50, 50, 50, 50, 49, 49, 49, 48, 47, 47,
-             47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50,
-             50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 48, 47, 47, 47, 47,
-             46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51,
-             53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 48, 47, 47, 47, 47, 46, 46,
-             46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53,
-             53, 53, 53, 53, 53, 53, 53, 53, 48, 48, 47, 47, 47, 46, 45, 45, 45,
-             46, 46, 46, 46, 46, 46, 46, 46, 48, 50, 50, 50, 51, 53, 53, 53, 54,
-             54, 54, 54, 55, 56, 56, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 46,
-             46, 46, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 56, 56,
-             56, 57, 58, 58, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 46, 46, 46,
-             45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 56, 56, 56, 57,
-             58, 58,
-             // Size 4x16
-             31, 33, 42, 48, 31, 34, 42, 47, 31, 34, 42, 47, 31, 35, 42, 45, 31,
-             35, 42, 45, 34, 39, 45, 46, 34, 39, 45, 46, 38, 43, 47, 46, 38, 43,
-             47, 46, 42, 45, 48, 50, 42, 45, 48, 50, 48, 47, 50, 53, 48, 47, 50,
-             53, 47, 46, 50, 54, 47, 46, 50, 54, 47, 45, 49, 56,
-             // Size 16x4
-             31, 31, 31, 31, 31, 34, 34, 38, 38, 42, 42, 48, 48, 47, 47, 47, 33,
-             34, 34, 35, 35, 39, 39, 43, 43, 45, 45, 47, 47, 46, 46, 45, 42, 42,
-             42, 42, 42, 45, 45, 47, 47, 48, 48, 50, 50, 50, 50, 49, 48, 47, 47,
-             45, 45, 46, 46, 46, 46, 50, 50, 53, 53, 54, 54, 56,
-             // Size 8x32
-             32, 31, 31, 37, 37, 48, 48, 49, 31, 31, 31, 37, 37, 47, 47, 48, 31,
-             31, 31, 38, 38, 47, 47, 47, 31, 31, 31, 38, 38, 47, 47, 47, 31, 31,
-             31, 38, 38, 47, 47, 47, 31, 32, 32, 39, 39, 46, 46, 46, 30, 32, 32,
-             40, 40, 46, 46, 45, 30, 32, 32, 40, 40, 46, 46, 45, 30, 32, 32, 40,
-             40, 46, 46, 45, 32, 34, 34, 41, 41, 46, 46, 45, 33, 36, 36, 43, 43,
-             47, 47, 46, 33, 36, 36, 43, 43, 47, 47, 46, 33, 36, 36, 43, 43, 47,
-             47, 46, 35, 38, 38, 45, 45, 47, 47, 45, 37, 40, 40, 47, 47, 47, 47,
-             45, 37, 40, 40, 47, 47, 47, 47, 45, 37, 40, 40, 47, 47, 47, 47, 45,
-             39, 41, 41, 47, 47, 49, 49, 47, 42, 43, 43, 47, 47, 50, 50, 49, 42,
-             43, 43, 47, 47, 50, 50, 49, 42, 43, 43, 47, 47, 50, 50, 49, 45, 44,
-             44, 47, 47, 51, 51, 51, 49, 46, 46, 48, 48, 53, 53, 53, 49, 46, 46,
-             48, 48, 53, 53, 53, 49, 46, 46, 48, 48, 53, 53, 53, 48, 46, 46, 47,
-             47, 53, 53, 54, 48, 46, 46, 47, 47, 53, 53, 56, 48, 46, 46, 47, 47,
-             53, 53, 56, 48, 46, 46, 47, 47, 53, 53, 56, 48, 45, 45, 46, 46, 53,
-             53, 57, 49, 45, 45, 46, 46, 53, 53, 58, 49, 45, 45, 46, 46, 53, 53,
-             58,
-             // Size 32x8
-             32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 37, 37, 37,
-             39, 42, 42, 42, 45, 49, 49, 49, 48, 48, 48, 48, 48, 49, 49, 31, 31,
-             31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43,
-             43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45, 31, 31, 31, 31,
-             31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43,
-             44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45, 37, 37, 38, 38, 38, 39,
-             40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48,
-             48, 48, 47, 47, 47, 47, 46, 46, 46, 37, 37, 38, 38, 38, 39, 40, 40,
-             40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48,
-             47, 47, 47, 47, 46, 46, 46, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46,
-             47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53,
-             53, 53, 53, 53, 53, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47,
-             47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53,
-             53, 53, 53, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 46, 46, 46, 45,
-             45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 56, 56, 56, 57, 58,
-             58},
-        },
-        // Quantizer level 11.
-        {
-            {// Luma
-             // Size 4x4
-             32, 32, 32, 35, 32, 32, 33, 35, 32, 33, 35, 38, 35, 35, 38, 46,
-             // Size 8x8
-             31, 31, 31, 32, 32, 32, 34, 35, 31, 32, 32, 32, 32, 33, 34, 35, 31,
-             32, 32, 32, 32, 33, 33, 34, 32, 32, 32, 33, 34, 34, 35, 36, 32, 32,
-             32, 34, 35, 35, 36, 38, 32, 33, 33, 34, 35, 36, 38, 40, 34, 34, 33,
-             35, 36, 38, 39, 42, 35, 35, 34, 36, 38, 40, 42, 48,
-             // Size 16x16
-             32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 34, 36, 36, 31,
-             31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 31,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 31, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 31, 32, 32, 32, 32, 32,
-             33, 33, 33, 33, 33, 34, 35, 35, 36, 36, 31, 32, 32, 32, 32, 32, 33,
-             33, 33, 34, 34, 35, 35, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 33,
-             34, 34, 34, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 33, 34, 34,
-             35, 35, 36, 37, 37, 38, 38, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35,
-             35, 36, 37, 37, 38, 38, 33, 33, 33, 33, 33, 33, 34, 35, 35, 36, 36,
-             38, 39, 40, 42, 42, 34, 34, 34, 34, 33, 33, 35, 35, 36, 37, 37, 39,
-             39, 41, 42, 42, 34, 34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 40, 41,
-             42, 45, 45, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42, 45,
-             48, 48, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42, 45, 48,
-             48,
-             // Size 32x32
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
-             32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 31, 31, 31, 31,
-             31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 31, 31, 31, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
-             34, 34, 34, 34, 35, 35, 35, 35, 36, 31, 31, 31, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34,
-             34, 34, 35, 35, 35, 35, 36, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34,
-             35, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35,
-             35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34,
-             35, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 31,
-             31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 31, 31, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 31, 31, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
-             34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 31, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35,
-             35, 35, 35, 36, 36, 36, 36, 37, 31, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35,
-             36, 36, 36, 36, 36, 37, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36,
-             36, 36, 36, 37, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
-             33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36,
-             36, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
-             33, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 38,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34,
-             35, 35, 35, 35, 35, 36, 36, 36, 36, 37, 37, 38, 38, 38, 39, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35,
-             35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35,
-             36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36,
-             37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, 33, 33, 33, 33, 33, 33,
-             33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37, 38, 38,
-             38, 38, 39, 40, 40, 40, 41, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
-             33, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 38, 39, 39, 39, 40,
-             41, 42, 42, 42, 42, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 34,
-             35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39, 39, 39, 41, 42, 42,
-             42, 42, 43, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35, 35,
-             35, 35, 36, 36, 37, 37, 37, 38, 39, 39, 39, 39, 41, 42, 42, 42, 42,
-             43, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35, 35, 35, 35,
-             36, 36, 37, 37, 37, 38, 39, 39, 39, 39, 41, 42, 42, 42, 42, 43, 34,
-             34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37,
-             37, 37, 37, 38, 40, 41, 41, 41, 42, 44, 45, 45, 45, 45, 35, 35, 35,
-             35, 35, 35, 34, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 37, 38, 38,
-             38, 39, 41, 42, 42, 42, 44, 46, 47, 47, 47, 48, 36, 35, 35, 35, 35,
-             35, 35, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40,
-             42, 42, 42, 42, 45, 47, 48, 48, 48, 49, 36, 35, 35, 35, 35, 35, 35,
-             34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42,
-             42, 42, 45, 47, 48, 48, 48, 49, 36, 35, 35, 35, 35, 35, 35, 34, 34,
-             34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42,
-             45, 47, 48, 48, 48, 49, 37, 37, 36, 36, 36, 36, 36, 35, 35, 35, 35,
-             36, 37, 37, 37, 37, 38, 39, 39, 39, 39, 41, 42, 43, 43, 43, 45, 48,
-             49, 49, 49, 50,
-             // Size 4x8
-             31, 31, 32, 35, 32, 32, 32, 35, 32, 32, 33, 34, 32, 32, 34, 36, 32,
-             33, 35, 38, 33, 33, 36, 40, 34, 34, 37, 42, 35, 34, 38, 48,
-             // Size 8x4
-             31, 32, 32, 32, 32, 33, 34, 35, 31, 32, 32, 32, 33, 33, 34, 34, 32,
-             32, 33, 34, 35, 36, 37, 38, 35, 35, 34, 36, 38, 40, 42, 48,
-             // Size 8x16
-             32, 31, 31, 31, 32, 32, 35, 36, 31, 32, 32, 32, 32, 32, 35, 35, 31,
-             32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 34, 35, 31, 32,
-             32, 32, 33, 33, 34, 34, 31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32,
-             33, 34, 34, 35, 36, 32, 32, 32, 33, 34, 34, 36, 36, 32, 32, 32, 33,
-             34, 34, 36, 37, 32, 32, 33, 34, 35, 35, 37, 38, 32, 32, 33, 34, 35,
-             35, 37, 38, 33, 33, 33, 35, 36, 36, 40, 41, 34, 34, 34, 35, 37, 37,
-             41, 42, 34, 34, 34, 35, 37, 37, 43, 44, 36, 35, 34, 36, 38, 38, 46,
-             48, 36, 35, 34, 36, 38, 38, 46, 48,
-             // Size 16x8
-             32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 31,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32,
-             32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32,
-             32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 35, 36, 36, 32, 32, 32, 32,
-             33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 32, 32, 32, 32, 33,
-             33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 35, 35, 35, 34, 34, 34,
-             35, 36, 36, 37, 37, 40, 41, 43, 46, 46, 36, 35, 35, 35, 34, 34, 36,
-             36, 37, 38, 38, 41, 42, 44, 48, 48,
-             // Size 16x32
-             32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31,
-             31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 31,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 33, 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 33, 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32,
-             33, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33,
-             33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33,
-             33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
-             34, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34,
-             35, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36,
-             36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36,
-             36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36,
-             36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 37, 37, 37,
-             32, 32, 32, 33, 33, 33, 33, 34, 35, 35, 35, 36, 37, 38, 38, 38, 32,
-             32, 32, 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 32,
-             32, 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 32, 32,
-             33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 33, 33, 33,
-             33, 33, 34, 35, 36, 36, 36, 37, 39, 40, 40, 40, 33, 33, 33, 33, 33,
-             33, 35, 36, 36, 36, 36, 38, 40, 41, 41, 41, 34, 34, 34, 34, 34, 34,
-             35, 36, 37, 37, 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35,
-             36, 37, 37, 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, 36,
-             37, 37, 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, 37, 37,
-             37, 37, 40, 43, 44, 44, 44, 35, 35, 34, 34, 34, 34, 36, 37, 38, 38,
-             38, 41, 45, 47, 47, 47, 36, 35, 35, 34, 34, 34, 36, 37, 38, 38, 38,
-             42, 46, 48, 48, 48, 36, 35, 35, 34, 34, 34, 36, 37, 38, 38, 38, 42,
-             46, 48, 48, 48, 36, 35, 35, 34, 34, 34, 36, 37, 38, 38, 38, 42, 46,
-             48, 48, 48, 37, 36, 36, 36, 36, 36, 37, 38, 39, 39, 39, 42, 46, 49,
-             49, 49,
-             // Size 32x16
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37, 31, 31,
-             31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 31, 31, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
-             34, 34, 34, 34, 34, 34, 34, 34, 36, 31, 31, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34,
-             34, 34, 34, 34, 34, 34, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34,
-             34, 34, 34, 34, 36, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
-             33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36, 36,
-             36, 36, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34,
-             34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 37,
-             38, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34,
-             34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32,
-             32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35,
-             35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, 32,
-             32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35,
-             35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 33, 33, 33, 33, 33,
-             33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37,
-             38, 39, 39, 39, 40, 41, 42, 42, 42, 42, 35, 35, 35, 35, 35, 35, 34,
-             34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 39, 40, 41,
-             41, 41, 43, 45, 46, 46, 46, 46, 36, 35, 35, 35, 35, 35, 35, 35, 34,
-             34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, 42, 42,
-             44, 47, 48, 48, 48, 49, 36, 35, 35, 35, 35, 35, 35, 35, 34, 34, 34,
-             35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47,
-             48, 48, 48, 49, 36, 35, 35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36,
-             36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47, 48, 48,
-             48, 49,
-             // Size 4x16
-             31, 31, 32, 36, 31, 32, 32, 35, 32, 32, 32, 35, 32, 32, 32, 35, 32,
-             32, 33, 34, 32, 32, 33, 34, 32, 32, 34, 36, 32, 32, 34, 36, 32, 32,
-             34, 37, 32, 33, 35, 38, 32, 33, 35, 38, 33, 33, 36, 41, 34, 34, 37,
-             42, 34, 34, 37, 44, 35, 34, 38, 48, 35, 34, 38, 48,
-             // Size 16x4
-             31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31,
-             32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 32, 32,
-             32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 36, 35, 35,
-             35, 34, 34, 36, 36, 37, 38, 38, 41, 42, 44, 48, 48,
-             // Size 8x32
-             32, 31, 31, 31, 32, 32, 35, 36, 31, 31, 31, 32, 32, 32, 35, 35, 31,
-             32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35, 31, 32,
-             32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32,
-             32, 32, 32, 34, 35, 31, 32, 32, 32, 32, 32, 34, 35, 31, 32, 32, 32,
-             33, 33, 34, 34, 31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 32, 33,
-             33, 34, 34, 31, 32, 32, 33, 33, 33, 35, 35, 31, 32, 32, 33, 34, 34,
-             35, 36, 32, 32, 32, 33, 34, 34, 36, 36, 32, 32, 32, 33, 34, 34, 36,
-             36, 32, 32, 32, 33, 34, 34, 36, 36, 32, 32, 32, 33, 34, 34, 36, 37,
-             32, 32, 33, 33, 35, 35, 37, 38, 32, 32, 33, 34, 35, 35, 37, 38, 32,
-             32, 33, 34, 35, 35, 37, 38, 32, 32, 33, 34, 35, 35, 37, 38, 32, 33,
-             33, 34, 36, 36, 39, 40, 33, 33, 33, 35, 36, 36, 40, 41, 34, 34, 34,
-             35, 37, 37, 41, 42, 34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35,
-             37, 37, 41, 42, 34, 34, 34, 35, 37, 37, 43, 44, 35, 34, 34, 36, 38,
-             38, 45, 47, 36, 35, 34, 36, 38, 38, 46, 48, 36, 35, 34, 36, 38, 38,
-             46, 48, 36, 35, 34, 36, 38, 38, 46, 48, 37, 36, 36, 37, 39, 39, 46,
-             49,
-             // Size 32x8
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37, 31, 31,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 36, 31, 31, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
-             33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36, 31, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35,
-             35, 35, 35, 35, 36, 36, 36, 36, 37, 32, 32, 32, 32, 32, 32, 32, 32,
-             33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37,
-             37, 37, 38, 38, 38, 38, 39, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
-             33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37,
-             38, 38, 38, 38, 39, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35,
-             35, 36, 36, 36, 36, 37, 37, 37, 37, 39, 40, 41, 41, 41, 43, 45, 46,
-             46, 46, 46, 36, 35, 35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36,
-             36, 36, 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47, 48, 48, 48,
-             49},
-            {// Chroma
-             // Size 4x4
-             31, 32, 38, 46, 32, 34, 41, 46, 38, 41, 47, 47, 46, 46, 47, 52,
-             // Size 8x8
-             31, 31, 30, 34, 36, 39, 42, 48, 31, 31, 31, 34, 37, 40, 42, 47, 30,
-             31, 32, 35, 39, 41, 42, 46, 34, 34, 35, 39, 42, 44, 45, 47, 36, 37,
-             39, 42, 46, 47, 47, 47, 39, 40, 41, 44, 47, 47, 48, 49, 42, 42, 42,
-             45, 47, 48, 48, 50, 48, 47, 46, 47, 47, 49, 50, 53,
-             // Size 16x16
-             32, 31, 31, 31, 30, 30, 33, 33, 34, 36, 36, 40, 41, 44, 49, 49, 31,
-             31, 31, 31, 31, 31, 33, 34, 36, 38, 38, 41, 42, 44, 48, 48, 31, 31,
-             31, 31, 31, 31, 34, 34, 36, 38, 38, 41, 42, 44, 47, 47, 31, 31, 31,
-             31, 31, 31, 34, 35, 36, 39, 39, 41, 42, 44, 47, 47, 30, 31, 31, 31,
-             32, 32, 34, 35, 37, 40, 40, 42, 42, 44, 46, 46, 30, 31, 31, 31, 32,
-             32, 34, 35, 37, 40, 40, 42, 42, 44, 46, 46, 33, 33, 34, 34, 34, 34,
-             37, 38, 40, 42, 42, 44, 44, 45, 47, 47, 33, 34, 34, 35, 35, 35, 38,
-             39, 40, 43, 43, 44, 45, 46, 47, 47, 34, 36, 36, 36, 37, 37, 40, 40,
-             42, 45, 45, 45, 46, 46, 47, 47, 36, 38, 38, 39, 40, 40, 42, 43, 45,
-             47, 47, 47, 47, 47, 48, 48, 36, 38, 38, 39, 40, 40, 42, 43, 45, 47,
-             47, 47, 47, 47, 48, 48, 40, 41, 41, 41, 42, 42, 44, 44, 45, 47, 47,
-             48, 48, 49, 50, 50, 41, 42, 42, 42, 42, 42, 44, 45, 46, 47, 47, 48,
-             48, 49, 50, 50, 44, 44, 44, 44, 44, 44, 45, 46, 46, 47, 47, 49, 49,
-             50, 51, 51, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51,
-             53, 53, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51, 53,
-             53,
-             // Size 32x32
-             32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 34,
-             36, 36, 36, 36, 38, 40, 41, 41, 41, 44, 47, 49, 49, 49, 49, 31, 31,
-             31, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 34, 34, 34, 35, 36, 37,
-             37, 37, 39, 41, 42, 42, 42, 44, 47, 48, 48, 48, 48, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 36, 37, 38, 38, 38,
-             39, 41, 42, 42, 42, 44, 46, 48, 48, 48, 47, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 36, 37, 38, 38, 38, 40, 41,
-             42, 42, 42, 44, 46, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 32, 34, 34, 34, 34, 36, 37, 38, 38, 38, 40, 41, 42, 42,
-             42, 44, 46, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 32, 34, 34, 34, 34, 36, 37, 38, 38, 38, 40, 41, 42, 42, 42, 44,
-             46, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33,
-             34, 35, 35, 35, 36, 38, 39, 39, 39, 40, 41, 42, 42, 42, 44, 46, 47,
-             47, 47, 47, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35,
-             35, 35, 37, 38, 39, 39, 39, 41, 42, 42, 42, 42, 44, 46, 46, 46, 46,
-             46, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35,
-             37, 39, 40, 40, 40, 41, 42, 42, 42, 42, 44, 45, 46, 46, 46, 46, 30,
-             30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 39,
-             40, 40, 40, 41, 42, 42, 42, 42, 44, 45, 46, 46, 46, 46, 30, 30, 31,
-             31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 39, 40, 40,
-             40, 41, 42, 42, 42, 42, 44, 45, 46, 46, 46, 46, 31, 32, 32, 32, 32,
-             32, 33, 33, 33, 33, 33, 34, 36, 37, 37, 37, 38, 40, 41, 41, 41, 42,
-             43, 43, 43, 43, 44, 46, 46, 46, 46, 46, 33, 33, 33, 34, 34, 34, 34,
-             34, 34, 34, 34, 36, 37, 38, 38, 38, 40, 41, 42, 42, 42, 43, 44, 44,
-             44, 44, 45, 46, 47, 47, 47, 46, 33, 34, 34, 34, 34, 34, 35, 35, 35,
-             35, 35, 37, 38, 39, 39, 39, 40, 42, 43, 43, 43, 44, 44, 45, 45, 45,
-             46, 47, 47, 47, 47, 47, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35,
-             37, 38, 39, 39, 39, 40, 42, 43, 43, 43, 44, 44, 45, 45, 45, 46, 47,
-             47, 47, 47, 47, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38,
-             39, 39, 39, 40, 42, 43, 43, 43, 44, 44, 45, 45, 45, 46, 47, 47, 47,
-             47, 47, 34, 35, 36, 36, 36, 36, 36, 37, 37, 37, 37, 38, 40, 40, 40,
-             40, 42, 44, 45, 45, 45, 45, 45, 46, 46, 46, 46, 47, 47, 47, 47, 47,
-             36, 36, 37, 37, 37, 37, 38, 38, 39, 39, 39, 40, 41, 42, 42, 42, 44,
-             46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 36, 37,
-             38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 45, 46, 47,
-             47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 36, 37, 38, 38,
-             38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 45, 46, 47, 47, 47,
-             47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 36, 37, 38, 38, 38, 38,
-             39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 45, 46, 47, 47, 47, 47, 47,
-             47, 47, 47, 47, 47, 48, 48, 48, 47, 38, 39, 39, 40, 40, 40, 40, 41,
-             41, 41, 41, 42, 43, 44, 44, 44, 45, 47, 47, 47, 47, 47, 48, 48, 48,
-             48, 48, 48, 49, 49, 49, 48, 40, 41, 41, 41, 41, 41, 41, 42, 42, 42,
-             42, 43, 44, 44, 44, 44, 45, 47, 47, 47, 47, 48, 48, 48, 48, 48, 49,
-             49, 50, 50, 50, 49, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43,
-             44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48, 48, 48, 49, 50, 50,
-             50, 50, 50, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 45,
-             45, 45, 46, 47, 47, 47, 47, 48, 48, 48, 48, 48, 49, 50, 50, 50, 50,
-             50, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 45,
-             46, 47, 47, 47, 47, 48, 48, 48, 48, 48, 49, 50, 50, 50, 50, 50, 44,
-             44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 45, 46, 46, 46, 46, 47,
-             47, 47, 47, 48, 49, 49, 49, 49, 50, 51, 51, 51, 51, 51, 47, 47, 46,
-             46, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47, 47, 47, 47, 47, 47,
-             47, 48, 49, 50, 50, 50, 51, 52, 52, 52, 52, 52, 49, 48, 48, 47, 47,
-             47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49,
-             50, 50, 50, 50, 51, 52, 53, 53, 53, 53, 49, 48, 48, 47, 47, 47, 47,
-             46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50,
-             50, 50, 51, 52, 53, 53, 53, 53, 49, 48, 48, 47, 47, 47, 47, 46, 46,
-             46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50,
-             51, 52, 53, 53, 53, 53, 49, 48, 47, 47, 47, 47, 47, 46, 46, 46, 46,
-             46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52,
-             53, 53, 53, 53,
-             // Size 4x8
-             31, 31, 37, 48, 31, 31, 38, 47, 31, 32, 40, 46, 34, 36, 43, 47, 37,
-             39, 46, 47, 39, 41, 47, 48, 42, 43, 47, 50, 48, 46, 48, 53,
-             // Size 8x4
-             31, 31, 31, 34, 37, 39, 42, 48, 31, 31, 32, 36, 39, 41, 43, 46, 37,
-             38, 40, 43, 46, 47, 47, 48, 48, 47, 46, 47, 47, 48, 50, 53,
-             // Size 8x16
-             32, 31, 31, 33, 37, 37, 45, 48, 31, 31, 31, 34, 38, 38, 45, 47, 31,
-             31, 31, 34, 38, 38, 45, 47, 31, 31, 32, 34, 39, 39, 45, 46, 30, 32,
-             32, 35, 40, 40, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46, 33, 34, 35,
-             37, 42, 42, 46, 47, 33, 35, 36, 38, 43, 43, 46, 47, 35, 37, 37, 40,
-             44, 44, 46, 47, 37, 39, 40, 43, 47, 47, 47, 47, 37, 39, 40, 43, 47,
-             47, 47, 47, 41, 42, 42, 44, 47, 47, 49, 49, 42, 42, 43, 44, 47, 47,
-             49, 50, 44, 44, 44, 45, 47, 47, 50, 51, 49, 47, 46, 47, 48, 48, 52,
-             53, 49, 47, 46, 47, 48, 48, 52, 53,
-             // Size 16x8
-             32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 31,
-             31, 31, 31, 32, 32, 34, 35, 37, 39, 39, 42, 42, 44, 47, 47, 31, 31,
-             31, 32, 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 33, 34, 34,
-             34, 35, 35, 37, 38, 40, 43, 43, 44, 44, 45, 47, 47, 37, 38, 38, 39,
-             40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 37, 38, 38, 39, 40,
-             40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 45, 45, 45, 45, 44, 44,
-             46, 46, 46, 47, 47, 49, 49, 50, 52, 52, 48, 47, 47, 46, 46, 46, 47,
-             47, 47, 47, 47, 49, 50, 51, 53, 53,
-             // Size 16x32
-             32, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 40, 45, 48, 48, 48, 31,
-             31, 31, 31, 31, 31, 33, 36, 37, 37, 37, 41, 45, 48, 48, 48, 31, 31,
-             31, 31, 31, 31, 34, 36, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31,
-             31, 31, 31, 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31,
-             31, 31, 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31, 31,
-             31, 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 32, 32, 32,
-             34, 37, 39, 39, 39, 41, 45, 46, 46, 46, 30, 31, 31, 32, 32, 32, 34,
-             38, 39, 39, 39, 42, 44, 46, 46, 46, 30, 31, 32, 32, 32, 32, 35, 38,
-             40, 40, 40, 42, 44, 46, 46, 46, 30, 31, 32, 32, 32, 32, 35, 38, 40,
-             40, 40, 42, 44, 46, 46, 46, 30, 31, 32, 32, 32, 32, 35, 38, 40, 40,
-             40, 42, 44, 46, 46, 46, 31, 32, 33, 33, 33, 33, 36, 39, 41, 41, 41,
-             43, 45, 46, 46, 46, 33, 34, 34, 35, 35, 35, 37, 40, 42, 42, 42, 44,
-             46, 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44, 46,
-             47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44, 46, 47,
-             47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44, 46, 47, 47,
-             47, 35, 36, 37, 37, 37, 37, 40, 43, 44, 44, 44, 45, 46, 47, 47, 47,
-             36, 37, 38, 39, 39, 39, 42, 44, 46, 46, 46, 47, 47, 47, 47, 47, 37,
-             38, 39, 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 37, 38,
-             39, 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 37, 38, 39,
-             40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 39, 39, 40, 41,
-             41, 41, 43, 46, 47, 47, 47, 48, 48, 48, 48, 48, 41, 41, 42, 42, 42,
-             42, 44, 46, 47, 47, 47, 48, 49, 49, 49, 49, 42, 42, 42, 43, 43, 43,
-             44, 46, 47, 47, 47, 48, 49, 50, 50, 50, 42, 42, 42, 43, 43, 43, 44,
-             46, 47, 47, 47, 48, 49, 50, 50, 50, 42, 42, 42, 43, 43, 43, 44, 46,
-             47, 47, 47, 48, 49, 50, 50, 50, 44, 44, 44, 44, 44, 44, 45, 47, 47,
-             47, 47, 49, 50, 51, 51, 51, 47, 46, 46, 46, 46, 46, 46, 47, 48, 48,
-             48, 49, 51, 52, 52, 52, 49, 48, 47, 46, 46, 46, 47, 48, 48, 48, 48,
-             50, 52, 53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 48, 48, 48, 48, 50,
-             52, 53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 48, 48, 48, 48, 50, 52,
-             53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 47, 47, 47, 47, 49, 52, 53,
-             53, 53,
-             // Size 32x16
-             32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 35,
-             36, 37, 37, 37, 39, 41, 42, 42, 42, 44, 47, 49, 49, 49, 49, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 36, 37, 38,
-             38, 38, 39, 41, 42, 42, 42, 44, 46, 48, 48, 48, 48, 31, 31, 31, 31,
-             31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 38, 39, 39, 39,
-             40, 42, 42, 42, 42, 44, 46, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31,
-             32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42,
-             43, 43, 43, 44, 46, 46, 46, 46, 46, 31, 31, 31, 31, 31, 31, 32, 32,
-             32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43,
-             43, 44, 46, 46, 46, 46, 46, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
-             32, 33, 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44,
-             46, 46, 46, 46, 46, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36,
-             37, 38, 38, 38, 40, 42, 43, 43, 43, 43, 44, 44, 44, 44, 45, 46, 47,
-             47, 47, 47, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 40, 41,
-             41, 41, 43, 44, 45, 45, 45, 46, 46, 46, 46, 46, 47, 47, 48, 48, 48,
-             47, 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43,
-             44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 37,
-             37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46,
-             47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 37, 37, 38,
-             38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47,
-             47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 40, 41, 41, 41, 41,
-             41, 41, 42, 42, 42, 42, 43, 44, 44, 44, 44, 45, 47, 47, 47, 47, 48,
-             48, 48, 48, 48, 49, 49, 50, 50, 50, 49, 45, 45, 45, 45, 45, 45, 45,
-             44, 44, 44, 44, 45, 46, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49,
-             49, 49, 50, 51, 52, 52, 52, 52, 48, 48, 47, 47, 47, 47, 46, 46, 46,
-             46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50,
-             51, 52, 53, 53, 53, 53, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46,
-             46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52,
-             53, 53, 53, 53, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47,
-             47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53,
-             53, 53,
-             // Size 4x16
-             31, 31, 37, 48, 31, 31, 38, 47, 31, 31, 38, 47, 31, 32, 39, 46, 31,
-             32, 40, 46, 31, 32, 40, 46, 34, 35, 42, 47, 34, 36, 43, 47, 36, 37,
-             44, 47, 38, 40, 47, 47, 38, 40, 47, 47, 41, 42, 47, 49, 42, 43, 47,
-             50, 44, 44, 47, 51, 48, 46, 48, 53, 48, 46, 48, 53,
-             // Size 16x4
-             31, 31, 31, 31, 31, 31, 34, 34, 36, 38, 38, 41, 42, 44, 48, 48, 31,
-             31, 31, 32, 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 37, 38,
-             38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47,
-             46, 46, 46, 47, 47, 47, 47, 47, 49, 50, 51, 53, 53,
-             // Size 8x32
-             32, 31, 31, 33, 37, 37, 45, 48, 31, 31, 31, 33, 37, 37, 45, 48, 31,
-             31, 31, 34, 38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47, 31, 31,
-             31, 34, 38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 32,
-             34, 39, 39, 45, 46, 30, 31, 32, 34, 39, 39, 44, 46, 30, 32, 32, 35,
-             40, 40, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46, 30, 32, 32, 35, 40,
-             40, 44, 46, 31, 33, 33, 36, 41, 41, 45, 46, 33, 34, 35, 37, 42, 42,
-             46, 47, 33, 35, 36, 38, 43, 43, 46, 47, 33, 35, 36, 38, 43, 43, 46,
-             47, 33, 35, 36, 38, 43, 43, 46, 47, 35, 37, 37, 40, 44, 44, 46, 47,
-             36, 38, 39, 42, 46, 46, 47, 47, 37, 39, 40, 43, 47, 47, 47, 47, 37,
-             39, 40, 43, 47, 47, 47, 47, 37, 39, 40, 43, 47, 47, 47, 47, 39, 40,
-             41, 43, 47, 47, 48, 48, 41, 42, 42, 44, 47, 47, 49, 49, 42, 42, 43,
-             44, 47, 47, 49, 50, 42, 42, 43, 44, 47, 47, 49, 50, 42, 42, 43, 44,
-             47, 47, 49, 50, 44, 44, 44, 45, 47, 47, 50, 51, 47, 46, 46, 46, 48,
-             48, 51, 52, 49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 48, 48,
-             52, 53, 49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 47, 47, 52,
-             53,
-             // Size 32x8
-             32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 35,
-             36, 37, 37, 37, 39, 41, 42, 42, 42, 44, 47, 49, 49, 49, 49, 31, 31,
-             31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 38, 39,
-             39, 39, 40, 42, 42, 42, 42, 44, 46, 47, 47, 47, 47, 31, 31, 31, 31,
-             31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40, 40,
-             41, 42, 43, 43, 43, 44, 46, 46, 46, 46, 46, 33, 33, 34, 34, 34, 34,
-             34, 34, 35, 35, 35, 36, 37, 38, 38, 38, 40, 42, 43, 43, 43, 43, 44,
-             44, 44, 44, 45, 46, 47, 47, 47, 47, 37, 37, 38, 38, 38, 38, 39, 39,
-             40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47,
-             47, 47, 48, 48, 48, 48, 47, 37, 37, 38, 38, 38, 38, 39, 39, 40, 40,
-             40, 41, 42, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47,
-             48, 48, 48, 48, 47, 45, 45, 45, 45, 45, 45, 45, 44, 44, 44, 44, 45,
-             46, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 49, 50, 51, 52,
-             52, 52, 52, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47,
-             47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53,
-             53},
-        },
-        // Quantizer level 12.
-        {
-            {// Luma
-             // Size 4x4
-             31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 33, 34, 32, 33, 34, 35,
-             // Size 8x8
-             31, 31, 31, 31, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31,
-             32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 32, 32,
-             32, 32, 33, 33, 34, 35, 32, 32, 32, 32, 33, 34, 34, 35, 32, 32, 32,
-             32, 34, 34, 35, 36, 33, 33, 33, 33, 35, 35, 36, 38,
-             // Size 16x16
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 31,
-             31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 31, 31, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 33, 33, 33, 33, 34, 31, 32, 32, 32, 32, 32, 32, 32,
-             33, 33, 33, 33, 33, 33, 34, 35, 31, 32, 32, 32, 32, 32, 32, 32, 33,
-             33, 33, 33, 34, 34, 34, 35, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33,
-             33, 33, 34, 34, 34, 35, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
-             34, 35, 35, 35, 36, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35,
-             35, 35, 36, 37, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35,
-             35, 36, 37, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36,
-             36, 38, 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36, 37, 37, 38,
-             39,
-             // Size 32x32
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 31, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 33, 33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
-             33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34,
-             34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31,
-             31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 31, 31,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 31, 31, 31, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 31, 31, 31, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 33, 33, 33, 33, 33, 31, 31, 31, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 33, 33, 33, 33, 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33,
-             33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34,
-             34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 35,
-             31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
-             33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 31, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
-             33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 31, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
-             33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 31, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33,
-             34, 34, 34, 34, 34, 34, 35, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34,
-             34, 34, 34, 35, 35, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35,
-             35, 35, 35, 36, 36, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35,
-             36, 36, 36, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37,
-             37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
-             33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34,
-             34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34,
-             34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 33, 33, 33,
-             33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35,
-             35, 35, 36, 36, 36, 36, 36, 37, 38, 38, 33, 33, 33, 33, 33, 33, 33,
-             33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36,
-             36, 36, 36, 36, 37, 38, 38, 38, 34, 34, 34, 34, 34, 34, 34, 34, 34,
-             33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37,
-             37, 37, 38, 38, 39, 39, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33,
-             33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37,
-             38, 38, 39, 39,
-             // Size 4x8
-             31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 32,
-             32, 33, 34, 32, 32, 34, 34, 32, 33, 34, 35, 33, 33, 35, 36,
-             // Size 8x4
-             31, 31, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 33, 33, 32,
-             32, 32, 32, 33, 34, 34, 35, 32, 32, 32, 33, 34, 34, 35, 36,
-             // Size 8x16
-             32, 31, 31, 31, 31, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 33, 31,
-             32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32,
-             32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32,
-             32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32,
-             33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33,
-             34, 34, 34, 32, 32, 32, 32, 33, 35, 35, 35, 32, 32, 33, 33, 34, 35,
-             35, 36, 32, 32, 33, 33, 34, 35, 35, 36, 32, 33, 33, 33, 34, 36, 36,
-             36, 34, 34, 34, 34, 35, 37, 37, 38,
-             // Size 16x8
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 31,
-             31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 31, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 31, 32, 32, 32,
-             32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 32, 32, 32, 32, 32,
-             33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 32, 32, 32, 32, 32, 33,
-             33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 32, 33, 33, 33, 33, 33, 33,
-             33, 34, 34, 34, 35, 36, 36, 36, 38,
-             // Size 16x32
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 31,
-             31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31,
-             31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
-             33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
-             33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
-             33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
-             33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34,
-             34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35,
-             32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32,
-             32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32,
-             32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32,
-             32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32,
-             32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 32, 32, 32, 32, 32,
-             32, 32, 33, 33, 34, 35, 35, 35, 35, 35, 36, 32, 32, 32, 32, 33, 33,
-             33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 32, 32, 32, 32, 33, 33, 33,
-             33, 34, 34, 35, 35, 35, 35, 36, 37, 32, 32, 32, 32, 33, 33, 33, 33,
-             34, 34, 35, 35, 35, 35, 36, 37, 32, 32, 32, 32, 33, 33, 33, 33, 34,
-             34, 35, 35, 35, 35, 36, 37, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34,
-             35, 35, 35, 35, 36, 37, 32, 33, 33, 33, 33, 33, 33, 33, 34, 35, 36,
-             36, 36, 36, 36, 38, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36,
-             36, 36, 37, 38, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37,
-             37, 38, 39, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37, 37,
-             38, 39,
-             // Size 32x16
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
-             33, 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
-             33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
-             33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34,
-             34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
-             33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
-             33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34,
-             34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34,
-             35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35,
-             35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35,
-             35, 35, 36, 36, 37, 37, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
-             33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 36,
-             36, 37, 38, 38, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33,
-             33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38,
-             39, 39,
-             // Size 4x16
-             31, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31,
-             32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 33, 33, 32, 32,
-             33, 34, 32, 32, 33, 34, 32, 32, 33, 34, 32, 32, 34, 35, 32, 33, 34,
-             35, 32, 33, 34, 35, 33, 33, 35, 36, 34, 34, 36, 37,
-             // Size 16x4
-             31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 32, 32,
-             32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 36, 32, 32, 32,
-             32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37,
-             // Size 8x32
-             32, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 33, 31,
-             31, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32,
-             32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32,
-             32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32,
-             32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32,
-             33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33,
-             33, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33,
-             33, 31, 32, 32, 32, 33, 33, 33, 34, 31, 32, 32, 32, 33, 34, 34, 34,
-             32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34, 32,
-             32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34, 32, 32,
-             32, 32, 33, 34, 34, 35, 32, 32, 32, 32, 33, 35, 35, 35, 32, 32, 33,
-             33, 33, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33,
-             34, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, 34,
-             35, 35, 36, 32, 33, 33, 33, 34, 36, 36, 36, 33, 33, 33, 33, 34, 36,
-             36, 37, 34, 34, 34, 34, 35, 37, 37, 38, 34, 34, 34, 34, 35, 37, 37,
-             38,
-             // Size 32x8
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31,
-             31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             33, 33, 33, 33, 33, 33, 33, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34,
-             34, 34, 34, 34, 34, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35,
-             35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
-             33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36,
-             36, 37, 37, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
-             33, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38,
-             38},
-            {// Chroma
-             // Size 4x4
-             31, 31, 34, 38, 31, 32, 35, 40, 34, 35, 39, 43, 38, 40, 43, 47,
-             // Size 8x8
-             31, 31, 31, 30, 34, 35, 37, 40, 31, 31, 31, 31, 34, 35, 38, 41, 31,
-             31, 31, 31, 35, 36, 39, 41, 30, 31, 31, 32, 35, 36, 40, 42, 34, 34,
-             35, 35, 39, 40, 43, 44, 35, 35, 36, 36, 40, 41, 44, 45, 37, 38, 39,
-             40, 43, 44, 47, 47, 40, 41, 41, 42, 44, 45, 47, 48,
-             // Size 16x16
-             32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 36, 36, 38, 41, 31,
-             31, 31, 31, 31, 31, 31, 31, 33, 34, 34, 36, 37, 37, 39, 42, 31, 31,
-             31, 31, 31, 31, 31, 32, 34, 34, 34, 37, 38, 38, 40, 42, 31, 31, 31,
-             31, 31, 31, 31, 32, 34, 34, 34, 37, 38, 38, 40, 42, 31, 31, 31, 31,
-             31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42, 30, 31, 31, 31, 31,
-             32, 32, 32, 34, 35, 35, 38, 40, 40, 41, 42, 30, 31, 31, 31, 31, 32,
-             32, 32, 34, 35, 35, 38, 40, 40, 41, 42, 31, 31, 32, 32, 32, 32, 32,
-             33, 35, 36, 36, 38, 40, 40, 41, 43, 33, 33, 34, 34, 34, 34, 34, 35,
-             37, 38, 38, 41, 42, 42, 43, 44, 33, 34, 34, 34, 35, 35, 35, 36, 38,
-             39, 39, 41, 43, 43, 44, 45, 33, 34, 34, 34, 35, 35, 35, 36, 38, 39,
-             39, 41, 43, 43, 44, 45, 35, 36, 37, 37, 37, 38, 38, 38, 41, 41, 41,
-             44, 46, 46, 46, 46, 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43, 46,
-             47, 47, 47, 47, 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43, 46, 47,
-             47, 47, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47,
-             47, 48, 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46, 47, 47, 48,
-             48,
-             // Size 32x32
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33,
-             33, 33, 33, 33, 34, 35, 36, 36, 36, 36, 37, 38, 40, 41, 41, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 32, 33, 34, 34,
-             34, 34, 35, 36, 37, 37, 37, 37, 37, 39, 40, 42, 42, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34,
-             35, 36, 37, 37, 37, 37, 38, 39, 40, 42, 42, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 34, 34, 34, 34, 34, 35, 36,
-             38, 38, 38, 38, 38, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 34, 35, 37, 38, 38,
-             38, 38, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 34, 35, 37, 38, 38, 38, 38,
-             39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 32, 33, 34, 34, 34, 34, 34, 35, 37, 38, 38, 38, 38, 39, 40,
-             41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             32, 33, 34, 34, 34, 34, 34, 36, 37, 38, 38, 38, 38, 39, 40, 41, 42,
-             42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33,
-             34, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40, 41, 42, 42, 30,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35,
-             35, 35, 35, 36, 37, 39, 39, 39, 39, 40, 40, 41, 42, 42, 30, 30, 31,
-             31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 34, 35, 35, 35,
-             35, 36, 38, 39, 40, 40, 40, 40, 41, 42, 42, 42, 30, 30, 31, 31, 31,
-             31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 34, 35, 35, 35, 35, 36,
-             38, 39, 40, 40, 40, 40, 41, 42, 42, 42, 30, 30, 31, 31, 31, 31, 31,
-             31, 31, 31, 32, 32, 32, 32, 32, 33, 34, 35, 35, 35, 35, 36, 38, 39,
-             40, 40, 40, 40, 41, 42, 42, 42, 30, 30, 31, 31, 31, 31, 31, 31, 31,
-             31, 32, 32, 32, 32, 32, 33, 34, 35, 35, 35, 35, 36, 38, 39, 40, 40,
-             40, 40, 41, 42, 42, 42, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 40, 40, 40, 40, 41,
-             41, 42, 43, 43, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33,
-             33, 34, 35, 36, 37, 37, 37, 37, 38, 39, 41, 41, 41, 41, 42, 42, 43,
-             43, 43, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35,
-             36, 37, 38, 38, 38, 38, 39, 41, 42, 42, 42, 42, 43, 43, 44, 44, 44,
-             33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38,
-             39, 39, 39, 39, 40, 41, 43, 43, 43, 43, 43, 44, 44, 45, 45, 33, 34,
-             34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, 39,
-             39, 39, 40, 41, 43, 43, 43, 43, 43, 44, 44, 45, 45, 33, 34, 34, 34,
-             34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39,
-             40, 41, 43, 43, 43, 43, 43, 44, 44, 45, 45, 33, 34, 34, 34, 34, 34,
-             34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40, 41,
-             43, 43, 43, 43, 43, 44, 44, 45, 45, 34, 35, 35, 35, 35, 35, 35, 36,
-             36, 36, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 44, 44,
-             44, 44, 44, 45, 45, 45, 45, 35, 36, 36, 36, 37, 37, 37, 37, 37, 37,
-             38, 38, 38, 38, 38, 39, 41, 41, 41, 41, 41, 42, 44, 45, 46, 46, 46,
-             46, 46, 46, 46, 46, 36, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 39,
-             39, 39, 40, 41, 42, 43, 43, 43, 43, 44, 45, 46, 47, 47, 47, 47, 47,
-             47, 47, 47, 36, 37, 37, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40,
-             40, 41, 42, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47,
-             47, 36, 37, 37, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41,
-             42, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 36,
-             37, 37, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43,
-             43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 37, 37, 38,
-             38, 39, 39, 39, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43,
-             43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 38, 39, 39, 40, 40,
-             40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44, 44, 44, 45,
-             46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 40, 40, 40, 41, 41, 41, 41,
-             41, 41, 41, 42, 42, 42, 42, 42, 43, 44, 44, 44, 44, 44, 45, 46, 47,
-             47, 47, 47, 47, 48, 48, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42,
-             42, 42, 42, 42, 42, 43, 43, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47,
-             47, 47, 48, 48, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
-             42, 42, 42, 43, 43, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47,
-             48, 48, 48, 48,
-             // Size 4x8
-             31, 31, 35, 37, 31, 31, 36, 38, 31, 32, 37, 39, 31, 32, 37, 40, 34,
-             36, 40, 43, 35, 37, 42, 44, 38, 40, 45, 47, 41, 42, 45, 47,
-             // Size 8x4
-             31, 31, 31, 31, 34, 35, 38, 41, 31, 31, 32, 32, 36, 37, 40, 42, 35,
-             36, 37, 37, 40, 42, 45, 45, 37, 38, 39, 40, 43, 44, 47, 47,
-             // Size 8x16
-             32, 31, 31, 31, 33, 37, 37, 38, 31, 31, 31, 31, 33, 38, 38, 39, 31,
-             31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31,
-             32, 32, 34, 39, 39, 40, 30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32,
-             32, 35, 40, 40, 41, 31, 32, 33, 33, 35, 40, 40, 41, 33, 34, 35, 35,
-             37, 42, 42, 43, 33, 35, 36, 36, 38, 43, 43, 44, 33, 35, 36, 36, 38,
-             43, 43, 44, 35, 37, 38, 38, 41, 45, 45, 46, 37, 39, 40, 40, 43, 47,
-             47, 47, 37, 39, 40, 40, 43, 47, 47, 47, 39, 40, 41, 41, 43, 47, 47,
-             47, 42, 42, 43, 43, 44, 47, 47, 48,
-             // Size 16x8
-             32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 31,
-             31, 31, 31, 31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42, 31, 31,
-             31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 31, 31, 31,
-             31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 33, 33, 34, 34,
-             34, 35, 35, 35, 37, 38, 38, 41, 43, 43, 43, 44, 37, 38, 38, 38, 39,
-             40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, 37, 38, 38, 38, 39, 40,
-             40, 40, 42, 43, 43, 45, 47, 47, 47, 47, 38, 39, 40, 40, 40, 41, 41,
-             41, 43, 44, 44, 46, 47, 47, 47, 48,
-             // Size 16x32
-             32, 31, 31, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 37, 38, 42, 31,
-             31, 31, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 37, 39, 42, 31, 31,
-             31, 31, 31, 31, 31, 32, 33, 35, 38, 38, 38, 38, 39, 42, 31, 31, 31,
-             31, 31, 31, 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31,
-             31, 31, 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31,
-             31, 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31,
-             31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31,
-             32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 32, 32, 32, 32,
-             34, 36, 39, 39, 39, 39, 40, 42, 30, 31, 31, 32, 32, 32, 32, 32, 34,
-             37, 39, 39, 39, 39, 40, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37,
-             40, 40, 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37, 40,
-             40, 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37, 40, 40,
-             40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37, 40, 40, 40,
-             40, 41, 42, 31, 31, 32, 32, 33, 33, 33, 33, 35, 38, 40, 40, 40, 40,
-             41, 43, 32, 32, 33, 33, 34, 34, 34, 34, 36, 39, 41, 41, 41, 41, 42,
-             44, 33, 33, 34, 35, 35, 35, 35, 35, 37, 40, 42, 42, 42, 42, 43, 44,
-             33, 34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33,
-             34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, 34,
-             35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, 34, 35,
-             35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 34, 35, 36, 37,
-             37, 37, 37, 37, 39, 42, 44, 44, 44, 44, 45, 45, 35, 36, 37, 38, 38,
-             38, 38, 39, 41, 43, 45, 45, 45, 45, 46, 46, 36, 37, 38, 39, 39, 39,
-             39, 40, 42, 44, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40,
-             41, 43, 45, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41,
-             43, 45, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43,
-             45, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43, 45,
-             47, 47, 47, 47, 47, 47, 39, 39, 40, 41, 41, 41, 41, 42, 43, 45, 47,
-             47, 47, 47, 47, 48, 40, 41, 41, 42, 42, 42, 42, 42, 44, 45, 47, 47,
-             47, 47, 47, 48, 42, 42, 42, 43, 43, 43, 43, 43, 44, 46, 47, 47, 47,
-             47, 48, 48, 42, 42, 42, 43, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47,
-             48, 48,
-             // Size 32x16
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33,
-             33, 33, 33, 33, 34, 35, 36, 37, 37, 37, 37, 39, 40, 42, 42, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34,
-             34, 34, 35, 36, 37, 38, 38, 38, 38, 39, 41, 42, 42, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 35, 35, 35,
-             36, 37, 38, 39, 39, 39, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 35, 37, 38,
-             39, 40, 40, 40, 40, 41, 42, 43, 43, 31, 31, 31, 31, 31, 31, 31, 31,
-             32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40,
-             40, 40, 40, 41, 42, 43, 43, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
-             32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40,
-             40, 41, 42, 43, 43, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
-             32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41,
-             42, 43, 43, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
-             33, 34, 35, 36, 36, 36, 36, 37, 39, 40, 41, 41, 41, 41, 42, 42, 43,
-             43, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36,
-             37, 38, 38, 38, 38, 39, 41, 42, 43, 43, 43, 43, 43, 44, 44, 44, 35,
-             35, 35, 36, 36, 36, 36, 36, 36, 37, 37, 37, 37, 37, 38, 39, 40, 40,
-             40, 40, 40, 42, 43, 44, 45, 45, 45, 45, 45, 45, 46, 46, 37, 37, 38,
-             38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43,
-             43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 37, 37, 38, 38, 38,
-             38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44,
-             45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 37, 37, 38, 38, 38, 38, 38,
-             38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47,
-             47, 47, 47, 47, 47, 47, 47, 47, 37, 37, 38, 38, 38, 38, 38, 38, 39,
-             39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47,
-             47, 47, 47, 47, 47, 47, 38, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41,
-             41, 41, 41, 41, 42, 43, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47, 47,
-             47, 47, 48, 48, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
-             42, 43, 44, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48,
-             48, 48,
-             // Size 4x16
-             31, 31, 35, 37, 31, 31, 35, 38, 31, 31, 36, 38, 31, 31, 36, 38, 31,
-             32, 36, 39, 31, 32, 37, 40, 31, 32, 37, 40, 31, 33, 38, 40, 33, 35,
-             40, 42, 34, 36, 40, 43, 34, 36, 40, 43, 36, 38, 43, 45, 38, 40, 45,
-             47, 38, 40, 45, 47, 39, 41, 45, 47, 42, 43, 46, 47,
-             // Size 16x4
-             31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34, 36, 38, 38, 39, 42, 31,
-             31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 35, 35,
-             36, 36, 36, 37, 37, 38, 40, 40, 40, 43, 45, 45, 45, 46, 37, 38, 38,
-             38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47,
-             // Size 8x32
-             32, 31, 31, 31, 33, 37, 37, 38, 31, 31, 31, 31, 33, 37, 37, 39, 31,
-             31, 31, 31, 33, 38, 38, 39, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31,
-             31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31,
-             31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 32, 32,
-             34, 39, 39, 40, 30, 31, 32, 32, 34, 39, 39, 40, 30, 31, 32, 32, 35,
-             40, 40, 41, 30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32, 32, 35, 40,
-             40, 41, 30, 31, 32, 32, 35, 40, 40, 41, 31, 32, 33, 33, 35, 40, 40,
-             41, 32, 33, 34, 34, 36, 41, 41, 42, 33, 34, 35, 35, 37, 42, 42, 43,
-             33, 35, 36, 36, 38, 43, 43, 44, 33, 35, 36, 36, 38, 43, 43, 44, 33,
-             35, 36, 36, 38, 43, 43, 44, 33, 35, 36, 36, 38, 43, 43, 44, 34, 36,
-             37, 37, 39, 44, 44, 45, 35, 37, 38, 38, 41, 45, 45, 46, 36, 38, 39,
-             39, 42, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40,
-             43, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, 43,
-             47, 47, 47, 39, 40, 41, 41, 43, 47, 47, 47, 40, 41, 42, 42, 44, 47,
-             47, 47, 42, 42, 43, 43, 44, 47, 47, 48, 42, 42, 43, 43, 44, 47, 47,
-             48,
-             // Size 32x8
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33,
-             33, 33, 33, 33, 34, 35, 36, 37, 37, 37, 37, 39, 40, 42, 42, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 35,
-             35, 35, 36, 37, 38, 39, 39, 39, 39, 40, 41, 42, 42, 31, 31, 31, 31,
-             31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36,
-             37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43, 31, 31, 31, 31, 31, 31,
-             31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38,
-             39, 40, 40, 40, 40, 41, 42, 43, 43, 33, 33, 33, 34, 34, 34, 34, 34,
-             34, 34, 35, 35, 35, 35, 35, 36, 37, 38, 38, 38, 38, 39, 41, 42, 43,
-             43, 43, 43, 43, 44, 44, 44, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39,
-             40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47,
-             47, 47, 47, 47, 47, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40,
-             40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47,
-             47, 47, 47, 38, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41,
-             41, 42, 43, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47, 47, 47, 47, 48,
-             48},
-        },
-        // Quantizer level 13.
-        {
-            {// Luma
-             // Size 4x4
-             31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33,
-             // Size 8x8
-             31, 31, 31, 31, 31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31,
-             32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32,
-             32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33,
-             // Size 16x16
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31,
-             31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
-             33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
-             33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
-             33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
-             33,
-             // Size 32x32
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
-             31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
-             31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
-             31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 33, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
-             33, 33, 33, 33, 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
-             33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33,
-             33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
-             33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
-             33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33,
-             33, 33, 33, 33,
-             // Size 4x8
-             31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31,
-             32, 32, 32, 31, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33,
-             // Size 8x4
-             31, 31, 31, 31, 31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
-             // Size 8x16
-             32, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 32, 32, 31,
-             31, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32,
-             32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32,
-             32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32,
-             32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32,
-             32, 32, 32, 31, 32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32,
-             33, 33, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33,
-             34, 32, 32, 32, 32, 32, 32, 33, 34,
-             // Size 16x8
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31,
-             31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 33, 33, 34, 34, 34,
-             // Size 16x32
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31,
-             31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
-             31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31,
-             31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 33, 33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
-             33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
-             33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
-             34, 34,
-             // Size 32x16
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
-             31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
-             31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
-             33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
-             33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34,
-             34, 34, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34,
-             34, 34,
-             // Size 4x16
-             31, 31, 31, 32, 31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31,
-             32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32,
-             32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32,
-             33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33,
-             // Size 16x4
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 31,
-             31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
-             // Size 8x32
-             32, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 32, 32, 31,
-             31, 31, 31, 31, 31, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31,
-             32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32,
-             32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32,
-             32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32,
-             32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
-             32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32,
-             32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
-             31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31,
-             32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32,
-             32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 33, 33, 31, 32, 32,
-             32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32,
-             32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32,
-             32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32,
-             33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33,
-             34,
-             // Size 32x8
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31,
-             31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33,
-             33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34,
-             34},
-            {// Chroma
-             // Size 4x4
-             31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 32, 35, 34, 35, 35, 39,
-             // Size 8x8
-             31, 31, 31, 31, 30, 31, 33, 33, 31, 31, 31, 31, 31, 32, 34, 34, 31,
-             31, 31, 31, 31, 32, 34, 34, 31, 31, 31, 31, 31, 32, 35, 35, 30, 31,
-             31, 31, 32, 32, 35, 35, 31, 32, 32, 32, 32, 33, 36, 36, 33, 34, 34,
-             35, 35, 36, 39, 39, 33, 34, 34, 35, 35, 36, 39, 39,
-             // Size 16x16
-             32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 31,
-             31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 34, 34, 34, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 33, 34, 35, 35, 35, 30, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 33, 34, 35, 35, 35, 30, 30, 31, 31, 31, 31, 31, 31,
-             32, 32, 32, 33, 34, 35, 35, 35, 30, 30, 31, 31, 31, 31, 31, 31, 32,
-             32, 32, 33, 34, 35, 35, 35, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32,
-             32, 33, 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
-             34, 36, 37, 37, 37, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36,
-             37, 38, 38, 38, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38,
-             39, 39, 39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39,
-             39, 39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39,
-             39,
-             // Size 32x32
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30,
-             30, 30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30,
-             30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30,
-             31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
-             33, 33, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
-             34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 34, 34,
-             34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 34, 34, 34, 34,
-             34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34,
-             35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
-             32, 33, 34, 35, 35, 35, 35, 35, 35, 35, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
-             34, 35, 35, 35, 35, 35, 35, 35, 30, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 35,
-             35, 35, 35, 35, 35, 36, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 35, 35, 35,
-             35, 35, 35, 36, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35,
-             35, 36, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36,
-             30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
-             32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30,
-             30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
-             32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
-             32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33,
-             34, 34, 35, 35, 35, 35, 35, 35, 36, 31, 31, 31, 31, 31, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35,
-             36, 36, 36, 36, 36, 36, 37, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 37, 37,
-             37, 37, 37, 37, 37, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33,
-             33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37, 37, 37,
-             37, 37, 38, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34,
-             34, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 38, 38, 38, 38, 38, 38,
-             39, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35,
-             35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39, 39, 40, 33,
-             33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35,
-             35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39, 39, 40, 33, 33, 34,
-             34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35,
-             35, 36, 37, 37, 38, 39, 39, 39, 39, 39, 39, 40, 33, 33, 34, 34, 34,
-             34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36,
-             37, 37, 38, 39, 39, 39, 39, 39, 39, 40, 33, 33, 34, 34, 34, 34, 34,
-             34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37,
-             38, 39, 39, 39, 39, 39, 39, 40, 33, 33, 34, 34, 34, 34, 34, 34, 34,
-             34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39,
-             39, 39, 39, 39, 39, 40, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35,
-             35, 35, 36, 36, 36, 36, 36, 36, 36, 36, 37, 37, 38, 39, 40, 40, 40,
-             40, 40, 40, 40,
-             // Size 4x8
-             31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 31, 35, 31, 32, 32, 36, 31,
-             32, 32, 36, 31, 33, 33, 37, 34, 36, 36, 40, 34, 36, 36, 40,
-             // Size 8x4
-             31, 31, 31, 31, 31, 31, 34, 34, 31, 31, 31, 32, 32, 33, 36, 36, 31,
-             31, 31, 32, 32, 33, 36, 36, 34, 35, 35, 36, 36, 37, 40, 40,
-             // Size 8x16
-             32, 31, 31, 31, 31, 31, 33, 35, 31, 31, 31, 31, 31, 31, 33, 36, 31,
-             31, 31, 31, 31, 31, 34, 36, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31,
-             31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31,
-             32, 32, 32, 34, 37, 30, 31, 31, 32, 32, 32, 34, 38, 30, 31, 32, 32,
-             32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32,
-             32, 35, 38, 31, 32, 33, 33, 33, 33, 36, 39, 33, 34, 34, 35, 35, 35,
-             37, 40, 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38,
-             41, 33, 34, 35, 36, 36, 36, 38, 41,
-             // Size 16x8
-             32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 31, 31,
-             31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 31, 31, 31,
-             31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31, 31, 31, 31,
-             31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31, 31, 31, 31, 31,
-             31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 33, 33, 34, 34, 34, 34,
-             34, 34, 35, 35, 35, 36, 37, 38, 38, 38, 35, 36, 36, 37, 37, 37, 37,
-             38, 38, 38, 38, 39, 40, 41, 41, 41,
-             // Size 16x32
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 37, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 37, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 36, 37, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 35, 36, 38, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 32, 34, 35, 36, 38, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33,
-             34, 36, 37, 39, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34,
-             36, 37, 39, 30, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 36,
-             38, 39, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38,
-             40, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40,
-             30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30,
-             31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31,
-             31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31, 31,
-             31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 31, 31, 31, 32,
-             32, 33, 33, 33, 33, 33, 33, 34, 35, 37, 38, 40, 31, 32, 32, 33, 33,
-             33, 33, 33, 33, 33, 33, 35, 36, 37, 39, 41, 32, 32, 33, 33, 34, 34,
-             34, 34, 34, 34, 34, 35, 37, 38, 40, 41, 33, 33, 34, 34, 34, 35, 35,
-             35, 35, 35, 35, 36, 37, 39, 40, 42, 33, 34, 34, 35, 35, 36, 36, 36,
-             36, 36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36,
-             36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36,
-             36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36,
-             37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37,
-             38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38,
-             40, 41, 43, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 36, 38, 39, 40,
-             42, 44,
-             // Size 32x16
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30,
-             30, 30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33,
-             33, 34, 35, 35, 35, 35, 35, 35, 35, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34,
-             35, 35, 35, 35, 35, 35, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36,
-             36, 36, 36, 36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36,
-             36, 36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36,
-             36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 32, 32, 32, 32, 32,
-             33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34,
-             35, 35, 36, 37, 37, 37, 37, 37, 37, 38, 33, 33, 33, 33, 34, 34, 34,
-             34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 36, 37,
-             37, 38, 38, 38, 38, 38, 38, 39, 34, 34, 34, 35, 35, 35, 35, 35, 35,
-             35, 35, 35, 36, 36, 36, 36, 36, 36, 36, 36, 36, 37, 37, 38, 39, 40,
-             40, 40, 40, 40, 40, 40, 35, 35, 36, 36, 36, 37, 37, 37, 37, 37, 37,
-             37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 39, 40, 40, 41, 41, 41,
-             41, 41, 41, 42, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 38, 39,
-             39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 42, 43, 43, 43, 43, 43,
-             43, 44,
-             // Size 4x16
-             31, 31, 31, 34, 31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 31, 35, 31,
-             31, 31, 35, 31, 31, 31, 35, 31, 32, 32, 36, 31, 32, 32, 36, 31, 32,
-             32, 36, 31, 32, 32, 36, 31, 32, 32, 36, 32, 33, 33, 37, 33, 35, 35,
-             39, 34, 36, 36, 40, 34, 36, 36, 40, 34, 36, 36, 40,
-             // Size 16x4
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 31,
-             31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31, 31,
-             31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 34, 34, 35,
-             35, 35, 35, 36, 36, 36, 36, 36, 37, 39, 40, 40, 40,
-             // Size 8x32
-             32, 31, 31, 31, 31, 31, 33, 35, 31, 31, 31, 31, 31, 31, 33, 35, 31,
-             31, 31, 31, 31, 31, 33, 36, 31, 31, 31, 31, 31, 31, 33, 36, 31, 31,
-             31, 31, 31, 31, 34, 36, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31,
-             31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31,
-             31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31,
-             31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 32, 32, 32,
-             34, 37, 31, 31, 31, 32, 32, 32, 34, 37, 30, 31, 31, 32, 32, 32, 34,
-             38, 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38,
-             30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, 30,
-             31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, 31, 31,
-             32, 33, 33, 33, 35, 38, 31, 32, 33, 33, 33, 33, 36, 39, 32, 33, 34,
-             34, 34, 34, 37, 40, 33, 34, 34, 35, 35, 35, 37, 40, 33, 34, 35, 36,
-             36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36,
-             36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36,
-             38, 41, 33, 34, 35, 36, 36, 36, 38, 41, 34, 35, 36, 36, 36, 36, 39,
-             42,
-             // Size 32x8
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30,
-             30, 30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
-             32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
-             34, 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35,
-             36, 36, 36, 36, 36, 36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36,
-             36, 36, 36, 36, 36, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34,
-             34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 38, 38, 38,
-             38, 38, 39, 35, 35, 36, 36, 36, 37, 37, 37, 37, 37, 37, 37, 37, 37,
-             38, 38, 38, 38, 38, 38, 38, 38, 39, 40, 40, 41, 41, 41, 41, 41, 41,
-             42},
-        },
-        // Quantizer level 14.
-        {
-            {// Luma
-             // Size 4x4
-             31, 31, 31, 31, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
-             // Size 8x8
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31,
-             32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32,
-             32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
-             // Size 16x16
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32,
-             // Size 32x32
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
-             31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
-             31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31,
-             31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32,
-             // Size 4x8
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 32, 32, 32, 31,
-             32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
-             // Size 8x4
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31,
-             31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
-             // Size 8x16
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31,
-             31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32,
-             32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
-             32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32,
-             32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
-             32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32,
-             32, 31, 31, 32, 32, 32, 32, 32, 32,
-             // Size 16x8
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
-             31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32,
-             // Size 16x32
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
-             31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
-             31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
-             31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32,
-             // Size 32x16
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
-             31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
-             31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31,
-             31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32,
-             // Size 4x16
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31,
-             32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32,
-             32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32,
-             32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
-             // Size 16x4
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
-             31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             // Size 8x32
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31, 32,
-             32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32,
-             32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
-             32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32,
-             32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
-             31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31,
-             31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31,
-             32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32,
-             32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
-             32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32,
-             32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
-             32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32,
-             32,
-             // Size 32x8
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
-             31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-             32},
-            {// Chroma
-             // Size 4x4
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             // Size 8x8
-             31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31,
-             // Size 16x16
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             32,
-             // Size 32x32
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30,
-             30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 32, 32, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 32, 32,
-             // Size 4x8
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 30, 31, 32, 32,
-             // Size 8x4
-             31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32,
-             // Size 8x16
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
-             32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 31, 32, 32,
-             32, 30, 31, 31, 31, 32, 32, 32, 32,
-             // Size 16x8
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 32, 32, 32, 32,
-             // Size 16x32
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             32, 32, 32, 32, 32, 32, 32, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32,
-             32, 32, 32, 32, 32, 32, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
-             32, 32, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
-             32, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
-             32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
-             32, 32,
-             // Size 32x16
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
-             32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
-             32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
-             32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
-             32, 32,
-             // Size 4x16
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             32, 31, 31, 32, 32, 31, 31, 32, 32, 30, 31, 32, 32,
-             // Size 16x4
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
-             // Size 8x32
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31,
-             31, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 31,
-             32, 32, 32, 30, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 32, 32,
-             32, 32, 30, 31, 31, 31, 32, 32, 32, 32, 30, 31, 31, 31, 32, 32, 32,
-             32,
-             // Size 32x8
-             32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
-             32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
-             32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
-             31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
-             32},
-        },
-};
+constexpr uint8_t kQuantizerMatrix4x8
+    [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][32] = {
+        {{32,  42,  75, 91,  33,  42,  69,  86,  37,  58, 84,
+          91,  49,  71, 103, 110, 65,  84,  125, 128, 80, 97,
+          142, 152, 91, 100, 145, 178, 104, 112, 146, 190},
+         {31, 47, 60, 66, 40, 45, 54, 61, 46, 56, 64, 64,  48, 61, 75, 73,
+          54, 65, 85, 82, 61, 69, 92, 92, 64, 68, 90, 102, 68, 71, 87, 105}},
+        {{32,  42,  69, 88, 33,  42,  64, 83,  36,  56, 77,
+          88,  46,  67, 93, 105, 60,  79, 112, 122, 75, 92,
+          130, 144, 86, 95, 136, 167, 98, 105, 136, 177},
+         {31, 47, 57, 65, 40, 45, 52, 61, 46, 55, 61, 63, 47, 60, 70, 72,
+          52, 64, 79, 81, 59, 68, 87, 90, 63, 66, 88, 99, 66, 69, 85, 102}},
+        {{32,  38,  62, 86, 32,  40,  58, 80, 34,  51, 68,
+          85,  44,  61, 85, 101, 54,  69, 98, 117, 72, 84,
+          118, 136, 82, 89, 129, 157, 92, 98, 127, 165},
+         {31, 47, 54, 64, 38, 46, 50, 60, 46, 53, 57, 62, 46, 56, 66, 71,
+          50, 59, 74, 79, 57, 64, 82, 88, 61, 65, 85, 97, 65, 67, 82, 99}},
+        {{32,  35,  59, 83, 32,  36,  57, 78, 34,  47, 65,
+          82,  41,  53, 78, 97,  51,  61, 92, 111, 65, 73,
+          108, 129, 75, 81, 117, 148, 86, 92, 119, 154},
+         {31, 47, 53, 63, 36, 47, 50, 59, 46, 52, 55, 61, 45, 53, 63, 70,
+          49, 55, 71, 77, 54, 58, 77, 86, 59, 61, 81, 94, 63, 65, 80, 95}},
+        {{32, 35, 51, 77,  32, 36, 50, 72,  34, 42, 54,  75,  38, 51, 67,  87,
+          48, 59, 80, 103, 60, 68, 92, 119, 72, 79, 104, 135, 81, 86, 112, 144},
+         {31, 47, 50, 61, 36, 47, 47, 57, 43, 50, 50, 58, 45, 53, 58, 65,
+          47, 54, 66, 74, 52, 56, 70, 82, 57, 60, 75, 90, 61, 63, 77, 93}},
+        {{32, 35, 51, 75, 32, 36, 50, 71,  34, 42, 54, 73,  37, 50, 65,  84,
+          45, 56, 76, 96, 54, 63, 87, 110, 65, 73, 97, 125, 75, 81, 106, 136},
+         {31, 47, 50, 60, 36, 47, 47, 56, 43, 50, 50, 57, 46, 53, 57, 64,
+          46, 54, 64, 71, 50, 55, 68, 78, 54, 58, 72, 85, 59, 61, 75, 90}},
+        {{32, 34, 43, 62, 32, 34, 42, 59, 33, 37, 44, 58,  35, 43, 54, 68,
+          41, 48, 64, 79, 49, 54, 71, 91, 57, 60, 78, 101, 66, 68, 86, 111},
+         {31, 42, 47, 54, 33, 44, 45, 51, 40, 47, 46, 50, 47, 50, 54, 57,
+          45, 49, 59, 64, 48, 50, 61, 70, 51, 52, 63, 75, 55, 55, 66, 79}},
+        {{32, 32, 42, 56, 32, 33, 41, 53, 32, 35, 42, 52, 34, 37, 50, 59,
+          38, 40, 58, 68, 44, 45, 66, 78, 50, 50, 71, 86, 61, 58, 79, 97},
+         {31, 38, 47, 52, 32, 40, 45, 49, 39, 47, 45, 48, 44, 47, 51, 53,
+          46, 47, 56, 58, 47, 46, 59, 64, 48, 47, 61, 68, 53, 50, 64, 73}},
+        {{32, 32, 37, 52, 32, 33, 36, 49, 32, 34, 38, 49, 34, 37, 44, 54,
+          35, 38, 49, 60, 40, 42, 55, 69, 46, 46, 59, 76, 52, 51, 64, 83},
+         {31, 38, 47, 50, 31, 40, 46, 48, 36, 44, 47, 47, 42, 47, 50, 50,
+          47, 48, 53, 54, 46, 46, 54, 60, 48, 46, 55, 64, 50, 48, 56, 67}},
+        {{31, 32, 35, 43, 32, 33, 34, 41, 32, 34, 36, 42, 32, 35, 38, 42,
+          34, 37, 43, 49, 37, 40, 49, 56, 42, 43, 53, 63, 46, 46, 56, 67},
+         {31, 38, 47, 48, 31, 40, 46, 45, 35, 43, 47, 46, 39, 47, 47, 45,
+          43, 47, 50, 50, 47, 47, 53, 55, 46, 46, 53, 58, 48, 46, 54, 59}},
+        {{31, 32, 34, 39, 32, 32, 34, 38, 32, 33, 34, 38, 32, 33, 36, 40,
+          33, 34, 38, 42, 34, 36, 41, 47, 37, 38, 44, 52, 40, 40, 46, 56},
+         {31, 34, 42, 48, 31, 35, 42, 46, 33, 37, 44, 46, 36, 41, 46, 46,
+          40, 44, 48, 48, 45, 46, 49, 51, 47, 47, 50, 54, 47, 46, 49, 55}},
+        {{31, 31, 32, 35, 32, 32, 32, 35, 32, 32, 33, 34, 32, 32, 34, 36,
+          32, 33, 35, 38, 33, 33, 36, 40, 34, 34, 37, 42, 35, 34, 38, 48},
+         {31, 31, 37, 48, 31, 31, 38, 47, 31, 32, 40, 46, 34, 36, 43, 47,
+          37, 39, 46, 47, 39, 41, 47, 48, 42, 43, 47, 50, 48, 46, 48, 53}},
+        {{31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+          32, 32, 33, 34, 32, 32, 34, 34, 32, 33, 34, 35, 33, 33, 35, 36},
+         {31, 31, 35, 37, 31, 31, 36, 38, 31, 32, 37, 39, 31, 32, 37, 40,
+          34, 36, 40, 43, 35, 37, 42, 44, 38, 40, 45, 47, 41, 42, 45, 47}},
+        {{31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+          31, 32, 32, 32, 31, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33},
+         {31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 31, 35, 31, 32, 32, 36,
+          31, 32, 32, 36, 31, 33, 33, 37, 34, 36, 36, 40, 34, 36, 36, 40}},
+        {{31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 32, 32, 32,
+          31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32},
+         {31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 30, 31, 32, 32}}};
+constexpr uint8_t kQuantizerMatrix4x16
+    [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][64] = {
+        {{31,  44,  79,  96,  32,  41,  72,  90,  32,  42,  71,  86,  34,
+          48,  73,  83,  34,  54,  78,  89,  41,  63,  90,  95,  45,  67,
+          96,  102, 54,  75,  110, 111, 60,  79,  118, 123, 72,  90,  133,
+          135, 75,  92,  136, 149, 83,  100, 142, 160, 88,  100, 140, 173,
+          94,  101, 144, 180, 101, 108, 141, 188, 108, 115, 151, 197},
+         {31, 49, 63, 69,  32, 45, 57, 65,  36, 46, 56, 62,  43, 49, 57, 60,
+          46, 53, 60, 63,  45, 58, 67, 66,  46, 59, 71, 70,  50, 62, 78, 74,
+          52, 64, 82, 80,  57, 67, 89, 85,  59, 68, 90, 91,  62, 71, 91, 96,
+          63, 69, 89, 101, 65, 68, 89, 103, 67, 70, 86, 105, 69, 72, 88, 107}},
+        {{31,  44, 73,  93,  32,  41,  67,  87,  32,  42,  65,  83,  33,
+          44,  66, 81,  34,  54,  74,  86,  37,  58,  79,  92,  44,  66,
+          90,  98, 49,  71,  99,  107, 56,  77,  107, 117, 65,  84,  119,
+          129, 72, 90,  127, 141, 78,  95,  133, 151, 84,  95,  132, 163,
+          89,  95, 136, 169, 95,  101, 132, 175, 101, 108, 141, 183},
+         {31, 49, 61, 69, 32, 45, 55, 64,  36, 46, 54, 61,  41, 47, 54, 59,
+          46, 53, 59, 62, 46, 56, 62, 65,  46, 59, 68, 68,  48, 61, 73, 73,
+          51, 63, 77, 78, 54, 65, 82, 84,  57, 67, 86, 89,  60, 69, 88, 93,
+          62, 67, 86, 98, 64, 66, 87, 100, 65, 68, 83, 102, 67, 70, 86, 103}},
+        {{31,  39, 65,  90,  32,  38,  60,  84,  32,  39,  59,  81,  33,
+          40,  58, 78,  34,  47,  65,  83,  37,  54,  73,  89,  41,  58,
+          79,  94, 46,  62,  86,  102, 53,  68,  97,  112, 60,  73,  105,
+          123, 65, 78,  111, 134, 74,  85,  120, 143, 79,  90,  125, 154,
+          84,  90, 128, 158, 89,  95,  124, 164, 94,  101, 131, 170},
+         {31, 48, 57, 68, 32, 46, 53, 63, 36, 46, 51, 60, 40, 46, 50, 58,
+          44, 51, 54, 61, 46, 54, 60, 64, 45, 56, 64, 67, 47, 57, 68, 71,
+          49, 58, 73, 77, 52, 60, 76, 82, 54, 62, 79, 87, 58, 64, 82, 91,
+          60, 66, 84, 95, 62, 64, 84, 97, 64, 66, 81, 99, 65, 68, 83, 100}},
+        {{31,  36, 62,  88,  32,  35, 58,  82,  32,  36,  57,  79,  33,
+          38,  56, 76,  34,  42,  61, 81,  34,  48,  66,  85,  39,  51,
+          74,  91, 44,  56,  82,  98, 49,  60,  90,  107, 54,  63,  95,
+          117, 60, 68,  102, 127, 68, 75,  110, 135, 75,  81,  117, 145,
+          79,  85, 120, 148, 84,  89, 116, 153, 88,  94,  123, 159},
+         {31, 48, 56, 67, 32, 46, 52, 62, 35, 47, 50, 60, 40, 47, 49, 57,
+          43, 50, 53, 60, 46, 53, 56, 63, 45, 53, 61, 66, 46, 54, 65, 70,
+          48, 54, 70, 75, 50, 55, 72, 80, 52, 56, 75, 85, 56, 59, 79, 89,
+          58, 61, 81, 93, 60, 63, 82, 94, 62, 64, 79, 96, 63, 66, 81, 97}},
+        {{31,  36, 53,  81,  32,  35, 51,  76,  32,  35, 49,  73,  32,
+          37,  49, 71,  33,  41,  53, 74,  34,  48,  60, 80,  37,  50,
+          65,  85, 41,  53,  71,  91, 45,  56,  76,  98, 49,  60,  82,
+          105, 54, 63,  87,  112, 61, 69,  93,  121, 68, 75,  100, 130,
+          74,  80, 105, 137, 78,  84, 109, 142, 83,  88, 114, 148},
+         {31, 48, 52, 64, 31, 47, 49, 60, 33, 46, 48, 57, 38, 47, 47, 56,
+          42, 49, 50, 57, 46, 53, 54, 61, 46, 53, 57, 64, 45, 53, 61, 68,
+          46, 54, 64, 71, 48, 54, 66, 75, 50, 55, 68, 78, 52, 57, 71, 83,
+          56, 59, 73, 87, 58, 61, 75, 90, 60, 62, 76, 92, 62, 64, 78, 94}},
+        {{31, 36, 53, 79,  32, 35, 51, 75,  32, 34, 49,  72,  32, 36, 50,  71,
+          33, 38, 49, 69,  34, 42, 54, 73,  34, 48, 60,  78,  37, 50, 65,  84,
+          41, 53, 71, 90,  45, 56, 76, 96,  49, 60, 82,  103, 54, 63, 87,  110,
+          60, 68, 92, 118, 65, 73, 97, 125, 72, 79, 104, 133, 75, 81, 106, 136},
+         {31, 48, 52, 63, 31, 47, 50, 60, 32, 46, 48, 57, 36, 47, 47, 56,
+          40, 47, 47, 54, 43, 50, 50, 57, 46, 53, 54, 60, 46, 53, 57, 64,
+          45, 53, 61, 67, 46, 54, 64, 71, 48, 54, 66, 75, 50, 55, 68, 78,
+          52, 56, 70, 82, 54, 58, 72, 85, 57, 60, 75, 89, 59, 61, 75, 90}},
+        {{31, 34, 44, 65, 32, 34, 43, 62,  32, 33, 41, 59,  32, 35, 43, 59,
+          32, 37, 43, 58, 34, 39, 48, 63,  34, 42, 53, 67,  36, 44, 57, 71,
+          39, 46, 60, 76, 42, 48, 64, 81,  45, 51, 67, 85,  50, 54, 72, 92,
+          54, 58, 76, 98, 60, 63, 80, 105, 66, 68, 85, 111, 73, 74, 91, 118},
+         {31, 42, 49, 57, 31, 42, 47, 54, 32, 42, 45, 52, 35, 45, 46, 51,
+          40, 47, 46, 50, 43, 48, 49, 53, 46, 50, 53, 56, 46, 50, 55, 58,
+          46, 49, 57, 61, 46, 49, 59, 64, 47, 50, 60, 67, 48, 50, 61, 71,
+          50, 52, 63, 73, 52, 53, 64, 76, 55, 55, 66, 79, 58, 58, 68, 82}},
+        {{31, 32, 44, 58, 32, 32, 42, 55, 32, 33, 41, 53, 32, 34, 42, 53,
+          32, 34, 42, 53, 32, 35, 42, 52, 34, 37, 48, 57, 35, 38, 54, 63,
+          37, 40, 57, 67, 39, 41, 60, 70, 41, 43, 63, 74, 45, 46, 67, 79,
+          50, 50, 71, 86, 54, 53, 74, 90, 57, 56, 77, 93, 61, 58, 79, 97},
+         {31, 37, 49, 54, 31, 38, 47, 51, 32, 40, 45, 49, 34, 42, 45, 49,
+          37, 44, 45, 48, 39, 47, 45, 48, 42, 47, 49, 51, 47, 48, 53, 55,
+          46, 47, 55, 58, 46, 46, 57, 60, 46, 46, 58, 62, 47, 46, 59, 65,
+          48, 47, 61, 68, 50, 48, 62, 70, 51, 49, 63, 71, 53, 50, 64, 73}},
+        {{31, 32, 38, 53, 32, 32, 37, 51, 32, 32, 36, 49, 32, 33, 36, 49,
+          32, 34, 38, 50, 32, 35, 39, 49, 33, 36, 41, 51, 34, 37, 44, 54,
+          35, 38, 49, 60, 37, 40, 51, 63, 38, 40, 52, 65, 42, 43, 56, 71,
+          45, 45, 58, 75, 47, 47, 60, 77, 51, 50, 63, 82, 55, 54, 67, 87},
+         {31, 37, 48, 52, 31, 38, 47, 50, 31, 39, 46, 48, 32, 40, 46, 48,
+          35, 43, 46, 47, 39, 47, 47, 47, 40, 47, 48, 48, 42, 47, 50, 50,
+          47, 48, 53, 54, 47, 47, 53, 56, 46, 47, 54, 57, 46, 46, 55, 61,
+          47, 46, 55, 63, 48, 47, 55, 64, 49, 47, 56, 66, 51, 49, 57, 68}},
+        {{31, 32, 36, 44, 32, 32, 35, 42, 32, 32, 35, 41, 32, 33, 34, 41,
+          32, 34, 36, 42, 32, 34, 36, 42, 32, 35, 38, 42, 33, 36, 40, 44,
+          34, 37, 42, 48, 35, 38, 47, 52, 35, 38, 48, 54, 38, 40, 50, 58,
+          40, 41, 51, 60, 42, 43, 53, 63, 45, 45, 56, 66, 46, 46, 56, 67},
+         {31, 37, 48, 49, 31, 38, 47, 47, 31, 39, 46, 46, 31, 40, 46, 45,
+          34, 42, 47, 45, 35, 43, 47, 46, 39, 47, 47, 45, 40, 47, 48, 47,
+          42, 47, 50, 49, 46, 48, 52, 53, 47, 48, 53, 53, 47, 47, 53, 56,
+          47, 46, 53, 57, 46, 46, 53, 58, 48, 46, 54, 59, 48, 46, 54, 59}},
+        {{31, 32, 34, 39, 32, 32, 34, 38, 32, 32, 34, 38, 32, 32, 33, 37,
+          32, 32, 33, 37, 32, 33, 35, 39, 32, 33, 35, 39, 32, 34, 37, 40,
+          32, 34, 37, 40, 34, 35, 39, 45, 34, 35, 39, 45, 35, 36, 43, 51,
+          35, 36, 43, 51, 38, 39, 45, 54, 38, 39, 45, 54, 42, 42, 48, 58},
+         {31, 33, 42, 48, 31, 34, 42, 47, 31, 34, 42, 47, 31, 35, 42, 45,
+          31, 35, 42, 45, 34, 39, 45, 46, 34, 39, 45, 46, 38, 43, 47, 46,
+          38, 43, 47, 46, 42, 45, 48, 50, 42, 45, 48, 50, 48, 47, 50, 53,
+          48, 47, 50, 53, 47, 46, 50, 54, 47, 46, 50, 54, 47, 45, 49, 56}},
+        {{31, 31, 32, 36, 31, 32, 32, 35, 32, 32, 32, 35, 32, 32, 32, 35,
+          32, 32, 33, 34, 32, 32, 33, 34, 32, 32, 34, 36, 32, 32, 34, 36,
+          32, 32, 34, 37, 32, 33, 35, 38, 32, 33, 35, 38, 33, 33, 36, 41,
+          34, 34, 37, 42, 34, 34, 37, 44, 35, 34, 38, 48, 35, 34, 38, 48},
+         {31, 31, 37, 48, 31, 31, 38, 47, 31, 31, 38, 47, 31, 32, 39, 46,
+          31, 32, 40, 46, 31, 32, 40, 46, 34, 35, 42, 47, 34, 36, 43, 47,
+          36, 37, 44, 47, 38, 40, 47, 47, 38, 40, 47, 47, 41, 42, 47, 49,
+          42, 43, 47, 50, 44, 44, 47, 51, 48, 46, 48, 53, 48, 46, 48, 53}},
+        {{31, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 33, 33,
+          32, 32, 33, 34, 32, 32, 33, 34, 32, 32, 33, 34, 32, 32, 34, 35,
+          32, 33, 34, 35, 32, 33, 34, 35, 33, 33, 35, 36, 34, 34, 36, 37},
+         {31, 31, 35, 37, 31, 31, 35, 38, 31, 31, 36, 38, 31, 31, 36, 38,
+          31, 32, 36, 39, 31, 32, 37, 40, 31, 32, 37, 40, 31, 33, 38, 40,
+          33, 35, 40, 42, 34, 36, 40, 43, 34, 36, 40, 43, 36, 38, 43, 45,
+          38, 40, 45, 47, 38, 40, 45, 47, 39, 41, 45, 47, 42, 43, 46, 47}},
+        {{31, 31, 31, 32, 31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+          31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+          31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33,
+          32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33},
+         {31, 31, 31, 34, 31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 31, 35,
+          31, 31, 31, 35, 31, 31, 31, 35, 31, 32, 32, 36, 31, 32, 32, 36,
+          31, 32, 32, 36, 31, 32, 32, 36, 31, 32, 32, 36, 32, 33, 33, 37,
+          33, 35, 35, 39, 34, 36, 36, 40, 34, 36, 36, 40, 34, 36, 36, 40}},
+        {{31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+          31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+          31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+          31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32},
+         {31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 32, 31, 31, 32, 32, 31, 31, 32, 32, 30, 31, 32, 32}}};
+constexpr uint8_t kQuantizerMatrix8x16
+    [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][128] = {
+        {{32,  32,  36,  53,  65,  87,  93,  99,  31,  33,  34,  49,  59,
+          78,  86,  93,  32,  34,  36,  50,  59,  77,  82,  89,  34,  37,
+          42,  54,  63,  79,  80,  88,  36,  38,  48,  60,  68,  84,  86,
+          90,  44,  43,  53,  71,  79,  95,  94,  97,  48,  46,  56,  76,
+          85,  102, 105, 105, 58,  54,  63,  87,  98,  116, 112, 115, 65,
+          58,  68,  92,  105, 124, 122, 124, 79,  70,  79,  104, 118, 141,
+          135, 135, 82,  72,  81,  106, 121, 144, 149, 146, 91,  80,  88,
+          106, 130, 148, 162, 159, 97,  86,  94,  107, 128, 157, 167, 171,
+          103, 93,  98,  114, 131, 150, 174, 186, 110, 100, 101, 117, 138,
+          161, 183, 193, 118, 107, 105, 118, 136, 157, 182, 203},
+         {32, 37, 48, 52, 57, 66, 68,  71,  30, 40, 46, 48, 52, 60, 63,  66,
+          33, 43, 47, 47, 51, 59, 60,  63,  42, 47, 50, 50, 53, 60, 59,  62,
+          49, 48, 53, 54, 57, 62, 62,  62,  49, 46, 53, 61, 64, 69, 66,  66,
+          50, 46, 54, 64, 67, 73, 72,  70,  54, 49, 55, 68, 73, 80, 76,  75,
+          57, 50, 56, 70, 76, 84, 80,  79,  63, 55, 60, 75, 82, 92, 87,  84,
+          64, 56, 61, 75, 83, 93, 93,  89,  68, 59, 64, 74, 86, 94, 98,  94,
+          70, 62, 66, 73, 83, 96, 99,  98,  72, 64, 66, 75, 83, 92, 101, 104,
+          74, 67, 66, 74, 84, 94, 103, 106, 76, 69, 67, 73, 82, 91, 101, 109}},
+        {{32,  32,  36,  47,  65,  79,  90,  96,  31,  32,  35,  44,  60,
+          72,  84,  90,  32,  34,  36,  45,  59,  71,  80,  87,  32,  35,
+          40,  47,  60,  71,  78,  85,  36,  37,  48,  56,  68,  78,  83,
+          87,  39,  40,  50,  60,  73,  84,  91,  94,  47,  45,  56,  69,
+          84,  95,  101, 101, 53,  50,  60,  75,  92,  103, 108, 110, 61,
+          56,  65,  81,  100, 113, 116, 118, 71,  64,  73,  89,  111, 125,
+          129, 129, 79,  70,  79,  95,  118, 133, 142, 138, 86,  76,  84,
+          100, 124, 140, 153, 150, 92,  82,  89,  101, 121, 148, 157, 161,
+          98,  88,  93,  108, 124, 141, 163, 174, 104, 94,  95,  110, 129,
+          151, 171, 181, 110, 100, 98,  111, 127, 147, 169, 188},
+         {32, 35, 48, 50, 57, 63, 68,  70,  30, 38, 46, 46, 52, 58, 63, 65,
+          33, 41, 47, 46, 51, 56, 60,  63,  39, 46, 48, 47, 51, 55, 58, 61,
+          49, 48, 53, 54, 57, 60, 61,  61,  48, 46, 53, 56, 60, 64, 65, 65,
+          50, 46, 54, 61, 66, 70, 71,  69,  52, 47, 54, 63, 71, 75, 75, 74,
+          55, 49, 56, 65, 74, 79, 79,  78,  60, 53, 58, 68, 79, 85, 85, 82,
+          63, 55, 60, 70, 82, 89, 91,  87,  66, 58, 62, 72, 84, 91, 95, 91,
+          68, 60, 64, 71, 81, 94, 97,  96,  70, 62, 65, 73, 81, 89, 98, 101,
+          72, 65, 65, 72, 82, 92, 100, 103, 74, 67, 65, 71, 79, 89, 98, 105}},
+        {{32,  32,  36,  44,  58,  79,  88,  93,  31,  32,  35,  41,  54,
+          73,  81,  88,  32,  33,  36,  42,  53,  71,  78,  84,  32,  34,
+          38,  42,  52,  69,  76,  82,  34,  36,  44,  50,  59,  75,  81,
+          84,  39,  39,  50,  58,  68,  84,  88,  90,  44,  42,  53,  63,
+          74,  90,  97,  97,  49,  46,  57,  67,  81,  97,  104, 105, 57,
+          53,  63,  74,  90,  108, 111, 113, 65,  59,  68,  79,  97,  118,
+          123, 122, 71,  64,  73,  84,  102, 125, 135, 131, 81,  72,  80,
+          91,  110, 135, 145, 141, 87,  77,  85,  96,  114, 140, 148, 151,
+          92,  83,  88,  102, 117, 133, 153, 163, 98,  88,  89,  103, 121,
+          141, 160, 169, 103, 94,  92,  103, 119, 137, 158, 175},
+         {32, 34, 48, 49, 54, 63, 67, 69,  31, 36, 46, 46, 50, 58, 62, 65,
+          33, 40, 47, 46, 49, 56, 59, 62,  37, 44, 47, 45, 48, 54, 57, 60,
+          44, 46, 51, 51, 53, 59, 60, 61,  48, 46, 53, 56, 58, 64, 64, 64,
+          49, 45, 53, 58, 62, 67, 70, 68,  51, 47, 54, 60, 65, 71, 73, 72,
+          54, 49, 55, 62, 70, 77, 77, 76,  57, 51, 56, 64, 73, 82, 83, 81,
+          60, 53, 58, 65, 75, 85, 89, 85,  64, 57, 61, 68, 78, 89, 93, 89,
+          66, 59, 63, 69, 79, 91, 94, 93,  68, 61, 63, 71, 79, 87, 96, 98,
+          70, 63, 63, 70, 80, 89, 97, 100, 72, 65, 63, 69, 77, 86, 95, 102}},
+        {{32, 31, 35,  44,  53,  65,  82,  90, 31, 32, 34,  41,  50,  61,  76,
+          85, 31, 33,  35,  42,  49,  59,  73, 81, 32, 34,  37,  42,  49,  58,
+          71, 79, 34,  35,  41,  48,  54,  63, 76, 81, 36,  36,  46,  54,  60,
+          68, 80, 87,  41,  40,  49,  60,  67, 76, 88, 93,  47,  44,  53,  66,
+          75, 84, 97,  101, 53,  50,  57,  71, 82, 92, 106, 108, 58,  54,  61,
+          75, 87, 98,  112, 116, 65,  59,  66, 79, 92, 105, 120, 124, 74,  67,
+          73, 86, 100, 113, 131, 134, 82,  73, 79, 92, 105, 120, 139, 142, 87,
+          78, 83, 96,  110, 125, 144, 153, 92, 83, 84, 97,  114, 132, 150, 157,
+          97, 88, 86,  97,  111, 128, 147, 163},
+         {32, 33, 45, 49, 52, 57, 64, 68, 31, 34, 45, 46, 49, 53, 60, 64,
+          33, 37, 46, 45, 47, 51, 57, 61, 37, 43, 47, 45, 47, 50, 55, 59,
+          42, 44, 49, 49, 50, 53, 58, 60, 49, 47, 52, 53, 54, 57, 61, 63,
+          48, 46, 51, 57, 59, 61, 66, 67, 50, 46, 52, 59, 63, 66, 71, 71,
+          52, 47, 53, 61, 66, 71, 75, 74, 54, 49, 54, 62, 68, 73, 79, 79,
+          57, 51, 55, 64, 70, 76, 83, 83, 61, 55, 58, 66, 73, 80, 87, 87,
+          64, 57, 60, 68, 75, 83, 91, 91, 66, 59, 61, 69, 77, 84, 93, 95,
+          68, 61, 61, 68, 77, 86, 94, 97, 70, 63, 61, 67, 75, 83, 92, 98}},
+        {{32, 31, 33, 40,  51,  65,  79,  87, 31, 32, 33, 39,  49,  61,  74,
+          82, 31, 32, 34,  38,  47,  59,  71, 79, 32, 33, 36,  40,  48,  58,
+          69, 77, 33, 34,  38,  44,  52,  62, 72, 78, 36, 35,  42,  51,  58,
+          68, 78, 84, 39,  38,  44,  54,  63, 73, 84, 89, 44,  41,  46,  59,
+          69, 79, 90, 96,  48,  45,  50,  62, 74, 85, 96, 103, 53,  49,  53,
+          66, 79, 92, 103, 111, 58,  54,  57, 70, 84, 98, 110, 118, 66,  60,
+          63, 75, 90, 106, 119, 126, 74,  67, 69, 81, 97, 113, 128, 134, 81,
+          73, 75, 86, 102, 120, 135, 143, 86, 78, 78, 90, 106, 124, 140, 147,
+          91, 82, 80, 90,  103, 119, 137, 151},
+         {32, 32, 40, 49, 51, 57, 63, 67, 31, 33, 41, 47, 49, 54, 59, 63,
+          31, 35, 43, 46, 47, 51, 57, 60, 35, 39, 46, 46, 47, 50, 55, 58,
+          41, 43, 48, 49, 49, 52, 57, 59, 49, 47, 50, 53, 54, 57, 60, 62,
+          48, 46, 49, 54, 57, 60, 64, 65, 49, 45, 48, 56, 61, 64, 67, 69,
+          50, 46, 49, 57, 63, 67, 71, 73, 52, 48, 50, 58, 65, 71, 75, 77,
+          54, 50, 51, 59, 67, 73, 78, 81, 57, 52, 53, 61, 69, 77, 82, 85,
+          61, 55, 56, 63, 72, 80, 86, 88, 64, 58, 58, 65, 73, 82, 89, 92,
+          66, 59, 59, 66, 75, 84, 91, 94, 68, 61, 59, 65, 72, 81, 89, 95}},
+        {{32, 31, 32, 36, 44, 53,  65,  79,  31, 32, 32, 35, 42, 51,  62,  75,
+          31, 32, 33, 34, 41, 49,  59,  72,  32, 32, 34, 36, 42, 50,  59,  71,
+          32, 33, 35, 38, 42, 49,  58,  69,  34, 34, 37, 42, 48, 54,  63,  73,
+          36, 34, 38, 48, 54, 60,  68,  78,  39, 37, 40, 50, 58, 65,  73,  84,
+          44, 41, 43, 53, 63, 71,  79,  90,  48, 45, 46, 56, 67, 76,  85,  96,
+          53, 49, 50, 60, 71, 82,  92,  103, 58, 54, 54, 63, 75, 87,  98,  110,
+          65, 60, 58, 68, 79, 92,  105, 118, 71, 65, 63, 73, 84, 97,  111, 125,
+          79, 72, 70, 79, 90, 104, 118, 133, 82, 75, 72, 81, 92, 106, 121, 136},
+         {32, 31, 37, 48, 49, 52, 57, 63, 31, 31, 38, 47, 47, 50, 54, 60,
+          30, 32, 40, 46, 45, 48, 52, 57, 33, 36, 43, 47, 46, 47, 51, 56,
+          37, 40, 47, 47, 45, 47, 50, 54, 42, 43, 47, 50, 49, 50, 53, 57,
+          49, 46, 48, 53, 53, 54, 57, 60, 48, 46, 47, 53, 56, 57, 60, 64,
+          49, 45, 46, 53, 58, 61, 64, 67, 50, 46, 46, 54, 59, 64, 67, 71,
+          52, 48, 47, 54, 61, 66, 71, 75, 54, 50, 49, 55, 62, 68, 73, 78,
+          57, 52, 50, 56, 64, 70, 76, 82, 60, 54, 52, 58, 65, 72, 79, 85,
+          63, 57, 55, 60, 67, 75, 82, 89, 64, 59, 56, 61, 68, 75, 83, 90}},
+        {{32, 31, 32, 36, 44, 53, 62,  73,  31, 32, 32, 35, 42, 51,  59,  69,
+          31, 32, 33, 34, 41, 49, 57,  66,  32, 32, 34, 36, 42, 50,  57,  65,
+          32, 33, 35, 38, 42, 49, 56,  64,  34, 34, 37, 42, 48, 54,  61,  69,
+          35, 34, 38, 47, 52, 59, 65,  73,  38, 36, 40, 49, 56, 63,  69,  77,
+          41, 39, 41, 51, 60, 67, 74,  81,  44, 42, 43, 54, 64, 72,  79,  86,
+          48, 45, 46, 56, 67, 76, 83,  91,  53, 49, 50, 60, 71, 82,  90,  99,
+          58, 54, 54, 63, 75, 87, 95,  105, 65, 60, 58, 68, 79, 92,  102, 112,
+          71, 65, 63, 73, 84, 97, 108, 119, 79, 72, 70, 79, 90, 104, 115, 127},
+         {32, 31, 37, 48, 49, 52, 56, 61, 31, 31, 38, 47, 47, 50, 53, 57,
+          30, 32, 40, 46, 45, 48, 51, 55, 33, 36, 43, 47, 46, 47, 50, 54,
+          37, 40, 47, 47, 45, 47, 49, 52, 42, 43, 47, 50, 49, 50, 53, 56,
+          47, 46, 48, 52, 53, 53, 55, 58, 48, 46, 47, 53, 55, 56, 58, 61,
+          48, 45, 46, 53, 57, 59, 61, 63, 49, 45, 46, 53, 58, 62, 64, 66,
+          50, 46, 46, 54, 59, 64, 66, 69, 52, 48, 47, 54, 61, 66, 70, 73,
+          54, 50, 49, 55, 62, 68, 72, 76, 57, 52, 50, 56, 64, 70, 75, 79,
+          60, 54, 52, 58, 65, 72, 77, 82, 63, 57, 55, 60, 67, 75, 80, 86}},
+        {{32, 31, 32, 35, 39, 44, 53, 65,  31, 32, 32, 35, 38, 42, 51, 62,
+          31, 32, 33, 34, 37, 41, 49, 59,  31, 32, 34, 35, 38, 42, 49, 59,
+          32, 32, 34, 36, 39, 42, 49, 58,  32, 33, 35, 37, 40, 42, 49, 58,
+          34, 34, 37, 41, 44, 48, 54, 63,  36, 34, 38, 46, 50, 54, 60, 68,
+          38, 37, 40, 47, 52, 57, 64, 72,  41, 39, 41, 49, 54, 60, 67, 76,
+          44, 41, 43, 51, 57, 63, 71, 79,  48, 45, 46, 54, 60, 67, 76, 85,
+          53, 49, 50, 57, 64, 71, 82, 92,  57, 53, 53, 60, 67, 74, 86, 97,
+          61, 56, 56, 63, 69, 77, 89, 100, 65, 60, 58, 66, 72, 79, 92, 105},
+         {32, 31, 37, 45, 48, 49, 52, 57, 31, 31, 38, 45, 47, 47, 50, 54,
+          30, 32, 40, 44, 45, 45, 48, 52, 33, 35, 42, 46, 46, 45, 47, 51,
+          35, 37, 44, 46, 46, 45, 47, 51, 37, 40, 47, 47, 47, 45, 47, 50,
+          42, 43, 47, 49, 50, 49, 50, 53, 49, 46, 48, 52, 53, 53, 54, 57,
+          48, 46, 47, 51, 54, 55, 57, 59, 48, 45, 46, 51, 54, 57, 59, 61,
+          49, 45, 46, 51, 55, 58, 61, 64, 50, 46, 46, 52, 56, 59, 64, 67,
+          52, 48, 47, 53, 57, 61, 66, 71, 54, 49, 48, 54, 58, 62, 68, 73,
+          55, 51, 49, 54, 58, 63, 69, 74, 57, 52, 50, 55, 59, 64, 70, 76}},
+        {{32, 31, 32, 32, 36, 44, 47, 53, 31, 32, 32, 33, 35, 42, 45, 51,
+          31, 32, 32, 33, 35, 41, 44, 49, 31, 32, 33, 33, 35, 41, 44, 49,
+          32, 32, 34, 34, 36, 42, 45, 50, 32, 33, 35, 36, 38, 42, 45, 49,
+          32, 33, 35, 36, 40, 44, 47, 51, 34, 34, 36, 38, 42, 48, 50, 54,
+          36, 34, 37, 40, 48, 54, 56, 60, 38, 36, 39, 41, 49, 56, 58, 63,
+          39, 37, 40, 42, 50, 58, 60, 65, 44, 41, 42, 45, 53, 63, 66, 71,
+          47, 44, 45, 47, 56, 66, 69, 75, 49, 46, 47, 48, 57, 67, 71, 77,
+          53, 49, 50, 51, 60, 71, 75, 82, 58, 54, 54, 55, 63, 75, 79, 87},
+         {32, 31, 35, 38, 48, 49, 50, 52, 31, 31, 37, 40, 47, 47, 48, 50,
+          30, 32, 38, 40, 46, 45, 46, 48, 31, 33, 38, 41, 46, 45, 46, 48,
+          33, 36, 41, 44, 47, 46, 46, 47, 37, 40, 45, 47, 47, 45, 46, 47,
+          39, 41, 46, 47, 48, 47, 47, 48, 42, 43, 46, 48, 50, 49, 50, 50,
+          49, 46, 48, 49, 53, 53, 54, 54, 48, 46, 47, 48, 53, 55, 55, 56,
+          48, 46, 46, 48, 53, 56, 56, 57, 49, 45, 45, 47, 53, 58, 59, 61,
+          50, 46, 46, 48, 54, 59, 61, 63, 51, 47, 47, 48, 54, 60, 61, 64,
+          52, 48, 47, 48, 54, 61, 63, 66, 54, 50, 49, 50, 55, 62, 65, 68}},
+        {{32, 31, 31, 32, 35, 36, 44, 47, 31, 32, 32, 32, 35, 35, 42, 45,
+          31, 32, 32, 32, 34, 35, 41, 45, 31, 32, 32, 33, 34, 34, 41, 44,
+          31, 32, 33, 34, 35, 36, 42, 44, 32, 32, 33, 34, 36, 36, 42, 45,
+          32, 33, 34, 35, 37, 38, 42, 45, 32, 33, 34, 36, 39, 40, 44, 47,
+          34, 34, 35, 37, 41, 42, 48, 50, 35, 34, 36, 38, 45, 47, 52, 55,
+          36, 34, 36, 38, 46, 48, 54, 56, 39, 37, 39, 40, 48, 50, 58, 60,
+          41, 39, 40, 41, 49, 51, 60, 62, 44, 41, 42, 43, 51, 53, 63, 66,
+          47, 44, 44, 45, 53, 56, 66, 69, 48, 45, 45, 46, 54, 56, 67, 70},
+         {32, 31, 33, 37, 45, 48, 49, 50, 31, 31, 34, 38, 45, 47, 47, 48,
+          31, 32, 34, 39, 45, 46, 46, 47, 30, 32, 35, 40, 44, 46, 45, 46,
+          33, 35, 37, 42, 46, 47, 45, 46, 33, 36, 38, 43, 46, 47, 46, 46,
+          37, 40, 43, 47, 47, 47, 45, 46, 39, 41, 43, 47, 48, 48, 47, 47,
+          42, 43, 44, 47, 49, 50, 49, 50, 47, 46, 46, 48, 51, 52, 53, 53,
+          49, 46, 47, 48, 52, 53, 53, 54, 48, 46, 46, 47, 51, 53, 56, 56,
+          48, 45, 46, 46, 51, 53, 57, 57, 49, 45, 45, 46, 51, 53, 58, 59,
+          50, 46, 46, 46, 52, 54, 59, 61, 50, 46, 46, 46, 52, 54, 59, 61}},
+        {{32, 31, 31, 32, 32, 36, 36, 44, 31, 32, 32, 32, 32, 35, 35, 42,
+          31, 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 33, 33, 34, 34, 41,
+          31, 32, 32, 33, 33, 34, 34, 41, 32, 32, 32, 34, 34, 36, 36, 42,
+          32, 32, 32, 34, 34, 36, 36, 42, 32, 33, 33, 35, 35, 38, 38, 42,
+          32, 33, 33, 35, 35, 38, 38, 42, 34, 34, 34, 37, 37, 42, 42, 48,
+          34, 34, 34, 37, 37, 42, 42, 48, 36, 34, 34, 38, 38, 48, 48, 54,
+          36, 34, 34, 38, 38, 48, 48, 54, 39, 37, 37, 40, 40, 50, 50, 58,
+          39, 37, 37, 40, 40, 50, 50, 58, 44, 41, 41, 43, 43, 53, 53, 63},
+         {32, 31, 31, 37, 37, 48, 48, 49, 31, 31, 31, 38, 38, 47, 47, 47,
+          31, 31, 31, 38, 38, 47, 47, 47, 30, 32, 32, 40, 40, 46, 46, 45,
+          30, 32, 32, 40, 40, 46, 46, 45, 33, 36, 36, 43, 43, 47, 47, 46,
+          33, 36, 36, 43, 43, 47, 47, 46, 37, 40, 40, 47, 47, 47, 47, 45,
+          37, 40, 40, 47, 47, 47, 47, 45, 42, 43, 43, 47, 47, 50, 50, 49,
+          42, 43, 43, 47, 47, 50, 50, 49, 49, 46, 46, 48, 48, 53, 53, 53,
+          49, 46, 46, 48, 48, 53, 53, 53, 48, 46, 46, 47, 47, 53, 53, 56,
+          48, 46, 46, 47, 47, 53, 53, 56, 49, 45, 45, 46, 46, 53, 53, 58}},
+        {{32, 31, 31, 31, 32, 32, 35, 36, 31, 32, 32, 32, 32, 32, 35, 35,
+          31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 34, 35,
+          31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 32, 33, 33, 34, 34,
+          31, 32, 32, 33, 34, 34, 35, 36, 32, 32, 32, 33, 34, 34, 36, 36,
+          32, 32, 32, 33, 34, 34, 36, 37, 32, 32, 33, 34, 35, 35, 37, 38,
+          32, 32, 33, 34, 35, 35, 37, 38, 33, 33, 33, 35, 36, 36, 40, 41,
+          34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35, 37, 37, 43, 44,
+          36, 35, 34, 36, 38, 38, 46, 48, 36, 35, 34, 36, 38, 38, 46, 48},
+         {32, 31, 31, 33, 37, 37, 45, 48, 31, 31, 31, 34, 38, 38, 45, 47,
+          31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 32, 34, 39, 39, 45, 46,
+          30, 32, 32, 35, 40, 40, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46,
+          33, 34, 35, 37, 42, 42, 46, 47, 33, 35, 36, 38, 43, 43, 46, 47,
+          35, 37, 37, 40, 44, 44, 46, 47, 37, 39, 40, 43, 47, 47, 47, 47,
+          37, 39, 40, 43, 47, 47, 47, 47, 41, 42, 42, 44, 47, 47, 49, 49,
+          42, 42, 43, 44, 47, 47, 49, 50, 44, 44, 44, 45, 47, 47, 50, 51,
+          49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 48, 48, 52, 53}},
+        {{32, 31, 31, 31, 31, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 33,
+          31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+          31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 33, 33, 33,
+          31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33,
+          31, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34,
+          32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 35, 35, 35,
+          32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36,
+          32, 33, 33, 33, 34, 36, 36, 36, 34, 34, 34, 34, 35, 37, 37, 38},
+         {32, 31, 31, 31, 33, 37, 37, 38, 31, 31, 31, 31, 33, 38, 38, 39,
+          31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40,
+          31, 31, 32, 32, 34, 39, 39, 40, 30, 31, 32, 32, 35, 40, 40, 41,
+          30, 31, 32, 32, 35, 40, 40, 41, 31, 32, 33, 33, 35, 40, 40, 41,
+          33, 34, 35, 35, 37, 42, 42, 43, 33, 35, 36, 36, 38, 43, 43, 44,
+          33, 35, 36, 36, 38, 43, 43, 44, 35, 37, 38, 38, 41, 45, 45, 46,
+          37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47,
+          39, 40, 41, 41, 43, 47, 47, 47, 42, 42, 43, 43, 44, 47, 47, 48}},
+        {{32, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 33, 33,
+          31, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 34,
+          32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34},
+         {32, 31, 31, 31, 31, 31, 33, 35, 31, 31, 31, 31, 31, 31, 33, 36,
+          31, 31, 31, 31, 31, 31, 34, 36, 31, 31, 31, 31, 31, 31, 34, 37,
+          31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37,
+          31, 31, 31, 32, 32, 32, 34, 37, 30, 31, 31, 32, 32, 32, 34, 38,
+          30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38,
+          30, 31, 32, 32, 32, 32, 35, 38, 31, 32, 33, 33, 33, 33, 36, 39,
+          33, 34, 34, 35, 35, 35, 37, 40, 33, 34, 35, 36, 36, 36, 38, 41,
+          33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41}},
+        {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+          31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32,
+          30, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 32, 32, 32, 32}}};
+constexpr uint8_t kQuantizerMatrix8x32
+    [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][256] = {
+        {{32,  32,  36,  53,  65,  87,  93,  99,  31,  32,  35,  51,  62,  82,
+          88,  94,  31,  33,  34,  49,  59,  78,  86,  93,  31,  33,  35,  49,
+          59,  78,  84,  90,  32,  34,  36,  50,  59,  77,  82,  89,  32,  35,
+          38,  49,  58,  75,  82,  89,  34,  37,  42,  54,  63,  79,  80,  88,
+          35,  37,  45,  57,  65,  82,  84,  87,  36,  38,  48,  60,  68,  84,
+          86,  90,  39,  40,  50,  65,  73,  89,  91,  93,  44,  43,  53,  71,
+          79,  95,  94,  97,  46,  44,  55,  73,  82,  98,  98,  99,  48,  46,
+          56,  76,  85,  102, 105, 105, 53,  50,  60,  82,  92,  109, 107, 107,
+          58,  54,  63,  87,  98,  116, 112, 115, 61,  56,  66,  89,  101, 120,
+          119, 116, 65,  58,  68,  92,  105, 124, 122, 124, 71,  63,  73,  97,
+          111, 132, 130, 127, 79,  70,  79,  104, 118, 141, 135, 135, 81,  71,
+          80,  105, 119, 142, 140, 139, 82,  72,  81,  106, 121, 144, 149, 146,
+          88,  77,  85,  108, 126, 149, 153, 152, 91,  80,  88,  106, 130, 148,
+          162, 159, 94,  83,  91,  105, 131, 153, 165, 166, 97,  86,  94,  107,
+          128, 157, 167, 171, 100, 89,  97,  111, 127, 152, 173, 182, 103, 93,
+          98,  114, 131, 150, 174, 186, 107, 96,  100, 117, 136, 155, 177, 191,
+          110, 100, 101, 117, 138, 161, 183, 193, 114, 104, 103, 117, 137, 159,
+          185, 201, 118, 107, 105, 118, 136, 157, 182, 203, 122, 111, 107, 119,
+          136, 156, 179, 204},
+         {32, 37, 48, 52, 57, 66, 68,  71,  31, 38, 47, 50, 54, 63, 65,  67,
+          30, 40, 46, 48, 52, 60, 63,  66,  32, 41, 46, 48, 51, 59, 62,  64,
+          33, 43, 47, 47, 51, 59, 60,  63,  37, 47, 47, 47, 50, 57, 60,  62,
+          42, 47, 50, 50, 53, 60, 59,  62,  45, 47, 51, 52, 55, 61, 61,  61,
+          49, 48, 53, 54, 57, 62, 62,  62,  48, 47, 53, 57, 60, 66, 65,  64,
+          49, 46, 53, 61, 64, 69, 66,  66,  49, 46, 53, 62, 65, 71, 68,  67,
+          50, 46, 54, 64, 67, 73, 72,  70,  52, 47, 54, 66, 71, 77, 73,  71,
+          54, 49, 55, 68, 73, 80, 76,  75,  55, 49, 56, 69, 75, 82, 79,  76,
+          57, 50, 56, 70, 76, 84, 80,  79,  60, 52, 58, 72, 79, 88, 84,  81,
+          63, 55, 60, 75, 82, 92, 87,  84,  64, 55, 61, 75, 82, 92, 89,  86,
+          64, 56, 61, 75, 83, 93, 93,  89,  67, 58, 63, 76, 85, 95, 94,  91,
+          68, 59, 64, 74, 86, 94, 98,  94,  69, 60, 65, 72, 85, 95, 99,  97,
+          70, 62, 66, 73, 83, 96, 99,  98,  71, 63, 67, 74, 82, 93, 102, 102,
+          72, 64, 66, 75, 83, 92, 101, 104, 73, 65, 66, 75, 84, 93, 102, 106,
+          74, 67, 66, 74, 84, 94, 103, 106, 75, 68, 66, 74, 83, 93, 103, 109,
+          76, 69, 67, 73, 82, 91, 101, 109, 77, 70, 67, 73, 81, 90, 99,  108}},
+        {{32,  32,  36,  47,  65,  79,  90,  96,  31,  32,  35,  45,  62,  75,
+          86,  91,  31,  32,  35,  44,  60,  72,  84,  90,  31,  33,  35,  44,
+          59,  71,  82,  87,  32,  34,  36,  45,  59,  71,  80,  87,  32,  35,
+          38,  45,  58,  69,  80,  86,  32,  35,  40,  47,  60,  71,  78,  85,
+          34,  36,  42,  50,  63,  73,  82,  84,  36,  37,  48,  56,  68,  78,
+          83,  87,  38,  39,  49,  58,  71,  81,  88,  90,  39,  40,  50,  60,
+          73,  84,  91,  94,  44,  42,  53,  66,  79,  90,  94,  96,  47,  45,
+          56,  69,  84,  95,  101, 101, 49,  47,  57,  71,  86,  97,  103, 102,
+          53,  50,  60,  75,  92,  103, 108, 110, 58,  54,  63,  79,  98,  110,
+          114, 111, 61,  56,  65,  81,  100, 113, 116, 118, 65,  59,  68,  84,
+          105, 118, 124, 121, 71,  64,  73,  89,  111, 125, 129, 129, 76,  68,
+          76,  92,  115, 130, 134, 132, 79,  70,  79,  95,  118, 133, 142, 138,
+          82,  73,  81,  97,  121, 136, 145, 144, 86,  76,  84,  100, 124, 140,
+          153, 150, 89,  79,  87,  99,  124, 145, 156, 156, 92,  82,  89,  101,
+          121, 148, 157, 161, 95,  85,  92,  105, 120, 143, 163, 171, 98,  88,
+          93,  108, 124, 141, 163, 174, 101, 91,  94,  110, 128, 146, 166, 179,
+          104, 94,  95,  110, 129, 151, 171, 181, 107, 97,  96,  110, 128, 149,
+          173, 188, 110, 100, 98,  111, 127, 147, 169, 188, 114, 104, 100, 111,
+          127, 145, 166, 190},
+         {32, 35, 48, 50, 57, 63, 68,  70,  31, 37, 47, 48, 54, 60, 64,  66,
+          30, 38, 46, 46, 52, 58, 63,  65,  31, 38, 46, 46, 52, 57, 61,  63,
+          33, 41, 47, 46, 51, 56, 60,  63,  37, 45, 47, 46, 50, 54, 59,  62,
+          39, 46, 48, 47, 51, 55, 58,  61,  42, 46, 50, 50, 53, 57, 60,  60,
+          49, 48, 53, 54, 57, 60, 61,  61,  48, 47, 53, 55, 58, 62, 64,  63,
+          48, 46, 53, 56, 60, 64, 65,  65,  49, 45, 53, 59, 64, 67, 67,  66,
+          50, 46, 54, 61, 66, 70, 71,  69,  51, 47, 54, 61, 68, 71, 72,  70,
+          52, 47, 54, 63, 71, 75, 75,  74,  54, 49, 55, 65, 73, 78, 78,  74,
+          55, 49, 56, 65, 74, 79, 79,  78,  57, 50, 56, 66, 76, 82, 83,  79,
+          60, 53, 58, 68, 79, 85, 85,  82,  62, 54, 60, 69, 81, 87, 87,  84,
+          63, 55, 60, 70, 82, 89, 91,  87,  64, 56, 61, 71, 83, 90, 92,  89,
+          66, 58, 62, 72, 84, 91, 95,  91,  67, 59, 63, 71, 83, 93, 96,  94,
+          68, 60, 64, 71, 81, 94, 97,  96,  69, 61, 65, 72, 80, 91, 99,  100,
+          70, 62, 65, 73, 81, 89, 98,  101, 71, 64, 65, 73, 82, 90, 99,  103,
+          72, 65, 65, 72, 82, 92, 100, 103, 73, 66, 65, 72, 81, 90, 100, 105,
+          74, 67, 65, 71, 79, 89, 98,  105, 75, 68, 65, 71, 78, 87, 96,  105}},
+        {{32,  32,  36,  44,  58,  79,  88,  93,  31,  32,  35,  42,  55,  75,
+          83,  88,  31,  32,  35,  41,  54,  73,  81,  88,  31,  32,  34,  41,
+          53,  72,  79,  84,  32,  33,  36,  42,  53,  71,  78,  84,  32,  34,
+          37,  42,  53,  70,  77,  83,  32,  34,  38,  42,  52,  69,  76,  82,
+          34,  35,  42,  48,  57,  73,  79,  81,  34,  36,  44,  50,  59,  75,
+          81,  84,  36,  37,  48,  54,  63,  78,  85,  86,  39,  39,  50,  58,
+          68,  84,  88,  90,  40,  40,  51,  59,  70,  85,  91,  92,  44,  42,
+          53,  63,  74,  90,  97,  97,  47,  45,  56,  66,  79,  95,  99,  98,
+          49,  46,  57,  67,  81,  97,  104, 105, 53,  50,  60,  71,  86,  103,
+          109, 106, 57,  53,  63,  74,  90,  108, 111, 113, 59,  54,  64,  75,
+          91,  111, 119, 115, 65,  59,  68,  79,  97,  118, 123, 122, 69,  62,
+          71,  83,  100, 122, 127, 125, 71,  64,  73,  84,  102, 125, 135, 131,
+          79,  71,  79,  90,  109, 133, 137, 136, 81,  72,  80,  91,  110, 135,
+          145, 141, 82,  73,  81,  92,  111, 136, 147, 147, 87,  77,  85,  96,
+          114, 140, 148, 151, 90,  80,  87,  99,  113, 135, 153, 160, 92,  83,
+          88,  102, 117, 133, 153, 163, 95,  85,  88,  103, 120, 137, 155, 168,
+          98,  88,  89,  103, 121, 141, 160, 169, 100, 91,  90,  103, 120, 139,
+          161, 175, 103, 94,  92,  103, 119, 137, 158, 175, 106, 97,  93,  104,
+          118, 135, 155, 176},
+         {32, 34, 48, 49, 54, 63, 67, 69,  31, 35, 47, 47, 51, 60, 63, 65,
+          31, 36, 46, 46, 50, 58, 62, 65,  30, 36, 46, 45, 49, 57, 60, 62,
+          33, 40, 47, 46, 49, 56, 59, 62,  35, 42, 47, 45, 48, 55, 58, 61,
+          37, 44, 47, 45, 48, 54, 57, 60,  42, 45, 50, 49, 51, 57, 59, 59,
+          44, 46, 51, 51, 53, 59, 60, 61,  49, 47, 53, 53, 55, 60, 63, 62,
+          48, 46, 53, 56, 58, 64, 64, 64,  48, 46, 53, 56, 59, 65, 66, 65,
+          49, 45, 53, 58, 62, 67, 70, 68,  50, 46, 54, 59, 65, 70, 70, 68,
+          51, 47, 54, 60, 65, 71, 73, 72,  52, 47, 54, 61, 68, 75, 76, 73,
+          54, 49, 55, 62, 70, 77, 77, 76,  54, 49, 55, 62, 70, 78, 81, 77,
+          57, 51, 56, 64, 73, 82, 83, 81,  59, 52, 58, 65, 74, 84, 85, 82,
+          60, 53, 58, 65, 75, 85, 89, 85,  63, 56, 60, 67, 77, 89, 90, 87,
+          64, 57, 61, 68, 78, 89, 93, 89,  64, 57, 61, 68, 78, 90, 94, 92,
+          66, 59, 63, 69, 79, 91, 94, 93,  67, 60, 63, 70, 78, 88, 96, 97,
+          68, 61, 63, 71, 79, 87, 96, 98,  69, 62, 63, 71, 80, 88, 96, 100,
+          70, 63, 63, 70, 80, 89, 97, 100, 71, 64, 63, 70, 78, 88, 97, 102,
+          72, 65, 63, 69, 77, 86, 95, 102, 73, 66, 63, 69, 76, 84, 93, 101}},
+        {{32,  31,  35,  44,  53,  65,  82,  90,  31,  32,  35,  42,  51,  62,
+          78,  86,  31,  32,  34,  41,  50,  61,  76,  85,  31,  32,  34,  41,
+          49,  59,  74,  82,  31,  33,  35,  42,  49,  59,  73,  81,  32,  33,
+          36,  42,  50,  59,  73,  80,  32,  34,  37,  42,  49,  58,  71,  79,
+          32,  34,  39,  44,  51,  60,  73,  78,  34,  35,  41,  48,  54,  63,
+          76,  81,  35,  36,  45,  52,  59,  67,  79,  83,  36,  36,  46,  54,
+          60,  68,  80,  87,  39,  39,  48,  58,  65,  73,  86,  88,  41,  40,
+          49,  60,  67,  76,  88,  93,  44,  42,  51,  63,  71,  79,  92,  94,
+          47,  44,  53,  66,  75,  84,  97,  101, 48,  45,  54,  67,  76,  85,
+          98,  101, 53,  50,  57,  71,  82,  92,  106, 108, 55,  51,  59,  72,
+          84,  94,  108, 110, 58,  54,  61,  75,  87,  98,  112, 116, 63,  58,
+          65,  78,  91,  103, 118, 119, 65,  59,  66,  79,  92,  105, 120, 124,
+          71,  64,  71,  84,  97,  111, 127, 129, 74,  67,  73,  86,  100, 113,
+          131, 134, 79,  71,  77,  90,  104, 118, 136, 139, 82,  73,  79,  92,
+          105, 120, 139, 142, 82,  74,  79,  92,  106, 121, 139, 150, 87,  78,
+          83,  96,  110, 125, 144, 153, 89,  81,  83,  97,  113, 128, 145, 157,
+          92,  83,  84,  97,  114, 132, 150, 157, 94,  85,  85,  97,  112, 130,
+          151, 163, 97,  88,  86,  97,  111, 128, 147, 163, 99,  91,  87,  97,
+          110, 126, 144, 163},
+         {32, 33, 45, 49, 52, 57, 64, 68, 31, 34, 45, 47, 50, 54, 61, 64,
+          31, 34, 45, 46, 49, 53, 60, 64, 30, 35, 44, 45, 48, 52, 58, 61,
+          33, 37, 46, 45, 47, 51, 57, 61, 33, 38, 46, 46, 47, 51, 57, 60,
+          37, 43, 47, 45, 47, 50, 55, 59, 39, 43, 48, 47, 48, 51, 56, 58,
+          42, 44, 49, 49, 50, 53, 58, 60, 47, 46, 51, 53, 53, 56, 61, 61,
+          49, 47, 52, 53, 54, 57, 61, 63, 48, 46, 51, 56, 57, 60, 64, 64,
+          48, 46, 51, 57, 59, 61, 66, 67, 49, 45, 51, 58, 61, 64, 68, 67,
+          50, 46, 52, 59, 63, 66, 71, 71, 50, 46, 52, 59, 64, 67, 71, 71,
+          52, 47, 53, 61, 66, 71, 75, 74, 53, 48, 53, 61, 67, 72, 77, 75,
+          54, 49, 54, 62, 68, 73, 79, 79, 56, 51, 55, 63, 70, 76, 82, 80,
+          57, 51, 55, 64, 70, 76, 83, 83, 60, 54, 57, 65, 72, 79, 86, 85,
+          61, 55, 58, 66, 73, 80, 87, 87, 63, 56, 59, 67, 75, 82, 90, 89,
+          64, 57, 60, 68, 75, 83, 91, 91, 64, 58, 60, 68, 75, 83, 91, 94,
+          66, 59, 61, 69, 77, 84, 93, 95, 67, 60, 61, 69, 78, 85, 93, 97,
+          68, 61, 61, 68, 77, 86, 94, 97, 69, 62, 61, 68, 76, 85, 94, 99,
+          70, 63, 61, 67, 75, 83, 92, 98, 70, 64, 61, 67, 74, 82, 90, 98}},
+        {{32,  31,  33,  40,  51,  65,  79,  87,  31,  32,  33,  39,  49,  62,
+          75,  83,  31,  32,  33,  39,  49,  61,  74,  82,  31,  32,  33,  38,
+          47,  59,  72,  79,  31,  32,  34,  38,  47,  59,  71,  79,  32,  33,
+          35,  39,  48,  59,  71,  78,  32,  33,  36,  40,  48,  58,  69,  77,
+          32,  33,  36,  41,  48,  58,  69,  75,  33,  34,  38,  44,  52,  62,
+          72,  78,  34,  34,  39,  45,  53,  63,  73,  80,  36,  35,  42,  51,
+          58,  68,  78,  84,  36,  35,  42,  51,  59,  68,  79,  85,  39,  38,
+          44,  54,  63,  73,  84,  89,  40,  39,  45,  56,  65,  75,  85,  90,
+          44,  41,  46,  59,  69,  79,  90,  96,  46,  43,  48,  60,  72,  82,
+          93,  97,  48,  45,  50,  62,  74,  85,  96,  103, 52,  48,  52,  65,
+          78,  90,  101, 105, 53,  49,  53,  66,  79,  92,  103, 111, 58,  53,
+          57,  69,  83,  97,  109, 113, 58,  54,  57,  70,  84,  98,  110, 118,
+          65,  59,  62,  74,  89,  105, 118, 122, 66,  60,  63,  75,  90,  106,
+          119, 126, 71,  65,  67,  79,  94,  111, 125, 131, 74,  67,  69,  81,
+          97,  113, 128, 134, 79,  72,  73,  85,  101, 118, 133, 141, 81,  73,
+          75,  86,  102, 120, 135, 143, 82,  74,  75,  87,  103, 121, 136, 147,
+          86,  78,  78,  90,  106, 124, 140, 147, 88,  80,  80,  90,  105, 122,
+          140, 152, 91,  82,  80,  90,  103, 119, 137, 151, 93,  85,  81,  90,
+          103, 117, 134, 152},
+         {32, 32, 40, 49, 51, 57, 63, 67, 31, 33, 41, 47, 49, 54, 60, 63,
+          31, 33, 41, 47, 49, 54, 59, 63, 30, 33, 42, 45, 47, 52, 57, 60,
+          31, 35, 43, 46, 47, 51, 57, 60, 33, 37, 44, 46, 47, 51, 56, 59,
+          35, 39, 46, 46, 47, 50, 55, 58, 37, 41, 47, 46, 46, 50, 54, 57,
+          41, 43, 48, 49, 49, 52, 57, 59, 42, 43, 48, 49, 50, 53, 57, 60,
+          49, 47, 50, 53, 54, 57, 60, 62, 49, 47, 50, 53, 54, 57, 61, 63,
+          48, 46, 49, 54, 57, 60, 64, 65, 48, 46, 49, 55, 58, 61, 65, 66,
+          49, 45, 48, 56, 61, 64, 67, 69, 49, 46, 49, 57, 62, 65, 69, 70,
+          50, 46, 49, 57, 63, 67, 71, 73, 51, 47, 49, 58, 64, 69, 73, 74,
+          52, 48, 50, 58, 65, 71, 75, 77, 54, 49, 51, 59, 67, 73, 77, 78,
+          54, 50, 51, 59, 67, 73, 78, 81, 57, 52, 52, 60, 69, 76, 82, 83,
+          57, 52, 53, 61, 69, 77, 82, 85, 60, 54, 55, 62, 71, 79, 85, 87,
+          61, 55, 56, 63, 72, 80, 86, 88, 63, 57, 57, 64, 73, 82, 89, 92,
+          64, 58, 58, 65, 73, 82, 89, 92, 64, 58, 58, 65, 74, 83, 90, 94,
+          66, 59, 59, 66, 75, 84, 91, 94, 67, 60, 59, 66, 74, 82, 91, 96,
+          68, 61, 59, 65, 72, 81, 89, 95, 68, 62, 59, 65, 71, 79, 87, 95}},
+        {{32, 31, 32, 36, 44, 53,  65,  79,  31, 32, 32, 35, 42, 51,  62,  75,
+          31, 32, 32, 35, 42, 51,  62,  75,  31, 32, 33, 34, 41, 49,  59,  72,
+          31, 32, 33, 34, 41, 49,  59,  72,  32, 32, 34, 36, 42, 50,  59,  71,
+          32, 32, 34, 36, 42, 50,  59,  71,  32, 33, 35, 38, 42, 49,  58,  69,
+          32, 33, 35, 38, 42, 49,  58,  69,  34, 34, 37, 42, 48, 54,  63,  73,
+          34, 34, 37, 42, 48, 54,  63,  73,  36, 34, 38, 48, 54, 60,  68,  78,
+          36, 34, 38, 48, 54, 60,  68,  78,  39, 37, 40, 50, 58, 65,  73,  84,
+          39, 37, 40, 50, 58, 65,  73,  84,  44, 41, 43, 53, 63, 71,  79,  90,
+          44, 41, 43, 53, 63, 71,  79,  90,  48, 45, 46, 56, 67, 76,  85,  96,
+          48, 45, 46, 56, 67, 76,  85,  96,  53, 49, 50, 60, 71, 82,  92,  103,
+          53, 49, 50, 60, 71, 82,  92,  103, 58, 54, 54, 63, 75, 87,  98,  110,
+          58, 54, 54, 63, 75, 87,  98,  110, 65, 60, 58, 68, 79, 92,  105, 118,
+          65, 60, 58, 68, 79, 92,  105, 118, 71, 65, 63, 73, 84, 97,  111, 125,
+          71, 65, 63, 73, 84, 97,  111, 125, 79, 72, 70, 79, 90, 104, 118, 133,
+          79, 72, 70, 79, 90, 104, 118, 133, 82, 75, 72, 81, 92, 106, 121, 136,
+          82, 75, 72, 81, 92, 106, 121, 136, 87, 79, 76, 84, 96, 109, 124, 141},
+         {32, 31, 37, 48, 49, 52, 57, 63, 31, 31, 38, 47, 47, 50, 54, 60,
+          31, 31, 38, 47, 47, 50, 54, 60, 30, 32, 40, 46, 45, 48, 52, 57,
+          30, 32, 40, 46, 45, 48, 52, 57, 33, 36, 43, 47, 46, 47, 51, 56,
+          33, 36, 43, 47, 46, 47, 51, 56, 37, 40, 47, 47, 45, 47, 50, 54,
+          37, 40, 47, 47, 45, 47, 50, 54, 42, 43, 47, 50, 49, 50, 53, 57,
+          42, 43, 47, 50, 49, 50, 53, 57, 49, 46, 48, 53, 53, 54, 57, 60,
+          49, 46, 48, 53, 53, 54, 57, 60, 48, 46, 47, 53, 56, 57, 60, 64,
+          48, 46, 47, 53, 56, 57, 60, 64, 49, 45, 46, 53, 58, 61, 64, 67,
+          49, 45, 46, 53, 58, 61, 64, 67, 50, 46, 46, 54, 59, 64, 67, 71,
+          50, 46, 46, 54, 59, 64, 67, 71, 52, 48, 47, 54, 61, 66, 71, 75,
+          52, 48, 47, 54, 61, 66, 71, 75, 54, 50, 49, 55, 62, 68, 73, 78,
+          54, 50, 49, 55, 62, 68, 73, 78, 57, 52, 50, 56, 64, 70, 76, 82,
+          57, 52, 50, 56, 64, 70, 76, 82, 60, 54, 52, 58, 65, 72, 79, 85,
+          60, 54, 52, 58, 65, 72, 79, 85, 63, 57, 55, 60, 67, 75, 82, 89,
+          63, 57, 55, 60, 67, 75, 82, 89, 64, 59, 56, 61, 68, 75, 83, 90,
+          64, 59, 56, 61, 68, 75, 83, 90, 66, 60, 57, 63, 69, 77, 84, 92}},
+        {{32, 31, 32, 36, 44, 53,  62,  73,  31, 32, 32, 35, 42, 51,  60,  70,
+          31, 32, 32, 35, 42, 51,  59,  69,  31, 32, 32, 35, 41, 50,  58,  67,
+          31, 32, 33, 34, 41, 49,  57,  66,  31, 32, 33, 35, 41, 49,  57,  66,
+          32, 32, 34, 36, 42, 50,  57,  65,  32, 32, 34, 37, 42, 49,  56,  65,
+          32, 33, 35, 38, 42, 49,  56,  64,  32, 33, 35, 39, 43, 50,  56,  64,
+          34, 34, 37, 42, 48, 54,  61,  69,  34, 34, 37, 42, 48, 54,  61,  69,
+          35, 34, 38, 47, 52, 59,  65,  73,  36, 34, 38, 48, 54, 60,  66,  74,
+          38, 36, 40, 49, 56, 63,  69,  77,  39, 37, 40, 50, 58, 65,  71,  79,
+          41, 39, 41, 51, 60, 67,  74,  81,  44, 41, 43, 53, 63, 71,  78,  85,
+          44, 42, 43, 54, 64, 72,  79,  86,  48, 45, 46, 56, 67, 76,  83,  91,
+          48, 45, 46, 56, 67, 76,  83,  91,  53, 49, 49, 59, 71, 81,  89,  98,
+          53, 49, 50, 60, 71, 82,  90,  99,  57, 52, 52, 62, 74, 85,  94,  103,
+          58, 54, 54, 63, 75, 87,  95,  105, 61, 57, 56, 66, 77, 89,  98,  108,
+          65, 60, 58, 68, 79, 92,  102, 112, 67, 61, 60, 69, 81, 94,  103, 114,
+          71, 65, 63, 73, 84, 97,  108, 119, 72, 66, 64, 73, 85, 98,  108, 119,
+          79, 72, 70, 79, 90, 104, 115, 127, 79, 72, 70, 79, 90, 104, 115, 127},
+         {32, 31, 37, 48, 49, 52, 56, 61, 31, 31, 38, 47, 47, 50, 54, 58,
+          31, 31, 38, 47, 47, 50, 53, 57, 30, 32, 39, 46, 46, 48, 52, 56,
+          30, 32, 40, 46, 45, 48, 51, 55, 32, 34, 41, 46, 45, 48, 51, 54,
+          33, 36, 43, 47, 46, 47, 50, 54, 34, 37, 44, 47, 45, 47, 50, 53,
+          37, 40, 47, 47, 45, 47, 49, 52, 37, 40, 47, 48, 46, 47, 49, 53,
+          42, 43, 47, 50, 49, 50, 53, 56, 42, 43, 47, 50, 49, 50, 53, 56,
+          47, 46, 48, 52, 53, 53, 55, 58, 49, 46, 48, 53, 53, 54, 56, 59,
+          48, 46, 47, 53, 55, 56, 58, 61, 48, 46, 47, 53, 56, 57, 59, 62,
+          48, 45, 46, 53, 57, 59, 61, 63, 49, 45, 46, 53, 58, 61, 63, 66,
+          49, 45, 46, 53, 58, 62, 64, 66, 50, 46, 46, 54, 59, 64, 66, 69,
+          50, 46, 46, 54, 59, 64, 66, 69, 52, 48, 47, 54, 61, 66, 69, 72,
+          52, 48, 47, 54, 61, 66, 70, 73, 53, 49, 48, 55, 62, 68, 71, 75,
+          54, 50, 49, 55, 62, 68, 72, 76, 55, 51, 49, 56, 63, 69, 74, 78,
+          57, 52, 50, 56, 64, 70, 75, 79, 58, 53, 51, 57, 64, 71, 76, 80,
+          60, 54, 52, 58, 65, 72, 77, 82, 60, 55, 53, 59, 65, 73, 78, 83,
+          63, 57, 55, 60, 67, 75, 80, 86, 63, 57, 55, 60, 67, 75, 80, 86}},
+        {{32, 31, 32, 35, 39, 44, 53, 65,  31, 32, 32, 35, 38, 42, 52, 63,
+          31, 32, 32, 35, 38, 42, 51, 62,  31, 32, 32, 34, 37, 41, 50, 61,
+          31, 32, 33, 34, 37, 41, 49, 59,  31, 32, 33, 34, 37, 41, 49, 59,
+          31, 32, 34, 35, 38, 42, 49, 59,  32, 32, 34, 36, 38, 42, 50, 59,
+          32, 32, 34, 36, 39, 42, 49, 58,  32, 33, 35, 37, 40, 42, 49, 58,
+          32, 33, 35, 37, 40, 42, 49, 58,  33, 33, 36, 40, 43, 46, 53, 62,
+          34, 34, 37, 41, 44, 48, 54, 63,  34, 34, 37, 43, 46, 50, 56, 65,
+          36, 34, 38, 46, 50, 54, 60, 68,  36, 34, 38, 46, 50, 54, 60, 68,
+          38, 37, 40, 47, 52, 57, 64, 72,  39, 37, 40, 48, 53, 58, 65, 73,
+          41, 39, 41, 49, 54, 60, 67, 76,  44, 41, 43, 51, 57, 63, 71, 79,
+          44, 41, 43, 51, 57, 63, 71, 79,  47, 44, 45, 53, 59, 66, 75, 84,
+          48, 45, 46, 54, 60, 67, 76, 85,  50, 46, 47, 55, 61, 68, 78, 88,
+          53, 49, 50, 57, 64, 71, 82, 92,  53, 49, 50, 57, 64, 71, 82, 92,
+          57, 53, 53, 60, 67, 74, 86, 97,  58, 54, 54, 61, 68, 75, 87, 98,
+          61, 56, 56, 63, 69, 77, 89, 100, 65, 60, 58, 66, 72, 79, 92, 105,
+          65, 60, 58, 66, 72, 79, 92, 105, 70, 64, 62, 70, 76, 83, 96, 109},
+         {32, 31, 37, 45, 48, 49, 52, 57, 31, 31, 38, 45, 47, 47, 50, 55,
+          31, 31, 38, 45, 47, 47, 50, 54, 31, 32, 39, 45, 46, 46, 49, 53,
+          30, 32, 40, 44, 45, 45, 48, 52, 30, 32, 40, 44, 45, 45, 48, 52,
+          33, 35, 42, 46, 46, 45, 47, 51, 33, 36, 43, 46, 46, 46, 47, 51,
+          35, 37, 44, 46, 46, 45, 47, 51, 37, 40, 47, 47, 47, 45, 47, 50,
+          37, 40, 47, 47, 47, 45, 47, 50, 41, 42, 47, 49, 49, 48, 50, 52,
+          42, 43, 47, 49, 50, 49, 50, 53, 44, 44, 47, 50, 51, 51, 52, 54,
+          49, 46, 48, 52, 53, 53, 54, 57, 49, 46, 48, 52, 53, 53, 54, 57,
+          48, 46, 47, 51, 54, 55, 57, 59, 48, 46, 47, 51, 54, 56, 57, 60,
+          48, 45, 46, 51, 54, 57, 59, 61, 49, 45, 46, 51, 55, 58, 61, 64,
+          49, 45, 46, 51, 55, 58, 61, 64, 50, 46, 46, 52, 56, 59, 63, 66,
+          50, 46, 46, 52, 56, 59, 64, 67, 51, 47, 47, 52, 56, 60, 65, 68,
+          52, 48, 47, 53, 57, 61, 66, 71, 52, 48, 47, 53, 57, 61, 66, 71,
+          54, 49, 48, 54, 58, 62, 68, 73, 54, 50, 49, 54, 58, 62, 68, 73,
+          55, 51, 49, 54, 58, 63, 69, 74, 57, 52, 50, 55, 59, 64, 70, 76,
+          57, 52, 50, 55, 59, 64, 70, 76, 59, 54, 52, 57, 61, 65, 72, 78}},
+        {{32, 31, 32, 32, 36, 44, 47, 53, 31, 32, 32, 33, 35, 43, 46, 52,
+          31, 32, 32, 33, 35, 42, 45, 51, 31, 32, 32, 33, 35, 42, 45, 51,
+          31, 32, 32, 33, 35, 41, 44, 49, 31, 32, 32, 33, 34, 41, 44, 49,
+          31, 32, 33, 33, 35, 41, 44, 49, 32, 32, 33, 34, 36, 42, 45, 49,
+          32, 32, 34, 34, 36, 42, 45, 50, 32, 32, 34, 35, 37, 42, 45, 49,
+          32, 33, 35, 36, 38, 42, 45, 49, 32, 33, 35, 36, 38, 42, 45, 49,
+          32, 33, 35, 36, 40, 44, 47, 51, 34, 34, 36, 38, 42, 48, 50, 54,
+          34, 34, 36, 38, 42, 48, 50, 54, 35, 34, 37, 39, 45, 50, 53, 57,
+          36, 34, 37, 40, 48, 54, 56, 60, 36, 34, 37, 40, 48, 54, 56, 60,
+          38, 36, 39, 41, 49, 56, 58, 63, 39, 37, 40, 42, 50, 58, 60, 65,
+          39, 37, 40, 42, 50, 58, 60, 65, 42, 40, 42, 44, 52, 61, 64, 69,
+          44, 41, 42, 45, 53, 63, 66, 71, 44, 41, 43, 45, 54, 63, 66, 72,
+          47, 44, 45, 47, 56, 66, 69, 75, 48, 45, 46, 48, 56, 67, 70, 76,
+          49, 46, 47, 48, 57, 67, 71, 77, 53, 49, 49, 51, 59, 71, 74, 81,
+          53, 49, 50, 51, 60, 71, 75, 82, 55, 51, 51, 53, 61, 72, 76, 83,
+          58, 54, 54, 55, 63, 75, 79, 87, 58, 54, 54, 55, 63, 75, 79, 87},
+         {32, 31, 35, 38, 48, 49, 50, 52, 31, 31, 36, 39, 47, 48, 49, 50,
+          31, 31, 37, 40, 47, 47, 48, 50, 31, 31, 37, 40, 47, 47, 48, 50,
+          30, 32, 38, 40, 46, 45, 46, 48, 30, 32, 38, 41, 46, 45, 46, 48,
+          31, 33, 38, 41, 46, 45, 46, 48, 33, 35, 41, 43, 47, 45, 46, 47,
+          33, 36, 41, 44, 47, 46, 46, 47, 34, 37, 42, 45, 47, 45, 46, 47,
+          37, 40, 45, 47, 47, 45, 46, 47, 37, 40, 45, 47, 47, 45, 46, 47,
+          39, 41, 46, 47, 48, 47, 47, 48, 42, 43, 46, 48, 50, 49, 50, 50,
+          42, 43, 46, 48, 50, 49, 50, 50, 45, 44, 47, 48, 51, 51, 52, 52,
+          49, 46, 48, 49, 53, 53, 54, 54, 49, 46, 48, 49, 53, 53, 54, 54,
+          48, 46, 47, 48, 53, 55, 55, 56, 48, 46, 46, 48, 53, 56, 56, 57,
+          48, 46, 46, 48, 53, 56, 56, 57, 49, 45, 46, 47, 53, 57, 58, 60,
+          49, 45, 45, 47, 53, 58, 59, 61, 49, 45, 46, 47, 53, 58, 60, 61,
+          50, 46, 46, 48, 54, 59, 61, 63, 50, 46, 46, 48, 54, 59, 61, 64,
+          51, 47, 47, 48, 54, 60, 61, 64, 52, 48, 47, 48, 54, 61, 63, 66,
+          52, 48, 47, 48, 54, 61, 63, 66, 53, 48, 48, 49, 54, 61, 63, 67,
+          54, 50, 49, 50, 55, 62, 65, 68, 54, 50, 49, 50, 55, 62, 65, 68}},
+        {{32, 31, 31, 32, 35, 36, 44, 47, 31, 32, 32, 32, 35, 35, 43, 46,
+          31, 32, 32, 32, 35, 35, 42, 45, 31, 32, 32, 32, 35, 35, 42, 45,
+          31, 32, 32, 32, 34, 35, 41, 45, 31, 32, 32, 33, 34, 34, 41, 44,
+          31, 32, 32, 33, 34, 34, 41, 44, 31, 32, 32, 33, 34, 35, 41, 44,
+          31, 32, 33, 34, 35, 36, 42, 44, 32, 32, 33, 34, 36, 36, 42, 45,
+          32, 32, 33, 34, 36, 36, 42, 45, 32, 32, 33, 35, 37, 37, 42, 45,
+          32, 33, 34, 35, 37, 38, 42, 45, 32, 33, 34, 35, 37, 38, 42, 45,
+          32, 33, 34, 36, 39, 40, 44, 47, 34, 34, 35, 37, 41, 42, 48, 50,
+          34, 34, 35, 37, 41, 42, 48, 50, 34, 34, 35, 37, 42, 43, 49, 51,
+          35, 34, 36, 38, 45, 47, 52, 55, 36, 34, 36, 38, 46, 48, 54, 56,
+          36, 34, 36, 38, 46, 48, 54, 56, 38, 36, 37, 40, 47, 49, 56, 58,
+          39, 37, 39, 40, 48, 50, 58, 60, 39, 37, 39, 40, 48, 50, 58, 60,
+          41, 39, 40, 41, 49, 51, 60, 62, 44, 41, 42, 43, 51, 53, 63, 66,
+          44, 41, 42, 43, 51, 53, 63, 66, 44, 42, 42, 43, 51, 54, 64, 67,
+          47, 44, 44, 45, 53, 56, 66, 69, 48, 45, 45, 46, 54, 56, 67, 70,
+          48, 45, 45, 46, 54, 56, 67, 70, 51, 47, 48, 48, 56, 58, 69, 73},
+         {32, 31, 33, 37, 45, 48, 49, 50, 31, 31, 33, 38, 45, 47, 48, 49,
+          31, 31, 34, 38, 45, 47, 47, 48, 31, 31, 34, 38, 45, 47, 47, 48,
+          31, 32, 34, 39, 45, 46, 46, 47, 30, 32, 35, 40, 44, 46, 45, 46,
+          30, 32, 35, 40, 44, 46, 45, 46, 31, 33, 35, 40, 45, 46, 45, 46,
+          33, 35, 37, 42, 46, 47, 45, 46, 33, 36, 38, 43, 46, 47, 46, 46,
+          33, 36, 38, 43, 46, 47, 46, 46, 35, 38, 41, 45, 47, 47, 45, 46,
+          37, 40, 43, 47, 47, 47, 45, 46, 37, 40, 43, 47, 47, 47, 45, 46,
+          39, 41, 43, 47, 48, 48, 47, 47, 42, 43, 44, 47, 49, 50, 49, 50,
+          42, 43, 44, 47, 49, 50, 49, 50, 43, 43, 45, 47, 50, 50, 50, 50,
+          47, 46, 46, 48, 51, 52, 53, 53, 49, 46, 47, 48, 52, 53, 53, 54,
+          49, 46, 47, 48, 52, 53, 53, 54, 48, 46, 46, 47, 52, 53, 55, 55,
+          48, 46, 46, 47, 51, 53, 56, 56, 48, 46, 46, 47, 51, 53, 56, 56,
+          48, 45, 46, 46, 51, 53, 57, 57, 49, 45, 45, 46, 51, 53, 58, 59,
+          49, 45, 45, 46, 51, 53, 58, 59, 49, 45, 45, 46, 52, 53, 58, 60,
+          50, 46, 46, 46, 52, 54, 59, 61, 50, 46, 46, 46, 52, 54, 59, 61,
+          50, 46, 46, 46, 52, 54, 59, 61, 51, 47, 47, 47, 52, 54, 60, 62}},
+        {{32, 31, 31, 32, 32, 36, 36, 44, 31, 31, 31, 32, 32, 35, 35, 43,
+          31, 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 42,
+          31, 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 41,
+          31, 32, 32, 33, 33, 34, 34, 41, 31, 32, 32, 33, 33, 34, 34, 41,
+          31, 32, 32, 33, 33, 34, 34, 41, 31, 32, 32, 33, 33, 35, 35, 41,
+          32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32, 34, 34, 36, 36, 42,
+          32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32, 34, 34, 37, 37, 42,
+          32, 33, 33, 35, 35, 38, 38, 42, 32, 33, 33, 35, 35, 38, 38, 42,
+          32, 33, 33, 35, 35, 38, 38, 42, 33, 33, 33, 36, 36, 40, 40, 45,
+          34, 34, 34, 37, 37, 42, 42, 48, 34, 34, 34, 37, 37, 42, 42, 48,
+          34, 34, 34, 37, 37, 42, 42, 48, 35, 34, 34, 37, 37, 45, 45, 50,
+          36, 34, 34, 38, 38, 48, 48, 54, 36, 34, 34, 38, 38, 48, 48, 54,
+          36, 34, 34, 38, 38, 48, 48, 54, 37, 36, 36, 39, 39, 49, 49, 56,
+          39, 37, 37, 40, 40, 50, 50, 58, 39, 37, 37, 40, 40, 50, 50, 58,
+          39, 37, 37, 40, 40, 50, 50, 58, 41, 39, 39, 42, 42, 52, 52, 60,
+          44, 41, 41, 43, 43, 53, 53, 63, 44, 41, 41, 43, 43, 53, 53, 63},
+         {32, 31, 31, 37, 37, 48, 48, 49, 31, 31, 31, 37, 37, 47, 47, 48,
+          31, 31, 31, 38, 38, 47, 47, 47, 31, 31, 31, 38, 38, 47, 47, 47,
+          31, 31, 31, 38, 38, 47, 47, 47, 31, 32, 32, 39, 39, 46, 46, 46,
+          30, 32, 32, 40, 40, 46, 46, 45, 30, 32, 32, 40, 40, 46, 46, 45,
+          30, 32, 32, 40, 40, 46, 46, 45, 32, 34, 34, 41, 41, 46, 46, 45,
+          33, 36, 36, 43, 43, 47, 47, 46, 33, 36, 36, 43, 43, 47, 47, 46,
+          33, 36, 36, 43, 43, 47, 47, 46, 35, 38, 38, 45, 45, 47, 47, 45,
+          37, 40, 40, 47, 47, 47, 47, 45, 37, 40, 40, 47, 47, 47, 47, 45,
+          37, 40, 40, 47, 47, 47, 47, 45, 39, 41, 41, 47, 47, 49, 49, 47,
+          42, 43, 43, 47, 47, 50, 50, 49, 42, 43, 43, 47, 47, 50, 50, 49,
+          42, 43, 43, 47, 47, 50, 50, 49, 45, 44, 44, 47, 47, 51, 51, 51,
+          49, 46, 46, 48, 48, 53, 53, 53, 49, 46, 46, 48, 48, 53, 53, 53,
+          49, 46, 46, 48, 48, 53, 53, 53, 48, 46, 46, 47, 47, 53, 53, 54,
+          48, 46, 46, 47, 47, 53, 53, 56, 48, 46, 46, 47, 47, 53, 53, 56,
+          48, 46, 46, 47, 47, 53, 53, 56, 48, 45, 45, 46, 46, 53, 53, 57,
+          49, 45, 45, 46, 46, 53, 53, 58, 49, 45, 45, 46, 46, 53, 53, 58}},
+        {{32, 31, 31, 31, 32, 32, 35, 36, 31, 31, 31, 32, 32, 32, 35, 35,
+          31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35,
+          31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35,
+          31, 32, 32, 32, 32, 32, 34, 35, 31, 32, 32, 32, 32, 32, 34, 35,
+          31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 32, 33, 33, 34, 34,
+          31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 33, 33, 33, 35, 35,
+          31, 32, 32, 33, 34, 34, 35, 36, 32, 32, 32, 33, 34, 34, 36, 36,
+          32, 32, 32, 33, 34, 34, 36, 36, 32, 32, 32, 33, 34, 34, 36, 36,
+          32, 32, 32, 33, 34, 34, 36, 37, 32, 32, 33, 33, 35, 35, 37, 38,
+          32, 32, 33, 34, 35, 35, 37, 38, 32, 32, 33, 34, 35, 35, 37, 38,
+          32, 32, 33, 34, 35, 35, 37, 38, 32, 33, 33, 34, 36, 36, 39, 40,
+          33, 33, 33, 35, 36, 36, 40, 41, 34, 34, 34, 35, 37, 37, 41, 42,
+          34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35, 37, 37, 41, 42,
+          34, 34, 34, 35, 37, 37, 43, 44, 35, 34, 34, 36, 38, 38, 45, 47,
+          36, 35, 34, 36, 38, 38, 46, 48, 36, 35, 34, 36, 38, 38, 46, 48,
+          36, 35, 34, 36, 38, 38, 46, 48, 37, 36, 36, 37, 39, 39, 46, 49},
+         {32, 31, 31, 33, 37, 37, 45, 48, 31, 31, 31, 33, 37, 37, 45, 48,
+          31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47,
+          31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47,
+          31, 31, 32, 34, 39, 39, 45, 46, 30, 31, 32, 34, 39, 39, 44, 46,
+          30, 32, 32, 35, 40, 40, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46,
+          30, 32, 32, 35, 40, 40, 44, 46, 31, 33, 33, 36, 41, 41, 45, 46,
+          33, 34, 35, 37, 42, 42, 46, 47, 33, 35, 36, 38, 43, 43, 46, 47,
+          33, 35, 36, 38, 43, 43, 46, 47, 33, 35, 36, 38, 43, 43, 46, 47,
+          35, 37, 37, 40, 44, 44, 46, 47, 36, 38, 39, 42, 46, 46, 47, 47,
+          37, 39, 40, 43, 47, 47, 47, 47, 37, 39, 40, 43, 47, 47, 47, 47,
+          37, 39, 40, 43, 47, 47, 47, 47, 39, 40, 41, 43, 47, 47, 48, 48,
+          41, 42, 42, 44, 47, 47, 49, 49, 42, 42, 43, 44, 47, 47, 49, 50,
+          42, 42, 43, 44, 47, 47, 49, 50, 42, 42, 43, 44, 47, 47, 49, 50,
+          44, 44, 44, 45, 47, 47, 50, 51, 47, 46, 46, 46, 48, 48, 51, 52,
+          49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 48, 48, 52, 53,
+          49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 47, 47, 52, 53}},
+        {{32, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 33,
+          31, 31, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+          31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+          31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+          31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+          31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33,
+          31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33,
+          31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 33, 33, 33, 34,
+          31, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34,
+          32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34,
+          32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 35,
+          32, 32, 32, 32, 33, 35, 35, 35, 32, 32, 33, 33, 33, 35, 35, 36,
+          32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36,
+          32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36,
+          32, 33, 33, 33, 34, 36, 36, 36, 33, 33, 33, 33, 34, 36, 36, 37,
+          34, 34, 34, 34, 35, 37, 37, 38, 34, 34, 34, 34, 35, 37, 37, 38},
+         {32, 31, 31, 31, 33, 37, 37, 38, 31, 31, 31, 31, 33, 37, 37, 39,
+          31, 31, 31, 31, 33, 38, 38, 39, 31, 31, 31, 31, 34, 38, 38, 40,
+          31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40,
+          31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40,
+          31, 31, 32, 32, 34, 39, 39, 40, 30, 31, 32, 32, 34, 39, 39, 40,
+          30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32, 32, 35, 40, 40, 41,
+          30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32, 32, 35, 40, 40, 41,
+          31, 32, 33, 33, 35, 40, 40, 41, 32, 33, 34, 34, 36, 41, 41, 42,
+          33, 34, 35, 35, 37, 42, 42, 43, 33, 35, 36, 36, 38, 43, 43, 44,
+          33, 35, 36, 36, 38, 43, 43, 44, 33, 35, 36, 36, 38, 43, 43, 44,
+          33, 35, 36, 36, 38, 43, 43, 44, 34, 36, 37, 37, 39, 44, 44, 45,
+          35, 37, 38, 38, 41, 45, 45, 46, 36, 38, 39, 39, 42, 47, 47, 47,
+          37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47,
+          37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47,
+          39, 40, 41, 41, 43, 47, 47, 47, 40, 41, 42, 42, 44, 47, 47, 47,
+          42, 42, 43, 43, 44, 47, 47, 48, 42, 42, 43, 43, 44, 47, 47, 48}},
+        {{32, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 32, 32,
+          31, 31, 31, 31, 31, 31, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33,
+          31, 32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 33, 33,
+          31, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 34,
+          32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34,
+          32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34,
+          32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34},
+         {32, 31, 31, 31, 31, 31, 33, 35, 31, 31, 31, 31, 31, 31, 33, 35,
+          31, 31, 31, 31, 31, 31, 33, 36, 31, 31, 31, 31, 31, 31, 33, 36,
+          31, 31, 31, 31, 31, 31, 34, 36, 31, 31, 31, 31, 31, 31, 34, 37,
+          31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37,
+          31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37,
+          31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37,
+          31, 31, 31, 32, 32, 32, 34, 37, 31, 31, 31, 32, 32, 32, 34, 37,
+          30, 31, 31, 32, 32, 32, 34, 38, 30, 31, 32, 32, 32, 32, 35, 38,
+          30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38,
+          30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38,
+          30, 31, 32, 32, 32, 32, 35, 38, 31, 31, 32, 33, 33, 33, 35, 38,
+          31, 32, 33, 33, 33, 33, 36, 39, 32, 33, 34, 34, 34, 34, 37, 40,
+          33, 34, 34, 35, 35, 35, 37, 40, 33, 34, 35, 36, 36, 36, 38, 41,
+          33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41,
+          33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41,
+          33, 34, 35, 36, 36, 36, 38, 41, 34, 35, 36, 36, 36, 36, 39, 42}},
+        {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32,
+          31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+          31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32,
+          31, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 31, 32, 32, 32,
+          30, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 32, 32, 32, 32,
+          30, 31, 31, 31, 32, 32, 32, 32, 30, 31, 31, 31, 32, 32, 32, 32}}};
+constexpr uint8_t kQuantizerMatrix16x32
+    [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][512] = {
+        {{32,  31,  32,  34,  36,  44,  53,  59,  65,  79,  87,  90,  93,  96,
+          99,  102, 31,  32,  32,  34,  35,  42,  51,  56,  62,  75,  82,  85,
+          88,  91,  94,  97,  31,  32,  33,  33,  34,  41,  49,  54,  59,  72,
+          78,  82,  86,  90,  93,  97,  31,  32,  33,  34,  35,  41,  49,  54,
+          59,  71,  78,  81,  84,  87,  90,  93,  32,  32,  34,  35,  36,  42,
+          50,  54,  59,  71,  77,  80,  82,  86,  89,  93,  32,  33,  35,  37,
+          38,  42,  49,  53,  58,  69,  75,  78,  82,  86,  89,  92,  34,  34,
+          37,  39,  42,  48,  54,  58,  63,  73,  79,  78,  80,  83,  88,  92,
+          35,  34,  37,  41,  45,  50,  57,  61,  65,  76,  82,  83,  84,  84,
+          87,  90,  36,  34,  38,  43,  48,  54,  60,  64,  68,  78,  84,  87,
+          86,  89,  90,  90,  39,  37,  40,  45,  50,  58,  65,  69,  73,  84,
+          89,  89,  91,  91,  93,  96,  44,  41,  43,  48,  53,  63,  71,  75,
+          79,  90,  95,  93,  94,  95,  97,  97,  46,  43,  44,  49,  55,  65,
+          73,  78,  82,  93,  98,  100, 98,  100, 99,  103, 48,  45,  46,  51,
+          56,  67,  76,  80,  85,  96,  102, 102, 105, 102, 105, 104, 53,  49,
+          50,  54,  60,  71,  82,  87,  92,  103, 109, 107, 107, 110, 107, 111,
+          58,  54,  54,  58,  63,  75,  87,  92,  98,  110, 116, 115, 112, 111,
+          115, 112, 61,  57,  56,  60,  66,  77,  89,  95,  101, 114, 120, 118,
+          119, 118, 116, 120, 65,  60,  58,  63,  68,  79,  92,  98,  105, 118,
+          124, 123, 122, 123, 124, 121, 71,  65,  63,  68,  73,  84,  97,  103,
+          111, 125, 132, 132, 130, 128, 127, 130, 79,  72,  70,  74,  79,  90,
+          104, 110, 118, 133, 141, 136, 135, 135, 135, 131, 81,  74,  71,  75,
+          80,  91,  105, 112, 119, 135, 142, 140, 140, 138, 139, 142, 82,  75,
+          72,  76,  81,  92,  106, 113, 121, 136, 144, 151, 149, 149, 146, 143,
+          88,  80,  77,  80,  85,  97,  108, 115, 126, 142, 149, 153, 153, 152,
+          152, 154, 91,  83,  80,  81,  88,  100, 106, 114, 130, 142, 148, 155,
+          162, 160, 159, 155, 94,  85,  83,  82,  91,  100, 105, 118, 131, 137,
+          153, 160, 165, 167, 166, 168, 97,  88,  86,  85,  94,  100, 107, 123,
+          128, 140, 157, 161, 167, 173, 171, 169, 100, 91,  89,  87,  97,  100,
+          111, 121, 127, 145, 152, 164, 173, 178, 182, 181, 103, 94,  93,  90,
+          98,  101, 114, 120, 131, 144, 150, 170, 174, 180, 186, 183, 107, 97,
+          96,  93,  100, 104, 117, 119, 136, 142, 155, 168, 177, 187, 191, 198,
+          110, 101, 100, 97,  101, 108, 117, 123, 138, 141, 161, 165, 183, 188,
+          193, 200, 114, 104, 104, 100, 103, 112, 117, 127, 137, 146, 159, 167,
+          185, 190, 201, 206, 118, 108, 107, 103, 105, 115, 118, 131, 136, 151,
+          157, 172, 182, 197, 203, 208, 122, 111, 111, 107, 107, 119, 119, 136,
+          136, 156, 156, 178, 179, 203, 204, 217},
+         {32, 31, 37, 42, 48, 49, 52, 54, 57, 63, 66, 67,  68,  69,  71,  72,
+          31, 31, 38, 42, 47, 47, 50, 52, 54, 60, 63, 64,  65,  66,  67,  68,
+          30, 32, 40, 42, 46, 45, 48, 50, 52, 57, 60, 62,  63,  65,  66,  68,
+          32, 34, 41, 44, 46, 45, 48, 49, 51, 57, 59, 61,  62,  63,  64,  65,
+          33, 36, 43, 45, 47, 46, 47, 49, 51, 56, 59, 60,  60,  62,  63,  65,
+          37, 40, 47, 47, 47, 45, 47, 48, 50, 54, 57, 58,  60,  61,  62,  63,
+          42, 43, 47, 48, 50, 49, 50, 52, 53, 57, 60, 58,  59,  60,  62,  63,
+          45, 44, 47, 49, 51, 51, 52, 54, 55, 59, 61, 61,  61,  60,  61,  61,
+          49, 46, 48, 50, 53, 53, 54, 55, 57, 60, 62, 63,  62,  63,  62,  62,
+          48, 46, 47, 50, 53, 56, 57, 59, 60, 64, 66, 65,  65,  64,  64,  65,
+          49, 45, 46, 49, 53, 58, 61, 62, 64, 67, 69, 67,  66,  66,  66,  65,
+          49, 46, 46, 49, 53, 59, 62, 64, 65, 69, 71, 70,  68,  68,  67,  68,
+          50, 46, 46, 50, 54, 59, 64, 65, 67, 71, 73, 72,  72,  70,  70,  69,
+          52, 48, 47, 50, 54, 61, 66, 68, 71, 75, 77, 74,  73,  73,  71,  72,
+          54, 50, 49, 52, 55, 62, 68, 71, 73, 78, 80, 78,  76,  74,  75,  73,
+          55, 51, 49, 52, 56, 63, 69, 72, 75, 80, 82, 80,  79,  78,  76,  77,
+          57, 52, 50, 53, 56, 64, 70, 73, 76, 82, 84, 82,  80,  80,  79,  77,
+          60, 54, 52, 55, 58, 65, 72, 75, 79, 85, 88, 86,  84,  82,  81,  81,
+          63, 57, 55, 58, 60, 67, 75, 78, 82, 89, 92, 88,  87,  85,  84,  81,
+          64, 58, 55, 58, 61, 68, 75, 78, 82, 89, 92, 90,  89,  87,  86,  86,
+          64, 59, 56, 58, 61, 68, 75, 79, 83, 90, 93, 95,  93,  91,  89,  87,
+          67, 61, 58, 60, 63, 69, 76, 79, 85, 92, 95, 96,  94,  92,  91,  91,
+          68, 62, 59, 60, 64, 71, 74, 78, 86, 91, 94, 96,  98,  96,  94,  91,
+          69, 62, 60, 60, 65, 70, 72, 79, 85, 88, 95, 98,  99,  98,  97,  96,
+          70, 63, 62, 60, 66, 69, 73, 81, 83, 89, 96, 97,  99,  101, 98,  97,
+          71, 64, 63, 61, 67, 68, 74, 79, 82, 90, 93, 98,  102, 102, 102, 101,
+          72, 65, 64, 62, 66, 68, 75, 78, 83, 89, 92, 100, 101, 103, 104, 102,
+          73, 66, 65, 63, 66, 69, 75, 76, 84, 87, 93, 98,  102, 105, 106, 107,
+          74, 67, 67, 64, 66, 70, 74, 77, 84, 86, 94, 96,  103, 105, 106, 107,
+          75, 68, 68, 65, 66, 71, 74, 78, 83, 87, 93, 96,  103, 105, 109, 109,
+          76, 69, 69, 66, 67, 72, 73, 80, 82, 88, 91, 97,  101, 107, 109, 110,
+          77, 70, 70, 67, 67, 73, 73, 81, 81, 90, 90, 99,  99,  108, 108, 113}},
+        {{32,  31,  32,  32,  36,  44,  47,  53,  65,  73,  79,  87,  90,  93,
+          96,  99,  31,  32,  32,  33,  35,  42,  45,  51,  62,  69,  75,  83,
+          86,  88,  91,  94,  31,  32,  32,  33,  35,  41,  44,  49,  60,  67,
+          72,  80,  84,  87,  90,  94,  31,  32,  33,  33,  35,  41,  44,  49,
+          59,  66,  71,  79,  82,  84,  87,  90,  32,  32,  34,  34,  36,  42,
+          45,  50,  59,  65,  71,  78,  80,  83,  87,  90,  32,  33,  35,  36,
+          38,  42,  45,  49,  58,  64,  69,  76,  80,  83,  86,  88,  32,  33,
+          35,  36,  40,  44,  47,  51,  60,  66,  71,  76,  78,  81,  85,  89,
+          34,  34,  36,  38,  42,  48,  50,  54,  63,  69,  73,  80,  82,  81,
+          84,  86,  36,  34,  37,  40,  48,  54,  56,  60,  68,  74,  78,  84,
+          83,  86,  87,  87,  38,  36,  39,  41,  49,  56,  58,  63,  71,  77,
+          81,  86,  88,  88,  90,  93,  39,  37,  40,  42,  50,  58,  60,  65,
+          73,  79,  84,  90,  91,  92,  94,  93,  44,  41,  42,  45,  53,  63,
+          66,  71,  79,  85,  90,  96,  94,  96,  96,  99,  47,  44,  45,  47,
+          56,  66,  69,  75,  84,  90,  95,  99,  101, 98,  101, 99,  49,  46,
+          47,  48,  57,  67,  71,  77,  86,  93,  97,  103, 103, 105, 102, 106,
+          53,  49,  50,  51,  60,  71,  75,  82,  92,  99,  103, 111, 108, 107,
+          110, 107, 58,  54,  54,  55,  63,  75,  79,  87,  98,  105, 110, 114,
+          114, 113, 111, 115, 61,  56,  56,  57,  65,  77,  81,  89,  100, 107,
+          113, 118, 116, 117, 118, 116, 65,  60,  59,  60,  68,  79,  84,  92,
+          105, 112, 118, 126, 124, 122, 121, 124, 71,  65,  64,  65,  73,  84,
+          89,  97,  111, 119, 125, 130, 129, 129, 129, 125, 76,  69,  68,  69,
+          76,  88,  92,  101, 115, 123, 130, 134, 134, 131, 132, 135, 79,  72,
+          70,  71,  79,  90,  95,  104, 118, 127, 133, 143, 142, 141, 138, 136,
+          82,  75,  73,  74,  81,  92,  97,  106, 121, 130, 136, 146, 145, 144,
+          144, 145, 86,  78,  76,  77,  84,  95,  100, 109, 124, 133, 140, 147,
+          153, 151, 150, 146, 89,  81,  79,  78,  87,  95,  99,  112, 124, 130,
+          145, 152, 156, 157, 156, 158, 92,  84,  82,  80,  89,  95,  101, 116,
+          121, 132, 148, 151, 157, 163, 161, 159, 95,  86,  85,  83,  92,  95,
+          105, 114, 120, 136, 143, 155, 163, 167, 171, 170, 98,  89,  88,  85,
+          93,  95,  108, 113, 124, 136, 141, 160, 163, 169, 174, 171, 101, 92,
+          91,  88,  94,  98,  110, 112, 128, 133, 146, 158, 166, 175, 179, 185,
+          104, 95,  94,  91,  95,  101, 110, 115, 129, 132, 151, 154, 171, 175,
+          181, 186, 107, 98,  97,  94,  96,  105, 110, 119, 128, 136, 149, 156,
+          173, 177, 188, 192, 110, 101, 100, 97,  98,  108, 111, 123, 127, 141,
+          147, 161, 169, 183, 188, 193, 114, 104, 104, 100, 100, 111, 111, 126,
+          127, 145, 145, 166, 166, 189, 190, 201},
+         {32, 31, 35, 38, 48, 49, 50, 52, 57, 61, 63, 67, 68,  69,  70,  71,
+          31, 31, 37, 40, 47, 47, 48, 50, 54, 57, 60, 63, 64,  65,  66,  67,
+          30, 32, 38, 40, 46, 45, 46, 48, 52, 55, 58, 61, 63,  64,  65,  67,
+          31, 33, 38, 41, 46, 45, 46, 48, 52, 55, 57, 60, 61,  62,  63,  64,
+          33, 36, 41, 44, 47, 46, 46, 47, 51, 54, 56, 59, 60,  61,  63,  64,
+          37, 40, 45, 47, 47, 45, 46, 47, 50, 52, 54, 57, 59,  61,  62,  62,
+          39, 41, 46, 47, 48, 47, 47, 48, 51, 54, 55, 57, 58,  59,  61,  62,
+          42, 43, 46, 48, 50, 49, 50, 50, 53, 56, 57, 60, 60,  59,  60,  60,
+          49, 46, 48, 49, 53, 53, 54, 54, 57, 59, 60, 63, 61,  62,  61,  61,
+          48, 46, 47, 48, 53, 55, 55, 56, 58, 61, 62, 64, 64,  63,  63,  64,
+          48, 46, 46, 48, 53, 56, 56, 57, 60, 62, 64, 66, 65,  65,  65,  64,
+          49, 45, 45, 47, 53, 58, 59, 61, 64, 66, 67, 69, 67,  67,  66,  67,
+          50, 46, 46, 48, 54, 59, 61, 63, 66, 68, 70, 71, 71,  68,  69,  67,
+          51, 47, 47, 48, 54, 60, 61, 64, 68, 70, 71, 73, 72,  72,  70,  71,
+          52, 48, 47, 48, 54, 61, 63, 66, 71, 73, 75, 77, 75,  73,  74,  71,
+          54, 50, 49, 50, 55, 62, 65, 68, 73, 76, 78, 79, 78,  76,  74,  75,
+          55, 51, 49, 50, 56, 63, 65, 69, 74, 77, 79, 81, 79,  78,  78,  75,
+          57, 52, 50, 51, 56, 64, 66, 70, 76, 79, 82, 85, 83,  81,  79,  79,
+          60, 54, 53, 53, 58, 65, 68, 72, 79, 82, 85, 87, 85,  84,  82,  80,
+          62, 56, 54, 55, 60, 66, 69, 74, 81, 84, 87, 88, 87,  85,  84,  84,
+          63, 57, 55, 56, 60, 67, 70, 75, 82, 86, 89, 92, 91,  89,  87,  84,
+          64, 59, 56, 57, 61, 68, 71, 75, 83, 87, 90, 93, 92,  90,  89,  89,
+          66, 60, 58, 58, 62, 69, 72, 76, 84, 88, 91, 94, 95,  93,  91,  89,
+          67, 61, 59, 58, 63, 68, 71, 78, 83, 86, 93, 96, 96,  96,  94,  94,
+          68, 62, 60, 59, 64, 67, 71, 79, 81, 86, 94, 95, 97,  98,  96,  94,
+          69, 63, 61, 60, 65, 66, 72, 77, 80, 88, 91, 96, 99,  99,  100, 98,
+          70, 64, 62, 60, 65, 66, 73, 76, 81, 87, 89, 97, 98,  100, 101, 99,
+          71, 65, 64, 61, 65, 67, 73, 74, 82, 85, 90, 95, 99,  102, 103, 104,
+          72, 65, 65, 62, 65, 68, 72, 75, 82, 83, 92, 93, 100, 102, 103, 104,
+          73, 66, 66, 63, 65, 69, 72, 76, 81, 85, 90, 93, 100, 102, 105, 106,
+          74, 67, 67, 64, 65, 70, 71, 77, 79, 86, 89, 94, 98,  103, 105, 106,
+          75, 68, 68, 65, 65, 71, 71, 78, 78, 87, 87, 96, 96,  105, 105, 109}},
+        {{32,  31,  32,  32,  36,  39,  44,  53,  58,  65,  79,  81,  88,  90,
+          93,  96,  31,  32,  32,  32,  35,  38,  42,  51,  55,  62,  75,  77,
+          83,  86,  88,  91,  31,  32,  32,  32,  35,  38,  41,  50,  54,  60,
+          73,  75,  81,  84,  88,  91,  31,  32,  32,  33,  34,  37,  41,  49,
+          53,  59,  72,  74,  79,  82,  84,  87,  32,  32,  33,  34,  36,  39,
+          42,  50,  53,  59,  71,  72,  78,  81,  84,  87,  32,  32,  34,  34,
+          37,  40,  42,  49,  53,  58,  70,  71,  77,  80,  83,  85,  32,  33,
+          34,  35,  38,  40,  42,  49,  52,  58,  69,  70,  76,  78,  82,  86,
+          34,  34,  35,  37,  42,  45,  48,  54,  57,  63,  73,  75,  79,  79,
+          81,  83,  34,  34,  36,  37,  44,  47,  50,  56,  59,  65,  75,  77,
+          81,  83,  84,  84,  36,  34,  37,  38,  48,  51,  54,  60,  63,  68,
+          78,  80,  85,  85,  86,  89,  39,  37,  39,  40,  50,  54,  58,  65,
+          68,  73,  84,  85,  88,  89,  90,  89,  40,  38,  40,  41,  51,  55,
+          59,  67,  70,  75,  85,  87,  91,  92,  92,  95,  44,  41,  42,  43,
+          53,  58,  63,  71,  74,  79,  90,  91,  97,  94,  97,  95,  47,  44,
+          45,  46,  56,  61,  66,  75,  79,  85,  95,  97,  99,  101, 98,  102,
+          49,  46,  46,  47,  57,  62,  67,  77,  81,  86,  97,  99,  104, 102,
+          105, 102, 53,  49,  50,  50,  60,  65,  71,  82,  86,  92,  103, 105,
+          109, 108, 106, 110, 57,  53,  53,  53,  63,  68,  74,  86,  90,  97,
+          108, 110, 111, 112, 113, 110, 59,  54,  54,  54,  64,  69,  75,  87,
+          91,  98,  111, 112, 119, 117, 115, 118, 65,  60,  59,  58,  68,  73,
+          79,  92,  97,  105, 118, 119, 123, 123, 122, 119, 69,  63,  62,  62,
+          71,  76,  83,  96,  100, 109, 122, 124, 127, 125, 125, 128, 71,  65,
+          64,  63,  73,  78,  84,  97,  102, 111, 125, 127, 135, 134, 131, 129,
+          79,  72,  71,  70,  79,  84,  90,  104, 109, 118, 133, 135, 137, 136,
+          136, 137, 81,  74,  72,  71,  80,  85,  91,  105, 110, 120, 135, 137,
+          145, 143, 141, 138, 82,  75,  73,  72,  81,  86,  92,  106, 111, 121,
+          136, 139, 147, 148, 147, 149, 87,  79,  77,  76,  85,  90,  96,  110,
+          114, 125, 140, 143, 148, 154, 151, 149, 90,  82,  80,  78,  87,  89,
+          99,  108, 113, 129, 135, 146, 153, 157, 160, 159, 92,  84,  83,  81,
+          88,  90,  102, 106, 117, 128, 133, 150, 153, 158, 163, 160, 95,  87,
+          85,  83,  88,  92,  103, 105, 120, 125, 137, 148, 155, 164, 168, 173,
+          98,  89,  88,  85,  89,  95,  103, 108, 121, 124, 141, 144, 160, 164,
+          169, 174, 100, 92,  91,  88,  90,  98,  103, 111, 120, 127, 139, 146,
+          161, 165, 175, 179, 103, 94,  94,  90,  92,  101, 103, 114, 119, 131,
+          137, 150, 158, 170, 175, 180, 106, 97,  97,  93,  93,  104, 104, 118,
+          118, 135, 135, 154, 155, 175, 176, 187},
+         {32, 31, 34, 37, 48, 48, 49, 52, 54, 57, 63, 64, 67, 68,  69,  69,
+          31, 31, 35, 38, 47, 47, 47, 50, 51, 54, 60, 61, 63, 64,  65,  66,
+          31, 32, 36, 39, 46, 46, 46, 48, 50, 53, 58, 59, 62, 63,  65,  66,
+          30, 32, 36, 40, 46, 45, 45, 48, 49, 52, 57, 58, 60, 61,  62,  63,
+          33, 36, 40, 43, 47, 46, 46, 47, 49, 51, 56, 57, 59, 60,  62,  63,
+          35, 38, 42, 45, 47, 46, 45, 47, 48, 50, 55, 56, 58, 60,  61,  61,
+          37, 40, 44, 47, 47, 46, 45, 47, 48, 50, 54, 55, 57, 58,  60,  61,
+          42, 43, 45, 47, 50, 50, 49, 50, 51, 53, 57, 58, 59, 58,  59,  59,
+          44, 44, 46, 47, 51, 51, 51, 52, 53, 54, 59, 59, 60, 61,  61,  60,
+          49, 46, 47, 48, 53, 53, 53, 54, 55, 57, 60, 61, 63, 62,  62,  63,
+          48, 46, 46, 47, 53, 54, 56, 57, 58, 60, 64, 64, 64, 64,  64,  63,
+          48, 45, 46, 46, 53, 55, 56, 58, 59, 61, 65, 65, 66, 66,  65,  66,
+          49, 45, 45, 46, 53, 56, 58, 61, 62, 64, 67, 68, 70, 67,  68,  66,
+          50, 46, 46, 46, 54, 56, 59, 63, 65, 66, 70, 71, 70, 71,  68,  70,
+          51, 47, 47, 47, 54, 57, 60, 64, 65, 68, 71, 72, 73, 71,  72,  70,
+          52, 48, 47, 47, 54, 57, 61, 66, 68, 71, 75, 75, 76, 75,  73,  73,
+          54, 49, 49, 48, 55, 58, 62, 68, 70, 73, 77, 78, 77, 77,  76,  74,
+          54, 50, 49, 49, 55, 59, 62, 68, 70, 74, 78, 79, 81, 79,  77,  78,
+          57, 52, 51, 50, 56, 60, 64, 70, 73, 76, 82, 82, 83, 82,  81,  78,
+          59, 54, 52, 52, 58, 61, 65, 72, 74, 78, 84, 85, 85, 83,  82,  82,
+          60, 54, 53, 52, 58, 62, 65, 72, 75, 79, 85, 86, 89, 87,  85,  82,
+          63, 57, 56, 55, 60, 64, 67, 75, 77, 82, 89, 90, 90, 88,  87,  86,
+          64, 58, 57, 55, 61, 64, 68, 75, 78, 82, 89, 90, 93, 91,  89,  87,
+          64, 59, 57, 56, 61, 65, 68, 75, 78, 83, 90, 91, 94, 93,  92,  91,
+          66, 60, 59, 57, 63, 66, 69, 77, 79, 84, 91, 93, 94, 95,  93,  91,
+          67, 61, 60, 58, 63, 65, 70, 75, 78, 85, 88, 93, 96, 97,  97,  95,
+          68, 62, 61, 59, 63, 64, 71, 74, 79, 84, 87, 94, 96, 97,  98,  96,
+          69, 63, 62, 60, 63, 65, 71, 72, 80, 82, 88, 93, 96, 99,  100, 101,
+          70, 64, 63, 60, 63, 66, 70, 73, 80, 81, 89, 90, 97, 99,  100, 101,
+          71, 65, 64, 61, 63, 67, 70, 74, 78, 82, 88, 90, 97, 99,  102, 103,
+          72, 65, 65, 62, 63, 68, 69, 75, 77, 83, 86, 92, 95, 100, 102, 103,
+          73, 66, 66, 63, 63, 69, 69, 76, 76, 84, 84, 93, 93, 101, 101, 105}},
+        {{32,  31,  31,  32,  35,  36,  44,  47,  53,  62,  65,  79,  82,  88,
+          90,  93,  31,  32,  32,  32,  35,  35,  42,  45,  51,  59,  62,  75,
+          78,  83,  86,  88,  31,  32,  32,  32,  34,  35,  41,  45,  50,  58,
+          61,  74,  76,  82,  85,  88,  31,  32,  32,  33,  34,  34,  41,  44,
+          49,  57,  59,  72,  74,  79,  82,  84,  31,  32,  33,  34,  35,  36,
+          42,  44,  49,  57,  59,  71,  73,  79,  81,  84,  32,  32,  33,  34,
+          36,  36,  42,  45,  50,  57,  59,  71,  73,  78,  80,  82,  32,  33,
+          34,  35,  37,  38,  42,  45,  49,  56,  58,  69,  71,  76,  79,  83,
+          32,  33,  34,  36,  39,  40,  44,  47,  51,  58,  60,  71,  73,  76,
+          78,  80,  34,  34,  35,  37,  41,  42,  48,  50,  54,  61,  63,  73,
+          76,  81,  81,  80,  35,  34,  36,  38,  45,  47,  52,  55,  59,  65,
+          67,  77,  79,  82,  83,  86,  36,  34,  36,  38,  46,  48,  54,  56,
+          60,  66,  68,  78,  80,  85,  87,  86,  39,  37,  39,  40,  48,  50,
+          58,  60,  65,  71,  73,  84,  86,  89,  88,  91,  41,  39,  40,  41,
+          49,  51,  60,  62,  67,  74,  76,  86,  88,  91,  93,  91,  44,  41,
+          42,  43,  51,  53,  63,  66,  71,  78,  79,  90,  92,  97,  94,  97,
+          47,  44,  44,  45,  53,  56,  66,  69,  75,  82,  84,  95,  97,  98,
+          101, 98,  48,  45,  45,  46,  54,  56,  67,  70,  76,  83,  85,  96,
+          98,  104, 101, 105, 53,  49,  50,  50,  57,  60,  71,  75,  82,  90,
+          92,  103, 106, 107, 108, 105, 55,  51,  51,  51,  59,  61,  72,  77,
+          84,  92,  94,  106, 108, 111, 110, 112, 58,  54,  54,  54,  61,  63,
+          75,  79,  87,  95,  98,  110, 112, 117, 116, 113, 63,  58,  58,  57,
+          65,  67,  78,  83,  91,  100, 103, 116, 118, 119, 119, 121, 65,  60,
+          59,  58,  66,  68,  79,  84,  92,  102, 105, 118, 120, 127, 124, 122,
+          71,  65,  64,  63,  71,  73,  84,  89,  97,  108, 111, 125, 127, 129,
+          129, 130, 74,  68,  67,  66,  73,  75,  86,  91,  100, 110, 113, 128,
+          131, 135, 134, 130, 79,  72,  71,  70,  77,  79,  90,  95,  104, 115,
+          118, 133, 136, 140, 139, 140, 82,  75,  73,  72,  79,  81,  92,  97,
+          105, 117, 120, 136, 139, 145, 142, 140, 82,  75,  74,  72,  79,  81,
+          92,  97,  106, 117, 121, 136, 139, 148, 150, 149, 87,  79,  78,  76,
+          83,  85,  96,  100, 110, 120, 125, 141, 144, 148, 153, 150, 89,  82,
+          81,  78,  83,  87,  97,  99,  113, 118, 128, 139, 145, 153, 157, 161,
+          92,  84,  83,  80,  84,  89,  97,  101, 114, 116, 132, 135, 150, 153,
+          157, 162, 94,  86,  85,  82,  85,  92,  97,  104, 112, 119, 130, 136,
+          151, 154, 163, 166, 97,  88,  88,  85,  86,  94,  97,  107, 111, 123,
+          128, 140, 147, 159, 163, 167, 99,  91,  91,  87,  87,  97,  97,  110,
+          110, 126, 126, 144, 144, 163, 163, 173},
+         {32, 31, 33, 37, 45, 48, 49, 50, 52, 56, 57, 63, 64, 67, 68, 68, 31,
+          31, 34, 38, 45, 47, 47, 48, 50, 53, 54, 60, 61, 63, 64, 65, 31, 32,
+          34, 39, 45, 46, 46, 47, 49, 52, 53, 59, 60, 62, 64, 65, 30, 32, 35,
+          40, 44, 46, 45, 46, 48, 51, 52, 57, 58, 60, 61, 62, 33, 35, 37, 42,
+          46, 47, 45, 46, 47, 50, 51, 56, 57, 60, 61, 62, 33, 36, 38, 43, 46,
+          47, 46, 46, 47, 50, 51, 56, 57, 59, 60, 60, 37, 40, 43, 47, 47, 47,
+          45, 46, 47, 49, 50, 54, 55, 57, 59, 61, 39, 41, 43, 47, 48, 48, 47,
+          47, 48, 50, 51, 55, 56, 57, 58, 59, 42, 43, 44, 47, 49, 50, 49, 50,
+          50, 53, 53, 57, 58, 60, 60, 59, 47, 46, 46, 48, 51, 52, 53, 53, 53,
+          55, 56, 60, 61, 61, 61, 62, 49, 46, 47, 48, 52, 53, 53, 54, 54, 56,
+          57, 60, 61, 63, 63, 62, 48, 46, 46, 47, 51, 53, 56, 56, 57, 59, 60,
+          64, 64, 65, 64, 65, 48, 45, 46, 46, 51, 53, 57, 57, 59, 61, 61, 65,
+          66, 66, 67, 65, 49, 45, 45, 46, 51, 53, 58, 59, 61, 63, 64, 67, 68,
+          70, 67, 68, 50, 46, 46, 46, 52, 54, 59, 61, 63, 65, 66, 70, 71, 70,
+          71, 68, 50, 46, 46, 46, 52, 54, 59, 61, 64, 66, 67, 71, 71, 73, 71,
+          72, 52, 48, 47, 47, 53, 54, 61, 63, 66, 70, 71, 75, 75, 75, 74, 72,
+          53, 49, 48, 48, 53, 55, 61, 64, 67, 71, 72, 76, 77, 77, 75, 76, 54,
+          50, 49, 49, 54, 55, 62, 65, 68, 72, 73, 78, 79, 80, 79, 76, 56, 51,
+          51, 50, 55, 56, 63, 66, 70, 74, 76, 81, 82, 81, 80, 80, 57, 52, 51,
+          50, 55, 56, 64, 66, 70, 75, 76, 82, 83, 85, 83, 80, 60, 54, 54, 52,
+          57, 58, 65, 68, 72, 77, 79, 85, 86, 86, 85, 84, 61, 56, 55, 53, 58,
+          59, 66, 69, 73, 79, 80, 86, 87, 89, 87, 84, 63, 57, 56, 55, 59, 60,
+          67, 70, 75, 80, 82, 89, 90, 91, 89, 89, 64, 58, 57, 56, 60, 61, 68,
+          71, 75, 81, 83, 90, 91, 93, 91, 89, 64, 59, 58, 56, 60, 61, 68, 71,
+          75, 81, 83, 90, 91, 94, 94, 93, 66, 60, 59, 57, 61, 63, 69, 72, 77,
+          82, 84, 92, 93, 94, 95, 93, 67, 61, 60, 58, 61, 63, 69, 70, 78, 80,
+          85, 90, 93, 96, 97, 97, 68, 62, 61, 59, 61, 64, 68, 71, 77, 79, 86,
+          88, 94, 96, 97, 98, 69, 63, 62, 59, 61, 65, 68, 72, 76, 80, 85, 88,
+          94, 95, 99, 99, 70, 63, 63, 60, 61, 66, 67, 73, 75, 81, 83, 89, 92,
+          97, 98, 99, 70, 64, 64, 61, 61, 67, 67, 74, 74, 82, 82, 90, 90, 98,
+          98, 102}},
+        {{32,  31,  31,  32,  33,  36,  40,  44,  51,  53,  65,  66,  79,  81,
+          87,  90,  31,  32,  32,  32,  33,  35,  39,  42,  49,  51,  62,  63,
+          75,  77,  83,  85,  31,  32,  32,  32,  33,  35,  39,  42,  49,  51,
+          61,  62,  74,  76,  82,  85,  31,  32,  32,  33,  33,  34,  38,  41,
+          47,  49,  59,  60,  72,  74,  79,  81,  31,  32,  32,  33,  34,  35,
+          38,  41,  47,  49,  59,  60,  71,  73,  79,  81,  32,  32,  33,  34,
+          35,  36,  39,  42,  48,  50,  59,  60,  71,  72,  78,  80,  32,  32,
+          33,  35,  36,  37,  40,  42,  48,  49,  58,  59,  69,  71,  77,  80,
+          32,  33,  33,  35,  36,  38,  41,  42,  48,  49,  58,  59,  69,  70,
+          75,  77,  33,  33,  34,  36,  38,  41,  44,  46,  52,  53,  62,  63,
+          72,  74,  78,  78,  34,  34,  34,  37,  39,  42,  45,  48,  53,  54,
+          63,  64,  73,  75,  80,  83,  36,  34,  35,  38,  42,  48,  51,  54,
+          58,  60,  68,  69,  78,  80,  84,  83,  36,  35,  35,  38,  42,  48,
+          51,  54,  59,  60,  68,  69,  79,  80,  85,  87,  39,  37,  38,  40,
+          44,  50,  54,  58,  63,  65,  73,  74,  84,  85,  89,  88,  40,  38,
+          39,  41,  45,  51,  56,  59,  65,  67,  75,  76,  85,  87,  90,  93,
+          44,  41,  41,  43,  46,  53,  59,  63,  69,  71,  79,  80,  90,  91,
+          96,  93,  46,  43,  43,  44,  48,  55,  60,  65,  72,  73,  82,  83,
+          93,  94,  97,  100, 48,  45,  45,  46,  50,  56,  62,  67,  74,  76,
+          85,  86,  96,  98,  103, 100, 52,  48,  48,  49,  52,  59,  65,  70,
+          78,  80,  90,  91,  101, 103, 105, 107, 53,  49,  49,  50,  53,  60,
+          66,  71,  79,  82,  92,  93,  103, 105, 111, 107, 58,  53,  53,  53,
+          57,  63,  69,  74,  83,  86,  97,  98,  109, 111, 113, 115, 58,  54,
+          54,  54,  57,  63,  70,  75,  84,  87,  98,  99,  110, 112, 118, 115,
+          65,  60,  59,  58,  62,  68,  74,  79,  89,  92,  105, 106, 118, 119,
+          122, 123, 66,  61,  60,  59,  63,  69,  75,  80,  90,  93,  106, 107,
+          119, 121, 126, 123, 71,  65,  65,  63,  67,  73,  79,  84,  94,  97,
+          111, 112, 125, 127, 131, 132, 74,  68,  67,  66,  69,  75,  81,  86,
+          97,  100, 113, 115, 128, 130, 134, 132, 79,  72,  72,  70,  73,  79,
+          85,  90,  101, 104, 118, 119, 133, 135, 141, 140, 81,  74,  73,  71,
+          75,  80,  86,  91,  102, 105, 120, 121, 135, 137, 143, 140, 82,  75,
+          74,  72,  75,  81,  87,  92,  103, 106, 121, 122, 136, 139, 147, 151,
+          86,  78,  78,  75,  78,  84,  90,  95,  106, 109, 124, 125, 140, 142,
+          147, 151, 88,  81,  80,  77,  80,  86,  90,  98,  105, 112, 122, 127,
+          140, 144, 152, 155, 91,  83,  82,  79,  80,  88,  90,  100, 103, 114,
+          119, 130, 137, 148, 151, 155, 93,  85,  85,  81,  81,  90,  90,  102,
+          103, 117, 117, 134, 134, 151, 152, 160},
+         {32, 31, 32, 37, 40, 48, 49, 49, 51, 52, 57, 58, 63, 64, 67, 67, 31,
+          31, 33, 38, 41, 47, 47, 47, 49, 50, 54, 55, 60, 61, 63, 64, 31, 31,
+          33, 38, 41, 47, 47, 47, 49, 49, 54, 54, 59, 60, 63, 64, 30, 32, 33,
+          40, 42, 46, 45, 45, 47, 48, 52, 52, 57, 58, 60, 61, 31, 33, 35, 41,
+          43, 46, 46, 45, 47, 48, 51, 52, 57, 57, 60, 61, 33, 36, 37, 43, 44,
+          47, 46, 46, 47, 47, 51, 52, 56, 57, 59, 60, 35, 38, 39, 45, 46, 47,
+          46, 45, 47, 47, 50, 51, 55, 56, 58, 60, 37, 40, 41, 47, 47, 47, 46,
+          45, 46, 47, 50, 50, 54, 55, 57, 58, 41, 42, 43, 47, 48, 49, 49, 48,
+          49, 50, 52, 53, 57, 57, 59, 58, 42, 43, 43, 47, 48, 50, 49, 49, 50,
+          50, 53, 54, 57, 58, 60, 61, 49, 46, 47, 48, 50, 53, 53, 53, 54, 54,
+          57, 57, 60, 61, 62, 61, 49, 46, 47, 48, 50, 53, 53, 54, 54, 55, 57,
+          57, 61, 61, 63, 64, 48, 46, 46, 47, 49, 53, 54, 56, 57, 57, 60, 60,
+          64, 64, 65, 64, 48, 45, 46, 46, 49, 53, 55, 56, 58, 58, 61, 61, 65,
+          65, 66, 67, 49, 45, 45, 46, 48, 53, 56, 58, 61, 61, 64, 64, 67, 68,
+          69, 67, 49, 46, 46, 46, 49, 53, 57, 59, 62, 62, 65, 66, 69, 69, 70,
+          70, 50, 46, 46, 46, 49, 54, 57, 59, 63, 64, 67, 67, 71, 71, 73, 71,
+          51, 47, 47, 47, 49, 54, 58, 61, 64, 66, 69, 70, 73, 74, 74, 74, 52,
+          48, 48, 47, 50, 54, 58, 61, 65, 66, 71, 71, 75, 75, 77, 74, 54, 50,
+          49, 48, 51, 55, 59, 62, 67, 68, 73, 73, 77, 78, 78, 78, 54, 50, 50,
+          49, 51, 55, 59, 62, 67, 68, 73, 74, 78, 78, 81, 78, 57, 52, 52, 50,
+          52, 56, 60, 64, 69, 70, 76, 77, 82, 82, 83, 82, 57, 52, 52, 51, 53,
+          57, 61, 64, 69, 71, 77, 77, 82, 83, 85, 82, 60, 54, 54, 52, 55, 58,
+          62, 65, 71, 72, 79, 79, 85, 86, 87, 86, 61, 56, 55, 53, 56, 59, 63,
+          66, 72, 73, 80, 81, 86, 87, 88, 86, 63, 57, 57, 55, 57, 60, 64, 67,
+          73, 75, 82, 82, 89, 90, 92, 90, 64, 58, 58, 55, 58, 61, 65, 68, 73,
+          75, 82, 83, 89, 90, 92, 90, 64, 59, 58, 56, 58, 61, 65, 68, 74, 75,
+          83, 83, 90, 91, 94, 95, 66, 60, 59, 57, 59, 62, 66, 69, 75, 76, 84,
+          85, 91, 92, 94, 95, 67, 61, 60, 58, 59, 63, 66, 70, 74, 77, 82, 85,
+          91, 93, 96, 96, 68, 62, 61, 58, 59, 64, 65, 71, 72, 78, 81, 86, 89,
+          94, 95, 96, 68, 62, 62, 59, 59, 65, 65, 71, 71, 79, 79, 87, 87, 95,
+          95, 98}},
+        {{32,  31,  31,  32,  32,  36,  36,  44,  44,  53,  53,  65,  65,  79,
+          79,  87,  31,  32,  32,  32,  32,  35,  35,  42,  42,  51,  51,  62,
+          62,  75,  75,  82,  31,  32,  32,  32,  32,  35,  35,  42,  42,  51,
+          51,  62,  62,  75,  75,  82,  31,  32,  32,  33,  33,  34,  34,  41,
+          41,  49,  49,  59,  59,  72,  72,  78,  31,  32,  32,  33,  33,  34,
+          34,  41,  41,  49,  49,  59,  59,  72,  72,  78,  32,  32,  32,  34,
+          34,  36,  36,  42,  42,  50,  50,  59,  59,  71,  71,  77,  32,  32,
+          32,  34,  34,  36,  36,  42,  42,  50,  50,  59,  59,  71,  71,  77,
+          32,  33,  33,  35,  35,  38,  38,  42,  42,  49,  49,  58,  58,  69,
+          69,  75,  32,  33,  33,  35,  35,  38,  38,  42,  42,  49,  49,  58,
+          58,  69,  69,  75,  34,  34,  34,  37,  37,  42,  42,  48,  48,  54,
+          54,  63,  63,  73,  73,  79,  34,  34,  34,  37,  37,  42,  42,  48,
+          48,  54,  54,  63,  63,  73,  73,  79,  36,  34,  34,  38,  38,  48,
+          48,  54,  54,  60,  60,  68,  68,  78,  78,  84,  36,  34,  34,  38,
+          38,  48,  48,  54,  54,  60,  60,  68,  68,  78,  78,  84,  39,  37,
+          37,  40,  40,  50,  50,  58,  58,  65,  65,  73,  73,  84,  84,  89,
+          39,  37,  37,  40,  40,  50,  50,  58,  58,  65,  65,  73,  73,  84,
+          84,  89,  44,  41,  41,  43,  43,  53,  53,  63,  63,  71,  71,  79,
+          79,  90,  90,  95,  44,  41,  41,  43,  43,  53,  53,  63,  63,  71,
+          71,  79,  79,  90,  90,  95,  48,  45,  45,  46,  46,  56,  56,  67,
+          67,  76,  76,  85,  85,  96,  96,  102, 48,  45,  45,  46,  46,  56,
+          56,  67,  67,  76,  76,  85,  85,  96,  96,  102, 53,  49,  49,  50,
+          50,  60,  60,  71,  71,  82,  82,  92,  92,  103, 103, 109, 53,  49,
+          49,  50,  50,  60,  60,  71,  71,  82,  82,  92,  92,  103, 103, 109,
+          58,  54,  54,  54,  54,  63,  63,  75,  75,  87,  87,  98,  98,  110,
+          110, 116, 58,  54,  54,  54,  54,  63,  63,  75,  75,  87,  87,  98,
+          98,  110, 110, 116, 65,  60,  60,  58,  58,  68,  68,  79,  79,  92,
+          92,  105, 105, 118, 118, 124, 65,  60,  60,  58,  58,  68,  68,  79,
+          79,  92,  92,  105, 105, 118, 118, 124, 71,  65,  65,  63,  63,  73,
+          73,  84,  84,  97,  97,  111, 111, 125, 125, 132, 71,  65,  65,  63,
+          63,  73,  73,  84,  84,  97,  97,  111, 111, 125, 125, 132, 79,  72,
+          72,  70,  70,  79,  79,  90,  90,  104, 104, 118, 118, 133, 133, 141,
+          79,  72,  72,  70,  70,  79,  79,  90,  90,  104, 104, 118, 118, 133,
+          133, 141, 82,  75,  75,  72,  72,  81,  81,  92,  92,  106, 106, 121,
+          121, 136, 136, 144, 82,  75,  75,  72,  72,  81,  81,  92,  92,  106,
+          106, 121, 121, 136, 136, 144, 87,  79,  79,  76,  76,  84,  84,  96,
+          96,  109, 109, 124, 124, 141, 141, 149},
+         {32, 31, 31, 37, 37, 48, 48, 49, 49, 52, 52, 57, 57, 63, 63, 66, 31,
+          31, 31, 38, 38, 47, 47, 47, 47, 50, 50, 54, 54, 60, 60, 63, 31, 31,
+          31, 38, 38, 47, 47, 47, 47, 50, 50, 54, 54, 60, 60, 63, 30, 32, 32,
+          40, 40, 46, 46, 45, 45, 48, 48, 52, 52, 57, 57, 60, 30, 32, 32, 40,
+          40, 46, 46, 45, 45, 48, 48, 52, 52, 57, 57, 60, 33, 36, 36, 43, 43,
+          47, 47, 46, 46, 47, 47, 51, 51, 56, 56, 59, 33, 36, 36, 43, 43, 47,
+          47, 46, 46, 47, 47, 51, 51, 56, 56, 59, 37, 40, 40, 47, 47, 47, 47,
+          45, 45, 47, 47, 50, 50, 54, 54, 57, 37, 40, 40, 47, 47, 47, 47, 45,
+          45, 47, 47, 50, 50, 54, 54, 57, 42, 43, 43, 47, 47, 50, 50, 49, 49,
+          50, 50, 53, 53, 57, 57, 60, 42, 43, 43, 47, 47, 50, 50, 49, 49, 50,
+          50, 53, 53, 57, 57, 60, 49, 46, 46, 48, 48, 53, 53, 53, 53, 54, 54,
+          57, 57, 60, 60, 62, 49, 46, 46, 48, 48, 53, 53, 53, 53, 54, 54, 57,
+          57, 60, 60, 62, 48, 46, 46, 47, 47, 53, 53, 56, 56, 57, 57, 60, 60,
+          64, 64, 66, 48, 46, 46, 47, 47, 53, 53, 56, 56, 57, 57, 60, 60, 64,
+          64, 66, 49, 45, 45, 46, 46, 53, 53, 58, 58, 61, 61, 64, 64, 67, 67,
+          69, 49, 45, 45, 46, 46, 53, 53, 58, 58, 61, 61, 64, 64, 67, 67, 69,
+          50, 46, 46, 46, 46, 54, 54, 59, 59, 64, 64, 67, 67, 71, 71, 73, 50,
+          46, 46, 46, 46, 54, 54, 59, 59, 64, 64, 67, 67, 71, 71, 73, 52, 48,
+          48, 47, 47, 54, 54, 61, 61, 66, 66, 71, 71, 75, 75, 77, 52, 48, 48,
+          47, 47, 54, 54, 61, 61, 66, 66, 71, 71, 75, 75, 77, 54, 50, 50, 49,
+          49, 55, 55, 62, 62, 68, 68, 73, 73, 78, 78, 80, 54, 50, 50, 49, 49,
+          55, 55, 62, 62, 68, 68, 73, 73, 78, 78, 80, 57, 52, 52, 50, 50, 56,
+          56, 64, 64, 70, 70, 76, 76, 82, 82, 84, 57, 52, 52, 50, 50, 56, 56,
+          64, 64, 70, 70, 76, 76, 82, 82, 84, 60, 54, 54, 52, 52, 58, 58, 65,
+          65, 72, 72, 79, 79, 85, 85, 88, 60, 54, 54, 52, 52, 58, 58, 65, 65,
+          72, 72, 79, 79, 85, 85, 88, 63, 57, 57, 55, 55, 60, 60, 67, 67, 75,
+          75, 82, 82, 89, 89, 92, 63, 57, 57, 55, 55, 60, 60, 67, 67, 75, 75,
+          82, 82, 89, 89, 92, 64, 59, 59, 56, 56, 61, 61, 68, 68, 75, 75, 83,
+          83, 90, 90, 93, 64, 59, 59, 56, 56, 61, 61, 68, 68, 75, 75, 83, 83,
+          90, 90, 93, 66, 60, 60, 57, 57, 63, 63, 69, 69, 77, 77, 84, 84, 92,
+          92, 95}},
+        {{32, 31, 31, 32, 32, 34, 36, 38, 44, 44, 53,  53,  62,  65,  73,  79,
+          31, 32, 32, 32, 32, 34, 35, 37, 42, 43, 51,  51,  60,  62,  70,  75,
+          31, 32, 32, 32, 32, 34, 35, 37, 42, 43, 51,  51,  59,  62,  69,  75,
+          31, 32, 32, 32, 32, 33, 35, 36, 41, 42, 50,  50,  58,  60,  67,  73,
+          31, 32, 32, 32, 33, 33, 34, 36, 41, 41, 49,  49,  57,  59,  66,  72,
+          31, 32, 32, 33, 33, 34, 35, 37, 41, 42, 49,  49,  57,  59,  66,  71,
+          32, 32, 32, 33, 34, 35, 36, 38, 42, 43, 50,  50,  57,  59,  65,  71,
+          32, 32, 32, 34, 34, 35, 37, 38, 42, 43, 49,  49,  56,  59,  65,  70,
+          32, 32, 33, 34, 35, 37, 38, 39, 42, 43, 49,  49,  56,  58,  64,  69,
+          32, 33, 33, 34, 35, 37, 39, 40, 43, 44, 50,  50,  56,  58,  64,  69,
+          34, 34, 34, 36, 37, 39, 42, 44, 48, 48, 54,  54,  61,  63,  69,  73,
+          34, 34, 34, 36, 37, 39, 42, 44, 48, 48, 54,  54,  61,  63,  69,  73,
+          35, 34, 34, 37, 38, 42, 47, 48, 52, 53, 59,  59,  65,  67,  73,  77,
+          36, 35, 34, 37, 38, 43, 48, 49, 54, 54, 60,  60,  66,  68,  74,  78,
+          38, 36, 36, 38, 40, 44, 49, 51, 56, 57, 63,  63,  69,  71,  77,  81,
+          39, 38, 37, 40, 40, 45, 50, 52, 58, 58, 65,  65,  71,  73,  79,  84,
+          41, 39, 39, 41, 41, 46, 51, 54, 60, 60, 67,  67,  74,  76,  81,  86,
+          44, 41, 41, 42, 43, 48, 53, 56, 63, 64, 71,  71,  78,  79,  85,  90,
+          44, 42, 42, 43, 43, 48, 54, 56, 64, 64, 72,  72,  79,  81,  86,  91,
+          48, 45, 45, 46, 46, 51, 56, 59, 67, 67, 76,  76,  83,  85,  91,  96,
+          48, 45, 45, 46, 46, 51, 56, 59, 67, 67, 76,  76,  83,  85,  91,  96,
+          53, 49, 49, 49, 49, 54, 59, 62, 71, 71, 81,  81,  89,  91,  98,  103,
+          53, 50, 49, 50, 50, 54, 60, 63, 71, 72, 82,  82,  90,  92,  99,  103,
+          57, 53, 52, 52, 52, 57, 62, 65, 74, 75, 85,  85,  94,  96,  103, 108,
+          58, 54, 54, 54, 54, 58, 63, 67, 75, 76, 87,  87,  95,  98,  105, 110,
+          61, 57, 57, 56, 56, 60, 66, 69, 77, 78, 89,  89,  98,  101, 108, 114,
+          65, 60, 60, 59, 58, 63, 68, 71, 79, 80, 92,  92,  102, 105, 112, 118,
+          67, 62, 61, 60, 60, 64, 69, 72, 81, 82, 94,  94,  103, 106, 114, 120,
+          71, 66, 65, 64, 63, 68, 73, 76, 84, 85, 97,  97,  108, 111, 119, 125,
+          72, 66, 66, 64, 64, 68, 73, 76, 85, 86, 98,  98,  108, 111, 119, 125,
+          79, 73, 72, 71, 70, 74, 79, 82, 90, 91, 104, 104, 115, 118, 127, 133,
+          79, 73, 72, 71, 70, 74, 79, 82, 90, 91, 104, 104, 115, 118, 127, 133},
+         {32, 31, 31, 35, 37, 42, 48, 48, 49, 49, 52, 52, 56, 57, 61, 63, 31,
+          31, 31, 36, 38, 42, 47, 47, 47, 47, 50, 50, 54, 54, 58, 60, 31, 31,
+          31, 36, 38, 42, 47, 47, 47, 47, 50, 50, 53, 54, 57, 60, 30, 32, 32,
+          37, 39, 42, 46, 46, 46, 46, 48, 48, 52, 52, 56, 58, 30, 32, 32, 37,
+          40, 42, 46, 46, 45, 45, 48, 48, 51, 52, 55, 57, 32, 33, 34, 39, 41,
+          44, 46, 46, 45, 45, 48, 48, 51, 51, 54, 57, 33, 35, 36, 40, 43, 45,
+          47, 46, 46, 46, 47, 47, 50, 51, 54, 56, 34, 37, 37, 42, 44, 45, 47,
+          47, 45, 46, 47, 47, 50, 51, 53, 55, 37, 40, 40, 45, 47, 47, 47, 47,
+          45, 46, 47, 47, 49, 50, 52, 54, 37, 40, 40, 45, 47, 47, 48, 47, 46,
+          46, 47, 47, 49, 50, 53, 55, 42, 43, 43, 46, 47, 48, 50, 50, 49, 49,
+          50, 50, 53, 53, 56, 57, 42, 43, 43, 46, 47, 48, 50, 50, 49, 49, 50,
+          50, 53, 53, 56, 57, 47, 46, 46, 47, 48, 50, 52, 52, 53, 53, 53, 53,
+          55, 56, 58, 60, 49, 47, 46, 47, 48, 50, 53, 53, 53, 54, 54, 54, 56,
+          57, 59, 60, 48, 46, 46, 47, 47, 50, 53, 53, 55, 55, 56, 56, 58, 58,
+          61, 62, 48, 46, 46, 46, 47, 50, 53, 54, 56, 56, 57, 57, 59, 60, 62,
+          64, 48, 46, 45, 46, 46, 49, 53, 54, 57, 57, 59, 59, 61, 61, 63, 65,
+          49, 45, 45, 45, 46, 49, 53, 55, 58, 59, 61, 61, 63, 64, 66, 67, 49,
+          46, 45, 46, 46, 49, 53, 55, 58, 59, 62, 62, 64, 64, 66, 68, 50, 47,
+          46, 46, 46, 50, 54, 55, 59, 60, 64, 64, 66, 67, 69, 71, 50, 47, 46,
+          46, 46, 50, 54, 55, 59, 60, 64, 64, 66, 67, 69, 71, 52, 48, 48, 47,
+          47, 50, 54, 56, 61, 61, 66, 66, 69, 70, 72, 74, 52, 48, 48, 47, 47,
+          50, 54, 56, 61, 61, 66, 66, 70, 71, 73, 75, 53, 50, 49, 48, 48, 51,
+          55, 57, 62, 62, 68, 68, 71, 72, 75, 77, 54, 50, 50, 49, 49, 52, 55,
+          57, 62, 63, 68, 68, 72, 73, 76, 78, 55, 51, 51, 50, 49, 52, 56, 58,
+          63, 63, 69, 69, 74, 75, 78, 80, 57, 52, 52, 51, 50, 53, 56, 58, 64,
+          64, 70, 70, 75, 76, 79, 82, 58, 53, 53, 51, 51, 54, 57, 59, 64, 65,
+          71, 71, 76, 77, 80, 83, 60, 55, 54, 53, 52, 55, 58, 60, 65, 66, 72,
+          72, 77, 79, 82, 85, 60, 55, 55, 53, 53, 55, 59, 60, 65, 66, 73, 73,
+          78, 79, 83, 85, 63, 58, 57, 56, 55, 58, 60, 62, 67, 68, 75, 75, 80,
+          82, 86, 89, 63, 58, 57, 56, 55, 58, 60, 62, 67, 68, 75, 75, 80, 82,
+          86, 89}},
+        {{32, 31, 31, 31, 32, 32, 35, 36, 39, 44, 44, 51, 53, 58,  65,  65,
+          31, 32, 32, 32, 32, 32, 35, 35, 38, 42, 42, 49, 52, 56,  63,  63,
+          31, 32, 32, 32, 32, 32, 35, 35, 38, 42, 42, 49, 51, 55,  62,  62,
+          31, 32, 32, 32, 32, 32, 34, 35, 37, 41, 41, 48, 50, 54,  61,  61,
+          31, 32, 32, 32, 33, 33, 34, 34, 37, 41, 41, 47, 49, 53,  59,  59,
+          31, 32, 32, 32, 33, 33, 34, 34, 37, 41, 41, 47, 49, 53,  59,  59,
+          31, 32, 32, 33, 34, 34, 35, 36, 38, 42, 42, 48, 49, 53,  59,  59,
+          32, 32, 32, 33, 34, 34, 36, 36, 38, 42, 42, 48, 50, 53,  59,  59,
+          32, 32, 32, 33, 34, 34, 36, 37, 39, 42, 42, 48, 49, 53,  58,  58,
+          32, 32, 33, 34, 35, 35, 37, 38, 40, 42, 42, 48, 49, 52,  58,  58,
+          32, 32, 33, 34, 35, 35, 37, 38, 40, 42, 42, 48, 49, 52,  58,  58,
+          33, 33, 33, 35, 36, 36, 40, 41, 43, 46, 46, 52, 53, 56,  62,  62,
+          34, 34, 34, 35, 37, 37, 41, 42, 44, 48, 48, 53, 54, 57,  63,  63,
+          34, 34, 34, 35, 37, 37, 43, 44, 46, 50, 50, 55, 56, 59,  65,  65,
+          36, 35, 34, 36, 38, 38, 46, 48, 50, 54, 54, 58, 60, 63,  68,  68,
+          36, 35, 34, 36, 38, 38, 46, 48, 50, 54, 54, 58, 60, 63,  68,  68,
+          38, 37, 37, 38, 40, 40, 47, 50, 52, 57, 57, 62, 64, 67,  72,  72,
+          39, 38, 37, 39, 40, 40, 48, 50, 53, 58, 58, 63, 65, 68,  73,  73,
+          41, 39, 39, 40, 41, 41, 49, 51, 54, 60, 60, 66, 67, 70,  76,  76,
+          44, 41, 41, 42, 43, 43, 51, 53, 57, 63, 63, 69, 71, 74,  79,  79,
+          44, 41, 41, 42, 43, 43, 51, 53, 57, 63, 63, 69, 71, 74,  79,  79,
+          47, 44, 44, 44, 45, 45, 53, 56, 59, 66, 66, 73, 75, 78,  84,  84,
+          48, 45, 45, 45, 46, 46, 54, 56, 60, 67, 67, 74, 76, 79,  85,  85,
+          50, 47, 46, 47, 47, 47, 55, 58, 61, 68, 68, 76, 78, 82,  88,  88,
+          53, 50, 49, 50, 50, 50, 57, 60, 64, 71, 71, 79, 82, 86,  92,  92,
+          53, 50, 49, 50, 50, 50, 57, 60, 64, 71, 71, 79, 82, 86,  92,  92,
+          57, 54, 53, 53, 53, 53, 60, 63, 67, 74, 74, 83, 86, 90,  97,  97,
+          58, 55, 54, 54, 54, 54, 61, 63, 68, 75, 75, 84, 87, 91,  98,  98,
+          61, 57, 56, 56, 56, 56, 63, 65, 69, 77, 77, 86, 89, 93,  100, 100,
+          65, 61, 60, 59, 58, 58, 66, 68, 72, 79, 79, 89, 92, 97,  105, 105,
+          65, 61, 60, 59, 58, 58, 66, 68, 72, 79, 79, 89, 92, 97,  105, 105,
+          70, 65, 64, 63, 62, 62, 70, 72, 76, 83, 83, 93, 96, 101, 109, 109},
+         {32, 31, 31, 33, 37, 37, 45, 48, 48, 49, 49, 51, 52, 54, 57, 57, 31,
+          31, 31, 34, 38, 38, 45, 47, 47, 47, 47, 50, 50, 52, 55, 55, 31, 31,
+          31, 34, 38, 38, 45, 47, 47, 47, 47, 49, 50, 51, 54, 54, 31, 31, 32,
+          34, 39, 39, 45, 46, 46, 46, 46, 48, 49, 51, 53, 53, 30, 32, 32, 35,
+          40, 40, 44, 46, 45, 45, 45, 47, 48, 49, 52, 52, 30, 32, 32, 35, 40,
+          40, 44, 46, 45, 45, 45, 47, 48, 49, 52, 52, 33, 34, 35, 37, 42, 42,
+          46, 47, 46, 45, 45, 47, 47, 49, 51, 51, 33, 35, 36, 38, 43, 43, 46,
+          47, 46, 46, 46, 47, 47, 49, 51, 51, 35, 37, 37, 40, 44, 44, 46, 47,
+          46, 45, 45, 47, 47, 48, 51, 51, 37, 39, 40, 43, 47, 47, 47, 47, 47,
+          45, 45, 46, 47, 48, 50, 50, 37, 39, 40, 43, 47, 47, 47, 47, 47, 45,
+          45, 46, 47, 48, 50, 50, 41, 42, 42, 44, 47, 47, 49, 49, 49, 48, 48,
+          49, 50, 51, 52, 52, 42, 42, 43, 44, 47, 47, 49, 50, 50, 49, 49, 50,
+          50, 51, 53, 53, 44, 44, 44, 45, 47, 47, 50, 51, 51, 51, 51, 52, 52,
+          53, 54, 54, 49, 47, 46, 47, 48, 48, 52, 53, 53, 53, 53, 54, 54, 55,
+          57, 57, 49, 47, 46, 47, 48, 48, 52, 53, 53, 53, 53, 54, 54, 55, 57,
+          57, 48, 46, 46, 46, 47, 47, 51, 53, 54, 55, 55, 56, 57, 58, 59, 59,
+          48, 46, 46, 46, 47, 47, 51, 53, 54, 56, 56, 57, 57, 58, 60, 60, 48,
+          46, 45, 46, 46, 46, 51, 53, 54, 57, 57, 58, 59, 60, 61, 61, 49, 46,
+          45, 45, 46, 46, 51, 53, 55, 58, 58, 61, 61, 62, 64, 64, 49, 46, 45,
+          45, 46, 46, 51, 53, 55, 58, 58, 61, 61, 62, 64, 64, 50, 47, 46, 46,
+          46, 46, 52, 54, 56, 59, 59, 62, 63, 64, 66, 66, 50, 47, 46, 46, 46,
+          46, 52, 54, 56, 59, 59, 63, 64, 65, 67, 67, 51, 48, 47, 47, 47, 47,
+          52, 54, 56, 60, 60, 64, 65, 66, 68, 68, 52, 48, 48, 47, 47, 47, 53,
+          54, 57, 61, 61, 65, 66, 68, 71, 71, 52, 48, 48, 47, 47, 47, 53, 54,
+          57, 61, 61, 65, 66, 68, 71, 71, 54, 50, 49, 49, 48, 48, 54, 55, 58,
+          62, 62, 67, 68, 70, 73, 73, 54, 51, 50, 49, 49, 49, 54, 55, 58, 62,
+          62, 67, 68, 70, 73, 73, 55, 51, 51, 50, 49, 49, 54, 56, 58, 63, 63,
+          68, 69, 71, 74, 74, 57, 53, 52, 51, 50, 50, 55, 56, 59, 64, 64, 69,
+          70, 73, 76, 76, 57, 53, 52, 51, 50, 50, 55, 56, 59, 64, 64, 69, 70,
+          73, 76, 76, 59, 55, 54, 53, 52, 52, 57, 58, 61, 65, 65, 70, 72, 74,
+          78, 78}},
+        {{32, 31, 31, 31, 32, 32, 32, 35, 36, 38, 44, 44, 47, 53, 53, 59, 31,
+          32, 32, 32, 32, 32, 33, 35, 35, 37, 43, 43, 46, 52, 52, 57, 31, 32,
+          32, 32, 32, 32, 33, 35, 35, 37, 42, 42, 45, 51, 51, 56, 31, 32, 32,
+          32, 32, 32, 33, 35, 35, 37, 42, 42, 45, 51, 51, 56, 31, 32, 32, 32,
+          32, 32, 33, 34, 35, 36, 41, 41, 44, 49, 49, 54, 31, 32, 32, 32, 32,
+          33, 33, 34, 34, 36, 41, 41, 44, 49, 49, 54, 31, 32, 32, 32, 33, 33,
+          33, 35, 35, 36, 41, 41, 44, 49, 49, 54, 32, 32, 32, 32, 33, 34, 34,
+          36, 36, 38, 42, 42, 45, 49, 49, 54, 32, 32, 32, 33, 34, 34, 34, 36,
+          36, 38, 42, 42, 45, 50, 50, 54, 32, 32, 32, 33, 34, 34, 35, 37, 37,
+          38, 42, 42, 45, 49, 49, 54, 32, 32, 33, 33, 35, 35, 36, 38, 38, 39,
+          42, 42, 45, 49, 49, 53, 32, 32, 33, 33, 35, 35, 36, 38, 38, 39, 42,
+          42, 45, 49, 49, 53, 32, 33, 33, 33, 35, 36, 36, 39, 40, 41, 44, 44,
+          47, 51, 51, 55, 34, 34, 34, 34, 36, 37, 38, 42, 42, 44, 48, 48, 50,
+          54, 54, 58, 34, 34, 34, 34, 36, 37, 38, 42, 42, 44, 48, 48, 50, 54,
+          54, 58, 35, 34, 34, 34, 37, 37, 39, 44, 45, 46, 50, 50, 53, 57, 57,
+          61, 36, 35, 34, 35, 37, 38, 40, 47, 48, 49, 54, 54, 56, 60, 60, 64,
+          36, 35, 34, 35, 37, 38, 40, 47, 48, 49, 54, 54, 56, 60, 60, 64, 38,
+          37, 36, 37, 39, 40, 41, 48, 49, 51, 56, 56, 58, 63, 63, 67, 39, 38,
+          37, 38, 40, 40, 42, 49, 50, 52, 58, 58, 60, 65, 65, 69, 39, 38, 37,
+          38, 40, 40, 42, 49, 50, 52, 58, 58, 60, 65, 65, 69, 42, 40, 40, 40,
+          42, 42, 44, 51, 52, 55, 61, 61, 64, 69, 69, 73, 44, 42, 41, 41, 42,
+          43, 45, 52, 53, 56, 63, 63, 66, 71, 71, 75, 44, 42, 41, 41, 43, 43,
+          45, 52, 54, 56, 63, 63, 66, 72, 72, 76, 47, 45, 44, 44, 45, 45, 47,
+          54, 56, 58, 66, 66, 69, 75, 75, 79, 48, 46, 45, 45, 46, 46, 48, 55,
+          56, 59, 67, 67, 70, 76, 76, 80, 49, 47, 46, 46, 47, 47, 48, 56, 57,
+          60, 67, 67, 71, 77, 77, 81, 53, 50, 49, 49, 49, 49, 51, 58, 59, 62,
+          71, 71, 74, 81, 81, 86, 53, 51, 49, 49, 50, 50, 51, 59, 60, 63, 71,
+          71, 75, 82, 82, 87, 55, 52, 51, 51, 51, 51, 53, 60, 61, 64, 72, 72,
+          76, 83, 83, 88, 58, 55, 54, 54, 54, 54, 55, 62, 63, 67, 75, 75, 79,
+          87, 87, 92, 58, 55, 54, 54, 54, 54, 55, 62, 63, 67, 75, 75, 79, 87,
+          87, 92},
+         {32, 31, 31, 31, 35, 37, 38, 47, 48, 48, 49, 49, 50, 52, 52, 54, 31,
+          31, 31, 32, 36, 38, 39, 46, 47, 47, 48, 48, 49, 50, 50, 53, 31, 31,
+          31, 32, 37, 38, 40, 46, 47, 47, 47, 47, 48, 50, 50, 52, 31, 31, 31,
+          32, 37, 38, 40, 46, 47, 47, 47, 47, 48, 50, 50, 52, 30, 31, 32, 32,
+          38, 39, 40, 45, 46, 46, 45, 45, 46, 48, 48, 50, 30, 31, 32, 33, 38,
+          40, 41, 45, 46, 46, 45, 45, 46, 48, 48, 50, 31, 32, 33, 33, 38, 40,
+          41, 45, 46, 46, 45, 45, 46, 48, 48, 50, 33, 35, 35, 36, 41, 43, 43,
+          46, 47, 46, 45, 45, 46, 47, 47, 49, 33, 35, 36, 36, 41, 43, 44, 46,
+          47, 46, 46, 46, 46, 47, 47, 49, 34, 36, 37, 37, 42, 44, 45, 47, 47,
+          47, 45, 45, 46, 47, 47, 49, 37, 39, 40, 41, 45, 47, 47, 47, 47, 47,
+          45, 45, 46, 47, 47, 48, 37, 39, 40, 41, 45, 47, 47, 47, 47, 47, 45,
+          45, 46, 47, 47, 48, 39, 40, 41, 42, 46, 47, 47, 48, 48, 48, 47, 47,
+          47, 48, 48, 50, 42, 42, 43, 43, 46, 47, 48, 50, 50, 50, 49, 49, 50,
+          50, 50, 52, 42, 42, 43, 43, 46, 47, 48, 50, 50, 50, 49, 49, 50, 50,
+          50, 52, 45, 45, 44, 45, 47, 47, 48, 51, 51, 51, 51, 51, 52, 52, 52,
+          54, 49, 47, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 54, 54, 54, 55,
+          49, 47, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 54, 54, 54, 55, 48,
+          47, 46, 46, 47, 47, 48, 52, 53, 53, 55, 55, 55, 56, 56, 57, 48, 46,
+          46, 46, 46, 47, 48, 52, 53, 54, 56, 56, 56, 57, 57, 59, 48, 46, 46,
+          46, 46, 47, 48, 52, 53, 54, 56, 56, 56, 57, 57, 59, 49, 46, 45, 45,
+          46, 46, 47, 52, 53, 54, 57, 57, 58, 60, 60, 61, 49, 46, 45, 45, 45,
+          46, 47, 52, 53, 55, 58, 58, 59, 61, 61, 62, 49, 46, 45, 45, 46, 46,
+          47, 52, 53, 55, 58, 58, 60, 61, 61, 63, 50, 47, 46, 46, 46, 46, 48,
+          53, 54, 55, 59, 59, 61, 63, 63, 65, 50, 48, 46, 46, 46, 46, 48, 53,
+          54, 55, 59, 59, 61, 64, 64, 65, 51, 48, 47, 47, 47, 47, 48, 53, 54,
+          55, 60, 60, 61, 64, 64, 66, 52, 49, 48, 48, 47, 47, 48, 53, 54, 56,
+          61, 61, 63, 66, 66, 68, 52, 49, 48, 48, 47, 47, 48, 53, 54, 56, 61,
+          61, 63, 66, 66, 68, 53, 50, 48, 48, 48, 48, 49, 54, 54, 56, 61, 61,
+          63, 67, 67, 69, 54, 51, 50, 50, 49, 49, 50, 55, 55, 57, 62, 62, 65,
+          68, 68, 71, 54, 51, 50, 50, 49, 49, 50, 55, 55, 57, 62, 62, 65, 68,
+          68, 71}},
+        {{32, 31, 31, 31, 31, 32, 32, 32, 35, 36, 36, 40, 44, 44, 47, 53, 31,
+          31, 32, 32, 32, 32, 32, 33, 35, 35, 35, 39, 43, 43, 46, 52, 31, 32,
+          32, 32, 32, 32, 32, 33, 35, 35, 35, 39, 42, 42, 45, 51, 31, 32, 32,
+          32, 32, 32, 32, 33, 35, 35, 35, 39, 42, 42, 45, 51, 31, 32, 32, 32,
+          32, 32, 32, 33, 34, 35, 35, 39, 41, 41, 45, 50, 31, 32, 32, 32, 32,
+          33, 33, 33, 34, 34, 34, 38, 41, 41, 44, 49, 31, 32, 32, 32, 32, 33,
+          33, 33, 34, 34, 34, 38, 41, 41, 44, 49, 31, 32, 32, 32, 32, 33, 33,
+          33, 34, 35, 35, 38, 41, 41, 44, 49, 31, 32, 32, 32, 33, 34, 34, 34,
+          35, 36, 36, 39, 42, 42, 44, 49, 32, 32, 32, 32, 33, 34, 34, 34, 36,
+          36, 36, 39, 42, 42, 45, 50, 32, 32, 32, 32, 33, 34, 34, 34, 36, 36,
+          36, 39, 42, 42, 45, 50, 32, 32, 32, 32, 33, 35, 35, 35, 37, 37, 37,
+          40, 42, 42, 45, 49, 32, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 41,
+          42, 42, 45, 49, 32, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 41, 42,
+          42, 45, 49, 32, 33, 33, 33, 34, 36, 36, 36, 39, 40, 40, 42, 44, 44,
+          47, 51, 34, 34, 34, 34, 35, 37, 37, 38, 41, 42, 42, 45, 48, 48, 50,
+          54, 34, 34, 34, 34, 35, 37, 37, 38, 41, 42, 42, 45, 48, 48, 50, 54,
+          34, 34, 34, 34, 35, 37, 37, 38, 42, 43, 43, 46, 49, 49, 51, 55, 35,
+          35, 34, 34, 36, 38, 38, 39, 45, 47, 47, 50, 52, 52, 55, 59, 36, 35,
+          34, 34, 36, 38, 38, 40, 46, 48, 48, 51, 54, 54, 56, 60, 36, 35, 34,
+          34, 36, 38, 38, 40, 46, 48, 48, 51, 54, 54, 56, 60, 38, 37, 36, 36,
+          37, 40, 40, 41, 47, 49, 49, 53, 56, 56, 58, 63, 39, 38, 37, 37, 39,
+          40, 40, 42, 48, 50, 50, 54, 58, 58, 60, 65, 39, 38, 37, 37, 39, 40,
+          40, 42, 48, 50, 50, 54, 58, 58, 60, 65, 41, 40, 39, 39, 40, 41, 41,
+          43, 49, 51, 51, 56, 60, 60, 62, 67, 44, 42, 41, 41, 42, 43, 43, 45,
+          51, 53, 53, 59, 63, 63, 66, 71, 44, 42, 41, 41, 42, 43, 43, 45, 51,
+          53, 53, 59, 63, 63, 66, 71, 44, 43, 42, 42, 42, 43, 43, 45, 51, 54,
+          54, 59, 64, 64, 67, 72, 47, 45, 44, 44, 44, 45, 45, 47, 53, 56, 56,
+          61, 66, 66, 69, 75, 48, 46, 45, 45, 45, 46, 46, 48, 54, 56, 56, 62,
+          67, 67, 70, 76, 48, 46, 45, 45, 45, 46, 46, 48, 54, 56, 56, 62, 67,
+          67, 70, 76, 51, 49, 47, 47, 48, 48, 48, 50, 56, 58, 58, 64, 69, 69,
+          73, 79},
+         {32, 31, 31, 31, 33, 37, 37, 38, 45, 48, 48, 49, 49, 49, 50, 52, 31,
+          31, 31, 31, 33, 38, 38, 39, 45, 47, 47, 48, 48, 48, 49, 51, 31, 31,
+          31, 31, 34, 38, 38, 40, 45, 47, 47, 47, 47, 47, 48, 50, 31, 31, 31,
+          31, 34, 38, 38, 40, 45, 47, 47, 47, 47, 47, 48, 50, 31, 31, 32, 32,
+          34, 39, 39, 40, 45, 46, 46, 46, 46, 46, 47, 49, 30, 31, 32, 32, 35,
+          40, 40, 41, 44, 46, 46, 45, 45, 45, 46, 48, 30, 31, 32, 32, 35, 40,
+          40, 41, 44, 46, 46, 45, 45, 45, 46, 48, 31, 32, 33, 33, 35, 40, 40,
+          41, 45, 46, 46, 45, 45, 45, 46, 48, 33, 34, 35, 35, 37, 42, 42, 43,
+          46, 47, 47, 46, 45, 45, 46, 47, 33, 35, 36, 36, 38, 43, 43, 44, 46,
+          47, 47, 46, 46, 46, 46, 47, 33, 35, 36, 36, 38, 43, 43, 44, 46, 47,
+          47, 46, 46, 46, 46, 47, 35, 37, 38, 38, 41, 45, 45, 46, 47, 47, 47,
+          46, 45, 45, 46, 47, 37, 39, 40, 40, 43, 47, 47, 47, 47, 47, 47, 46,
+          45, 45, 46, 47, 37, 39, 40, 40, 43, 47, 47, 47, 47, 47, 47, 46, 45,
+          45, 46, 47, 39, 40, 41, 41, 43, 47, 47, 47, 48, 48, 48, 47, 47, 47,
+          47, 48, 42, 42, 43, 43, 44, 47, 47, 48, 49, 50, 50, 49, 49, 49, 50,
+          50, 42, 42, 43, 43, 44, 47, 47, 48, 49, 50, 50, 49, 49, 49, 50, 50,
+          43, 43, 43, 43, 45, 47, 47, 48, 50, 50, 50, 50, 50, 50, 50, 51, 47,
+          46, 46, 46, 46, 48, 48, 48, 51, 52, 52, 52, 53, 53, 53, 53, 49, 47,
+          46, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 53, 54, 54, 49, 47, 46,
+          46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 53, 54, 54, 48, 47, 46, 46,
+          46, 47, 47, 48, 52, 53, 53, 54, 55, 55, 55, 56, 48, 47, 46, 46, 46,
+          47, 47, 48, 51, 53, 53, 54, 56, 56, 56, 57, 48, 47, 46, 46, 46, 47,
+          47, 48, 51, 53, 53, 54, 56, 56, 56, 57, 48, 47, 45, 45, 46, 46, 46,
+          47, 51, 53, 53, 55, 57, 57, 57, 59, 49, 46, 45, 45, 45, 46, 46, 47,
+          51, 53, 53, 56, 58, 58, 59, 61, 49, 46, 45, 45, 45, 46, 46, 47, 51,
+          53, 53, 56, 58, 58, 59, 61, 49, 47, 45, 45, 45, 46, 46, 47, 52, 53,
+          53, 56, 58, 58, 60, 62, 50, 48, 46, 46, 46, 46, 46, 48, 52, 54, 54,
+          57, 59, 59, 61, 63, 50, 48, 46, 46, 46, 46, 46, 48, 52, 54, 54, 57,
+          59, 59, 61, 64, 50, 48, 46, 46, 46, 46, 46, 48, 52, 54, 54, 57, 59,
+          59, 61, 64, 51, 49, 47, 47, 47, 47, 47, 48, 52, 54, 54, 58, 60, 60,
+          62, 65}},
+        {{32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 39, 44, 44, 31,
+          31, 31, 31, 31, 32, 32, 32, 32, 34, 35, 35, 35, 39, 43, 43, 31, 32,
+          32, 32, 32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32,
+          32, 32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32,
+          32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 34, 35, 35, 35, 38, 41, 41, 31, 32, 32, 32, 32, 32,
+          33, 33, 33, 33, 34, 34, 34, 37, 41, 41, 31, 32, 32, 32, 32, 32, 33,
+          33, 33, 33, 34, 34, 34, 37, 41, 41, 31, 32, 32, 32, 32, 32, 33, 33,
+          33, 33, 34, 34, 34, 37, 41, 41, 31, 32, 32, 32, 32, 33, 33, 33, 33,
+          34, 35, 35, 35, 38, 41, 41, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35,
+          36, 36, 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36,
+          36, 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 36,
+          36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34, 34, 36, 37, 37, 37,
+          40, 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40,
+          42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42,
+          42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42, 42,
+          33, 33, 33, 33, 33, 34, 36, 36, 36, 38, 40, 40, 40, 42, 45, 45, 34,
+          34, 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 34, 34,
+          34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 34, 34, 34,
+          34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 35, 34, 34, 34,
+          34, 36, 37, 37, 37, 41, 45, 45, 45, 47, 50, 50, 36, 35, 34, 34, 34,
+          36, 38, 38, 38, 43, 48, 48, 48, 51, 54, 54, 36, 35, 34, 34, 34, 36,
+          38, 38, 38, 43, 48, 48, 48, 51, 54, 54, 36, 35, 34, 34, 34, 36, 38,
+          38, 38, 43, 48, 48, 48, 51, 54, 54, 37, 37, 36, 36, 36, 38, 39, 39,
+          39, 44, 49, 49, 49, 52, 56, 56, 39, 38, 37, 37, 37, 39, 40, 40, 40,
+          45, 50, 50, 50, 54, 58, 58, 39, 38, 37, 37, 37, 39, 40, 40, 40, 45,
+          50, 50, 50, 54, 58, 58, 39, 38, 37, 37, 37, 39, 40, 40, 40, 45, 50,
+          50, 50, 54, 58, 58, 41, 40, 39, 39, 39, 40, 42, 42, 42, 46, 52, 52,
+          52, 56, 60, 60, 44, 42, 41, 41, 41, 42, 43, 43, 43, 48, 53, 53, 53,
+          58, 63, 63, 44, 42, 41, 41, 41, 42, 43, 43, 43, 48, 53, 53, 53, 58,
+          63, 63},
+         {32, 31, 31, 31, 31, 33, 37, 37, 37, 42, 48, 48, 48, 48, 49, 49, 31,
+          31, 31, 31, 31, 34, 37, 37, 37, 42, 47, 47, 47, 48, 48, 48, 31, 31,
+          31, 31, 31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 31,
+          31, 31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31,
+          31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 32, 32, 32,
+          35, 39, 39, 39, 42, 46, 46, 46, 46, 46, 46, 30, 31, 32, 32, 32, 35,
+          40, 40, 40, 42, 46, 46, 46, 45, 45, 45, 30, 31, 32, 32, 32, 35, 40,
+          40, 40, 42, 46, 46, 46, 45, 45, 45, 30, 31, 32, 32, 32, 35, 40, 40,
+          40, 42, 46, 46, 46, 45, 45, 45, 32, 33, 34, 34, 34, 37, 41, 41, 41,
+          44, 46, 46, 46, 46, 45, 45, 33, 34, 36, 36, 36, 39, 43, 43, 43, 45,
+          47, 47, 47, 46, 46, 46, 33, 34, 36, 36, 36, 39, 43, 43, 43, 45, 47,
+          47, 47, 46, 46, 46, 33, 34, 36, 36, 36, 39, 43, 43, 43, 45, 47, 47,
+          47, 46, 46, 46, 35, 36, 38, 38, 38, 41, 45, 45, 45, 46, 47, 47, 47,
+          46, 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46,
+          45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46, 45,
+          45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46, 45, 45,
+          39, 40, 41, 41, 41, 44, 47, 47, 47, 48, 49, 49, 49, 48, 47, 47, 42,
+          42, 43, 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 42, 42,
+          43, 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 42, 42, 43,
+          43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 45, 45, 44, 44,
+          44, 46, 47, 47, 47, 49, 51, 51, 51, 51, 51, 51, 49, 48, 46, 46, 46,
+          47, 48, 48, 48, 50, 53, 53, 53, 53, 53, 53, 49, 48, 46, 46, 46, 47,
+          48, 48, 48, 50, 53, 53, 53, 53, 53, 53, 49, 48, 46, 46, 46, 47, 48,
+          48, 48, 50, 53, 53, 53, 53, 53, 53, 48, 47, 46, 46, 46, 47, 47, 47,
+          47, 50, 53, 53, 53, 54, 54, 54, 48, 47, 46, 46, 46, 46, 47, 47, 47,
+          50, 53, 53, 53, 54, 56, 56, 48, 47, 46, 46, 46, 46, 47, 47, 47, 50,
+          53, 53, 53, 54, 56, 56, 48, 47, 46, 46, 46, 46, 47, 47, 47, 50, 53,
+          53, 53, 54, 56, 56, 48, 47, 45, 45, 45, 46, 46, 46, 46, 49, 53, 53,
+          53, 55, 57, 57, 49, 47, 45, 45, 45, 45, 46, 46, 46, 49, 53, 53, 53,
+          56, 58, 58, 49, 47, 45, 45, 45, 45, 46, 46, 46, 49, 53, 53, 53, 56,
+          58, 58}},
+        {{32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31,
+          31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 31,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 33, 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 33, 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32,
+          33, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33,
+          33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+          33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+          34, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34,
+          35, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36,
+          36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36,
+          36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36,
+          36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 37, 37, 37,
+          32, 32, 32, 33, 33, 33, 33, 34, 35, 35, 35, 36, 37, 38, 38, 38, 32,
+          32, 32, 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 32,
+          32, 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 32, 32,
+          33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 33, 33, 33,
+          33, 33, 34, 35, 36, 36, 36, 37, 39, 40, 40, 40, 33, 33, 33, 33, 33,
+          33, 35, 36, 36, 36, 36, 38, 40, 41, 41, 41, 34, 34, 34, 34, 34, 34,
+          35, 36, 37, 37, 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35,
+          36, 37, 37, 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, 36,
+          37, 37, 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, 37, 37,
+          37, 37, 40, 43, 44, 44, 44, 35, 35, 34, 34, 34, 34, 36, 37, 38, 38,
+          38, 41, 45, 47, 47, 47, 36, 35, 35, 34, 34, 34, 36, 37, 38, 38, 38,
+          42, 46, 48, 48, 48, 36, 35, 35, 34, 34, 34, 36, 37, 38, 38, 38, 42,
+          46, 48, 48, 48, 36, 35, 35, 34, 34, 34, 36, 37, 38, 38, 38, 42, 46,
+          48, 48, 48, 37, 36, 36, 36, 36, 36, 37, 38, 39, 39, 39, 42, 46, 49,
+          49, 49},
+         {32, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 40, 45, 48, 48, 48, 31,
+          31, 31, 31, 31, 31, 33, 36, 37, 37, 37, 41, 45, 48, 48, 48, 31, 31,
+          31, 31, 31, 31, 34, 36, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31,
+          31, 31, 31, 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31,
+          31, 31, 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31, 31,
+          31, 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 32, 32, 32,
+          34, 37, 39, 39, 39, 41, 45, 46, 46, 46, 30, 31, 31, 32, 32, 32, 34,
+          38, 39, 39, 39, 42, 44, 46, 46, 46, 30, 31, 32, 32, 32, 32, 35, 38,
+          40, 40, 40, 42, 44, 46, 46, 46, 30, 31, 32, 32, 32, 32, 35, 38, 40,
+          40, 40, 42, 44, 46, 46, 46, 30, 31, 32, 32, 32, 32, 35, 38, 40, 40,
+          40, 42, 44, 46, 46, 46, 31, 32, 33, 33, 33, 33, 36, 39, 41, 41, 41,
+          43, 45, 46, 46, 46, 33, 34, 34, 35, 35, 35, 37, 40, 42, 42, 42, 44,
+          46, 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44, 46,
+          47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44, 46, 47,
+          47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44, 46, 47, 47,
+          47, 35, 36, 37, 37, 37, 37, 40, 43, 44, 44, 44, 45, 46, 47, 47, 47,
+          36, 37, 38, 39, 39, 39, 42, 44, 46, 46, 46, 47, 47, 47, 47, 47, 37,
+          38, 39, 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 37, 38,
+          39, 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 37, 38, 39,
+          40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 39, 39, 40, 41,
+          41, 41, 43, 46, 47, 47, 47, 48, 48, 48, 48, 48, 41, 41, 42, 42, 42,
+          42, 44, 46, 47, 47, 47, 48, 49, 49, 49, 49, 42, 42, 42, 43, 43, 43,
+          44, 46, 47, 47, 47, 48, 49, 50, 50, 50, 42, 42, 42, 43, 43, 43, 44,
+          46, 47, 47, 47, 48, 49, 50, 50, 50, 42, 42, 42, 43, 43, 43, 44, 46,
+          47, 47, 47, 48, 49, 50, 50, 50, 44, 44, 44, 44, 44, 44, 45, 47, 47,
+          47, 47, 49, 50, 51, 51, 51, 47, 46, 46, 46, 46, 46, 46, 47, 48, 48,
+          48, 49, 51, 52, 52, 52, 49, 48, 47, 46, 46, 46, 47, 48, 48, 48, 48,
+          50, 52, 53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 48, 48, 48, 48, 50,
+          52, 53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 48, 48, 48, 48, 50, 52,
+          53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 47, 47, 47, 47, 49, 52, 53,
+          53, 53}},
+        {{32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 31,
+          31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+          33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+          33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+          33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+          33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34,
+          34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35,
+          32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32,
+          32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32,
+          32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32,
+          32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32,
+          32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 32, 32, 32, 32, 32,
+          32, 32, 33, 33, 34, 35, 35, 35, 35, 35, 36, 32, 32, 32, 32, 33, 33,
+          33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 32, 32, 32, 32, 33, 33, 33,
+          33, 34, 34, 35, 35, 35, 35, 36, 37, 32, 32, 32, 32, 33, 33, 33, 33,
+          34, 34, 35, 35, 35, 35, 36, 37, 32, 32, 32, 32, 33, 33, 33, 33, 34,
+          34, 35, 35, 35, 35, 36, 37, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34,
+          35, 35, 35, 35, 36, 37, 32, 33, 33, 33, 33, 33, 33, 33, 34, 35, 36,
+          36, 36, 36, 36, 38, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36,
+          36, 36, 37, 38, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37,
+          37, 38, 39, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37, 37,
+          38, 39},
+         {32, 31, 31, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 37, 38, 42, 31,
+          31, 31, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 37, 39, 42, 31, 31,
+          31, 31, 31, 31, 31, 32, 33, 35, 38, 38, 38, 38, 39, 42, 31, 31, 31,
+          31, 31, 31, 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31,
+          31, 31, 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31,
+          31, 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31,
+          31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31,
+          32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 32, 32, 32, 32,
+          34, 36, 39, 39, 39, 39, 40, 42, 30, 31, 31, 32, 32, 32, 32, 32, 34,
+          37, 39, 39, 39, 39, 40, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37,
+          40, 40, 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37, 40,
+          40, 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37, 40, 40,
+          40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37, 40, 40, 40,
+          40, 41, 42, 31, 31, 32, 32, 33, 33, 33, 33, 35, 38, 40, 40, 40, 40,
+          41, 43, 32, 32, 33, 33, 34, 34, 34, 34, 36, 39, 41, 41, 41, 41, 42,
+          44, 33, 33, 34, 35, 35, 35, 35, 35, 37, 40, 42, 42, 42, 42, 43, 44,
+          33, 34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33,
+          34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, 34,
+          35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, 34, 35,
+          35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 34, 35, 36, 37,
+          37, 37, 37, 37, 39, 42, 44, 44, 44, 44, 45, 45, 35, 36, 37, 38, 38,
+          38, 38, 39, 41, 43, 45, 45, 45, 45, 46, 46, 36, 37, 38, 39, 39, 39,
+          39, 40, 42, 44, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40,
+          41, 43, 45, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41,
+          43, 45, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43,
+          45, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43, 45,
+          47, 47, 47, 47, 47, 47, 39, 39, 40, 41, 41, 41, 41, 42, 43, 45, 47,
+          47, 47, 47, 47, 48, 40, 41, 41, 42, 42, 42, 42, 42, 44, 45, 47, 47,
+          47, 47, 47, 48, 42, 42, 42, 43, 43, 43, 43, 43, 44, 46, 47, 47, 47,
+          47, 48, 48, 42, 42, 42, 43, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47,
+          48, 48}},
+        {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+          31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 33, 33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+          33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+          33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+          34, 34},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 37, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 37, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 36, 37, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 35, 36, 38, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 32, 34, 35, 36, 38, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33,
+          34, 36, 37, 39, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34,
+          36, 37, 39, 30, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 36,
+          38, 39, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38,
+          40, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40,
+          30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30,
+          31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31,
+          31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 31, 31, 31, 32,
+          32, 33, 33, 33, 33, 33, 33, 34, 35, 37, 38, 40, 31, 32, 32, 33, 33,
+          33, 33, 33, 33, 33, 33, 35, 36, 37, 39, 41, 32, 32, 33, 33, 34, 34,
+          34, 34, 34, 34, 34, 35, 37, 38, 40, 41, 33, 33, 34, 34, 34, 35, 35,
+          35, 35, 35, 35, 36, 37, 39, 40, 42, 33, 34, 34, 35, 35, 36, 36, 36,
+          36, 36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36,
+          36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36,
+          36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36,
+          37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37,
+          38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38,
+          40, 41, 43, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 36, 38, 39, 40,
+          42, 44}},
+        {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+          31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+          31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          32, 32, 32, 32, 32, 32, 32, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+          32, 32, 32, 32, 32, 32, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+          32, 32, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+          32, 32}}};
+constexpr uint8_t
+    kQuantizerMatrix4x4[kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes]
+                       [10] = {{{32, 43, 67, 73, 94, 137, 97, 110, 150, 200},
+                                {35, 46, 60, 57, 69, 90, 66, 71, 90, 109}},
+                               {{32, 41, 63, 69, 88, 127, 92, 103, 140, 184},
+                                {33, 45, 58, 56, 66, 86, 64, 69, 87, 105}},
+                               {{32, 38, 56, 63, 78, 113, 86, 97, 130, 169},
+                                {32, 45, 55, 53, 62, 80, 63, 67, 84, 101}},
+                               {{32, 37, 54, 58, 72, 102, 81, 91, 121, 156},
+                                {32, 45, 54, 51, 59, 75, 61, 65, 81, 97}},
+                               {{32, 34, 49, 53, 64, 91, 75, 81, 112, 140},
+                                {32, 46, 53, 49, 55, 70, 58, 62, 78, 91}},
+                               {{32, 34, 48, 49, 60, 82, 72, 79, 104, 134},
+                                {32, 46, 53, 47, 54, 66, 57, 60, 75, 89}},
+                               {{32, 33, 39, 45, 51, 71, 62, 64, 87, 108},
+                                {31, 42, 48, 47, 50, 61, 53, 54, 67, 78}},
+                               {{32, 33, 38, 42, 46, 63, 55, 57, 75, 92},
+                                {31, 41, 48, 46, 48, 58, 51, 51, 62, 71}},
+                               {{32, 32, 35, 38, 40, 54, 51, 49, 64, 81},
+                                {31, 38, 47, 47, 46, 54, 49, 46, 57, 66}},
+                               {{32, 32, 34, 35, 37, 48, 43, 43, 54, 65},
+                                {31, 37, 44, 47, 47, 53, 47, 45, 53, 59}},
+                               {{32, 32, 33, 34, 35, 39, 38, 39, 45, 54},
+                                {31, 34, 39, 42, 45, 48, 47, 46, 49, 54}},
+                               {{32, 32, 32, 32, 33, 35, 35, 35, 38, 46},
+                                {31, 32, 34, 38, 41, 47, 46, 46, 47, 52}},
+                               {{31, 32, 32, 32, 32, 33, 32, 33, 34, 35},
+                                {31, 31, 32, 34, 35, 39, 38, 40, 43, 47}},
+                               {{31, 31, 32, 31, 32, 32, 32, 32, 32, 33},
+                                {31, 31, 31, 31, 31, 32, 34, 35, 35, 39}},
+                               {{31, 31, 32, 31, 32, 32, 31, 32, 32, 32},
+                                {31, 31, 31, 31, 31, 31, 31, 31, 31, 31}}};
+constexpr uint8_t kQuantizerMatrix8x8
+    [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][36] = {
+        {{32,  32,  35,  38,  40,  54,  51,  49,  65,  82,  68,  63,
+          78,  97,  117, 84,  76,  91,  111, 134, 152, 95,  89,  98,
+          113, 138, 159, 183, 109, 102, 106, 121, 142, 168, 199, 220},
+         {31, 38, 47, 47,  46, 54, 50, 47, 57, 66, 57,  52,
+          61, 72, 82, 63,  57, 66, 77, 88, 96, 67, 62,  67,
+          75, 86, 95, 104, 71, 67, 68, 75, 84, 95, 107, 113}},
+        {{32,  32,  35,  37,  39,  51, 47,  46,  60,  73,  62,  58,
+          71,  87,  105, 78,  72,  84, 100, 121, 140, 90,  84,  93,
+          106, 129, 148, 169, 102, 96, 100, 113, 132, 155, 183, 201},
+         {31, 38, 47, 47,  47, 53, 48, 46, 55, 62, 54,  50,
+          58, 67, 76, 61,  55, 63, 72, 83, 91, 66, 61,  65,
+          73, 84, 92, 101, 69, 65, 66, 73, 82, 92, 103, 109}},
+        {{32,  32,  34,  35,  37, 48, 46, 45,  56,  70,  57,  54,
+          64,  80,  93,  76,  70, 79, 96, 111, 134, 85,  79,  87,
+          100, 121, 138, 156, 96, 90, 93, 105, 122, 144, 168, 184},
+         {31, 36, 43, 47, 47, 53, 48, 46, 54, 61, 52, 49,
+          55, 65, 71, 60, 55, 60, 70, 78, 89, 64, 59, 63,
+          71, 81, 89, 97, 67, 63, 64, 71, 79, 89, 99, 104}},
+        {{32, 32,  33,  35,  36, 46, 42, 42,  52,  63,  53,  51,
+          60, 73,  86,  68,  64, 72, 84, 100, 117, 78,  74,  80,
+          92, 109, 128, 140, 90, 84, 87, 98,  114, 133, 155, 168},
+         {31, 34, 39, 46, 47, 52, 47, 45, 52, 58, 50, 48,
+          54, 62, 68, 57, 53, 58, 65, 73, 82, 61, 57, 61,
+          68, 77, 86, 91, 65, 61, 62, 68, 76, 86, 95, 100}},
+        {{32, 32,  33,  34,  35, 39, 39, 40, 46,  56,  50,  48,
+          53, 65,  78,  62,  59, 63, 75, 90, 105, 76,  71,  74,
+          86, 101, 118, 134, 84, 79, 81, 92, 106, 123, 142, 153},
+         {31, 34, 39, 42, 45, 48, 47, 46, 49, 55, 49, 47,
+          50, 58, 65, 54, 51, 53, 61, 69, 76, 60, 56, 57,
+          65, 73, 82, 89, 64, 59, 60, 66, 74, 83, 92, 96}},
+        {{32, 32, 33,  34,  35, 39, 38, 39, 45, 54,  46,  45,
+          51, 61, 71,  56,  54, 58, 69, 80, 92, 68,  64,  68,
+          78, 90, 103, 117, 78, 74, 76, 86, 99, 113, 128, 140},
+         {31, 34, 39, 42, 45, 48, 47, 46, 49, 54, 48, 46,
+          50, 56, 61, 52, 49, 52, 58, 65, 71, 57, 53, 55,
+          61, 68, 75, 82, 61, 57, 58, 64, 71, 79, 86, 91}},
+        {{31, 32, 32, 32, 33, 35, 35, 35, 38, 48, 42,  41,
+          43, 54, 63, 51, 49, 49, 59, 71, 81, 59, 56,  56,
+          66, 77, 89, 98, 69, 65, 64, 73, 85, 97, 108, 119},
+         {31, 32, 35, 38, 42, 47, 48, 47, 48, 53, 47, 45,
+          45, 53, 58, 50, 47, 47, 54, 61, 66, 53, 50, 49,
+          56, 63, 69, 73, 57, 54, 52, 58, 65, 72, 77, 82}},
+        {{31, 32, 32, 32, 32, 35, 34, 34, 37, 42, 38, 37,
+          40, 47, 54, 46, 44, 45, 52, 60, 69, 52, 49, 49,
+          56, 65, 75, 82, 63, 59, 58, 65, 73, 84, 92, 105},
+         {31, 31, 32, 38, 40, 47, 44, 44, 47, 50, 47, 45,
+          46, 51, 54, 48, 46, 46, 51, 56, 61, 50, 47, 47,
+          52, 57, 63, 66, 55, 52, 50, 54, 60, 66, 70, 76}},
+        {{31, 32, 32, 32, 32, 34, 34, 33, 35, 39, 35, 34,
+          37, 42, 48, 41, 40, 41, 47, 53, 60, 47, 44, 45,
+          51, 57, 65, 71, 53, 50, 51, 55, 61, 70, 77, 85},
+         {31, 31, 32, 35, 36, 41, 42, 42, 45, 48, 48, 46,
+          47, 50, 53, 47, 45, 45, 49, 53, 57, 49, 46, 46,
+          50, 54, 59, 61, 51, 48, 48, 51, 54, 60, 64, 68}},
+        {{31, 31, 32, 32, 32, 33, 32, 32, 34, 35, 34, 34,
+          35, 37, 41, 37, 36, 38, 39, 45, 51, 43, 41, 42,
+          42, 49, 56, 63, 47, 44, 45, 46, 52, 59, 67, 71},
+         {31, 31, 32, 34, 35, 39, 37, 40, 43, 47, 43, 43,
+          45, 47, 49, 48, 46, 46, 47, 50, 53, 47, 45, 45,
+          45, 50, 55, 58, 49, 46, 46, 46, 50, 55, 60, 61}},
+        {{31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 33, 33,
+          34, 35, 37, 34, 34, 35, 36, 39, 43, 37, 36, 37,
+          38, 41, 46, 51, 41, 39, 40, 41, 44, 49, 54, 58},
+         {31, 31, 31, 32, 33, 35, 35, 37, 39, 43, 39, 41,
+          42, 45, 47, 45, 44, 45, 47, 48, 50, 48, 46, 46,
+          47, 48, 51, 53, 48, 46, 45, 46, 47, 51, 54, 56}},
+        {{31, 31, 32, 31, 32, 32, 32, 32, 32, 33, 32, 32,
+          32, 34, 35, 32, 33, 33, 34, 35, 36, 34, 34, 33,
+          35, 36, 38, 39, 35, 35, 34, 36, 38, 40, 42, 48},
+         {31, 31, 31, 30, 31, 32, 34, 34, 35, 39, 36, 37,
+          39, 42, 46, 39, 40, 41, 44, 47, 47, 42, 42, 42,
+          45, 47, 48, 48, 48, 47, 46, 47, 47, 49, 50, 53}},
+        {{31, 31, 32, 31, 32, 32, 31, 32, 32, 32, 32, 32,
+          32, 32, 33, 32, 32, 32, 32, 33, 34, 32, 32, 32,
+          32, 34, 34, 35, 33, 33, 33, 33, 35, 35, 36, 38},
+         {31, 31, 31, 31, 31, 31, 30, 31, 31, 32, 34, 34,
+          35, 35, 39, 35, 35, 36, 36, 40, 41, 37, 38, 39,
+          40, 43, 44, 47, 40, 41, 41, 42, 44, 45, 47, 48}},
+        {{31, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32,
+          32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 33, 33},
+         {31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31,
+          31, 31, 32, 31, 32, 32, 32, 32, 33, 33, 34, 34,
+          35, 35, 36, 39, 33, 34, 34, 35, 35, 36, 39, 39}},
+        {{31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31,
+          32, 32, 32, 31, 31, 32, 32, 32, 32, 31, 31, 32,
+          32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32},
+         {31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31}}};
+constexpr uint8_t kQuantizerMatrix32x32
+    [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][528] = {
+        {{32,  31,  32,  31,  32,  32,  31,  32,  32,  32,  31,  32,  32,  33,
+          33,  32,  32,  32,  33,  34,  35,  34,  34,  33,  34,  35,  37,  39,
+          35,  34,  34,  35,  36,  37,  41,  43,  36,  35,  34,  35,  36,  38,
+          42,  45,  48,  39,  38,  37,  38,  39,  40,  45,  47,  50,  54,  44,
+          42,  41,  41,  42,  42,  47,  50,  54,  58,  63,  46,  44,  42,  43,
+          44,  44,  49,  52,  55,  59,  65,  67,  48,  46,  44,  45,  45,  46,
+          51,  53,  57,  61,  67,  69,  71,  54,  51,  49,  49,  50,  49,  54,
+          57,  60,  65,  71,  74,  76,  82,  59,  56,  54,  54,  54,  53,  58,
+          61,  64,  69,  75,  78,  80,  87,  92,  62,  59,  56,  56,  56,  55,
+          60,  63,  66,  71,  77,  80,  83,  89,  95,  98,  65,  62,  59,  59,
+          59,  58,  63,  65,  68,  73,  79,  82,  85,  92,  98,  101, 105, 71,
+          68,  65,  64,  64,  63,  68,  70,  73,  78,  84,  87,  90,  97,  103,
+          107, 111, 117, 80,  76,  72,  72,  71,  69,  74,  76,  79,  84,  90,
+          93,  96,  104, 110, 114, 118, 125, 134, 81,  77,  73,  73,  72,  70,
+          75,  77,  80,  85,  91,  94,  97,  105, 111, 115, 119, 126, 135, 137,
+          83,  78,  75,  74,  74,  72,  76,  79,  81,  86,  92,  95,  99,  106,
+          113, 117, 121, 128, 137, 138, 140, 88,  84,  80,  79,  78,  76,  80,
+          82,  85,  91,  95,  98,  103, 111, 115, 119, 126, 134, 139, 144, 147,
+          152, 91,  86,  83,  82,  81,  79,  81,  84,  88,  92,  95,  100, 107,
+          110, 115, 123, 127, 132, 140, 147, 151, 154, 159, 94,  89,  86,  85,
+          84,  82,  82,  86,  90,  92,  97,  103, 105, 111, 119, 121, 128, 136,
+          139, 146, 156, 158, 161, 166, 97,  92,  90,  88,  86,  85,  84,  89,
+          91,  95,  100, 102, 108, 114, 116, 125, 130, 133, 143, 148, 152, 163,
+          166, 168, 174, 101, 95,  93,  91,  89,  89,  87,  91,  93,  98,  101,
+          105, 111, 113, 120, 126, 130, 138, 142, 149, 157, 159, 171, 174, 176,
+          183, 104, 99,  97,  94,  93,  93,  90,  92,  96,  100, 102, 108, 111,
+          116, 122, 125, 134, 137, 144, 151, 155, 165, 169, 179, 182, 184, 191,
+          107, 102, 101, 97,  96,  96,  93,  93,  99,  101, 105, 110, 113, 120,
+          122, 129, 133, 140, 146, 150, 161, 163, 173, 178, 187, 191, 193, 200,
+          111, 105, 104, 101, 100, 99,  97,  96,  102, 103, 109, 111, 117, 120,
+          125, 131, 135, 143, 146, 156, 158, 168, 173, 180, 189, 195, 200, 202,
+          210, 115, 109, 108, 104, 104, 102, 101, 100, 103, 106, 111, 113, 119,
+          121, 129, 131, 140, 142, 151, 155, 162, 168, 176, 183, 188, 199, 204,
+          210, 212, 220, 119, 113, 112, 107, 107, 106, 105, 103, 105, 110, 112,
+          117, 120, 125, 130, 135, 140, 145, 152, 157, 165, 169, 179, 183, 193,
+          197, 210, 214, 220, 222, 231, 123, 116, 116, 111, 111, 109, 110, 107,
+          107, 114, 114, 121, 122, 130, 130, 140, 140, 150, 151, 163, 164, 176,
+          177, 190, 191, 204, 206, 222, 224, 230, 232, 242},
+         {32,  31,  31,  30,  31,  32,  32,  33,  33,  35,  33,  34,  35,  37,
+          39,  36,  38,  40,  41,  43,  47,  41,  42,  42,  43,  45,  47,  48,
+          45,  45,  44,  45,  46,  47,  49,  50,  49,  47,  46,  47,  47,  48,
+          50,  51,  53,  48,  47,  45,  46,  46,  46,  49,  51,  53,  54,  49,
+          47,  45,  45,  45,  45,  49,  51,  53,  55,  58,  50,  47,  45,  46,
+          46,  46,  49,  51,  54,  56,  59,  60,  50,  48,  46,  46,  46,  46,
+          50,  52,  54,  56,  60,  60,  61,  52,  50,  47,  47,  47,  47,  50,
+          52,  54,  57,  61,  62,  63,  66,  54,  52,  49,  49,  49,  48,  52,
+          53,  55,  58,  62,  64,  65,  68,  71,  56,  53,  51,  50,  50,  49,
+          52,  54,  56,  59,  63,  64,  66,  69,  72,  73,  57,  54,  52,  51,
+          51,  50,  53,  55,  56,  60,  63,  65,  67,  70,  73,  75,  76,  60,
+          57,  54,  54,  53,  52,  55,  57,  58,  61,  65,  67,  68,  72,  75,
+          77,  79,  82,  63,  60,  57,  57,  56,  54,  57,  59,  60,  63,  67,
+          69,  71,  75,  78,  80,  82,  85,  89,  64,  61,  58,  57,  57,  55,
+          58,  59,  61,  64,  67,  69,  71,  75,  78,  80,  82,  85,  89,  90,
+          65,  61,  58,  58,  57,  55,  58,  60,  61,  64,  68,  70,  71,  75,
+          79,  81,  83,  86,  90,  91,  91,  67,  63,  61,  60,  59,  57,  60,
+          61,  63,  66,  69,  70,  73,  77,  79,  81,  85,  88,  90,  92,  94,
+          96,  68,  64,  62,  61,  60,  58,  59,  61,  64,  66,  67,  71,  74,
+          75,  78,  82,  84,  86,  90,  93,  94,  96,  98,  69,  65,  63,  62,
+          61,  59,  59,  62,  64,  65,  68,  71,  72,  75,  79,  80,  83,  87,
+          89,  92,  96,  97,  98,  100, 70,  66,  64,  63,  62,  61,  60,  63,
+          64,  66,  69,  70,  73,  76,  77,  81,  84,  85,  89,  92,  93,  98,
+          99,  100, 102, 71,  67,  66,  64,  63,  62,  61,  63,  64,  67,  68,
+          70,  74,  75,  78,  81,  83,  86,  88,  91,  94,  95,  100, 101, 102,
+          104, 72,  68,  67,  65,  64,  64,  61,  63,  65,  67,  68,  71,  73,
+          75,  78,  79,  84,  85,  88,  91,  93,  97,  98,  102, 103, 104, 106,
+          73,  69,  68,  66,  65,  65,  63,  63,  66,  67,  69,  71,  73,  76,
+          77,  81,  82,  85,  88,  90,  94,  95,  99,  101, 104, 105, 106, 109,
+          74,  70,  70,  67,  66,  66,  64,  63,  66,  67,  70,  71,  74,  75,
+          78,  80,  82,  86,  87,  91,  92,  96,  98,  101, 104, 106, 108, 108,
+          111, 75,  71,  71,  68,  68,  67,  66,  64,  66,  68,  70,  71,  74,
+          75,  79,  79,  84,  84,  88,  90,  93,  95,  98,  101, 103, 107, 108,
+          110, 111, 113, 76,  72,  72,  69,  69,  68,  67,  65,  66,  69,  70,
+          72,  74,  76,  78,  81,  83,  85,  88,  90,  93,  95,  98,  100, 104,
+          105, 109, 111, 112, 113, 116, 78,  74,  74,  70,  70,  69,  69,  66,
+          66,  70,  70,  74,  74,  77,  78,  82,  82,  86,  87,  92,  92,  96,
+          97,  102, 102, 107, 107, 112, 113, 115, 115, 118}},
+        {{32,  31,  32,  31,  32,  32,  31,  32,  32,  32,  31,  32,  32,  32,
+          33,  32,  32,  32,  33,  34,  35,  32,  33,  33,  33,  34,  36,  36,
+          34,  34,  33,  34,  35,  37,  38,  39,  36,  35,  34,  35,  36,  38,
+          40,  42,  48,  38,  37,  36,  36,  38,  39,  41,  44,  50,  51,  39,
+          38,  37,  38,  39,  40,  42,  45,  50,  52,  54,  44,  42,  41,  41,
+          42,  42,  44,  47,  54,  56,  58,  63,  47,  45,  44,  44,  45,  45,
+          47,  50,  56,  58,  60,  66,  69,  49,  47,  46,  45,  46,  46,  48,
+          51,  57,  60,  62,  68,  71,  73,  54,  51,  50,  49,  50,  49,  51,
+          54,  60,  63,  65,  71,  75,  77,  82,  59,  56,  54,  54,  54,  53,
+          55,  58,  64,  67,  69,  75,  79,  81,  87,  92,  61,  58,  56,  56,
+          56,  55,  57,  60,  65,  68,  70,  77,  81,  83,  89,  94,  97,  65,
+          62,  60,  59,  59,  58,  60,  63,  68,  71,  73,  79,  84,  87,  92,
+          98,  101, 105, 71,  68,  65,  65,  64,  63,  65,  68,  73,  76,  78,
+          84,  89,  92,  97,  103, 106, 111, 117, 76,  72,  70,  69,  68,  66,
+          68,  71,  76,  79,  81,  88,  92,  95,  101, 107, 110, 115, 122, 127,
+          80,  76,  73,  72,  71,  69,  71,  74,  79,  82,  84,  90,  95,  98,
+          104, 110, 113, 118, 125, 130, 134, 83,  78,  76,  75,  74,  72,  73,
+          76,  81,  84,  86,  92,  97,  100, 106, 113, 116, 121, 128, 133, 137,
+          140, 86,  82,  79,  78,  77,  74,  76,  79,  84,  87,  89,  95,  100,
+          103, 109, 116, 119, 124, 131, 136, 140, 144, 147, 89,  85,  82,  81,
+          79,  78,  78,  82,  86,  87,  92,  97,  100, 105, 112, 114, 120, 128,
+          131, 136, 146, 147, 150, 155, 92,  88,  85,  84,  82,  81,  80,  85,
+          86,  90,  95,  97,  102, 107, 110, 117, 122, 125, 134, 138, 142, 152,
+          154, 156, 162, 95,  90,  88,  86,  85,  84,  82,  86,  88,  93,  95,
+          99,  105, 106, 113, 118, 121, 129, 132, 139, 146, 148, 159, 161, 163,
+          169, 98,  93,  91,  89,  88,  87,  85,  87,  90,  94,  96,  102, 104,
+          109, 114, 117, 126, 128, 134, 141, 145, 154, 157, 166, 168, 170, 176,
+          101, 96,  95,  92,  91,  90,  88,  88,  93,  95,  99,  103, 106, 112,
+          114, 121, 124, 131, 136, 140, 149, 151, 160, 165, 173, 176, 178, 184,
+          104, 99,  98,  95,  94,  93,  91,  90,  95,  96,  102, 103, 109, 112,
+          117, 122, 125, 133, 136, 145, 146, 156, 160, 167, 174, 180, 184, 186,
+          193, 108, 102, 101, 98,  97,  96,  95,  93,  97,  100, 104, 106, 111,
+          113, 121, 122, 130, 132, 140, 143, 150, 155, 162, 169, 174, 183, 188,
+          192, 194, 201, 111, 105, 105, 101, 100, 99,  98,  96,  98,  103, 105,
+          109, 112, 117, 121, 125, 130, 135, 141, 146, 152, 156, 165, 169, 178,
+          181, 193, 196, 201, 202, 210, 114, 109, 109, 104, 104, 102, 102, 99,
+          100, 106, 106, 113, 113, 120, 121, 129, 130, 139, 140, 151, 151, 162,
+          162, 175, 176, 187, 188, 203, 204, 210, 211, 219},
+         {32,  31,  31,  30, 31, 31,  31, 32, 32, 33,  33,  34,  35,  36,  39,
+          36,  38,  39,  40, 43, 47,  38, 40, 41, 41,  44,  47,  47,  41,  42,
+          42,  43,  45,  47, 48, 48,  49, 47, 46, 46,  47,  48,  49,  50,  53,
+          49,  47,  46,  46, 46, 47,  48, 50, 53, 53,  48,  47,  46,  45,  46,
+          46,  48,  49,  53, 54, 54,  49, 47, 45, 45,  45,  45,  47,  49,  53,
+          55,  55,  58,  50, 48, 46,  46, 46, 46, 47,  50,  54,  55,  56,  59,
+          61,  51,  48,  47, 46, 47,  46, 47, 50, 54,  55,  56,  60,  61,  62,
+          52,  50,  48,  47, 47, 47,  48, 50, 54, 56,  57,  61,  63,  64,  66,
+          54,  52,  50,  49, 49, 48,  49, 52, 55, 57,  58,  62,  64,  66,  68,
+          71,  55,  53,  51, 50, 50,  49, 50, 52, 56,  58,  59,  63,  65,  66,
+          69,  72,  73,  57, 54, 52,  51, 51, 50, 51,  53,  56,  58,  60,  63,
+          66,  67,  70,  73, 74, 76,  60, 57, 55, 54,  53,  52,  53,  55,  58,
+          60,  61,  65,  68, 69, 72,  75, 77, 79, 82,  62,  59,  57,  56,  55,
+          53,  54,  56,  59, 61, 63,  66, 69, 70, 74,  77,  78,  80,  84,  86,
+          63,  60,  58,  57, 56, 54,  55, 57, 60, 62,  63,  67,  70,  71,  75,
+          78,  79,  82,  85, 87, 89,  65, 61, 59, 58,  57,  55,  56,  58,  61,
+          63,  64,  68,  71, 72, 75,  79, 80, 83, 86,  88,  90,  91,  66,  63,
+          60,  59,  58,  56, 58, 59,  62, 64, 65, 69,  72,  73,  76,  80,  81,
+          84,  87,  90,  91, 93, 94,  67, 64, 62, 61,  59,  58,  58,  60,  63,
+          64,  66,  69,  71, 73, 77,  78, 81, 85, 86,  89,  93,  94,  95,  97,
+          68,  65,  63,  62, 60, 59,  58, 61, 62, 64,  67,  68,  71,  74,  75,
+          79,  81,  83,  87, 89, 91,  95, 96, 97, 99,  69,  66,  64,  63,  61,
+          61,  59,  61,  62, 65, 66,  68, 72, 73, 76,  78,  80,  84,  85,  88,
+          91,  92,  97,  98, 98, 101, 70, 67, 65, 63,  62,  62,  60,  61,  63,
+          65,  66,  69,  71, 73, 76,  77, 81, 83, 85,  88,  90,  94,  95,  99,
+          100, 100, 103, 71, 67, 67,  64, 63, 63, 61,  61,  64,  65,  67,  69,
+          71,  74,  75,  78, 80, 83,  85, 87, 91, 92,  95,  97,  100, 102, 102,
+          105, 72,  68,  68, 65, 65,  64, 62, 62, 64,  65,  68,  69,  72,  73,
+          76,  78,  80,  83, 84, 88,  89, 93, 95, 97,  100, 102, 104, 104, 107,
+          73,  69,  69,  66, 66, 65,  64, 63, 64, 66,  68,  69,  72,  73,  77,
+          77,  81,  82,  86, 87, 90,  92, 95, 97, 99,  103, 104, 106, 106, 109,
+          74,  70,  70,  67, 67, 66,  65, 63, 64, 67,  68,  70,  72,  74,  76,
+          78,  80,  82,  85, 87, 90,  91, 95, 96, 100, 101, 105, 106, 108, 108,
+          111, 75,  71,  71, 68, 68,  66, 66, 64, 64,  68,  68,  71,  71,  75,
+          75,  79,  79,  83, 84, 88,  89, 93, 93, 98,  98,  102, 103, 108, 108,
+          110, 110, 113}},
+        {{32,  31,  32,  31,  32,  32,  31,  32,  32,  32,  31,  32,  32,  32,
+          33,  32,  32,  32,  32,  33,  34,  32,  32,  32,  32,  34,  34,  35,
+          34,  34,  33,  33,  35,  36,  37,  39,  34,  34,  34,  34,  36,  36,
+          37,  41,  42,  36,  35,  34,  34,  36,  37,  38,  42,  45,  48,  39,
+          38,  38,  37,  39,  40,  40,  45,  47,  50,  54,  41,  39,  39,  38,
+          40,  40,  41,  46,  48,  51,  55,  56,  44,  42,  41,  41,  42,  42,
+          42,  47,  50,  54,  58,  59,  63,  48,  46,  45,  44,  45,  45,  45,
+          50,  53,  56,  61,  62,  66,  70,  49,  47,  46,  45,  46,  46,  46,
+          51,  53,  57,  62,  63,  68,  71,  73,  54,  51,  50,  49,  50,  49,
+          49,  54,  56,  60,  65,  67,  71,  76,  77,  82,  58,  55,  54,  53,
+          53,  53,  52,  57,  59,  63,  68,  70,  74,  79,  81,  86,  90,  59,
+          57,  55,  54,  54,  54,  54,  59,  61,  64,  69,  71,  75,  80,  82,
+          87,  91,  93,  65,  62,  60,  59,  59,  58,  58,  63,  65,  68,  73,
+          75,  79,  85,  87,  92,  97,  99,  105, 69,  66,  64,  63,  63,  62,
+          61,  66,  68,  71,  76,  78,  83,  88,  90,  96,  100, 102, 109, 113,
+          71,  68,  66,  65,  64,  63,  63,  68,  70,  73,  78,  80,  84,  90,
+          92,  97,  102, 104, 111, 115, 117, 80,  76,  73,  72,  71,  70,  69,
+          74,  76,  79,  84,  86,  90,  96,  98,  104, 109, 111, 118, 123, 125,
+          134, 81,  77,  75,  74,  73,  72,  71,  75,  77,  80,  85,  87,  91,
+          97,  99,  105, 110, 112, 120, 125, 127, 136, 137, 83,  78,  76,  75,
+          74,  73,  72,  76,  78,  81,  86,  88,  92,  98,  100, 106, 111, 113,
+          121, 126, 128, 137, 139, 140, 87,  83,  81,  79,  78,  77,  75,  80,
+          82,  85,  90,  91,  96,  101, 103, 110, 114, 117, 125, 129, 133, 142,
+          143, 145, 150, 90,  85,  83,  81,  80,  79,  78,  81,  83,  87,  89,
+          93,  98,  100, 106, 110, 114, 121, 124, 130, 136, 138, 148, 149, 151,
+          156, 93,  88,  86,  84,  83,  82,  80,  82,  85,  89,  90,  96,  98,
+          102, 107, 109, 118, 120, 125, 131, 134, 143, 145, 153, 156, 157, 163,
+          95,  90,  89,  86,  85,  85,  83,  83,  88,  89,  93,  97,  99,  105,
+          106, 113, 116, 122, 127, 130, 139, 140, 148, 153, 159, 162, 164, 169,
+          98,  93,  92,  89,  88,  87,  86,  85,  89,  90,  96,  97,  102, 105,
+          109, 114, 117, 124, 126, 134, 136, 144, 148, 154, 160, 166, 169, 170,
+          176, 101, 96,  95,  91,  91,  90,  89,  87,  90,  93,  97,  99,  104,
+          105, 112, 113, 121, 122, 130, 133, 139, 144, 150, 155, 160, 168, 172,
+          176, 177, 184, 104, 99,  98,  94,  94,  92,  92,  90,  92,  96,  98,
+          102, 104, 109, 112, 116, 121, 125, 130, 135, 141, 144, 152, 155, 163,
+          166, 177, 179, 184, 185, 191, 107, 101, 101, 97,  97,  95,  95,  93,
+          93,  99,  99,  105, 105, 112, 112, 120, 120, 129, 129, 139, 140, 149,
+          149, 161, 161, 172, 172, 185, 186, 191, 192, 199},
+         {32,  31,  31, 30, 31, 31, 30, 31, 31, 32, 33, 34,  35,  35,  39,
+          35,  36,  37, 37, 41, 43, 36, 38, 39, 40, 43, 45,  47,  41,  42,
+          42,  42,  45, 46, 47, 48, 44, 44, 44, 44, 46, 46,  47,  49,  50,
+          49,  47,  47, 46, 47, 47, 48, 50, 51, 53, 48, 47,  46,  45,  46,
+          46,  46,  49, 51, 53, 54, 48, 47, 46, 45, 46, 46,  46,  49,  51,
+          53,  54,  55, 49, 47, 46, 45, 45, 45, 45, 49, 51,  53,  55,  56,
+          58,  50,  48, 47, 46, 46, 46, 46, 50, 51, 54, 56,  57,  59,  61,
+          51,  48,  47, 46, 47, 46, 46, 50, 51, 54, 56, 57,  60,  62,  62,
+          52,  50,  48, 47, 47, 47, 47, 50, 52, 54, 57, 58,  61,  63,  64,
+          66,  54,  51, 50, 49, 49, 48, 48, 51, 53, 55, 58,  59,  62,  64,
+          65,  68,  70, 55, 52, 51, 50, 49, 49, 48, 52, 53,  55,  59,  60,
+          62,  65,  66, 68, 70, 71, 57, 54, 53, 52, 51, 50,  50,  53,  54,
+          56,  60,  61, 63, 66, 67, 70, 73, 73, 76, 59, 56,  54,  53,  53,
+          52,  51,  54, 56, 58, 61, 62, 65, 68, 69, 72, 74,  75,  78,  80,
+          60,  57,  55, 54, 53, 53, 52, 55, 56, 58, 61, 63,  65,  68,  69,
+          72,  75,  76, 79, 81, 82, 63, 60, 58, 57, 56, 55,  54,  57,  59,
+          60,  63,  65, 67, 70, 71, 75, 77, 78, 82, 84, 85,  89,  64,  61,
+          59,  58,  57, 56, 55, 58, 59, 61, 64, 65, 68, 71,  72,  75,  78,
+          79,  82,  85, 86, 89, 90, 65, 61, 60, 58, 57, 56,  55,  58,  59,
+          61,  64,  65, 68, 71, 72, 75, 78, 79, 83, 85, 86,  90,  91,  91,
+          67,  63,  61, 60, 59, 58, 57, 60, 61, 63, 65, 66,  69,  72,  73,
+          77,  79,  80, 84, 86, 88, 92, 93, 93, 95, 68, 64,  63,  61,  60,
+          59,  58,  60, 61, 63, 65, 67, 70, 71, 74, 76, 78,  81,  83,  86,
+          88,  89,  94, 94, 95, 97, 68, 65, 64, 62, 61, 60,  58,  59,  61,
+          64,  64,  68, 69, 71, 74, 75, 79, 80, 83, 86, 87,  91,  92,  95,
+          96,  97,  99, 69, 66, 65, 63, 62, 61, 59, 59, 62,  63,  65,  67,
+          69,  72,  72, 76, 78, 80, 83, 84, 88, 89, 92, 94,  97,  98,  99,
+          101, 70,  67, 66, 63, 63, 62, 61, 60, 63, 63, 66,  67,  69,  71,
+          73,  76,  77, 81, 82, 85, 86, 90, 91, 94, 96, 99,  100, 100, 103,
+          71,  67,  67, 64, 64, 63, 62, 61, 62, 64, 66, 67,  70,  71,  74,
+          74,  78,  79, 83, 84, 87, 89, 91, 94, 95, 99, 100, 102, 102, 104,
+          72,  68,  68, 65, 65, 64, 63, 61, 62, 65, 66, 68,  69,  71,  73,
+          75,  77,  79, 82, 84, 87, 88, 92, 93, 96, 97, 101, 102, 104, 104,
+          106, 73,  69, 69, 66, 66, 64, 64, 62, 62, 66, 66,  69,  69,  72,
+          73,  76,  77, 81, 81, 85, 85, 89, 90, 94, 94, 99,  99,  104, 104,
+          106, 106, 108}},
+        {{32,  31,  32,  31,  32,  32,  31,  32,  32,  32,  31,  32,  32,  32,
+          33,  31,  32,  32,  32,  33,  33,  32,  32,  32,  32,  33,  34,  35,
+          32,  33,  33,  33,  34,  34,  36,  36,  34,  34,  34,  33,  35,  35,
+          37,  38,  39,  35,  35,  34,  34,  36,  36,  38,  39,  42,  46,  36,
+          35,  35,  34,  36,  36,  38,  40,  42,  47,  48,  39,  38,  38,  37,
+          39,  39,  40,  42,  45,  49,  50,  54,  41,  40,  39,  38,  40,  40,
+          41,  43,  46,  50,  52,  55,  57,  44,  42,  42,  41,  42,  42,  42,
+          44,  47,  52,  54,  58,  60,  63,  47,  45,  45,  44,  44,  45,  45,
+          47,  50,  55,  56,  60,  62,  66,  69,  48,  46,  45,  44,  45,  45,
+          46,  47,  51,  55,  57,  61,  63,  67,  70,  71,  54,  51,  50,  49,
+          49,  50,  49,  51,  54,  59,  60,  65,  67,  71,  75,  76,  82,  56,
+          53,  52,  51,  51,  51,  51,  53,  56,  60,  61,  66,  69,  73,  77,
+          78,  84,  86,  59,  56,  55,  54,  54,  54,  53,  55,  58,  62,  64,
+          69,  71,  75,  79,  80,  87,  89,  92,  64,  61,  60,  58,  58,  58,
+          57,  59,  62,  66,  67,  72,  75,  79,  83,  84,  91,  93,  97,  102,
+          65,  62,  61,  59,  59,  59,  58,  60,  63,  67,  68,  73,  75,  79,
+          84,  85,  92,  94,  98,  103, 105, 71,  68,  67,  65,  64,  64,  63,
+          65,  68,  72,  73,  78,  80,  84,  89,  90,  97,  100, 103, 109, 111,
+          117, 74,  71,  69,  68,  67,  67,  65,  67,  70,  74,  75,  80,  83,
+          86,  91,  93,  100, 102, 106, 112, 114, 120, 123, 80,  76,  74,  72,
+          71,  71,  69,  71,  74,  78,  79,  84,  86,  90,  95,  96,  104, 106,
+          110, 116, 118, 125, 128, 134, 82,  78,  76,  74,  73,  73,  71,  73,
+          76,  79,  80,  86,  88,  92,  97,  98,  106, 108, 112, 118, 120, 127,
+          131, 136, 139, 83,  78,  77,  75,  74,  74,  72,  73,  76,  80,  81,
+          86,  89,  92,  97,  99,  106, 109, 113, 119, 121, 128, 131, 137, 139,
+          140, 87,  83,  81,  79,  78,  78,  75,  77,  80,  83,  85,  90,  92,
+          96,  100, 102, 110, 112, 117, 122, 125, 133, 135, 142, 144, 145, 150,
+          90,  85,  84,  81,  80,  80,  78,  78,  82,  84,  87,  91,  93,  98,
+          99,  106, 108, 113, 118, 121, 129, 130, 137, 141, 147, 150, 151, 156,
+          92,  88,  87,  84,  83,  82,  80,  80,  84,  85,  90,  91,  95,  98,
+          102, 106, 109, 115, 117, 125, 126, 134, 137, 142, 148, 152, 155, 156,
+          162, 95,  90,  89,  86,  85,  84,  83,  82,  85,  87,  91,  92,  97,
+          98,  105, 105, 112, 114, 121, 123, 129, 133, 138, 143, 147, 155, 158,
+          161, 162, 168, 97,  92,  92,  88,  88,  86,  86,  84,  85,  90,  91,
+          95,  97,  101, 104, 108, 112, 116, 121, 125, 130, 133, 140, 143, 150,
+          152, 162, 164, 168, 168, 174, 100, 95,  95,  90,  90,  89,  89,  86,
+          86,  92,  92,  97,  98,  104, 104, 111, 111, 119, 119, 128, 129, 137,
+          137, 147, 148, 157, 158, 169, 170, 174, 175, 181},
+         {32,  31,  31, 31, 31, 31, 30, 31, 31, 32, 33, 34, 34, 34,  37,
+          33,  34,  35, 35, 38, 39, 36, 38, 39, 40, 42, 43, 47, 38,  40,
+          40,  41,  43, 44, 47, 47, 41, 42, 42, 42, 44, 45, 47, 48,  48,
+          47,  46,  46, 45, 46, 47, 47, 48, 50, 52, 49, 47, 47, 46,  47,
+          47,  48,  49, 50, 52, 53, 48, 47, 46, 45, 46, 46, 46, 48,  49,
+          52,  53,  54, 49, 47, 46, 45, 46, 46, 46, 47, 49, 52, 53,  55,
+          55,  49,  47, 46, 45, 45, 45, 45, 47, 49, 52, 53, 55, 57,  58,
+          50,  48,  47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59,  61,
+          50,  48,  47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 58, 60,  61,
+          61,  52,  50, 49, 47, 47, 47, 47, 48, 50, 53, 54, 57, 59,  61,
+          63,  63,  66, 53, 50, 50, 48, 48, 48, 47, 49, 51, 54, 55,  58,
+          59,  62,  64, 64, 67, 68, 54, 52, 51, 49, 49, 49, 48, 49,  52,
+          55,  55,  58, 60, 62, 64, 65, 68, 69, 71, 56, 54, 53, 51,  51,
+          51,  49,  51, 53, 55, 56, 59, 61, 63, 66, 66, 70, 71, 73,  75,
+          57,  54,  53, 52, 51, 51, 50, 51, 53, 56, 56, 60, 61, 63,  66,
+          67,  70,  71, 73, 76, 76, 60, 57, 56, 54, 53, 53, 52, 53,  55,
+          58,  58,  61, 63, 65, 68, 68, 72, 73, 75, 78, 79, 82, 61,  58,
+          57,  55,  55, 54, 53, 54, 56, 58, 59, 62, 64, 66, 69, 69,  73,
+          74,  76,  79, 80, 83, 84, 63, 60, 59, 57, 56, 56, 54, 55,  57,
+          60,  60,  63, 65, 67, 70, 71, 75, 76, 78, 81, 82, 85, 86,  89,
+          64,  61,  60, 58, 57, 57, 55, 56, 58, 60, 61, 64, 66, 68,  70,
+          71,  75,  77, 79, 82, 82, 86, 87, 90, 91, 65, 61, 60, 58,  57,
+          57,  55,  56, 58, 61, 61, 64, 66, 68, 71, 71, 75, 77, 79,  82,
+          83,  86,  88, 90, 91, 91, 67, 63, 62, 60, 59, 59, 57, 58,  60,
+          62,  63,  66, 67, 69, 72, 73, 77, 78, 80, 83, 84, 88, 89,  92,
+          93,  93,  95, 67, 64, 63, 61, 60, 60, 58, 58, 61, 61, 63,  65,
+          67,  70,  70, 74, 75, 78, 80, 81, 85, 86, 89, 91, 93, 94,  95,
+          97,  68,  65, 64, 62, 61, 60, 59, 58, 61, 61, 64, 65, 67,  69,
+          71,  73,  75, 78, 79, 83, 83, 87, 88, 91, 93, 95, 96, 97,  99,
+          69,  65,  65, 62, 62, 61, 60, 59, 61, 62, 64, 65, 68, 68,  72,
+          72,  76,  76, 80, 81, 84, 86, 88, 90, 92, 95, 96, 98, 98,  100,
+          70,  66,  66, 63, 63, 62, 61, 60, 60, 63, 64, 66, 67, 69,  71,
+          73,  75,  77, 79, 81, 84, 85, 88, 89, 93, 93, 97, 98, 100, 100,
+          102, 71,  67, 67, 64, 64, 62, 62, 60, 60, 64, 64, 67, 67,  70,
+          70,  74,  74, 78, 78, 82, 82, 86, 86, 91, 91, 95, 95, 100, 100,
+          101, 101, 104}},
+        {{32,  31,  32,  31,  32,  32,  31,  32,  32,  32,  31,  32,  32,  32,
+          32,  31,  32,  32,  32,  33,  33,  32,  32,  32,  32,  33,  33,  34,
+          32,  32,  32,  32,  33,  34,  35,  35,  33,  33,  33,  33,  34,  35,
+          36,  36,  38,  34,  34,  34,  33,  34,  35,  36,  37,  39,  39,  36,
+          35,  35,  34,  35,  36,  37,  38,  42,  42,  48,  36,  35,  35,  34,
+          35,  36,  38,  38,  42,  43,  48,  49,  39,  38,  38,  37,  38,  39,
+          40,  40,  44,  45,  50,  51,  54,  41,  39,  39,  38,  39,  40,  40,
+          41,  45,  46,  51,  52,  55,  56,  44,  42,  42,  41,  41,  42,  42,
+          42,  46,  47,  54,  54,  58,  59,  63,  46,  44,  44,  42,  43,  44,
+          44,  44,  48,  49,  55,  55,  59,  61,  65,  67,  48,  46,  46,  44,
+          45,  45,  45,  46,  50,  51,  57,  57,  61,  63,  67,  69,  71,  52,
+          50,  49,  48,  48,  48,  48,  48,  52,  53,  59,  59,  64,  65,  70,
+          72,  74,  78,  54,  51,  51,  49,  49,  50,  49,  49,  53,  54,  60,
+          60,  65,  67,  71,  74,  76,  80,  82,  58,  56,  55,  53,  53,  53,
+          53,  53,  57,  58,  63,  64,  68,  70,  75,  77,  80,  84,  86,  91,
+          59,  56,  56,  54,  54,  54,  53,  53,  57,  58,  64,  64,  69,  70,
+          75,  78,  80,  85,  87,  91,  92,  65,  62,  61,  59,  59,  59,  58,
+          58,  62,  63,  68,  68,  73,  75,  79,  82,  85,  90,  92,  97,  98,
+          105, 66,  63,  63,  60,  60,  60,  59,  59,  63,  64,  69,  69,  74,
+          76,  80,  83,  86,  91,  93,  98,  99,  106, 107, 71,  68,  67,  65,
+          65,  64,  63,  63,  67,  68,  73,  73,  78,  80,  84,  87,  90,  95,
+          97,  103, 103, 111, 112, 117, 74,  71,  70,  68,  67,  67,  66,  65,
+          69,  70,  75,  75,  80,  82,  86,  89,  93,  97,  100, 105, 106, 114,
+          115, 120, 123, 80,  76,  75,  72,  72,  71,  70,  69,  73,  74,  79,
+          79,  84,  86,  90,  93,  96,  101, 104, 110, 110, 118, 119, 125, 128,
+          134, 81,  77,  77,  74,  73,  73,  71,  71,  74,  75,  80,  80,  85,
+          87,  91,  94,  98,  103, 105, 111, 112, 120, 121, 127, 130, 136, 137,
+          83,  78,  78,  75,  74,  74,  72,  72,  75,  76,  81,  81,  86,  88,
+          92,  95,  99,  104, 106, 112, 113, 121, 122, 128, 131, 137, 139, 140,
+          86,  82,  81,  78,  77,  77,  75,  74,  78,  79,  84,  84,  89,  91,
+          95,  98,  101, 106, 109, 115, 116, 124, 125, 131, 135, 140, 142, 144,
+          147, 89,  84,  84,  80,  80,  79,  78,  77,  79,  81,  85,  86,  91,
+          92,  97,  98,  104, 106, 112, 114, 119, 123, 128, 132, 135, 142, 145,
+          148, 149, 153, 91,  86,  86,  82,  82,  81,  80,  79,  80,  84,  85,
+          88,  91,  94,  97,  100, 104, 107, 112, 115, 120, 123, 129, 132, 138,
+          140, 148, 150, 153, 154, 159, 93,  88,  88,  84,  84,  83,  83,  80,
+          81,  86,  86,  91,  91,  96,  97,  103, 103, 110, 110, 118, 119, 126,
+          126, 135, 136, 144, 144, 155, 155, 159, 159, 164},
+         {32, 31, 31, 31, 31, 31, 30, 31, 31, 32, 31, 32, 32, 33, 34, 33, 34,
+          35, 35, 37, 39, 35, 37, 37, 38, 39, 41, 44, 36, 38, 39, 40, 41, 43,
+          46, 47, 40, 41, 41, 42, 43, 44, 46, 47, 48, 41, 42, 42, 42, 43, 45,
+          46, 47, 48, 48, 49, 47, 47, 46, 46, 47, 47, 48, 50, 50, 53, 49, 47,
+          47, 46, 46, 47, 47, 47, 49, 50, 53, 53, 48, 47, 47, 45, 46, 46, 46,
+          46, 49, 49, 53, 53, 54, 48, 47, 46, 45, 45, 46, 46, 46, 49, 49, 53,
+          53, 54, 55, 49, 47, 46, 45, 45, 45, 45, 45, 48, 49, 53, 54, 55, 56,
+          58, 50, 47, 47, 45, 46, 46, 46, 46, 49, 49, 54, 54, 56, 57, 59, 60,
+          50, 48, 48, 46, 46, 46, 46, 46, 49, 50, 54, 54, 56, 57, 60, 60, 61,
+          52, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, 61, 62, 63,
+          65, 52, 50, 49, 47, 47, 47, 47, 47, 49, 50, 54, 54, 57, 58, 61, 62,
+          63, 65, 66, 54, 52, 51, 49, 49, 49, 48, 48, 51, 52, 55, 55, 58, 59,
+          62, 63, 65, 67, 68, 70, 54, 52, 51, 49, 49, 49, 48, 48, 51, 52, 55,
+          56, 58, 60, 62, 64, 65, 67, 68, 70, 71, 57, 54, 54, 52, 51, 51, 50,
+          50, 52, 53, 56, 57, 60, 61, 63, 65, 67, 69, 70, 73, 73, 76, 57, 55,
+          54, 52, 52, 51, 51, 50, 53, 53, 57, 57, 60, 61, 64, 65, 67, 70, 71,
+          73, 74, 77, 77, 60, 57, 56, 54, 54, 53, 52, 52, 54, 55, 58, 59, 61,
+          63, 65, 67, 68, 71, 72, 75, 75, 79, 79, 82, 61, 58, 57, 55, 55, 54,
+          53, 53, 55, 56, 59, 59, 62, 63, 66, 68, 69, 72, 73, 76, 76, 80, 80,
+          83, 84, 63, 60, 59, 57, 57, 56, 55, 54, 57, 57, 60, 61, 63, 65, 67,
+          69, 71, 73, 75, 78, 78, 82, 82, 85, 86, 89, 64, 61, 60, 58, 57, 57,
+          56, 55, 57, 58, 61, 61, 64, 65, 68, 69, 71, 74, 75, 78, 78, 82, 83,
+          86, 87, 89, 90, 65, 61, 61, 58, 58, 57, 56, 55, 58, 58, 61, 62, 64,
+          65, 68, 70, 71, 74, 75, 78, 79, 83, 83, 86, 88, 90, 91, 91, 66, 63,
+          62, 60, 59, 58, 57, 56, 59, 59, 62, 63, 65, 66, 69, 70, 72, 75, 76,
+          79, 80, 84, 84, 87, 89, 91, 92, 93, 94, 67, 64, 63, 61, 60, 59, 58,
+          57, 59, 60, 62, 63, 66, 66, 70, 70, 73, 74, 77, 78, 81, 83, 85, 87,
+          89, 92, 93, 94, 94, 96, 68, 64, 64, 61, 61, 60, 59, 58, 59, 61, 62,
+          64, 65, 67, 69, 71, 72, 74, 77, 78, 81, 82, 85, 86, 89, 90, 94, 94,
+          96, 96, 98, 69, 65, 65, 62, 62, 61, 61, 58, 59, 62, 62, 65, 65, 68,
+          68, 71, 71, 75, 75, 79, 79, 83, 83, 87, 87, 91, 91, 96, 96, 97, 97,
+          99}},
+        {{32,  31,  32,  31,  32,  32,  31,  32,  32,  32,  31,  32,  32,  32,
+          32,  31,  32,  32,  32,  32,  33,  31,  32,  32,  32,  32,  33,  33,
+          32,  32,  32,  32,  32,  34,  34,  35,  32,  32,  32,  32,  32,  34,
+          34,  35,  35,  34,  34,  34,  33,  33,  35,  35,  37,  37,  39,  34,
+          34,  34,  33,  33,  35,  35,  37,  37,  39,  39,  36,  35,  35,  34,
+          34,  36,  36,  38,  38,  42,  42,  48,  36,  35,  35,  34,  34,  36,
+          36,  38,  38,  42,  42,  48,  48,  39,  38,  38,  37,  37,  39,  39,
+          40,  40,  45,  45,  50,  50,  54,  39,  38,  38,  37,  37,  39,  39,
+          40,  40,  45,  45,  50,  50,  54,  54,  44,  42,  42,  41,  41,  42,
+          42,  42,  42,  47,  47,  54,  54,  58,  58,  63,  44,  42,  42,  41,
+          41,  42,  42,  42,  42,  47,  47,  54,  54,  58,  58,  63,  63,  48,
+          46,  46,  44,  44,  45,  45,  46,  46,  51,  51,  57,  57,  61,  61,
+          67,  67,  71,  48,  46,  46,  44,  44,  45,  45,  46,  46,  51,  51,
+          57,  57,  61,  61,  67,  67,  71,  71,  54,  51,  51,  49,  49,  50,
+          50,  49,  49,  54,  54,  60,  60,  65,  65,  71,  71,  76,  76,  82,
+          54,  51,  51,  49,  49,  50,  50,  49,  49,  54,  54,  60,  60,  65,
+          65,  71,  71,  76,  76,  82,  82,  59,  56,  56,  54,  54,  54,  54,
+          53,  53,  58,  58,  64,  64,  69,  69,  75,  75,  80,  80,  87,  87,
+          92,  59,  56,  56,  54,  54,  54,  54,  53,  53,  58,  58,  64,  64,
+          69,  69,  75,  75,  80,  80,  87,  87,  92,  92,  65,  62,  62,  59,
+          59,  59,  59,  58,  58,  63,  63,  68,  68,  73,  73,  79,  79,  85,
+          85,  92,  92,  98,  98,  105, 65,  62,  62,  59,  59,  59,  59,  58,
+          58,  63,  63,  68,  68,  73,  73,  79,  79,  85,  85,  92,  92,  98,
+          98,  105, 105, 71,  68,  68,  65,  65,  64,  64,  63,  63,  68,  68,
+          73,  73,  78,  78,  84,  84,  90,  90,  97,  97,  103, 103, 111, 111,
+          117, 71,  68,  68,  65,  65,  64,  64,  63,  63,  68,  68,  73,  73,
+          78,  78,  84,  84,  90,  90,  97,  97,  103, 103, 111, 111, 117, 117,
+          80,  76,  76,  72,  72,  71,  71,  69,  69,  74,  74,  79,  79,  84,
+          84,  90,  90,  96,  96,  104, 104, 110, 110, 118, 118, 125, 125, 134,
+          80,  76,  76,  72,  72,  71,  71,  69,  69,  74,  74,  79,  79,  84,
+          84,  90,  90,  96,  96,  104, 104, 110, 110, 118, 118, 125, 125, 134,
+          134, 83,  78,  78,  75,  75,  74,  74,  72,  72,  76,  76,  81,  81,
+          86,  86,  92,  92,  99,  99,  106, 106, 113, 113, 121, 121, 128, 128,
+          137, 137, 140, 83,  78,  78,  75,  75,  74,  74,  72,  72,  76,  76,
+          81,  81,  86,  86,  92,  92,  99,  99,  106, 106, 113, 113, 121, 121,
+          128, 128, 137, 137, 140, 140, 87,  83,  83,  79,  79,  77,  77,  75,
+          75,  80,  80,  84,  84,  90,  90,  96,  96,  102, 102, 109, 109, 116,
+          116, 124, 124, 132, 132, 141, 141, 144, 144, 149},
+         {32, 31, 31, 31, 31, 31, 30, 31, 31, 32, 30, 31, 31, 32, 32, 33, 34,
+          34, 35, 35, 39, 33, 34, 34, 35, 35, 39, 39, 36, 38, 38, 40, 40, 43,
+          43, 47, 36, 38, 38, 40, 40, 43, 43, 47, 47, 41, 42, 42, 42, 42, 45,
+          45, 47, 47, 48, 41, 42, 42, 42, 42, 45, 45, 47, 47, 48, 48, 49, 47,
+          47, 46, 46, 47, 47, 48, 48, 50, 50, 53, 49, 47, 47, 46, 46, 47, 47,
+          48, 48, 50, 50, 53, 53, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49,
+          53, 53, 54, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54,
+          54, 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53, 53, 55, 55, 58,
+          49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53, 53, 55, 55, 58, 58,
+          50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, 56, 60, 60,
+          61, 50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, 56, 60,
+          60, 61, 61, 52, 50, 50, 47, 47, 47, 47, 47, 47, 50, 50, 54, 54, 57,
+          57, 61, 61, 63, 63, 66, 52, 50, 50, 47, 47, 47, 47, 47, 47, 50, 50,
+          54, 54, 57, 57, 61, 61, 63, 63, 66, 66, 54, 52, 52, 49, 49, 49, 49,
+          48, 48, 52, 52, 55, 55, 58, 58, 62, 62, 65, 65, 68, 68, 71, 54, 52,
+          52, 49, 49, 49, 49, 48, 48, 52, 52, 55, 55, 58, 58, 62, 62, 65, 65,
+          68, 68, 71, 71, 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 56, 56,
+          60, 60, 63, 63, 67, 67, 70, 70, 73, 73, 76, 57, 54, 54, 52, 52, 51,
+          51, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 67, 67, 70, 70, 73, 73,
+          76, 76, 60, 57, 57, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 61, 61,
+          65, 65, 68, 68, 72, 72, 75, 75, 79, 79, 82, 60, 57, 57, 54, 54, 53,
+          53, 52, 52, 55, 55, 58, 58, 61, 61, 65, 65, 68, 68, 72, 72, 75, 75,
+          79, 79, 82, 82, 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60,
+          63, 63, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 63, 60,
+          60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 63, 63, 67, 67, 71, 71,
+          75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 65, 61, 61, 58, 58, 57, 57,
+          55, 55, 58, 58, 61, 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83,
+          83, 86, 86, 90, 90, 91, 65, 61, 61, 58, 58, 57, 57, 55, 55, 58, 58,
+          61, 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 86, 86, 90,
+          90, 91, 91, 67, 63, 63, 60, 60, 59, 59, 57, 57, 60, 60, 62, 62, 66,
+          66, 69, 69, 72, 72, 76, 76, 80, 80, 84, 84, 88, 88, 92, 92, 93, 93,
+          95}},
+        {{32,  31,  31,  31,  32,  32,  31,  32,  32,  32,  31,  32,  32,  32,
+          32,  31,  32,  32,  32,  32,  32,  31,  32,  32,  32,  32,  33,  33,
+          32,  32,  32,  32,  32,  33,  33,  34,  32,  32,  32,  32,  32,  33,
+          34,  34,  35,  32,  32,  32,  32,  33,  33,  34,  34,  35,  35,  34,
+          34,  34,  33,  33,  34,  35,  35,  37,  37,  39,  34,  34,  34,  33,
+          33,  34,  35,  35,  37,  37,  39,  39,  35,  35,  35,  34,  34,  35,
+          36,  36,  38,  38,  42,  42,  46,  36,  35,  35,  34,  34,  35,  36,
+          37,  38,  38,  42,  42,  47,  48,  38,  37,  37,  36,  36,  37,  38,
+          38,  39,  40,  44,  44,  48,  50,  51,  39,  38,  38,  38,  37,  38,
+          39,  39,  40,  41,  45,  45,  49,  50,  52,  54,  41,  40,  40,  39,
+          38,  39,  40,  40,  41,  41,  46,  46,  50,  52,  54,  55,  57,  44,
+          42,  42,  41,  41,  41,  42,  42,  42,  43,  47,  47,  52,  54,  56,
+          58,  60,  63,  45,  43,  43,  42,  41,  42,  42,  43,  43,  43,  48,
+          48,  53,  54,  57,  58,  60,  64,  65,  48,  46,  46,  45,  44,  45,
+          45,  45,  46,  46,  51,  51,  55,  57,  59,  61,  63,  67,  68,  71,
+          48,  46,  46,  45,  44,  45,  45,  45,  46,  46,  51,  51,  55,  57,
+          59,  61,  63,  67,  68,  71,  71,  53,  51,  51,  49,  49,  49,  49,
+          49,  49,  49,  54,  54,  58,  59,  62,  64,  67,  71,  72,  75,  75,
+          81,  54,  52,  51,  50,  49,  49,  50,  49,  49,  50,  54,  54,  59,
+          60,  63,  65,  67,  71,  72,  76,  76,  81,  82,  57,  55,  55,  53,
+          52,  52,  52,  52,  52,  52,  57,  57,  61,  62,  65,  67,  70,  74,
+          75,  79,  79,  85,  85,  89,  59,  56,  56,  54,  54,  54,  54,  54,
+          53,  54,  58,  58,  62,  64,  67,  69,  71,  75,  76,  80,  80,  86,
+          87,  90,  92,  62,  59,  59,  57,  56,  56,  56,  56,  55,  56,  60,
+          60,  64,  66,  69,  71,  73,  77,  78,  83,  83,  89,  89,  93,  95,
+          98,  65,  62,  62,  60,  59,  59,  59,  59,  58,  58,  63,  63,  67,
+          68,  71,  73,  75,  79,  81,  85,  85,  91,  92,  96,  98,  101, 105,
+          67,  64,  64,  62,  61,  61,  60,  60,  59,  60,  64,  64,  68,  69,
+          72,  74,  77,  81,  82,  87,  87,  93,  94,  98,  99,  103, 106, 108,
+          71,  68,  68,  66,  65,  64,  64,  64,  63,  63,  68,  68,  72,  73,
+          76,  78,  80,  84,  85,  90,  90,  97,  97,  102, 103, 107, 111, 113,
+          117, 72,  69,  69,  66,  65,  65,  65,  64,  63,  64,  68,  68,  72,
+          73,  76,  78,  81,  85,  86,  91,  91,  97,  98,  102, 104, 108, 111,
+          113, 118, 119, 80,  76,  76,  73,  72,  72,  71,  70,  69,  70,  74,
+          74,  78,  79,  82,  84,  86,  90,  91,  96,  96,  103, 104, 108, 110,
+          114, 118, 120, 125, 126, 134, 80,  76,  76,  73,  72,  72,  71,  70,
+          69,  70,  74,  74,  78,  79,  82,  84,  86,  90,  91,  96,  96,  103,
+          104, 108, 110, 114, 118, 120, 125, 126, 134, 134},
+         {32, 31, 31, 31, 31, 31, 30, 31, 31, 31, 30, 31, 31, 31, 32, 32, 32,
+          33, 33, 33, 35, 33, 34, 34, 35, 35, 37, 39, 34, 35, 35, 36, 36, 38,
+          40, 41, 36, 38, 38, 39, 40, 41, 43, 44, 47, 37, 38, 39, 40, 40, 42,
+          43, 44, 47, 47, 41, 42, 42, 42, 42, 43, 45, 45, 47, 47, 48, 41, 42,
+          42, 42, 42, 43, 45, 45, 47, 47, 48, 48, 47, 46, 46, 46, 45, 46, 47,
+          47, 47, 48, 50, 50, 52, 49, 48, 47, 47, 46, 47, 47, 47, 48, 48, 50,
+          50, 52, 53, 49, 47, 47, 46, 46, 46, 46, 47, 47, 47, 50, 50, 52, 53,
+          53, 48, 47, 47, 46, 45, 46, 46, 46, 46, 47, 49, 49, 52, 53, 54, 54,
+          49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 49, 49, 52, 53, 54, 55, 55,
+          49, 47, 47, 45, 45, 45, 45, 45, 45, 45, 49, 49, 52, 53, 55, 55, 57,
+          58, 49, 47, 47, 46, 45, 45, 45, 45, 45, 46, 49, 49, 52, 53, 55, 56,
+          57, 59, 59, 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 50, 50, 53, 54,
+          55, 56, 58, 60, 60, 61, 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 50,
+          50, 53, 54, 55, 56, 58, 60, 60, 61, 61, 52, 50, 49, 48, 47, 47, 47,
+          47, 46, 47, 50, 50, 53, 54, 56, 57, 59, 61, 61, 63, 63, 66, 52, 50,
+          50, 48, 47, 47, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 61,
+          63, 63, 66, 66, 54, 51, 51, 50, 49, 49, 49, 48, 48, 48, 51, 51, 54,
+          55, 57, 58, 60, 62, 62, 65, 65, 67, 68, 69, 54, 52, 52, 50, 49, 49,
+          49, 49, 48, 48, 52, 52, 55, 55, 57, 58, 60, 62, 63, 65, 65, 68, 68,
+          70, 71, 56, 53, 53, 51, 51, 50, 50, 50, 49, 49, 52, 52, 55, 56, 58,
+          59, 61, 63, 63, 66, 66, 69, 69, 71, 72, 73, 57, 54, 54, 52, 52, 51,
+          51, 51, 50, 50, 53, 53, 56, 56, 58, 60, 61, 63, 64, 67, 67, 70, 70,
+          72, 73, 75, 76, 58, 55, 55, 53, 52, 52, 52, 51, 50, 51, 54, 54, 56,
+          57, 59, 60, 62, 64, 65, 67, 67, 71, 71, 73, 74, 75, 77, 78, 60, 57,
+          57, 55, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 60, 61, 63, 65, 66,
+          68, 68, 72, 72, 74, 75, 77, 79, 80, 82, 60, 57, 57, 55, 54, 54, 54,
+          53, 52, 52, 55, 55, 58, 58, 60, 62, 63, 65, 66, 69, 69, 72, 73, 75,
+          76, 77, 79, 80, 82, 82, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57,
+          57, 60, 60, 62, 63, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83,
+          85, 85, 89, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57, 60, 60,
+          62, 63, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85, 89,
+          89}},
+        {{32,  31,  31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32,  32,  32,
+          31,  32,  32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 33,  31,  32,
+          32,  32,  32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33,  33,  34,
+          32,  32,  32, 32, 32, 32, 33, 34, 34, 35, 32, 32, 32,  32,  32,
+          32,  33,  34, 34, 35, 35, 33, 33, 33, 33, 33, 33, 34,  35,  35,
+          36,  36,  38, 34, 34, 34, 34, 33, 33, 35, 35, 36, 37,  37,  39,
+          39,  34,  34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 40,  41,  42,
+          36,  35,  35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42,  45,  48,
+          36,  35,  35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42,  45,  48,
+          48,  38,  38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43,  44,  46,
+          50,  50,  52, 39, 38, 38, 38, 37, 37, 39, 39, 39, 40,  40,  44,
+          45,  47,  50, 50, 53, 54, 41, 40, 40, 39, 38, 38, 40,  40,  40,
+          41,  41,  45, 46, 48, 52, 52, 54, 55, 57, 44, 42, 42,  42,  41,
+          41,  42,  42, 42, 42, 42, 46, 47, 50, 54, 54, 57, 58,  60,  63,
+          44,  42,  42, 42, 41, 41, 42, 42, 42, 42, 42, 46, 47,  50,  54,
+          54,  57,  58, 60, 63, 63, 47, 46, 45, 45, 44, 44, 44,  45,  45,
+          45,  45,  49, 50, 52, 56, 56, 59, 60, 62, 66, 66, 69,  48,  47,
+          46,  45,  44, 44, 45, 45, 45, 46, 46, 50, 51, 53, 57,  57,  60,
+          61,  63,  67, 67, 70, 71, 50, 49, 48, 47, 46, 46, 47,  47,  47,
+          47,  47,  51, 52, 54, 58, 58, 61, 62, 65, 68, 68, 72,  73,  75,
+          54,  52,  51, 50, 49, 49, 49, 50, 49, 49, 49, 53, 54,  56,  60,
+          60,  64,  65, 67, 71, 71, 75, 76, 78, 82, 54, 52, 51,  50,  49,
+          49,  49,  50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65,  67,  71,
+          71,  75,  76, 78, 82, 82, 58, 56, 55, 54, 53, 53, 53,  53,  53,
+          52,  52,  56, 57, 59, 63, 63, 67, 68, 70, 74, 74, 78,  79,  82,
+          86,  86,  90, 59, 57, 56, 55, 54, 54, 54, 54, 54, 53,  53,  57,
+          58,  60,  64, 64, 68, 69, 71, 75, 75, 79, 80, 83, 87,  87,  91,
+          92,  61,  59, 58, 57, 56, 56, 56, 56, 55, 55, 55, 59,  60,  62,
+          65,  65,  69, 70, 73, 77, 77, 81, 82, 85, 89, 89, 93,  94,  97,
+          65,  63,  62, 61, 59, 59, 59, 59, 59, 58, 58, 62, 63,  65,  68,
+          68,  72,  73, 75, 79, 79, 84, 85, 88, 92, 92, 97, 98,  101, 105,
+          65,  63,  62, 61, 59, 59, 59, 59, 59, 58, 58, 62, 63,  65,  68,
+          68,  72,  73, 75, 79, 79, 84, 85, 88, 92, 92, 97, 98,  101, 105,
+          105, 70,  67, 67, 65, 64, 64, 63, 63, 63, 62, 62, 66,  67,  69,
+          72,  72,  76, 77, 79, 83, 83, 88, 89, 92, 96, 96, 101, 102, 105,
+          109, 109, 114},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 32, 30, 31,
+          31, 31, 32, 32, 33, 33, 34, 34, 34, 34, 37, 33, 34, 34, 35, 35, 35,
+          38, 39, 34, 36, 36, 36, 37, 37, 40, 40, 42, 36, 38, 38, 39, 40, 40,
+          42, 43, 45, 47, 36, 38, 38, 39, 40, 40, 42, 43, 45, 47, 47, 40, 41,
+          41, 41, 42, 42, 44, 44, 45, 47, 47, 48, 41, 42, 42, 42, 42, 42, 44,
+          45, 46, 47, 47, 48, 48, 44, 44, 44, 44, 44, 44, 45, 46, 46, 47, 47,
+          49, 49, 50, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51,
+          53, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51, 53, 53,
+          48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53, 54,
+          48, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, 53, 53, 54,
+          54, 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, 53, 53,
+          54, 55, 55, 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 48, 49, 51,
+          53, 53, 55, 55, 57, 58, 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45,
+          48, 49, 51, 53, 53, 55, 55, 57, 58, 58, 50, 48, 48, 47, 46, 46, 46,
+          46, 46, 46, 46, 49, 50, 51, 54, 54, 56, 56, 57, 59, 59, 61, 50, 49,
+          48, 47, 46, 46, 46, 46, 46, 46, 46, 49, 50, 51, 54, 54, 56, 56, 58,
+          60, 60, 61, 61, 51, 49, 49, 48, 47, 47, 47, 47, 47, 46, 46, 49, 50,
+          51, 54, 54, 56, 57, 58, 60, 60, 62, 62, 63, 52, 50, 50, 49, 47, 47,
+          47, 47, 47, 47, 47, 49, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 63,
+          65, 66, 52, 50, 50, 49, 47, 47, 47, 47, 47, 47, 47, 49, 50, 52, 54,
+          54, 57, 57, 59, 61, 61, 63, 63, 65, 66, 66, 54, 52, 51, 50, 49, 49,
+          49, 49, 48, 48, 48, 51, 51, 53, 55, 55, 58, 58, 60, 62, 62, 64, 65,
+          66, 68, 68, 70, 54, 52, 52, 51, 49, 49, 49, 49, 49, 48, 48, 51, 52,
+          53, 55, 55, 58, 58, 60, 62, 62, 64, 65, 66, 68, 68, 70, 71, 55, 53,
+          53, 52, 50, 50, 50, 50, 49, 49, 49, 51, 52, 54, 56, 56, 58, 59, 60,
+          63, 63, 65, 66, 67, 69, 69, 71, 72, 73, 57, 55, 54, 53, 52, 52, 51,
+          51, 50, 50, 50, 52, 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68,
+          70, 70, 73, 73, 74, 76, 57, 55, 54, 53, 52, 52, 51, 51, 50, 50, 50,
+          52, 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68, 70, 70, 73, 73,
+          74, 76, 76, 59, 57, 56, 55, 54, 54, 53, 53, 52, 51, 51, 54, 55, 56,
+          58, 58, 60, 61, 63, 65, 65, 67, 68, 70, 72, 72, 74, 75, 76, 78, 78,
+          80}},
+        {{32, 31, 31, 31, 31, 32, 31, 31, 32, 32, 31, 32, 32, 32, 32, 31, 32,
+          32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
+          32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+          33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 32, 32,
+          32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 32, 33, 33, 33, 33, 33, 33,
+          34, 34, 35, 36, 36, 36, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 37,
+          37, 38, 39, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 37, 37, 38, 39,
+          39, 35, 34, 34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 39, 41, 41, 43,
+          36, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48,
+          36, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48,
+          48, 38, 37, 37, 37, 36, 36, 36, 38, 38, 38, 39, 39, 41, 44, 44, 47,
+          50, 50, 51, 39, 39, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 42, 45,
+          45, 47, 50, 50, 52, 54, 39, 39, 38, 38, 37, 37, 38, 39, 39, 39, 40,
+          40, 42, 45, 45, 47, 50, 50, 52, 54, 54, 42, 41, 41, 41, 40, 40, 40,
+          41, 41, 41, 42, 42, 44, 47, 47, 49, 53, 53, 55, 56, 56, 60, 44, 43,
+          42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 47, 47, 50, 54, 54, 56,
+          58, 58, 61, 63, 44, 43, 43, 42, 41, 41, 41, 42, 42, 42, 43, 43, 45,
+          48, 48, 51, 54, 54, 56, 58, 58, 62, 64, 64, 47, 46, 45, 45, 44, 44,
+          44, 44, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66,
+          66, 69, 48, 47, 46, 46, 45, 44, 45, 45, 45, 45, 46, 46, 47, 51, 51,
+          53, 57, 57, 59, 61, 61, 65, 67, 67, 70, 71, 49, 48, 47, 47, 46, 45,
+          45, 46, 46, 46, 46, 46, 48, 51, 51, 54, 57, 57, 60, 62, 62, 66, 68,
+          68, 71, 72, 73, 53, 51, 51, 51, 49, 49, 49, 49, 49, 49, 49, 49, 51,
+          54, 54, 57, 59, 59, 62, 64, 64, 69, 71, 71, 74, 75, 77, 81, 54, 52,
+          51, 51, 50, 49, 49, 50, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63,
+          65, 65, 69, 71, 72, 75, 76, 77, 81, 82, 55, 53, 53, 52, 51, 50, 50,
+          51, 51, 51, 50, 50, 52, 55, 55, 58, 61, 61, 64, 66, 66, 70, 72, 73,
+          76, 77, 78, 83, 83, 85, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53,
+          53, 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86,
+          87, 88, 92, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53, 55, 58,
+          58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88, 92,
+          92},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 30, 31,
+          31, 31, 31, 32, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35,
+          35, 38, 33, 34, 34, 34, 35, 35, 36, 38, 39, 34, 35, 35, 36, 36, 36,
+          37, 40, 40, 41, 36, 38, 38, 38, 39, 40, 40, 43, 43, 44, 47, 36, 38,
+          38, 38, 39, 40, 40, 43, 43, 44, 47, 47, 38, 39, 40, 40, 41, 41, 41,
+          43, 44, 45, 47, 47, 47, 41, 42, 42, 42, 42, 42, 43, 44, 45, 45, 47,
+          47, 48, 48, 41, 42, 42, 42, 42, 42, 43, 44, 45, 45, 47, 47, 48, 48,
+          48, 45, 45, 45, 45, 44, 44, 44, 46, 46, 46, 47, 47, 48, 49, 49, 50,
+          49, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50, 50, 51, 53,
+          49, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50, 50, 51, 53,
+          53, 49, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 51,
+          53, 53, 53, 48, 47, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 48, 49,
+          49, 51, 53, 53, 54, 54, 48, 47, 47, 47, 46, 45, 45, 46, 46, 46, 46,
+          46, 48, 49, 49, 51, 53, 53, 54, 54, 54, 49, 47, 47, 47, 45, 45, 45,
+          45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 54, 55, 55, 57, 49, 47,
+          47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55,
+          55, 55, 57, 58, 49, 47, 47, 47, 45, 45, 45, 45, 45, 45, 45, 45, 47,
+          49, 49, 51, 53, 53, 55, 56, 56, 58, 58, 59, 50, 49, 48, 48, 46, 46,
+          46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59,
+          59, 61, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50,
+          52, 54, 54, 55, 56, 56, 59, 60, 60, 61, 61, 51, 49, 48, 48, 47, 46,
+          46, 47, 47, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 59, 60,
+          60, 61, 62, 62, 52, 50, 49, 49, 48, 47, 47, 47, 47, 47, 46, 46, 48,
+          50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 63, 64, 66, 52, 50,
+          50, 49, 48, 47, 47, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56,
+          57, 57, 60, 61, 61, 63, 63, 64, 66, 66, 53, 51, 50, 50, 48, 48, 48,
+          48, 48, 48, 47, 47, 48, 51, 51, 52, 54, 54, 56, 58, 58, 60, 61, 62,
+          63, 64, 64, 67, 67, 68, 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48,
+          48, 49, 52, 52, 53, 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68,
+          68, 69, 71, 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48, 48, 49, 52,
+          52, 53, 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68, 68, 69, 71,
+          71}},
+        {{32, 31, 31, 31, 31, 32, 31, 31, 32, 32, 31, 31, 32, 32, 32, 31, 31,
+          32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
+          32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32,
+          32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32,
+          32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 32,
+          33, 33, 34, 34, 35, 35, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34,
+          35, 35, 35, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36,
+          36, 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36, 37, 37, 38, 39,
+          34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36, 37, 37, 38, 39, 39,
+          34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 38, 40, 40,
+          41, 35, 35, 35, 35, 34, 34, 34, 34, 36, 36, 36, 37, 38, 38, 39, 42,
+          42, 43, 46, 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38,
+          40, 42, 42, 44, 47, 48, 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36,
+          37, 38, 38, 40, 42, 42, 44, 47, 48, 48, 38, 37, 37, 37, 36, 36, 36,
+          36, 37, 38, 38, 39, 39, 39, 41, 44, 44, 45, 48, 50, 50, 51, 39, 39,
+          38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 40, 42, 45, 45, 46, 49,
+          50, 50, 52, 54, 39, 39, 38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40,
+          40, 42, 45, 45, 46, 49, 50, 50, 52, 54, 54, 41, 40, 40, 40, 39, 38,
+          38, 39, 40, 40, 40, 41, 41, 41, 43, 46, 46, 47, 50, 52, 52, 54, 55,
+          55, 57, 44, 43, 42, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44,
+          47, 47, 49, 52, 54, 54, 56, 58, 58, 60, 63, 44, 43, 42, 42, 42, 41,
+          41, 41, 42, 42, 42, 42, 42, 42, 44, 47, 47, 49, 52, 54, 54, 56, 58,
+          58, 60, 63, 63, 45, 44, 43, 43, 42, 41, 41, 42, 42, 42, 42, 43, 43,
+          43, 45, 48, 48, 49, 53, 54, 54, 57, 58, 58, 60, 64, 64, 65, 47, 46,
+          45, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55,
+          56, 56, 58, 60, 60, 62, 66, 66, 67, 69, 48, 47, 46, 46, 45, 44, 44,
+          45, 45, 45, 45, 45, 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61,
+          63, 67, 67, 68, 70, 71, 48, 47, 46, 46, 45, 44, 44, 45, 45, 45, 45,
+          45, 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61, 63, 67, 67, 68,
+          70, 71, 71, 51, 50, 49, 49, 48, 47, 47, 47, 48, 48, 48, 48, 48, 48,
+          50, 53, 53, 54, 57, 58, 58, 61, 63, 63, 66, 69, 69, 70, 73, 74, 74,
+          77},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31,
+          31, 31, 31, 32, 30, 31, 31, 31, 31, 32, 32, 31, 31, 32, 32, 32, 32,
+          32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 37, 33, 34, 34, 34, 35, 35,
+          35, 36, 38, 39, 33, 34, 34, 34, 35, 35, 35, 36, 38, 39, 39, 35, 36,
+          37, 37, 37, 38, 38, 38, 41, 41, 41, 44, 36, 37, 38, 38, 39, 40, 40,
+          40, 42, 43, 43, 46, 47, 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43,
+          46, 47, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47,
+          47, 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46, 47, 47, 48, 48,
+          41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46, 47, 47, 48, 48, 48,
+          43, 43, 43, 43, 43, 43, 43, 43, 45, 45, 45, 46, 47, 47, 48, 49, 49,
+          49, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 47, 47, 48, 50,
+          50, 50, 52, 49, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 48, 48,
+          49, 50, 50, 51, 52, 53, 49, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47,
+          47, 48, 48, 49, 50, 50, 51, 52, 53, 53, 49, 48, 47, 47, 46, 46, 46,
+          46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, 48, 47,
+          47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 48, 49, 49, 50, 52,
+          53, 53, 54, 54, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46,
+          46, 48, 49, 49, 50, 52, 53, 53, 54, 54, 54, 49, 47, 47, 47, 46, 45,
+          45, 45, 46, 46, 46, 46, 46, 46, 47, 49, 49, 50, 52, 53, 53, 54, 55,
+          55, 55, 49, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47,
+          49, 49, 50, 52, 53, 53, 55, 55, 55, 57, 58, 49, 47, 47, 47, 46, 45,
+          45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 50, 52, 53, 53, 55, 55,
+          55, 57, 58, 58, 49, 48, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45,
+          45, 47, 49, 49, 50, 52, 53, 53, 55, 56, 56, 57, 59, 59, 59, 50, 49,
+          48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53,
+          54, 54, 55, 56, 56, 57, 59, 59, 60, 61, 50, 49, 48, 48, 47, 46, 46,
+          46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56,
+          58, 60, 60, 60, 61, 61, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46,
+          46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 58, 60, 60, 60,
+          61, 61, 61, 51, 50, 49, 49, 48, 47, 47, 47, 47, 47, 47, 47, 46, 46,
+          48, 50, 50, 51, 53, 54, 54, 56, 57, 57, 58, 60, 60, 61, 62, 63, 63,
+          64}},
+        {{32, 31, 31, 31, 31, 32, 31, 31, 32, 32, 31, 31, 32, 32, 32, 31, 31,
+          32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+          32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 31, 32,
+          32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+          33, 33, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34,
+          35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35,
+          33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 36, 36, 36,
+          37, 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37,
+          37, 38, 39, 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36,
+          37, 37, 37, 38, 39, 39, 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35,
+          35, 35, 36, 37, 37, 37, 38, 39, 39, 39, 35, 34, 34, 34, 34, 34, 34,
+          34, 34, 35, 36, 36, 36, 36, 37, 37, 37, 39, 41, 41, 41, 43, 36, 35,
+          35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42,
+          42, 42, 45, 48, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36,
+          37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 36, 35, 35, 35, 35, 35,
+          34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48,
+          48, 48, 37, 37, 37, 37, 37, 36, 36, 36, 36, 37, 38, 38, 38, 38, 39,
+          39, 39, 41, 44, 44, 44, 46, 49, 49, 49, 51, 39, 39, 38, 38, 38, 38,
+          37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47, 50,
+          50, 50, 52, 54, 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39,
+          40, 40, 40, 40, 42, 45, 45, 45, 47, 50, 50, 50, 52, 54, 54, 39, 39,
+          38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45,
+          45, 45, 47, 50, 50, 50, 52, 54, 54, 54, 41, 41, 40, 40, 40, 39, 39,
+          39, 39, 40, 40, 40, 40, 41, 41, 41, 41, 44, 46, 46, 46, 49, 52, 52,
+          52, 54, 56, 56, 56, 58, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42,
+          42, 42, 42, 42, 42, 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58,
+          58, 60, 63, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42,
+          42, 42, 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63,
+          63},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 32, 30, 31, 31, 31, 31, 31,
+          32, 32, 30, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+          33, 33, 33, 35, 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 39, 33, 34,
+          34, 34, 34, 35, 35, 35, 35, 37, 39, 39, 33, 34, 34, 34, 34, 35, 35,
+          35, 35, 37, 39, 39, 39, 35, 35, 36, 36, 36, 37, 37, 37, 37, 39, 41,
+          41, 41, 43, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45,
+          47, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47,
+          36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47,
+          39, 39, 40, 40, 40, 41, 41, 41, 41, 42, 44, 44, 44, 45, 47, 47, 47,
+          47, 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47,
+          47, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46,
+          47, 47, 47, 48, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45,
+          45, 45, 46, 47, 47, 47, 48, 48, 48, 48, 45, 45, 45, 45, 45, 44, 44,
+          44, 44, 45, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 50, 49, 48,
+          47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50,
+          50, 50, 51, 53, 49, 48, 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47,
+          47, 48, 48, 48, 49, 50, 50, 50, 51, 53, 53, 49, 48, 47, 47, 47, 47,
+          46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 51, 53,
+          53, 53, 49, 48, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47,
+          47, 47, 48, 50, 50, 50, 51, 53, 53, 53, 53, 48, 48, 47, 47, 47, 46,
+          45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49, 49, 49, 51, 53,
+          53, 53, 53, 54, 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46,
+          46, 46, 46, 46, 48, 49, 49, 49, 51, 53, 53, 53, 53, 54, 54, 48, 48,
+          47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49,
+          49, 49, 51, 53, 53, 53, 53, 54, 54, 54, 49, 48, 47, 47, 47, 46, 45,
+          45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 47, 49, 49, 49, 51, 53, 53,
+          53, 54, 55, 55, 55, 56, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45,
+          45, 45, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55,
+          55, 57, 58, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45,
+          45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, 55, 57, 58,
+          58}},
+        {{32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 32, 32, 31, 31,
+          31, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+          32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+          33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34,
+          35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34,
+          34, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34,
+          34, 34, 34, 35, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 32, 32, 33, 33, 33, 33, 33,
+          33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 33, 33,
+          33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 36,
+          36, 36, 37, 38, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35,
+          35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39, 34, 34, 34, 34, 34, 34,
+          34, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39,
+          39, 39, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35, 35, 35,
+          35, 36, 36, 37, 37, 37, 38, 39, 39, 39, 39, 34, 34, 34, 34, 34, 34,
+          34, 34, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 37, 37, 37, 38, 40,
+          41, 41, 41, 42, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35, 36,
+          36, 36, 36, 37, 37, 38, 38, 38, 39, 41, 42, 42, 42, 44, 46, 36, 35,
+          35, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38,
+          38, 38, 40, 42, 42, 42, 42, 45, 47, 48, 36, 35, 35, 35, 35, 35, 35,
+          34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42,
+          42, 42, 45, 47, 48, 48, 36, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34,
+          35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42, 45, 47,
+          48, 48, 48, 37, 37, 36, 36, 36, 36, 36, 35, 35, 35, 35, 36, 37, 37,
+          37, 37, 38, 39, 39, 39, 39, 41, 42, 43, 43, 43, 45, 48, 49, 49, 49,
+          50},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31,
+          31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 32, 30, 30, 31, 31, 31, 31,
+          31, 31, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 32,
+          32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 33, 33, 33, 34, 34, 34, 34,
+          34, 34, 34, 34, 36, 37, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35,
+          37, 38, 39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39,
+          39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39, 39,
+          34, 35, 36, 36, 36, 36, 36, 37, 37, 37, 37, 38, 40, 40, 40, 40, 42,
+          36, 36, 37, 37, 37, 37, 38, 38, 39, 39, 39, 40, 41, 42, 42, 42, 44,
+          46, 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43,
+          45, 46, 47, 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43,
+          43, 43, 45, 46, 47, 47, 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40,
+          41, 42, 43, 43, 43, 45, 46, 47, 47, 47, 38, 39, 39, 40, 40, 40, 40,
+          41, 41, 41, 41, 42, 43, 44, 44, 44, 45, 47, 47, 47, 47, 47, 40, 41,
+          41, 41, 41, 41, 41, 42, 42, 42, 42, 43, 44, 44, 44, 44, 45, 47, 47,
+          47, 47, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44,
+          45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48, 41, 42, 42, 42, 42, 42,
+          42, 42, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48,
+          48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45,
+          45, 46, 47, 47, 47, 47, 48, 48, 48, 48, 48, 44, 44, 44, 44, 44, 44,
+          44, 44, 44, 44, 44, 44, 45, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49,
+          49, 49, 49, 50, 47, 47, 46, 46, 46, 46, 46, 46, 45, 45, 45, 46, 46,
+          47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 49, 48,
+          48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48,
+          48, 48, 49, 50, 50, 50, 50, 51, 52, 53, 49, 48, 48, 47, 47, 47, 47,
+          46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50,
+          50, 50, 51, 52, 53, 53, 49, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46,
+          46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 51, 52,
+          53, 53, 53, 49, 48, 47, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47,
+          47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53,
+          53}},
+        {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31,
+          31, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32,
+          32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32,
+          32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+          33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+          33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+          33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35,
+          35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+          33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35,
+          35, 35, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+          33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 32, 32,
+          33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34,
+          34, 34, 35, 35, 35, 36, 36, 36, 36, 36, 33, 33, 33, 33, 33, 33, 33,
+          33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36,
+          36, 36, 36, 36, 37, 38, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33,
+          33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37,
+          38, 38, 39, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33,
+          34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 39,
+          39},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 30, 30,
+          31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 30, 30, 31, 31, 31, 31, 31,
+          31, 31, 31, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+          32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          33, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 35,
+          33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37,
+          33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38,
+          39, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37,
+          38, 39, 39, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35,
+          36, 37, 38, 39, 39, 39, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35,
+          35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 34, 35, 35, 35, 35, 35, 35,
+          36, 36, 36, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 35, 36,
+          36, 36, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 39, 41, 41, 41,
+          41, 41, 42, 44, 36, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39,
+          39, 40, 41, 42, 43, 43, 43, 43, 44, 45, 46, 36, 37, 37, 38, 38, 38,
+          38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46,
+          47, 47, 36, 37, 37, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40,
+          41, 42, 43, 43, 43, 43, 44, 46, 47, 47, 47, 36, 37, 37, 38, 38, 38,
+          38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46,
+          47, 47, 47, 47, 37, 37, 38, 38, 39, 39, 39, 39, 39, 40, 40, 40, 40,
+          40, 41, 42, 43, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 38, 39,
+          39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44,
+          44, 44, 45, 46, 47, 47, 47, 47, 47, 47, 40, 40, 40, 41, 41, 41, 41,
+          41, 41, 41, 42, 42, 42, 42, 42, 43, 44, 44, 44, 44, 44, 45, 46, 47,
+          47, 47, 47, 47, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+          42, 42, 42, 43, 43, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47,
+          48, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+          43, 43, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48, 48,
+          48}},
+        {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 32, 31, 31, 31, 31, 31, 32, 32, 31, 31, 31, 31, 31, 32,
+          32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32,
+          32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31,
+          31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+          33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+          33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 31, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+          33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+          33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+          33},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+          30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+          30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+          32, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+          32, 32, 32, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 32, 32, 32, 32, 32, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33,
+          33, 33, 34, 34, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+          33, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 33, 33, 33, 33, 33, 34,
+          34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36,
+          37, 37, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35,
+          35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 33, 33, 34, 34, 34, 34,
+          34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37,
+          37, 38, 39, 39, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35,
+          35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 33, 33,
+          34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35,
+          35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 33, 33, 34, 34, 34, 34, 34,
+          34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37,
+          38, 39, 39, 39, 39, 39, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34,
+          35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39,
+          39, 39, 39, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36,
+          36, 36, 36, 36, 36, 36, 36, 37, 37, 38, 39, 40, 40, 40, 40, 40, 40,
+          40}},
+        {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+          32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+          32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+          31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+          31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+          31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+          31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+          32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+          32},
+         {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30,
+          30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 32, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+          31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+          32}}};

diff --git a/libgav1/src/reconstruction.cc b/libgav1/src/reconstruction.cc
index 97de9f0..1aa1233 100644
--- a/libgav1/src/reconstruction.cc
+++ b/libgav1/src/reconstruction.cc

@@ -14,6 +14,7 @@
 
 #include "src/reconstruction.h"
 
+#include <algorithm>
 #include <cassert>
 #include <cstdint>
 
@@ -48,6 +49,84 @@
   return static_cast<dsp::TransformSize1D>(size_log2 - 2);
 }
 
+// Returns the number of rows to process based on |non_zero_coeff_count|. The
+// transform loops process either 4 or a multiple of 8 rows. Use the
+// TransformClass derived from |tx_type| to determine the scan order.
+template <int tx_width>
+int GetNumRows(TransformType tx_type, int tx_height, int non_zero_coeff_count) {
+  const TransformClass tx_class = GetTransformClass(tx_type);
+
+  switch (tx_class) {
+    case kTransformClass2D:
+      if (tx_width == 4) {
+        if (non_zero_coeff_count <= 13) return 4;
+        if (non_zero_coeff_count <= 29) return 8;
+      }
+      if (tx_width == 8) {
+        if (non_zero_coeff_count <= 10) return 4;
+        if ((non_zero_coeff_count <= 14) & (tx_height > 8)) return 4;
+        if (non_zero_coeff_count <= 43) return 8;
+        if ((non_zero_coeff_count <= 107) & (tx_height > 16)) return 16;
+        if ((non_zero_coeff_count <= 171) & (tx_height > 16)) return 24;
+      }
+      if (tx_width == 16) {
+        if (non_zero_coeff_count <= 10) return 4;
+        if ((non_zero_coeff_count <= 14) & (tx_height > 16)) return 4;
+        if (non_zero_coeff_count <= 36) return 8;
+        if ((non_zero_coeff_count <= 44) & (tx_height > 16)) return 8;
+        if ((non_zero_coeff_count <= 151) & (tx_height > 16)) return 16;
+        if ((non_zero_coeff_count <= 279) & (tx_height > 16)) return 24;
+      }
+      if (tx_width == 32) {
+        if (non_zero_coeff_count <= 10) return 4;
+        if (non_zero_coeff_count <= 36) return 8;
+        if ((non_zero_coeff_count <= 136) & (tx_height > 16)) return 16;
+        if ((non_zero_coeff_count <= 300) & (tx_height > 16)) return 24;
+      }
+      break;
+
+    case kTransformClassHorizontal:
+      if (non_zero_coeff_count <= 4) return 4;
+      if (non_zero_coeff_count <= 8) return 8;
+      if ((non_zero_coeff_count <= 16) & (tx_height > 16)) return 16;
+      if ((non_zero_coeff_count <= 24) & (tx_height > 16)) return 24;
+      break;
+
+    default:
+      assert(tx_class == kTransformClassVertical);
+      if (tx_width == 4) {
+        if (non_zero_coeff_count <= 16) return 4;
+        if (non_zero_coeff_count <= 32) return 8;
+      }
+      if (tx_width == 8) {
+        if (non_zero_coeff_count <= 32) return 4;
+        if (non_zero_coeff_count <= 64) return 8;
+        // There's no need to check tx_height since the maximum values for
+        // smaller sizes are: 8x8: 63, 8x16: 127.
+        if (non_zero_coeff_count <= 128) return 16;
+        if (non_zero_coeff_count <= 192) return 24;
+      }
+      if (tx_width == 16) {
+        if (non_zero_coeff_count <= 64) return 4;
+        if (non_zero_coeff_count <= 128) return 8;
+        // There's no need to check tx_height since the maximum values for
+        // smaller sizes are: 16x8: 127, 16x16: 255.
+        if (non_zero_coeff_count <= 256) return 16;
+        if (non_zero_coeff_count <= 384) return 24;
+      }
+      if (tx_width == 32) {
+        if (non_zero_coeff_count <= 128) return 4;
+        if (non_zero_coeff_count <= 256) return 8;
+        // There's no need to check tx_height since the maximum values for
+        // smaller sizes are: 32x8 is 255, 32x16 is 511.
+        if ((non_zero_coeff_count <= 512)) return 16;
+        if ((non_zero_coeff_count <= 768)) return 24;
+      }
+      break;
+  }
+  return (tx_width >= 16) ? std::min(tx_height, 32) : tx_height;
+}
+
 }  // namespace
 
 template <typename Residual, typename Pixel>
@@ -59,17 +138,28 @@
   const int tx_width_log2 = kTransformWidthLog2[tx_size];
   const int tx_height_log2 = kTransformHeightLog2[tx_size];
 
+  int tx_height = (non_zero_coeff_count == 1) ? 1 : kTransformHeight[tx_size];
+  if (tx_height > 4) {
+    static constexpr int (*kGetNumRows[])(TransformType tx_type, int tx_height,
+                                          int non_zero_coeff_count) = {
+        &GetNumRows<4>, &GetNumRows<8>, &GetNumRows<16>, &GetNumRows<32>,
+        &GetNumRows<32>};
+    tx_height = kGetNumRows[tx_width_log2 - 2](tx_type, tx_height,
+                                               non_zero_coeff_count);
+  }
+  assert(tx_height <= 32);
+
   // Row transform.
   const dsp::TransformSize1D row_transform_size =
       Get1DTransformSize(tx_width_log2);
   const dsp::Transform1D row_transform =
       lossless ? dsp::k1DTransformWht : kRowTransform[tx_type];
   const dsp::InverseTransformAddFunc row_transform_func =
-      dsp.inverse_transforms[row_transform_size][row_transform];
+      dsp.inverse_transforms[row_transform][row_transform_size][dsp::kRow];
   assert(row_transform_func != nullptr);
 
-  row_transform_func(tx_type, tx_size, buffer, start_x, start_y, frame,
-                     /*is_row=*/true, non_zero_coeff_count);
+  row_transform_func(tx_type, tx_size, tx_height, buffer, start_x, start_y,
+                     frame);
 
   // Column transform.
   const dsp::TransformSize1D column_transform_size =
@@ -77,11 +167,12 @@
   const dsp::Transform1D column_transform =
       lossless ? dsp::k1DTransformWht : kColumnTransform[tx_type];
   const dsp::InverseTransformAddFunc column_transform_func =
-      dsp.inverse_transforms[column_transform_size][column_transform];
+      dsp.inverse_transforms[column_transform][column_transform_size]
+                            [dsp::kColumn];
   assert(column_transform_func != nullptr);
 
-  column_transform_func(tx_type, tx_size, buffer, start_x, start_y, frame,
-                        /*is_row=*/false, non_zero_coeff_count);
+  column_transform_func(tx_type, tx_size, tx_height, buffer, start_x, start_y,
+                        frame);
 }
 
 template void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type,

diff --git a/libgav1/src/residual_buffer_pool.cc b/libgav1/src/residual_buffer_pool.cc
index e166392..44a842c 100644
--- a/libgav1/src/residual_buffer_pool.cc
+++ b/libgav1/src/residual_buffer_pool.cc

@@ -129,7 +129,8 @@
 }
 
 void ResidualBufferPool::Release(std::unique_ptr<ResidualBuffer> buffer) {
-  buffer->transform_parameters()->Reset();
+  buffer->transform_parameters()->Clear();
+  buffer->partition_tree_order()->Clear();
   std::lock_guard<std::mutex> lock(mutex_);
   buffers_.Push(std::move(buffer));
 }

diff --git a/libgav1/src/residual_buffer_pool.h b/libgav1/src/residual_buffer_pool.h
index f7bc75d..75924db 100644
--- a/libgav1/src/residual_buffer_pool.h
+++ b/libgav1/src/residual_buffer_pool.h

@@ -27,73 +27,11 @@
 #include "src/utils/compiler_attributes.h"
 #include "src/utils/constants.h"
 #include "src/utils/memory.h"
+#include "src/utils/queue.h"
 #include "src/utils/types.h"
 
 namespace libgav1 {
 
-// A simple fixed size queue implementation to hold the transform parameters
-// when |Tile::split_parse_and_decode_| is true. We don't have to do any
-// boundary checks since we always push data into the queue before accessing it.
-class TransformParameterQueue {
- public:
-  TransformParameterQueue() = default;
-
-  // Move only.
-  TransformParameterQueue(TransformParameterQueue&& other) = default;
-  TransformParameterQueue& operator=(TransformParameterQueue&& other) = default;
-
-  LIBGAV1_MUST_USE_RESULT bool Init(int max_size) {
-    max_size_ = max_size;
-    // No initialization is necessary since the data will be always written to
-    // before being read.
-    non_zero_coeff_count_.reset(new (std::nothrow) int16_t[max_size_]);
-    tx_type_.reset(new (std::nothrow) TransformType[max_size_]);
-    return non_zero_coeff_count_ != nullptr && tx_type_ != nullptr;
-  }
-
-  // Adds the |non_zero_coeff_count| and the |tx_type| to the back of the queue.
-  void Push(int non_zero_coeff_count, TransformType tx_type) {
-    assert(back_ < max_size_);
-    non_zero_coeff_count_[back_] = non_zero_coeff_count;
-    tx_type_[back_++] = tx_type;
-  }
-
-  // Returns the non_zero_coeff_count at the front of the queue.
-  int16_t NonZeroCoeffCount() const {
-    assert(front_ != back_);
-    return non_zero_coeff_count_[front_];
-  }
-
-  // Returns the tx_type at the front of the queue.
-  TransformType Type() const {
-    assert(front_ != back_);
-    return tx_type_[front_];
-  }
-
-  // Removes the |non_zero_coeff_count| and the |tx_type| from the front of the
-  // queue.
-  void Pop() {
-    assert(front_ != back_);
-    ++front_;
-  }
-
-  // Clears the queue.
-  void Reset() {
-    front_ = 0;
-    back_ = 0;
-  }
-
-  // Used only in the tests. Returns the number of elements in the queue.
-  int Size() const { return back_ - front_; }
-
- private:
-  int max_size_ = 0;
-  std::unique_ptr<int16_t[]> non_zero_coeff_count_;
-  std::unique_ptr<TransformType[]> tx_type_;
-  int front_ = 0;
-  int back_ = 0;
-};
-
 // This class is used for parsing and decoding a superblock. Members of this
 // class are populated in the "parse" step and consumed in the "decode" step.
 class ResidualBuffer : public Allocable {
@@ -104,7 +42,8 @@
     if (buffer != nullptr) {
       buffer->buffer_ = MakeAlignedUniquePtr<uint8_t>(32, buffer_size);
       if (buffer->buffer_ == nullptr ||
-          !buffer->transform_parameters_.Init(queue_size)) {
+          !buffer->transform_parameters_.Init(queue_size) ||
+          !buffer->partition_tree_order_.Init(queue_size)) {
         buffer = nullptr;
       }
     }
@@ -118,9 +57,14 @@
   // Buffer used to store the residual values.
   uint8_t* buffer() { return buffer_.get(); }
   // Queue used to store the transform parameters.
-  TransformParameterQueue* transform_parameters() {
+  Queue<TransformParameters>* transform_parameters() {
     return &transform_parameters_;
   }
+  // Queue used to store the block ordering in the partition tree of the
+  // superblocks.
+  Queue<PartitionTreeNode>* partition_tree_order() {
+    return &partition_tree_order_;
+  }
 
  private:
   friend class ResidualBufferStack;
@@ -128,7 +72,8 @@
   ResidualBuffer() = default;
 
   AlignedUniquePtr<uint8_t> buffer_;
-  TransformParameterQueue transform_parameters_;
+  Queue<TransformParameters> transform_parameters_;
+  Queue<PartitionTreeNode> partition_tree_order_;
   // Used by ResidualBufferStack to form a chain of ResidualBuffers.
   ResidualBuffer* next_ = nullptr;
 };

diff --git a/libgav1/src/symbol_decoder_context.cc b/libgav1/src/symbol_decoder_context.cc
index 159f25c..26a281e 100644
--- a/libgav1/src/symbol_decoder_context.cc
+++ b/libgav1/src/symbol_decoder_context.cc

@@ -319,20 +319,4 @@
   }
 }
 
-int SymbolDecoderContext::TxTypeIndex(TransformSet tx_set) {
-  assert(tx_set != kTransformSetDctOnly);
-  switch (tx_set) {
-    case kTransformSetInter1:
-    case kTransformSetIntra1:
-      return 0;
-    case kTransformSetInter2:
-    case kTransformSetIntra2:
-      return 1;
-    case kTransformSetInter3:
-      return 2;
-    default:
-      return -1;
-  }
-}
-
 }  // namespace libgav1

diff --git a/libgav1/src/symbol_decoder_context.h b/libgav1/src/symbol_decoder_context.h
index 8713f5b..1bea76c 100644
--- a/libgav1/src/symbol_decoder_context.h
+++ b/libgav1/src/symbol_decoder_context.h

@@ -17,10 +17,12 @@
 #ifndef LIBGAV1_SRC_SYMBOL_DECODER_CONTEXT_H_
 #define LIBGAV1_SRC_SYMBOL_DECODER_CONTEXT_H_
 
+#include <cassert>
 #include <cstdint>
 
 #include "src/dsp/constants.h"
 #include "src/utils/constants.h"
+#include "src/utils/memory.h"
 
 namespace libgav1 {
 
@@ -101,7 +103,21 @@
 
   // Returns the cdf array index for inter_tx_type or intra_tx_type based on
   // |tx_set|.
-  static int TxTypeIndex(TransformSet tx_set);
+  static int TxTypeIndex(TransformSet tx_set) {
+    assert(tx_set != kTransformSetDctOnly);
+    switch (tx_set) {
+      case kTransformSetInter1:
+      case kTransformSetIntra1:
+        return 0;
+      case kTransformSetInter2:
+      case kTransformSetIntra2:
+        return 1;
+      case kTransformSetInter3:
+        return 2;
+      default:
+        return -1;
+    }
+  }
 
   // Resets the intra_frame_y_mode_cdf array to the default.
   void ResetIntraFrameYModeCdf();
@@ -110,117 +126,175 @@
   // the last used element in the innermost dimension of each of the CDF array.
   void ResetCounters();
 
-  uint16_t partition_cdf[kBlockWidthCount][kPartitionContexts]
-                        [kMaxPartitionTypes + 1];
-  uint16_t segment_id_cdf[kSegmentIdContexts][kMaxSegments + 1];
-  uint16_t use_predicted_segment_id_cdf[kUsePredictedSegmentIdContexts]
-                                       [kBooleanFieldCdfSize];
-  uint16_t skip_cdf[kSkipContexts][kBooleanFieldCdfSize];
-  uint16_t skip_mode_cdf[kSkipModeContexts][kBooleanFieldCdfSize];
-  uint16_t delta_q_cdf[kDeltaSymbolCount + 1];
-  uint16_t delta_lf_cdf[kDeltaSymbolCount + 1];
-  uint16_t delta_lf_multi_cdf[kFrameLfCount][kDeltaSymbolCount + 1];
-  uint16_t intra_block_copy_cdf[kBooleanFieldCdfSize];
-  uint16_t intra_frame_y_mode_cdf[kIntraModeContexts][kIntraModeContexts]
-                                 [kIntraPredictionModesY + 1];
-  uint16_t y_mode_cdf[kYModeContexts][kIntraPredictionModesY + 1];
-  uint16_t angle_delta_cdf[kDirectionalIntraModes][kAngleDeltaSymbolCount + 1];
-  uint16_t uv_mode_cdf[kBooleanSymbolCount][kIntraPredictionModesY]
-                      [kIntraPredictionModesUV + 1];
-  uint16_t cfl_alpha_signs_cdf[kCflAlphaSignsSymbolCount + 1];
-  uint16_t cfl_alpha_cdf[kCflAlphaContexts][kCflAlphaSymbolCount + 1];
-  uint16_t use_filter_intra_cdf[kMaxBlockSizes][kBooleanFieldCdfSize];
-  uint16_t filter_intra_mode_cdf[kNumFilterIntraPredictors + 1];
-  uint16_t tx_depth_cdf[4][kTxDepthContexts][kMaxTxDepthSymbolCount + 1];
-  uint16_t tx_split_cdf[kTxSplitContexts][kBooleanFieldCdfSize];
-  uint16_t all_zero_cdf[kNumSquareTransformSizes][kAllZeroContexts]
+  // Note kMaxAlignment allows for aligned instructions to be used in the
+  // copies done in Initialize().
+  alignas(kMaxAlignment) uint16_t
+      partition_cdf[kBlockWidthCount][kPartitionContexts]
+                   [kMaxPartitionTypes + 1];
+  alignas(kMaxAlignment) uint16_t
+      segment_id_cdf[kSegmentIdContexts][kMaxSegments + 1];
+  alignas(kMaxAlignment) uint16_t
+      use_predicted_segment_id_cdf[kUsePredictedSegmentIdContexts]
+                                  [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t skip_cdf[kSkipContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      skip_mode_cdf[kSkipModeContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t delta_q_cdf[kDeltaSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t delta_lf_cdf[kDeltaSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      delta_lf_multi_cdf[kFrameLfCount][kDeltaSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t intra_block_copy_cdf[kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      intra_frame_y_mode_cdf[kIntraModeContexts][kIntraModeContexts]
+                            [kIntraPredictionModesY + 1];
+  alignas(kMaxAlignment) uint16_t
+      y_mode_cdf[kYModeContexts][kIntraPredictionModesY + 1];
+  alignas(kMaxAlignment) uint16_t
+      angle_delta_cdf[kDirectionalIntraModes][kAngleDeltaSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      uv_mode_cdf[kBooleanSymbolCount][kIntraPredictionModesY]
+                 [kIntraPredictionModesUV + 1];
+  alignas(kMaxAlignment) uint16_t
+      cfl_alpha_signs_cdf[kCflAlphaSignsSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      cfl_alpha_cdf[kCflAlphaContexts][kCflAlphaSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      use_filter_intra_cdf[kMaxBlockSizes][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      filter_intra_mode_cdf[kNumFilterIntraPredictors + 1];
+  alignas(kMaxAlignment) uint16_t
+      tx_depth_cdf[4][kTxDepthContexts][kMaxTxDepthSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      tx_split_cdf[kTxSplitContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      all_zero_cdf[kNumSquareTransformSizes][kAllZeroContexts]
+                  [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      inter_tx_type_cdf[3][kNumExtendedTransformSizes][kNumTransformTypes + 1];
+  alignas(kMaxAlignment) uint16_t
+      intra_tx_type_cdf[2][kNumExtendedTransformSizes][kIntraPredictionModesY]
+                       [kNumTransformTypes + 1];
+  alignas(kMaxAlignment) uint16_t
+      eob_pt_16_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt16SymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      eob_pt_32_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt32SymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      eob_pt_64_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt64SymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      eob_pt_128_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt128SymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      eob_pt_256_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt256SymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      eob_pt_512_cdf[kNumPlaneTypes][kEobPt512SymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      eob_pt_1024_cdf[kNumPlaneTypes][kEobPt1024SymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      eob_extra_cdf[kNumSquareTransformSizes][kNumPlaneTypes][kEobExtraContexts]
+                   [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      coeff_base_eob_cdf[kNumSquareTransformSizes][kNumPlaneTypes]
+                        [kCoeffBaseEobContexts][kCoeffBaseEobSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      coeff_base_cdf[kNumSquareTransformSizes][kNumPlaneTypes]
+                    [kCoeffBaseContexts][kCoeffBaseSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      coeff_base_range_cdf[kNumSquareTransformSizes][kNumPlaneTypes]
+                          [kCoeffBaseRangeContexts]
+                          [kCoeffBaseRangeSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      dc_sign_cdf[kNumPlaneTypes][kDcSignContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      restoration_type_cdf[kRestorationTypeSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t use_wiener_cdf[kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t use_sgrproj_cdf[kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      has_palette_y_cdf[kPaletteBlockSizeContexts][kPaletteYModeContexts]
                        [kBooleanFieldCdfSize];
-  uint16_t inter_tx_type_cdf[3][kNumExtendedTransformSizes]
-                            [kNumTransformTypes + 1];
-  uint16_t intra_tx_type_cdf[2][kNumExtendedTransformSizes]
-                            [kIntraPredictionModesY][kNumTransformTypes + 1];
-  uint16_t eob_pt_16_cdf[kNumPlaneTypes][kEobPtContexts]
-                        [kEobPt16SymbolCount + 1];
-  uint16_t eob_pt_32_cdf[kNumPlaneTypes][kEobPtContexts]
-                        [kEobPt32SymbolCount + 1];
-  uint16_t eob_pt_64_cdf[kNumPlaneTypes][kEobPtContexts]
-                        [kEobPt64SymbolCount + 1];
-  uint16_t eob_pt_128_cdf[kNumPlaneTypes][kEobPtContexts]
-                         [kEobPt128SymbolCount + 1];
-  uint16_t eob_pt_256_cdf[kNumPlaneTypes][kEobPtContexts]
-                         [kEobPt256SymbolCount + 1];
-  uint16_t eob_pt_512_cdf[kNumPlaneTypes][kEobPt512SymbolCount + 1];
-  uint16_t eob_pt_1024_cdf[kNumPlaneTypes][kEobPt1024SymbolCount + 1];
-  uint16_t eob_extra_cdf[kNumSquareTransformSizes][kNumPlaneTypes]
-                        [kEobExtraContexts][kBooleanFieldCdfSize];
-  uint16_t coeff_base_eob_cdf[kNumSquareTransformSizes][kNumPlaneTypes]
-                             [kCoeffBaseEobContexts]
-                             [kCoeffBaseEobSymbolCount + 1];
-  uint16_t coeff_base_cdf[kNumSquareTransformSizes][kNumPlaneTypes]
-                         [kCoeffBaseContexts][kCoeffBaseSymbolCount + 1];
-  uint16_t coeff_base_range_cdf[kNumSquareTransformSizes][kNumPlaneTypes]
-                               [kCoeffBaseRangeContexts]
-                               [kCoeffBaseRangeSymbolCount + 1];
-  uint16_t dc_sign_cdf[kNumPlaneTypes][kDcSignContexts][kBooleanFieldCdfSize];
-  uint16_t restoration_type_cdf[kRestorationTypeSymbolCount + 1];
-  uint16_t use_wiener_cdf[kBooleanFieldCdfSize];
-  uint16_t use_sgrproj_cdf[kBooleanFieldCdfSize];
-  uint16_t has_palette_y_cdf[kPaletteBlockSizeContexts][kPaletteYModeContexts]
-                            [kBooleanFieldCdfSize];
-  uint16_t palette_y_size_cdf[kPaletteBlockSizeContexts]
-                             [kPaletteSizeSymbolCount + 1];
-  uint16_t has_palette_uv_cdf[kPaletteUVModeContexts][kBooleanFieldCdfSize];
-  uint16_t palette_uv_size_cdf[kPaletteBlockSizeContexts]
-                              [kPaletteSizeSymbolCount + 1];
-  uint16_t palette_color_index_cdf[kNumPlaneTypes][kPaletteSizeSymbolCount]
-                                  [kPaletteColorIndexContexts]
-                                  [kPaletteColorIndexSymbolCount + 1];
-  uint16_t is_inter_cdf[kIsInterContexts][kBooleanFieldCdfSize];
-  uint16_t use_compound_reference_cdf[kUseCompoundReferenceContexts]
-                                     [kBooleanFieldCdfSize];
-  uint16_t compound_reference_type_cdf[kCompoundReferenceTypeContexts]
-                                      [kBooleanFieldCdfSize];
-  uint16_t compound_reference_cdf[kNumCompoundReferenceTypes]
-                                 [kReferenceContexts][3][kBooleanFieldCdfSize];
-  uint16_t compound_backward_reference_cdf[kReferenceContexts][2]
-                                          [kBooleanFieldCdfSize];
-  uint16_t single_reference_cdf[kReferenceContexts][6][kBooleanFieldCdfSize];
-  uint16_t compound_prediction_mode_cdf[kCompoundPredictionModeContexts]
-                                       [kNumCompoundInterPredictionModes + 1];
-  uint16_t new_mv_cdf[kNewMvContexts][kBooleanFieldCdfSize];
-  uint16_t zero_mv_cdf[kZeroMvContexts][kBooleanFieldCdfSize];
-  uint16_t reference_mv_cdf[kReferenceMvContexts][kBooleanFieldCdfSize];
-  uint16_t ref_mv_index_cdf[kRefMvIndexContexts][kBooleanFieldCdfSize];
-  uint16_t is_inter_intra_cdf[kInterIntraContexts][kBooleanFieldCdfSize];
-  uint16_t inter_intra_mode_cdf[kInterIntraContexts][kNumInterIntraModes + 1];
-  uint16_t is_wedge_inter_intra_cdf[kMaxBlockSizes][kBooleanFieldCdfSize];
-  uint16_t wedge_index_cdf[kMaxBlockSizes][kWedgeIndexSymbolCount + 1];
-  uint16_t use_obmc_cdf[kMaxBlockSizes][kBooleanFieldCdfSize];
-  uint16_t motion_mode_cdf[kMaxBlockSizes][kNumMotionModes + 1];
-  uint16_t is_explicit_compound_type_cdf[kIsExplicitCompoundTypeContexts]
-                                        [kBooleanFieldCdfSize];
-  uint16_t is_compound_type_average_cdf[kIsCompoundTypeAverageContexts]
-                                       [kBooleanFieldCdfSize];
-  uint16_t compound_type_cdf[kMaxBlockSizes]
-                            [kNumExplicitCompoundPredictionTypes + 1];
-  uint16_t interpolation_filter_cdf[kInterpolationFilterContexts]
-                                   [kNumExplicitInterpolationFilters + 1];
-  uint16_t mv_joint_cdf[kMvContexts][kNumMvJointTypes + 1];
-  uint16_t mv_sign_cdf[kMvContexts][kNumMvComponents][kBooleanFieldCdfSize];
-  uint16_t mv_class_cdf[kMvContexts][kNumMvComponents][kMvClassSymbolCount + 1];
-  uint16_t mv_class0_bit_cdf[kMvContexts][kNumMvComponents]
-                            [kBooleanFieldCdfSize];
-  uint16_t mv_class0_fraction_cdf[kMvContexts][kNumMvComponents]
-                                 [kBooleanSymbolCount]
-                                 [kMvFractionSymbolCount + 1];
-  uint16_t mv_class0_high_precision_cdf[kMvContexts][kNumMvComponents]
-                                       [kBooleanFieldCdfSize];
-  uint16_t mv_bit_cdf[kMvContexts][kNumMvComponents][kMvBitSymbolCount]
-                     [kBooleanFieldCdfSize];
-  uint16_t mv_fraction_cdf[kMvContexts][kNumMvComponents]
-                          [kMvFractionSymbolCount + 1];
-  uint16_t mv_high_precision_cdf[kMvContexts][kNumMvComponents]
+  alignas(kMaxAlignment) uint16_t
+      palette_y_size_cdf[kPaletteBlockSizeContexts]
+                        [kPaletteSizeSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      has_palette_uv_cdf[kPaletteUVModeContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      palette_uv_size_cdf[kPaletteBlockSizeContexts]
+                         [kPaletteSizeSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      palette_color_index_cdf[kNumPlaneTypes][kPaletteSizeSymbolCount]
+                             [kPaletteColorIndexContexts]
+                             [kPaletteColorIndexSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      is_inter_cdf[kIsInterContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      use_compound_reference_cdf[kUseCompoundReferenceContexts]
                                 [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      compound_reference_type_cdf[kCompoundReferenceTypeContexts]
+                                 [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      compound_reference_cdf[kNumCompoundReferenceTypes][kReferenceContexts][3]
+                            [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      compound_backward_reference_cdf[kReferenceContexts][2]
+                                     [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      single_reference_cdf[kReferenceContexts][6][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      compound_prediction_mode_cdf[kCompoundPredictionModeContexts]
+                                  [kNumCompoundInterPredictionModes + 1];
+  alignas(kMaxAlignment) uint16_t
+      new_mv_cdf[kNewMvContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      zero_mv_cdf[kZeroMvContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      reference_mv_cdf[kReferenceMvContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      ref_mv_index_cdf[kRefMvIndexContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      is_inter_intra_cdf[kInterIntraContexts][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      inter_intra_mode_cdf[kInterIntraContexts][kNumInterIntraModes + 1];
+  alignas(kMaxAlignment) uint16_t
+      is_wedge_inter_intra_cdf[kMaxBlockSizes][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      wedge_index_cdf[kMaxBlockSizes][kWedgeIndexSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      use_obmc_cdf[kMaxBlockSizes][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      motion_mode_cdf[kMaxBlockSizes][kNumMotionModes + 1];
+  alignas(kMaxAlignment) uint16_t
+      is_explicit_compound_type_cdf[kIsExplicitCompoundTypeContexts]
+                                   [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      is_compound_type_average_cdf[kIsCompoundTypeAverageContexts]
+                                  [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      compound_type_cdf[kMaxBlockSizes]
+                       [kNumExplicitCompoundPredictionTypes + 1];
+  alignas(kMaxAlignment) uint16_t
+      interpolation_filter_cdf[kInterpolationFilterContexts]
+                              [kNumExplicitInterpolationFilters + 1];
+  alignas(kMaxAlignment) uint16_t
+      mv_joint_cdf[kMvContexts][kNumMvJointTypes + 1];
+  alignas(kMaxAlignment) uint16_t
+      mv_sign_cdf[kMvContexts][kNumMvComponents][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      mv_class_cdf[kMvContexts][kNumMvComponents][kMvClassSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      mv_class0_bit_cdf[kMvContexts][kNumMvComponents][kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      mv_class0_fraction_cdf[kMvContexts][kNumMvComponents][kBooleanSymbolCount]
+                            [kMvFractionSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      mv_class0_high_precision_cdf[kMvContexts][kNumMvComponents]
+                                  [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t
+      mv_bit_cdf[kMvContexts][kNumMvComponents][kMvBitSymbolCount]
+                [kBooleanFieldCdfSize];
+  alignas(kMaxAlignment) uint16_t mv_fraction_cdf[kMvContexts][kNumMvComponents]
+                                                 [kMvFractionSymbolCount + 1];
+  alignas(kMaxAlignment) uint16_t
+      mv_high_precision_cdf[kMvContexts][kNumMvComponents]
+                           [kBooleanFieldCdfSize];
 };
 
 }  // namespace libgav1

diff --git a/libgav1/src/symbol_decoder_context_cdfs.inc b/libgav1/src/symbol_decoder_context_cdfs.inc
index 7f8f2c2..509286f 100644
--- a/libgav1/src/symbol_decoder_context_cdfs.inc
+++ b/libgav1/src/symbol_decoder_context_cdfs.inc

@@ -15,7 +15,7 @@
 // This file is just a convenience to separate out all the CDF constant
 // definitions from the symbol decoder context functions.
 
-constexpr uint16_t kDefaultPartitionCdf
+alignas(kMaxAlignment) constexpr uint16_t kDefaultPartitionCdf
     [kBlockWidthCount][kPartitionContexts][kMaxPartitionTypes + 1] = {
         // width 8
         {{13636, 7258, 2376, 0, 0},
@@ -43,32 +43,34 @@
          {27339, 26092, 25646, 741, 541, 237, 186, 0, 0},
          {32057, 31802, 31596, 320, 230, 151, 104, 0, 0}}};
 
-constexpr uint16_t kDefaultSegmentIdCdf[kSegmentIdContexts][kMaxSegments + 1] =
-    {{27146, 24875, 16675, 14535, 4959, 4395, 235, 0, 0},
-     {18494, 14538, 10211, 7833, 2788, 1917, 424, 0, 0},
-     {5241, 4281, 4045, 3878, 371, 121, 89, 0, 0}};
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultSegmentIdCdf[kSegmentIdContexts][kMaxSegments + 1] = {
+        {27146, 24875, 16675, 14535, 4959, 4395, 235, 0, 0},
+        {18494, 14538, 10211, 7833, 2788, 1917, 424, 0, 0},
+        {5241, 4281, 4045, 3878, 371, 121, 89, 0, 0}};
 
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
     kDefaultUsePredictedSegmentIdCdf[kUsePredictedSegmentIdContexts]
                                     [kBooleanFieldCdfSize] = {{16384, 0, 0},
                                                               {16384, 0, 0},
                                                               {16384, 0, 0}};
 
-constexpr uint16_t kDefaultSkipCdf[kSkipContexts][kBooleanFieldCdfSize] = {
-    {1097, 0, 0}, {16253, 0, 0}, {28192, 0, 0}};
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultSkipCdf[kSkipContexts][kBooleanFieldCdfSize] = {
+        {1097, 0, 0}, {16253, 0, 0}, {28192, 0, 0}};
 
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
     kDefaultSkipModeCdf[kSkipModeContexts][kBooleanFieldCdfSize] = {
         {147, 0, 0}, {12060, 0, 0}, {24641, 0, 0}};
 
 // This constant is also used for DeltaLf and DeltaLfMulti.
-constexpr uint16_t kDefaultDeltaQCdf[kDeltaSymbolCount + 1] = {4608, 648, 91, 0,
-                                                               0};
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultDeltaQCdf[kDeltaSymbolCount + 1] = {4608, 648, 91, 0, 0};
 
-constexpr uint16_t kDefaultIntraBlockCopyCdf[kBooleanFieldCdfSize] = {2237, 0,
-                                                                      0};
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultIntraBlockCopyCdf[kBooleanFieldCdfSize] = {2237, 0, 0};
 
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
     kDefaultIntraFrameYModeCdf[kIntraModeContexts][kIntraModeContexts]
                               [kIntraPredictionModesY + 1] = {
                                   {{17180, 15741, 13430, 12550, 12086, 11658,
@@ -122,7 +124,7 @@
                                    {25150, 24480, 22909, 22259, 17382, 14111,
                                     9865, 3992, 3588, 1413, 966, 175, 0, 0}}};
 
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
     kDefaultYModeCdf[kYModeContexts][kIntraPredictionModesY + 1] = {
         {9967, 9279, 8475, 8012, 7167, 6645, 6162, 5350, 4823, 3540, 3083, 2419,
          0, 0},
@@ -133,7 +135,7 @@
         {12613, 11467, 9930, 9590, 9507, 9235, 9065, 7964, 7416, 6193, 5752,
          4719, 0, 0}};
 
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
     kDefaultAngleDeltaCdf[kDirectionalIntraModes][kAngleDeltaSymbolCount + 1] =
         {{30588, 27736, 25201, 9992, 5779, 2551, 0, 0},
          {30467, 27160, 23967, 9281, 5794, 2438, 0, 0},
@@ -144,7 +146,7 @@
          {30528, 21672, 17315, 12427, 10207, 3851, 0, 0},
          {29163, 22340, 20309, 15092, 11524, 2113, 0, 0}};
 
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
     kDefaultUVModeCdf[kBooleanSymbolCount][kIntraPredictionModesY]
                      [kIntraPredictionModesUV + 1] = {
                          // CFL not allowed.
@@ -202,24 +204,26 @@
                           {29624, 27681, 25386, 25264, 25175, 25078, 24967,
                            24704, 24536, 23520, 22893, 22247, 3720, 0, 0}}};
 
-constexpr uint16_t kDefaultCflAlphaSignsCdf[kCflAlphaSignsSymbolCount + 1] = {
-    31350, 30645, 19428, 14363, 5796, 4425, 474, 0, 0};
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultCflAlphaSignsCdf[kCflAlphaSignsSymbolCount + 1] = {
+        31350, 30645, 19428, 14363, 5796, 4425, 474, 0, 0};
 
-constexpr uint16_t kDefaultCflAlphaCdf[kCflAlphaContexts][kCflAlphaSymbolCount +
-                                                          1] = {
-    {25131, 12049, 1367, 287, 111, 80, 76, 72, 68, 64, 60, 56, 52, 48, 44, 0,
-     0},
-    {18403, 9165, 4633, 1600, 601, 373, 281, 195, 148, 121, 100, 96, 92, 88, 84,
-     0, 0},
-    {21236, 10388, 4323, 1408, 419, 245, 184, 119, 95, 91, 87, 83, 79, 75, 71,
-     0, 0},
-    {5778, 1366, 486, 197, 76, 72, 68, 64, 60, 56, 52, 48, 44, 40, 36, 0, 0},
-    {15520, 6710, 3864, 2160, 1463, 891, 642, 447, 374, 304, 252, 208, 192, 175,
-     146, 0, 0},
-    {18030, 11090, 6989, 4867, 3744, 2466, 1788, 925, 624, 355, 248, 174, 146,
-     112, 108, 0, 0}};
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultCflAlphaCdf[kCflAlphaContexts][kCflAlphaSymbolCount + 1] = {
+        {25131, 12049, 1367, 287, 111, 80, 76, 72, 68, 64, 60, 56, 52, 48, 44,
+         0, 0},
+        {18403, 9165, 4633, 1600, 601, 373, 281, 195, 148, 121, 100, 96, 92, 88,
+         84, 0, 0},
+        {21236, 10388, 4323, 1408, 419, 245, 184, 119, 95, 91, 87, 83, 79, 75,
+         71, 0, 0},
+        {5778, 1366, 486, 197, 76, 72, 68, 64, 60, 56, 52, 48, 44, 40, 36, 0,
+         0},
+        {15520, 6710, 3864, 2160, 1463, 891, 642, 447, 374, 304, 252, 208, 192,
+         175, 146, 0, 0},
+        {18030, 11090, 6989, 4867, 3744, 2466, 1788, 925, 624, 355, 248, 174,
+         146, 112, 108, 0, 0}};
 
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
     kDefaultUseFilterIntraCdf[kMaxBlockSizes][kBooleanFieldCdfSize] = {
         {28147, 0, 0}, {26025, 0, 0}, {19998, 0, 0}, {26875, 0, 0},
         {24902, 0, 0}, {20217, 0, 0}, {12539, 0, 0}, {22400, 0, 0},
@@ -228,25 +232,28 @@
         {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
         {16384, 0, 0}, {16384, 0, 0}};
 
-constexpr uint16_t kDefaultFilterIntraModeCdf[kNumFilterIntraPredictors + 1] = {
-    23819, 19992, 15557, 3210, 0, 0};
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultFilterIntraModeCdf[kNumFilterIntraPredictors + 1] = {
+        23819, 19992, 15557, 3210, 0, 0};
 
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
     kDefaultTxDepthCdf[4][kTxDepthContexts][kMaxTxDepthSymbolCount + 1] = {
         {{12800, 0, 0}, {12800, 0, 0}, {8448, 0, 0}},
         {{20496, 2596, 0, 0}, {20496, 2596, 0, 0}, {14091, 1920, 0, 0}},
         {{19782, 17588, 0, 0}, {19782, 17588, 0, 0}, {8466, 7166, 0, 0}},
         {{26986, 21293, 0, 0}, {26986, 21293, 0, 0}, {15965, 10009, 0, 0}}};
 
-constexpr uint16_t kDefaultTxSplitCdf[kTxSplitContexts][kBooleanFieldCdfSize] =
-    {{4187, 0, 0},  {8922, 0, 0},  {11921, 0, 0}, {8453, 0, 0},  {14572, 0, 0},
-     {20635, 0, 0}, {13977, 0, 0}, {21881, 0, 0}, {21763, 0, 0}, {5589, 0, 0},
-     {12764, 0, 0}, {21487, 0, 0}, {6219, 0, 0},  {13460, 0, 0}, {18544, 0, 0},
-     {4753, 0, 0},  {11222, 0, 0}, {18368, 0, 0}, {4603, 0, 0},  {10367, 0, 0},
-     {16680, 0, 0}};
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultTxSplitCdf[kTxSplitContexts][kBooleanFieldCdfSize] = {
+        {4187, 0, 0},  {8922, 0, 0},  {11921, 0, 0}, {8453, 0, 0},
+        {14572, 0, 0}, {20635, 0, 0}, {13977, 0, 0}, {21881, 0, 0},
+        {21763, 0, 0}, {5589, 0, 0},  {12764, 0, 0}, {21487, 0, 0},
+        {6219, 0, 0},  {13460, 0, 0}, {18544, 0, 0}, {4753, 0, 0},
+        {11222, 0, 0}, {18368, 0, 0}, {4603, 0, 0},  {10367, 0, 0},
+        {16680, 0, 0}};
 
 /* clang-format off */
-constexpr uint16_t kDefaultAllZeroCdf[kCoefficientQuantizerContexts]
+alignas(kMaxAlignment) constexpr uint16_t kDefaultAllZeroCdf[kCoefficientQuantizerContexts]
                                  [kNumSquareTransformSizes][kAllZeroContexts]
                                  [kBooleanFieldCdfSize] = {
   {
@@ -320,7 +327,7 @@
 };
 /* clang-format on */
 
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
     kDefaultInterTxTypeCdf[3][kNumExtendedTransformSizes][kNumTransformTypes +
                                                           1] = {
         {{28310, 27208, 25073, 23059, 19438, 17979, 15231, 12502, 11264, 9920,
@@ -339,7 +346,7 @@
          {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}},
         {{16384, 0, 0}, {28601, 0, 0}, {30770, 0, 0}, {32020, 0, 0}}};
 
-constexpr uint16_t kDefaultIntraTxTypeCdf
+alignas(kMaxAlignment) constexpr uint16_t kDefaultIntraTxTypeCdf
     [2][kNumExtendedTransformSizes][kIntraPredictionModesY]
     [kNumTransformTypes + 1] = {
         {{{31233, 24733, 23307, 20017, 9301, 4943, 0, 0},
@@ -408,26 +415,26 @@
           {32685, 27153, 20767, 15540, 0, 0},
           {30800, 27212, 20745, 14221, 0, 0}}}};
 
-constexpr uint16_t kDefaultEobPt16Cdf[kCoefficientQuantizerContexts]
-                                     [kNumPlaneTypes][kEobPtContexts]
-                                     [kEobPt16SymbolCount + 1] = {
-                                         {{{31928, 31729, 30788, 27873, 0, 0},
-                                           {32398, 32097, 30885, 28297, 0, 0}},
-                                          {{29521, 27818, 23080, 18205, 0, 0},
-                                           {30864, 29414, 25005, 18121, 0, 0}}},
-                                         {{{30643, 30217, 27603, 23822, 0, 0},
-                                           {32255, 32003, 30909, 26429, 0, 0}},
-                                          {{25131, 23270, 18509, 13660, 0, 0},
-                                           {30271, 28672, 23902, 15775, 0, 0}}},
-                                         {{{28752, 27871, 23887, 17800, 0, 0},
-                                           {32052, 31663, 30122, 22712, 0, 0}},
-                                          {{21629, 19498, 14527, 9202, 0, 0},
-                                           {29576, 27736, 22471, 13013, 0, 0}}},
-                                         {{{26060, 23810, 18022, 10635, 0, 0},
-                                           {31546, 30694, 27985, 17358, 0, 0}},
-                                          {{13193, 11002, 6724, 3059, 0, 0},
-                                           {25471, 22001, 13495, 4574, 0, 0}}}};
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultEobPt16Cdf[kCoefficientQuantizerContexts][kNumPlaneTypes]
+                      [kEobPtContexts][kEobPt16SymbolCount + 1] = {
+                          {{{31928, 31729, 30788, 27873, 0, 0},
+                            {32398, 32097, 30885, 28297, 0, 0}},
+                           {{29521, 27818, 23080, 18205, 0, 0},
+                            {30864, 29414, 25005, 18121, 0, 0}}},
+                          {{{30643, 30217, 27603, 23822, 0, 0},
+                            {32255, 32003, 30909, 26429, 0, 0}},
+                           {{25131, 23270, 18509, 13660, 0, 0},
+                            {30271, 28672, 23902, 15775, 0, 0}}},
+                          {{{28752, 27871, 23887, 17800, 0, 0},
+                            {32052, 31663, 30122, 22712, 0, 0}},
+                           {{21629, 19498, 14527, 9202, 0, 0},
+                            {29576, 27736, 22471, 13013, 0, 0}}},
+                          {{{26060, 23810, 18022, 10635, 0, 0},
+                            {31546, 30694, 27985, 17358, 0, 0}},
+                           {{13193, 11002, 6724, 3059, 0, 0},
+                            {25471, 22001, 13495, 4574, 0, 0}}}};
+alignas(kMaxAlignment) constexpr uint16_t
     kDefaultEobPt32Cdf[kCoefficientQuantizerContexts][kNumPlaneTypes]
                       [kEobPtContexts][kEobPt32SymbolCount + 1] = {
                           {{{32368, 32248, 31791, 30666, 26226, 0, 0},
@@ -446,7 +453,7 @@
                             {31612, 31066, 29093, 23494, 12229, 0, 0}},
                            {{10682, 8486, 5758, 2998, 1025, 0, 0},
                             {25069, 21871, 11877, 5842, 1140, 0, 0}}}};
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
     kDefaultEobPt64Cdf[kCoefficientQuantizerContexts][kNumPlaneTypes]
                       [kEobPtContexts][kEobPt64SymbolCount + 1] = {
                           {{{32439, 32270, 31667, 30984, 29503, 25010, 0, 0},
@@ -465,7 +472,7 @@
                             {31479, 30448, 28797, 24842, 18615, 8477, 0, 0}},
                            {{8556, 7060, 4500, 2733, 1461, 719, 0, 0},
                             {24042, 20390, 13359, 6318, 2730, 306, 0, 0}}}};
-constexpr uint16_t kDefaultEobPt128Cdf
+alignas(kMaxAlignment) constexpr uint16_t kDefaultEobPt128Cdf
     [kCoefficientQuantizerContexts][kNumPlaneTypes][kEobPtContexts]
     [kEobPt128SymbolCount + 1] = {
         {{{32549, 32286, 31628, 30677, 29088, 26740, 20182, 0, 0},
@@ -485,7 +492,7 @@
          {{8455, 6706, 4383, 2661, 1551, 870, 423, 0, 0},
           {23603, 19486, 11618, 2482, 874, 197, 56, 0, 0}}}};
 
-constexpr uint16_t kDefaultEobPt256Cdf
+alignas(kMaxAlignment) constexpr uint16_t kDefaultEobPt256Cdf
     [kCoefficientQuantizerContexts][kNumPlaneTypes][kEobPtContexts]
     [kEobPt256SymbolCount + 1] = {
         {{{32458, 32184, 30881, 29179, 26600, 24157, 21416, 17116, 0, 0},
@@ -505,7 +512,7 @@
          {{9658, 8171, 5628, 3874, 2601, 1841, 1376, 674, 0, 0},
           {22770, 15107, 7590, 4671, 1460, 730, 365, 73, 0, 0}}}};
 
-constexpr uint16_t kDefaultEobPt512Cdf
+alignas(kMaxAlignment) constexpr uint16_t kDefaultEobPt512Cdf
     [kCoefficientQuantizerContexts][kNumPlaneTypes][kEobPt512SymbolCount + 1] =
         {{{32127, 31785, 29061, 27338, 22534, 17810, 13980, 9356, 6707, 0, 0},
           {27673, 26322, 22772, 19414, 16751, 14782, 11849, 6639, 3628, 0, 0}},
@@ -516,7 +523,7 @@
          {{26841, 24959, 21845, 18171, 13329, 8633, 4312, 1626, 708, 0, 0},
           {11675, 9725, 7026, 5110, 3671, 3052, 2695, 1948, 812, 0, 0}}};
 
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
     kDefaultEobPt1024Cdf[kCoefficientQuantizerContexts][kNumPlaneTypes]
                         [kEobPt1024SymbolCount + 1] = {
                             {{32375, 32347, 32017, 31145, 29608, 26416, 19423,
@@ -537,7 +544,7 @@
                               2961, 198, 0, 0}}};
 
 /* clang-format off */
-constexpr uint16_t kDefaultEobExtraCdf[kCoefficientQuantizerContexts]
+alignas(kMaxAlignment) constexpr uint16_t kDefaultEobExtraCdf[kCoefficientQuantizerContexts]
                                   [kNumSquareTransformSizes][kNumPlaneTypes]
                                   [kEobExtraContexts][kBooleanFieldCdfSize] = {
   {
@@ -710,7 +717,7 @@
   }
 };
 
-constexpr uint16_t kDefaultCoeffBaseEobCdf[kCoefficientQuantizerContexts]
+alignas(kMaxAlignment) constexpr uint16_t kDefaultCoeffBaseEobCdf[kCoefficientQuantizerContexts]
                                       [kNumSquareTransformSizes][kNumPlaneTypes]
                                       [kCoeffBaseEobContexts]
                                       [kCoeffBaseEobSymbolCount + 1] = {
@@ -845,7 +852,7 @@
 };
 /* clang-format on */
 
-constexpr uint16_t kDefaultCoeffBaseCdf
+alignas(kMaxAlignment) constexpr uint16_t kDefaultCoeffBaseCdf
     [kCoefficientQuantizerContexts][kNumSquareTransformSizes][kNumPlaneTypes]
     [kCoeffBaseContexts][kCoeffBaseSymbolCount + 1] = {
         {{{{28734, 23838, 20041, 0, 0}, {14686, 3027, 891, 0, 0},
@@ -1689,7 +1696,7 @@
            {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
            {24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}}}};
 
-constexpr uint16_t kDefaultCoeffBaseRangeCdf
+alignas(kMaxAlignment) constexpr uint16_t kDefaultCoeffBaseRangeCdf
     [kCoefficientQuantizerContexts][kNumSquareTransformSizes][kNumPlaneTypes]
     [kCoeffBaseRangeContexts][kCoeffBaseRangeSymbolCount + 1] = {
         {{{{18470, 12050, 8594, 0, 0},  {20232, 13167, 8979, 0, 0},
@@ -2134,7 +2141,7 @@
            {24576, 16384, 8192, 0, 0}}}}};
 
 /* clang-format off */
-constexpr uint16_t kDefaultDcSignCdf[kCoefficientQuantizerContexts][kNumPlaneTypes]
+alignas(kMaxAlignment) constexpr uint16_t kDefaultDcSignCdf[kCoefficientQuantizerContexts][kNumPlaneTypes]
                                 [kDcSignContexts][kBooleanFieldCdfSize] = {
   {{{16768, 0, 0}, {19712, 0, 0}, {13952, 0, 0}}, {{17536, 0, 0}, {19840, 0, 0},
     {15488, 0, 0}}},
@@ -2146,14 +2153,17 @@
     {15488, 0, 0}}}
 };
 /* clang-format on */
-constexpr uint16_t kDefaultRestorationTypeCdf[kRestorationTypeSymbolCount + 1] =
-    {23355, 10187, 0, 0};
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultRestorationTypeCdf[kRestorationTypeSymbolCount + 1] = {23355, 10187,
+                                                                   0, 0};
 
-constexpr uint16_t kDefaultUseWienerCdf[kBooleanFieldCdfSize] = {21198, 0, 0};
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultUseWienerCdf[kBooleanFieldCdfSize] = {21198, 0, 0};
 
-constexpr uint16_t kDefaultUseSgrProjCdf[kBooleanFieldCdfSize] = {15913, 0, 0};
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultUseSgrProjCdf[kBooleanFieldCdfSize] = {15913, 0, 0};
 
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
     kDefaultHasPaletteYCdf[kPaletteBlockSizeContexts][kPaletteYModeContexts]
                           [kBooleanFieldCdfSize] = {
                               {{1092, 0, 0}, {29349, 0, 0}, {31507, 0, 0}},
@@ -2164,7 +2174,7 @@
                               {{503, 0, 0}, {28753, 0, 0}, {31247, 0, 0}},
                               {{318, 0, 0}, {24822, 0, 0}, {32639, 0, 0}}};
 
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
     kDefaultPaletteYSizeCdf[kPaletteBlockSizeContexts]
                            [kPaletteSizeSymbolCount + 1] = {
                                {24816, 19768, 14619, 11290, 7241, 3527, 0, 0},
@@ -2175,11 +2185,11 @@
                                {23057, 17880, 15845, 11716, 7107, 4893, 0, 0},
                                {17828, 11971, 11090, 8582, 5735, 3769, 0, 0}};
 
-constexpr uint16_t kDefaultHasPaletteUVCdf[kPaletteUVModeContexts]
-                                          [kBooleanFieldCdfSize] = {
-                                              {307, 0, 0}, {11280, 0, 0}};
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultHasPaletteUVCdf[kPaletteUVModeContexts][kBooleanFieldCdfSize] = {
+        {307, 0, 0}, {11280, 0, 0}};
 
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
     kDefaultPaletteUVSizeCdf[kPaletteBlockSizeContexts]
                             [kPaletteSizeSymbolCount + 1] = {
                                 {24055, 12789, 5640, 3159, 1437, 496, 0, 0},
@@ -2191,7 +2201,7 @@
                                 {31499, 27333, 22335, 13805, 11068, 6903, 0,
                                  0}};
 
-constexpr uint16_t kDefaultPaletteColorIndexCdf
+alignas(kMaxAlignment) constexpr uint16_t kDefaultPaletteColorIndexCdf
     [kNumPlaneTypes][kPaletteSizeSymbolCount][kPaletteColorIndexContexts]
     [kPaletteColorIndexSymbolCount + 1] = {
         {{{4058, 0, 0},
@@ -2265,10 +2275,11 @@
           {14803, 12684, 10536, 8794, 6494, 4366, 2378, 0, 0},
           {1578, 1439, 1252, 1089, 943, 742, 446, 0, 0}}}};
 
-constexpr uint16_t kDefaultIsInterCdf[kIsInterContexts][kBooleanFieldCdfSize] =
-    {{31962, 0, 0}, {16106, 0, 0}, {12582, 0, 0}, {6230, 0, 0}};
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultIsInterCdf[kIsInterContexts][kBooleanFieldCdfSize] = {
+        {31962, 0, 0}, {16106, 0, 0}, {12582, 0, 0}, {6230, 0, 0}};
 
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
     kDefaultUseCompoundReferenceCdf[kUseCompoundReferenceContexts]
                                    [kBooleanFieldCdfSize] = {{5940, 0, 0},
                                                              {8733, 0, 0},
@@ -2276,7 +2287,7 @@
                                                              {22128, 0, 0},
                                                              {29867, 0, 0}};
 
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
     kDefaultCompoundReferenceTypeCdf[kCompoundReferenceTypeContexts]
                                     [kBooleanFieldCdfSize] = {{31570, 0, 0},
                                                               {30698, 0, 0},
@@ -2284,7 +2295,7 @@
                                                               {25269, 0, 0},
                                                               {10293, 0, 0}};
 
-constexpr uint16_t kDefaultCompoundReferenceCdf
+alignas(kMaxAlignment) constexpr uint16_t kDefaultCompoundReferenceCdf
     [kNumCompoundReferenceTypes][kReferenceContexts][3][kBooleanFieldCdfSize] =
         {{{{27484, 0, 0}, {28903, 0, 0}, {29640, 0, 0}},
           {{9616, 0, 0}, {18595, 0, 0}, {17498, 0, 0}},
@@ -2293,7 +2304,7 @@
           {{12877, 0, 0}, {10327, 0, 0}, {17608, 0, 0}},
           {{2037, 0, 0}, {1709, 0, 0}, {5224, 0, 0}}}};
 
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
     kDefaultCompoundBackwardReferenceCdf[kReferenceContexts][2]
                                         [kBooleanFieldCdfSize] = {
                                             {{30533, 0, 0}, {31345, 0, 0}},
@@ -2301,7 +2312,7 @@
                                             {{2162, 0, 0}, {2279, 0, 0}}};
 
 /* clang-format off */
-constexpr uint16_t kDefaultSingleReferenceCdf[kReferenceContexts][6]
+alignas(kMaxAlignment) constexpr uint16_t kDefaultSingleReferenceCdf[kReferenceContexts][6]
                                          [kBooleanFieldCdfSize] = {
   {{27871, 0, 0}, {31213, 0, 0}, {28532, 0, 0}, {24118, 0, 0}, {31864, 0, 0},
    {31324, 0, 0}},
@@ -2311,7 +2322,7 @@
    {2464, 0, 0}}};
 /* clang-format on */
 
-constexpr uint16_t kDefaultCompoundPredictionModeCdf
+alignas(kMaxAlignment) constexpr uint16_t kDefaultCompoundPredictionModeCdf
     [kCompoundPredictionModeContexts][kNumCompoundInterPredictionModes + 1] = {
         {25008, 18945, 16960, 15127, 13612, 12102, 5877, 0, 0},
         {22038, 13316, 11623, 10019, 8729, 7637, 4044, 0, 0},
@@ -2322,35 +2333,37 @@
         {15643, 8495, 6954, 5276, 4554, 4064, 2176, 0, 0},
         {19722, 9554, 8263, 6826, 5333, 4326, 3438, 0, 0}};
 
-constexpr uint16_t kDefaultNewMvCdf[kNewMvContexts][kBooleanFieldCdfSize] = {
-    {8733, 0, 0},  {16138, 0, 0}, {17429, 0, 0},
-    {24382, 0, 0}, {20546, 0, 0}, {28092, 0, 0}};
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultNewMvCdf[kNewMvContexts][kBooleanFieldCdfSize] = {
+        {8733, 0, 0},  {16138, 0, 0}, {17429, 0, 0},
+        {24382, 0, 0}, {20546, 0, 0}, {28092, 0, 0}};
 
-constexpr uint16_t kDefaultZeroMvCdf[kZeroMvContexts][kBooleanFieldCdfSize] = {
-    {30593, 0, 0}, {31714, 0, 0}};
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultZeroMvCdf[kZeroMvContexts][kBooleanFieldCdfSize] = {{30593, 0, 0},
+                                                                {31714, 0, 0}};
 
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
     kDefaultReferenceMvCdf[kReferenceMvContexts][kBooleanFieldCdfSize] = {
         {8794, 0, 0}, {8580, 0, 0}, {14920, 0, 0},
         {4146, 0, 0}, {8456, 0, 0}, {12845, 0, 0}};
 
 // This is called drl_mode in the spec where DRL stands for Dynamic Reference
 // List.
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
     kDefaultRefMvIndexCdf[kRefMvIndexContexts][kBooleanFieldCdfSize] = {
         {19664, 0, 0}, {8208, 0, 0}, {13823, 0, 0}};
 
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
     kDefaultIsInterIntraCdf[kInterIntraContexts][kBooleanFieldCdfSize] = {
         {5881, 0, 0}, {5171, 0, 0}, {2531, 0, 0}};
 
-constexpr uint16_t kDefaultInterIntraModeCdf[kInterIntraContexts]
-                                            [kNumInterIntraModes + 1] = {
-                                                {30893, 21686, 5436, 0, 0},
-                                                {30295, 22772, 6380, 0, 0},
-                                                {28530, 21231, 6842, 0, 0}};
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultInterIntraModeCdf[kInterIntraContexts][kNumInterIntraModes + 1] = {
+        {30893, 21686, 5436, 0, 0},
+        {30295, 22772, 6380, 0, 0},
+        {28530, 21231, 6842, 0, 0}};
 
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
     kDefaultIsWedgeInterIntraCdf[kMaxBlockSizes][kBooleanFieldCdfSize] = {
         {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
         {12732, 0, 0}, {7811, 0, 0},  {16384, 0, 0}, {16384, 0, 0},
@@ -2359,7 +2372,7 @@
         {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
         {16384, 0, 0}, {16384, 0, 0}};
 
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
     kDefaultWedgeIndexCdf[kMaxBlockSizes][kWedgeIndexSymbolCount + 1] = {
         {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
          10240, 8192, 6144, 4096, 2048, 0, 0},
@@ -2406,47 +2419,53 @@
         {30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
          10240, 8192, 6144, 4096, 2048, 0, 0}};
 
-constexpr uint16_t kDefaultUseObmcCdf[kMaxBlockSizes][kBooleanFieldCdfSize] = {
-    {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {22331, 0, 0},
-    {23397, 0, 0}, {9104, 0, 0},  {16384, 0, 0}, {23467, 0, 0}, {15336, 0, 0},
-    {18345, 0, 0}, {8760, 0, 0},  {11867, 0, 0}, {17626, 0, 0}, {6951, 0, 0},
-    {9945, 0, 0},  {5889, 0, 0},  {10685, 0, 0}, {2640, 0, 0},  {1754, 0, 0},
-    {1208, 0, 0},  {130, 0, 0}};
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultUseObmcCdf[kMaxBlockSizes][kBooleanFieldCdfSize] = {
+        {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+        {22331, 0, 0}, {23397, 0, 0}, {9104, 0, 0},  {16384, 0, 0},
+        {23467, 0, 0}, {15336, 0, 0}, {18345, 0, 0}, {8760, 0, 0},
+        {11867, 0, 0}, {17626, 0, 0}, {6951, 0, 0},  {9945, 0, 0},
+        {5889, 0, 0},  {10685, 0, 0}, {2640, 0, 0},  {1754, 0, 0},
+        {1208, 0, 0},  {130, 0, 0}};
 
-constexpr uint16_t kDefaultMotionModeCdf[kMaxBlockSizes][kNumMotionModes + 1] =
-    {{21845, 10923, 0, 0}, {21845, 10923, 0, 0}, {21845, 10923, 0, 0},
-     {21845, 10923, 0, 0}, {25117, 8008, 0, 0},  {28030, 8003, 0, 0},
-     {3969, 1378, 0, 0},   {21845, 10923, 0, 0}, {27377, 7240, 0, 0},
-     {13349, 5958, 0, 0},  {27645, 9162, 0, 0},  {3795, 1174, 0, 0},
-     {6337, 1994, 0, 0},   {21162, 8460, 0, 0},  {6508, 3652, 0, 0},
-     {12408, 4706, 0, 0},  {3026, 1565, 0, 0},   {11089, 5938, 0, 0},
-     {3252, 2067, 0, 0},   {3870, 2371, 0, 0},   {1890, 1433, 0, 0},
-     {261, 210, 0, 0}};
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMotionModeCdf[kMaxBlockSizes][kNumMotionModes + 1] = {
+        {21845, 10923, 0, 0}, {21845, 10923, 0, 0}, {21845, 10923, 0, 0},
+        {21845, 10923, 0, 0}, {25117, 8008, 0, 0},  {28030, 8003, 0, 0},
+        {3969, 1378, 0, 0},   {21845, 10923, 0, 0}, {27377, 7240, 0, 0},
+        {13349, 5958, 0, 0},  {27645, 9162, 0, 0},  {3795, 1174, 0, 0},
+        {6337, 1994, 0, 0},   {21162, 8460, 0, 0},  {6508, 3652, 0, 0},
+        {12408, 4706, 0, 0},  {3026, 1565, 0, 0},   {11089, 5938, 0, 0},
+        {3252, 2067, 0, 0},   {3870, 2371, 0, 0},   {1890, 1433, 0, 0},
+        {261, 210, 0, 0}};
 
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
     kDefaultIsExplicitCompoundTypeCdf[kIsExplicitCompoundTypeContexts]
                                      [kBooleanFieldCdfSize] = {
                                          {6161, 0, 0},  {9877, 0, 0},
                                          {13928, 0, 0}, {8174, 0, 0},
                                          {12834, 0, 0}, {10094, 0, 0}};
 
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
     kDefaultIsCompoundTypeAverageCdf[kIsCompoundTypeAverageContexts]
                                     [kBooleanFieldCdfSize] = {
                                         {14524, 0, 0}, {19903, 0, 0},
                                         {25715, 0, 0}, {19509, 0, 0},
                                         {23434, 0, 0}, {28124, 0, 0}};
 
-constexpr uint16_t kDefaultCompoundTypeCdf
-    [kMaxBlockSizes][kNumExplicitCompoundPredictionTypes + 1] = {
-        {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
-        {9337, 0, 0},  {19597, 0, 0}, {20948, 0, 0}, {16384, 0, 0},
-        {21298, 0, 0}, {22998, 0, 0}, {23668, 0, 0}, {16384, 0, 0},
-        {25067, 0, 0}, {24535, 0, 0}, {26596, 0, 0}, {16384, 0, 0},
-        {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
-        {16384, 0, 0}, {16384, 0, 0}};
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultCompoundTypeCdf[kMaxBlockSizes]
+                           [kNumExplicitCompoundPredictionTypes + 1] = {
+                               {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+                               {16384, 0, 0}, {9337, 0, 0},  {19597, 0, 0},
+                               {20948, 0, 0}, {16384, 0, 0}, {21298, 0, 0},
+                               {22998, 0, 0}, {23668, 0, 0}, {16384, 0, 0},
+                               {25067, 0, 0}, {24535, 0, 0}, {26596, 0, 0},
+                               {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+                               {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+                               {16384, 0, 0}};
 
-constexpr uint16_t kDefaultInterpolationFilterCdf
+alignas(kMaxAlignment) constexpr uint16_t kDefaultInterpolationFilterCdf
     [kInterpolationFilterContexts][kNumExplicitInterpolationFilters + 1] = {
         {833, 48, 0, 0},      {27200, 49, 0, 0},    {32346, 29830, 0, 0},
         {4524, 160, 0, 0},    {1562, 815, 0, 0},    {27906, 647, 0, 0},
@@ -2455,30 +2474,36 @@
         {1746, 759, 0, 0},    {29805, 675, 0, 0},   {32167, 31825, 0, 0},
         {17799, 11370, 0, 0}};
 
-constexpr uint16_t kDefaultMvJointCdf[kNumMvJointTypes + 1] = {28672, 21504,
-                                                               13440, 0, 0};
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMvJointCdf[kNumMvJointTypes + 1] = {28672, 21504, 13440, 0, 0};
 
-constexpr uint16_t kDefaultMvSignCdf[kBooleanFieldCdfSize] = {16384, 0, 0};
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMvSignCdf[kBooleanFieldCdfSize] = {16384, 0, 0};
 
-constexpr uint16_t kDefaultMvClassCdf[kMvClassSymbolCount + 1] = {
-    4096, 1792, 910, 448, 217, 112, 28, 11, 6, 1, 0};
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMvClassCdf[kMvClassSymbolCount + 1] = {
+        4096, 1792, 910, 448, 217, 112, 28, 11, 6, 1, 0};
 
-constexpr uint16_t kDefaultMvClass0BitCdf[kBooleanFieldCdfSize] = {5120, 0, 0};
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMvClass0BitCdf[kBooleanFieldCdfSize] = {5120, 0, 0};
 
-constexpr uint16_t kDefaultMvClass0FractionCdf[kBooleanSymbolCount]
-                                              [kMvFractionSymbolCount + 1] = {
-                                                  {16384, 8192, 6144, 0, 0},
-                                                  {20480, 11520, 8640, 0, 0}};
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMvClass0FractionCdf[kBooleanSymbolCount][kMvFractionSymbolCount +
+                                                     1] = {
+        {16384, 8192, 6144, 0, 0}, {20480, 11520, 8640, 0, 0}};
 
-constexpr uint16_t kDefaultMvClass0HighPrecisionCdf[kBooleanFieldCdfSize] = {
-    12288, 0, 0};
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMvClass0HighPrecisionCdf[kBooleanFieldCdfSize] = {12288, 0, 0};
 
-constexpr uint16_t kDefaultMvBitCdf[kMvBitSymbolCount][kBooleanFieldCdfSize] = {
-    {15360, 0, 0}, {14848, 0, 0}, {13824, 0, 0}, {12288, 0, 0}, {10240, 0, 0},
-    {8192, 0, 0},  {4096, 0, 0},  {2816, 0, 0},  {2816, 0, 0},  {2048, 0, 0}};
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMvBitCdf[kMvBitSymbolCount][kBooleanFieldCdfSize] = {
+        {15360, 0, 0}, {14848, 0, 0}, {13824, 0, 0}, {12288, 0, 0},
+        {10240, 0, 0}, {8192, 0, 0},  {4096, 0, 0},  {2816, 0, 0},
+        {2816, 0, 0},  {2048, 0, 0}};
 
-constexpr uint16_t kDefaultMvFractionCdf[kMvFractionSymbolCount + 1] = {
-    24576, 15360, 11520, 0, 0};
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMvFractionCdf[kMvFractionSymbolCount + 1] = {24576, 15360, 11520, 0,
+                                                         0};
 
-constexpr uint16_t kDefaultMvHighPrecisionCdf[kBooleanFieldCdfSize] = {16384, 0,
-                                                                       0};
+alignas(kMaxAlignment) constexpr uint16_t
+    kDefaultMvHighPrecisionCdf[kBooleanFieldCdfSize] = {16384, 0, 0};

diff --git a/libgav1/src/threading_strategy.cc b/libgav1/src/threading_strategy.cc
index 2864c34..17ce18f 100644
--- a/libgav1/src/threading_strategy.cc
+++ b/libgav1/src/threading_strategy.cc

@@ -27,7 +27,7 @@
 namespace {
 
 #if !defined(LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER)
-constexpr int kFrameParallelThresholdMultiplier = 4;
+constexpr int kFrameParallelThresholdMultiplier = 3;
 #else
 constexpr int kFrameParallelThresholdMultiplier =
     LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER;
@@ -36,24 +36,25 @@
 // Computes the number of frame threads to be used based on the following
 // heuristic:
 //   * If |thread_count| == 1, return 0.
-//   * If |thread_count| <= |tile_count| * 4, return 0.
+//   * If |thread_count| <= |tile_count| * kFrameParallelThresholdMultiplier,
+//     return 0.
 //   * Otherwise, return the largest value of i which satisfies the following
 //     condition: i + i * tile_columns <= thread_count. This ensures that there
 //     are at least |tile_columns| worker threads for each frame thread.
 //   * This function will never return 1 or a value > |thread_count|.
 //
-//  This heuristic is based empirical performance data. The in-frame threading
-//  model (combination of tile multithreading, superblock row multithreading and
-//  post filter multithreading) performs better than the frame parallel model
-//  until we reach the threshold of |thread_count| > |tile_count| *
-//  kFrameParallelThresholdMultiplier.
+//  This heuristic is based on empirical performance data. The in-frame
+//  threading model (combination of tile multithreading, superblock row
+//  multithreading and post filter multithreading) performs better than the
+//  frame parallel model until we reach the threshold of |thread_count| >
+//  |tile_count| * kFrameParallelThresholdMultiplier.
 //
 //  It is a function of |tile_count| since tile threading and superblock row
-//  multithreading will scale only as a factor of |tile_count|. The threshold 4
-//  is arrived at based on empirical data. The general idea is that superblock
-//  row multithreading plateaus at 4 * |tile_count| because in most practical
-//  cases there aren't more than that many superblock rows and columns available
-//  to work on in parallel.
+//  multithreading will scale only as a factor of |tile_count|. The threshold
+//  kFrameParallelThresholdMultiplier is arrived at based on empirical data.
+//  The general idea is that superblock row multithreading plateaus at 4 *
+//  |tile_count| because in most practical cases there aren't more than that
+//  many superblock rows and columns available to work on in parallel.
 int ComputeFrameThreadCount(int thread_count, int tile_count,
                             int tile_columns) {
   assert(thread_count > 0);
@@ -132,7 +133,7 @@
     thread_count -= 2;
     if (thread_count <= 0) break;
   }
-#else  // !defined(__ANDROID__)
+#else   // !defined(__ANDROID__)
   // Assign the remaining threads to each Tile.
   for (int i = 0; i < tile_count; ++i) {
     const int count = thread_count / tile_count +

diff --git a/libgav1/src/tile.h b/libgav1/src/tile.h
index 065ef70..6bae2a0 100644
--- a/libgav1/src/tile.h
+++ b/libgav1/src/tile.h

@@ -48,7 +48,6 @@
 #include "src/utils/constants.h"
 #include "src/utils/entropy_decoder.h"
 #include "src/utils/memory.h"
-#include "src/utils/parameter_tree.h"
 #include "src/utils/segmentation_map.h"
 #include "src/utils/threadpool.h"
 #include "src/utils/types.h"
@@ -74,6 +73,7 @@
       const ObuFrameHeader& frame_header, RefCountedBuffer* const current_frame,
       const DecoderState& state, FrameScratchBuffer* const frame_scratch_buffer,
       const WedgeMaskArray& wedge_masks,
+      const QuantizerMatrix& quantizer_matrix,
       SymbolDecoderContext* const saved_symbol_decoder_context,
       const SegmentationMap* prev_segment_ids, PostFilter* const post_filter,
       const dsp::Dsp* const dsp, ThreadPool* const thread_pool,
@@ -81,9 +81,10 @@
       bool use_intra_prediction_buffer) {
     std::unique_ptr<Tile> tile(new (std::nothrow) Tile(
         tile_number, data, size, sequence_header, frame_header, current_frame,
-        state, frame_scratch_buffer, wedge_masks, saved_symbol_decoder_context,
-        prev_segment_ids, post_filter, dsp, thread_pool, pending_tiles,
-        frame_parallel, use_intra_prediction_buffer));
+        state, frame_scratch_buffer, wedge_masks, quantizer_matrix,
+        saved_symbol_decoder_context, prev_segment_ids, post_filter, dsp,
+        thread_pool, pending_tiles, frame_parallel,
+        use_intra_prediction_buffer));
     return (tile != nullptr && tile->Init()) ? std::move(tile) : nullptr;
   }
 
@@ -186,17 +187,6 @@
   int column4x4_end() const { return column4x4_end_; }
 
  private:
-  Tile(int tile_number, const uint8_t* data, size_t size,
-       const ObuSequenceHeader& sequence_header,
-       const ObuFrameHeader& frame_header, RefCountedBuffer* current_frame,
-       const DecoderState& state, FrameScratchBuffer* frame_scratch_buffer,
-       const WedgeMaskArray& wedge_masks,
-       SymbolDecoderContext* saved_symbol_decoder_context,
-       const SegmentationMap* prev_segment_ids, PostFilter* post_filter,
-       const dsp::Dsp* dsp, ThreadPool* thread_pool,
-       BlockingCounterWithStatus* pending_tiles, bool frame_parallel,
-       bool use_intra_prediction_buffer);
-
   // Stores the transform tree state when reading variable size transform trees
   // and when applying the transform tree. When applying the transform tree,
   // |depth| is not used.
@@ -248,6 +238,18 @@
   //    every transform block.
   using ResidualPtr = uint8_t*;
 
+  Tile(int tile_number, const uint8_t* data, size_t size,
+       const ObuSequenceHeader& sequence_header,
+       const ObuFrameHeader& frame_header, RefCountedBuffer* current_frame,
+       const DecoderState& state, FrameScratchBuffer* frame_scratch_buffer,
+       const WedgeMaskArray& wedge_masks,
+       const QuantizerMatrix& quantizer_matrix,
+       SymbolDecoderContext* saved_symbol_decoder_context,
+       const SegmentationMap* prev_segment_ids, PostFilter* post_filter,
+       const dsp::Dsp* dsp, ThreadPool* thread_pool,
+       BlockingCounterWithStatus* pending_tiles, bool frame_parallel,
+       bool use_intra_prediction_buffer);
+
   // Performs member initializations that may fail. Helper function used by
   // Create().
   LIBGAV1_MUST_USE_RESULT bool Init();
@@ -289,26 +291,25 @@
   // iteratively. It performs a DFS traversal over the partition tree to process
   // the blocks in the right order.
   bool ProcessPartition(
-      int row4x4_start, int column4x4_start, ParameterTree* root,
-      TileScratchBuffer* scratch_buffer,
+      int row4x4_start, int column4x4_start, TileScratchBuffer* scratch_buffer,
       ResidualPtr* residual);  // Iterative implementation of 5.11.4.
   bool ProcessBlock(int row4x4, int column4x4, BlockSize block_size,
-                    ParameterTree* tree, TileScratchBuffer* scratch_buffer,
+                    TileScratchBuffer* scratch_buffer,
                     ResidualPtr* residual);   // 5.11.5.
   void ResetCdef(int row4x4, int column4x4);  // 5.11.55.
 
   // This function is used to decode a superblock when the parsing has already
   // been done for that superblock.
-  bool DecodeSuperBlock(ParameterTree* tree, TileScratchBuffer* scratch_buffer,
-                        ResidualPtr* residual);
+  bool DecodeSuperBlock(int sb_row_index, int sb_column_index,
+                        TileScratchBuffer* scratch_buffer);
   // Helper function used by DecodeSuperBlock(). Note that the decode_block()
   // function in the spec is equivalent to ProcessBlock() in the code.
-  bool DecodeBlock(ParameterTree* tree, TileScratchBuffer* scratch_buffer,
-                   ResidualPtr* residual);
+  bool DecodeBlock(int row4x4, int column4x4, BlockSize block_size,
+                   TileScratchBuffer* scratch_buffer, ResidualPtr* residual);
 
   void ClearBlockDecoded(TileScratchBuffer* scratch_buffer, int row4x4,
                          int column4x4);  // 5.11.3.
-  bool ProcessSuperBlock(int row4x4, int column4x4, int block_width4x4,
+  bool ProcessSuperBlock(int row4x4, int column4x4,
                          TileScratchBuffer* scratch_buffer,
                          ProcessingMode mode);
   void ResetLoopRestorationParams();
@@ -357,7 +358,12 @@
                                 const MvContexts& mode_contexts);
   void ReadRefMvIndex(const Block& block);
   void ReadInterIntraMode(const Block& block, bool is_compound);  // 5.11.28.
-  bool IsScaled(ReferenceFrameType type) const;  // Part of 5.11.27.
+  bool IsScaled(ReferenceFrameType type) const {  // Part of 5.11.27.
+    const int index =
+        frame_header_.reference_frame_index[type - kReferenceFrameLast];
+    return reference_frames_[index]->upscaled_width() != frame_header_.width ||
+           reference_frames_[index]->frame_height() != frame_header_.height;
+  }
   void ReadMotionMode(const Block& block, bool is_compound);  // 5.11.27.
   uint16_t* GetIsExplicitCompoundTypeCdf(const Block& block);
   uint16_t* GetIsCompoundTypeAverageCdf(const Block& block);
@@ -394,22 +400,28 @@
                          TransformSize tx_size);  // 5.11.47.
   template <typename ResidualType>
   void ReadCoeffBase2D(
-      const uint16_t* scan, PlaneType plane_type, TransformSize tx_size,
-      int clamped_tx_size_context, int adjusted_tx_width_log2, int eob,
+      const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
+      int eob,
       uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
-      ResidualType* quantized_buffer);
+      uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+                                   [kCoeffBaseRangeSymbolCount + 1],
+      ResidualType* quantized_buffer, uint8_t* level_buffer);
   template <typename ResidualType>
   void ReadCoeffBaseHorizontal(
-      const uint16_t* scan, PlaneType plane_type, TransformSize tx_size,
-      int clamped_tx_size_context, int adjusted_tx_width_log2, int eob,
+      const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
+      int eob,
       uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
-      ResidualType* quantized_buffer);
+      uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+                                   [kCoeffBaseRangeSymbolCount + 1],
+      ResidualType* quantized_buffer, uint8_t* level_buffer);
   template <typename ResidualType>
   void ReadCoeffBaseVertical(
-      const uint16_t* scan, PlaneType plane_type, TransformSize tx_size,
-      int clamped_tx_size_context, int adjusted_tx_width_log2, int eob,
+      const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
+      int eob,
       uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
-      ResidualType* quantized_buffer);
+      uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+                                   [kCoeffBaseRangeSymbolCount + 1],
+      ResidualType* quantized_buffer, uint8_t* level_buffer);
   int GetDcSignContext(int x4, int y4, int w4, int h4, Plane plane);
   void SetEntropyContexts(int x4, int y4, int w4, int h4, Plane plane,
                           uint8_t coefficient_level, int8_t dc_category);
@@ -495,9 +507,8 @@
       const uint16_t* scan, int i, int q_value, const uint8_t* quantizer_matrix,
       int shift, int max_value, uint16_t* dc_sign_cdf, int8_t* dc_category,
       int* coefficient_level,
-      ResidualType* residual_buffer);  // Part of 5.11.39.
-  int ReadCoeffBaseRange(int clamped_tx_size_context, int cdf_context,
-                         int plane_type);  // Part of 5.11.39.
+      ResidualType* residual_buffer);     // Part of 5.11.39.
+  int ReadCoeffBaseRange(uint16_t* cdf);  // Part of 5.11.39.
   // Returns the number of non-zero coefficients that were read. |tx_type| is an
   // output parameter that stores the computed transform type for the plane
   // whose coefficients were read. Returns -1 on failure.
@@ -637,6 +648,7 @@
   TemporalMotionField& motion_field_;
   const std::array<uint8_t, kNumReferenceFrameTypes>& reference_order_hint_;
   const WedgeMaskArray& wedge_masks_;
+  const QuantizerMatrix& quantizer_matrix_;
   DaalaBitReader reader_;
   SymbolDecoderContext symbol_decoder_context_;
   SymbolDecoderContext* const saved_symbol_decoder_context_;

diff --git a/libgav1/src/tile/bitstream/mode_info.cc b/libgav1/src/tile/bitstream/mode_info.cc
index d73ebed..0b22eb0 100644
--- a/libgav1/src/tile/bitstream/mode_info.cc
+++ b/libgav1/src/tile/bitstream/mode_info.cc

@@ -44,7 +44,6 @@
 
 constexpr int kDeltaQSmall = 3;
 constexpr int kDeltaLfSmall = 3;
-constexpr int kNoScale = 1 << kReferenceFrameScalePrecision;
 
 constexpr uint8_t kIntraYModeContext[kIntraPredictionModesY] = {
     0, 1, 2, 3, 4, 4, 4, 4, 3, 0, 1, 2, 0};
@@ -510,9 +509,9 @@
   BlockParameters& bp = *block.bp;
   const int context =
       static_cast<int>(block.bp->prediction_parameters->use_intra_block_copy);
-  const auto mv_joint = static_cast<MvJointType>(
-      reader_.ReadSymbol(symbol_decoder_context_.mv_joint_cdf[context],
-                         static_cast<int>(kNumMvJointTypes)));
+  const auto mv_joint =
+      static_cast<MvJointType>(reader_.ReadSymbol<kNumMvJointTypes>(
+          symbol_decoder_context_.mv_joint_cdf[context]));
   if (mv_joint == kMvJointTypeHorizontalZeroVerticalNonZero ||
       mv_joint == kMvJointTypeNonZero) {
     bp.mv.mv[index].mv[0] = ReadMotionVectorComponent(block, 0);
@@ -1032,21 +1031,6 @@
   prediction_parameters.wedge_sign = 0;
 }
 
-bool Tile::IsScaled(ReferenceFrameType type) const {
-  const int index =
-      frame_header_.reference_frame_index[type - kReferenceFrameLast];
-  const int x_scale = ((reference_frames_[index]->upscaled_width()
-                        << kReferenceFrameScalePrecision) +
-                       DivideBy2(frame_header_.width)) /
-                      frame_header_.width;
-  if (x_scale != kNoScale) return true;
-  const int y_scale = ((reference_frames_[index]->frame_height()
-                        << kReferenceFrameScalePrecision) +
-                       DivideBy2(frame_header_.height)) /
-                      frame_header_.height;
-  return y_scale != kNoScale;
-}
-
 void Tile::ReadMotionMode(const Block& block, bool is_compound) {
   BlockParameters& bp = *block.bp;
   PredictionParameters& prediction_parameters =

diff --git a/libgav1/src/tile/bitstream/palette.cc b/libgav1/src/tile/bitstream/palette.cc
index 674d210..41b42d6 100644
--- a/libgav1/src/tile/bitstream/palette.cc
+++ b/libgav1/src/tile/bitstream/palette.cc

@@ -130,10 +130,10 @@
 
 void Tile::ReadPaletteModeInfo(const Block& block) {
   BlockParameters& bp = *block.bp;
+  bp.palette_mode_info.size[kPlaneTypeY] = 0;
+  bp.palette_mode_info.size[kPlaneTypeUV] = 0;
   if (IsBlockSmallerThan8x8(block.size) || block.size > kBlock64x64 ||
       !frame_header_.allow_screen_content_tools) {
-    bp.palette_mode_info.size[kPlaneTypeY] = 0;
-    bp.palette_mode_info.size[kPlaneTypeUV] = 0;
     return;
   }
   const int block_size_context =
@@ -156,7 +156,7 @@
       ReadPaletteColors(block, kPlaneY);
     }
   }
-  if (bp.uv_mode == kPredictionModeDc && block.HasChroma()) {
+  if (block.HasChroma() && bp.uv_mode == kPredictionModeDc) {
     const int context =
         static_cast<int>(bp.palette_mode_info.size[kPlaneTypeY] > 0);
     const bool has_palette_uv =

diff --git a/libgav1/src/tile/bitstream/partition.cc b/libgav1/src/tile/bitstream/partition.cc
index 60899a2..f3dbbb0 100644
--- a/libgav1/src/tile/bitstream/partition.cc
+++ b/libgav1/src/tile/bitstream/partition.cc

@@ -132,13 +132,13 @@
           reader_.ReadSymbol<kMaxPartitionTypes>(partition_cdf));
     }
   } else if (has_columns) {
-    uint16_t cdf[3] = {
-        PartitionCdfGatherVerticalAlike(partition_cdf, block_size), 0, 0};
+    const uint16_t cdf =
+        PartitionCdfGatherVerticalAlike(partition_cdf, block_size);
     *partition = reader_.ReadSymbolWithoutCdfUpdate(cdf) ? kPartitionSplit
                                                          : kPartitionHorizontal;
   } else {
-    uint16_t cdf[3] = {
-        PartitionCdfGatherHorizontalAlike(partition_cdf, block_size), 0, 0};
+    const uint16_t cdf =
+        PartitionCdfGatherHorizontalAlike(partition_cdf, block_size);
     *partition = reader_.ReadSymbolWithoutCdfUpdate(cdf) ? kPartitionSplit
                                                          : kPartitionVertical;
   }

diff --git a/libgav1/src/tile/bitstream/transform_size.cc b/libgav1/src/tile/bitstream/transform_size.cc
index c5ee757..b79851d 100644
--- a/libgav1/src/tile/bitstream/transform_size.cc
+++ b/libgav1/src/tile/bitstream/transform_size.cc

@@ -117,9 +117,11 @@
   const auto context = static_cast<int>(top_width >= max_tx_width) +
                        static_cast<int>(left_height >= max_tx_height);
   const int cdf_index = kTxDepthCdfIndex[block.size];
-  const int symbol_count = 3 - static_cast<int>(cdf_index == 0);
-  const int tx_depth = reader_.ReadSymbol(
-      symbol_decoder_context_.tx_depth_cdf[cdf_index][context], symbol_count);
+  uint16_t* const cdf =
+      symbol_decoder_context_.tx_depth_cdf[cdf_index][context];
+  const int tx_depth = (cdf_index == 0)
+                           ? static_cast<int>(reader_.ReadSymbol(cdf))
+                           : reader_.ReadSymbol<3>(cdf);
   assert(tx_depth < 3);
   TransformSize tx_size = max_rect_tx_size;
   if (tx_depth == 0) return tx_size;

diff --git a/libgav1/src/tile/prediction.cc b/libgav1/src/tile/prediction.cc
index a234a19..c5560a6 100644
--- a/libgav1/src/tile/prediction.cc
+++ b/libgav1/src/tile/prediction.cc

@@ -45,6 +45,8 @@
 // Import all the constants in the anonymous namespace.
 #include "src/inter_intra_masks.inc"
 
+// Precision bits when scaling reference frames.
+constexpr int kReferenceScaleShift = 14;
 constexpr int kAngleStep = 3;
 constexpr int kPredictionModeToAngle[kIntraPredictionModesUV] = {
     0, 90, 180, 45, 135, 113, 157, 203, 67, 0, 0, 0, 0};
@@ -404,20 +406,13 @@
   const int subsampling_x = subsampling_x_[plane];
   const int subsampling_y = subsampling_y_[plane];
   if (block.top_available[plane]) {
-    const int row =
-        block.row4x4 - 1 -
-        static_cast<int>(subsampling_y != 0 && (block.row4x4 & 1) != 0);
-    const int column =
-        block.column4x4 +
-        static_cast<int>(subsampling_x != 0 && (block.column4x4 & 1) == 0);
+    const int row = block.row4x4 - 1 - (block.row4x4 & subsampling_y);
+    const int column = block.column4x4 + (~block.column4x4 & subsampling_x);
     if (IsSmoothPrediction(row, column, plane)) return 1;
   }
   if (block.left_available[plane]) {
-    const int row = block.row4x4 + static_cast<int>(subsampling_y != 0 &&
-                                                    (block.row4x4 & 1) == 0);
-    const int column =
-        block.column4x4 - 1 -
-        static_cast<int>(subsampling_x != 0 && (block.column4x4 & 1) != 0);
+    const int row = block.row4x4 + (~block.row4x4 & subsampling_y);
+    const int column = block.column4x4 - 1 - (block.column4x4 & subsampling_x);
     if (IsSmoothPrediction(row, column, plane)) return 1;
   }
   return 0;
@@ -945,6 +940,68 @@
                                width, height, dest, dest_stride);
 }
 
+void Tile::ScaleMotionVector(const MotionVector& mv, const Plane plane,
+                             const int reference_frame_index, const int x,
+                             const int y, int* const start_x,
+                             int* const start_y, int* const step_x,
+                             int* const step_y) {
+  const int reference_upscaled_width =
+      (reference_frame_index == -1)
+          ? frame_header_.upscaled_width
+          : reference_frames_[reference_frame_index]->upscaled_width();
+  const int reference_height =
+      (reference_frame_index == -1)
+          ? frame_header_.height
+          : reference_frames_[reference_frame_index]->frame_height();
+  assert(2 * frame_header_.width >= reference_upscaled_width &&
+         2 * frame_header_.height >= reference_height &&
+         frame_header_.width <= 16 * reference_upscaled_width &&
+         frame_header_.height <= 16 * reference_height);
+  const bool is_scaled_x = reference_upscaled_width != frame_header_.width;
+  const bool is_scaled_y = reference_height != frame_header_.height;
+  const int half_sample = 1 << (kSubPixelBits - 1);
+  int orig_x = (x << kSubPixelBits) + ((2 * mv.mv[1]) >> subsampling_x_[plane]);
+  int orig_y = (y << kSubPixelBits) + ((2 * mv.mv[0]) >> subsampling_y_[plane]);
+  const int rounding_offset =
+      DivideBy2(1 << (kScaleSubPixelBits - kSubPixelBits));
+  if (is_scaled_x) {
+    const int scale_x = ((reference_upscaled_width << kReferenceScaleShift) +
+                         DivideBy2(frame_header_.width)) /
+                        frame_header_.width;
+    *step_x = RightShiftWithRoundingSigned(
+        scale_x, kReferenceScaleShift - kScaleSubPixelBits);
+    orig_x += half_sample;
+    // When frame size is 4k and above, orig_x can be above 16 bits, scale_x can
+    // be up to 15 bits. So we use int64_t to hold base_x.
+    const int64_t base_x = static_cast<int64_t>(orig_x) * scale_x -
+                           (half_sample << kReferenceScaleShift);
+    *start_x =
+        RightShiftWithRoundingSigned(
+            base_x, kReferenceScaleShift + kSubPixelBits - kScaleSubPixelBits) +
+        rounding_offset;
+  } else {
+    *step_x = 1 << kScaleSubPixelBits;
+    *start_x = LeftShift(orig_x, 6) + rounding_offset;
+  }
+  if (is_scaled_y) {
+    const int scale_y = ((reference_height << kReferenceScaleShift) +
+                         DivideBy2(frame_header_.height)) /
+                        frame_header_.height;
+    *step_y = RightShiftWithRoundingSigned(
+        scale_y, kReferenceScaleShift - kScaleSubPixelBits);
+    orig_y += half_sample;
+    const int64_t base_y = static_cast<int64_t>(orig_y) * scale_y -
+                           (half_sample << kReferenceScaleShift);
+    *start_y =
+        RightShiftWithRoundingSigned(
+            base_y, kReferenceScaleShift + kSubPixelBits - kScaleSubPixelBits) +
+        rounding_offset;
+  } else {
+    *step_y = 1 << kScaleSubPixelBits;
+    *start_y = LeftShift(orig_y, 6) + rounding_offset;
+  }
+}
+
 // static.
 bool Tile::GetReferenceBlockPosition(
     const int reference_frame_index, const bool is_scaled, const int width,
@@ -1007,12 +1064,9 @@
                     kScaleSubPixelBits) +
                    kSubPixelTaps;
   }
-  const int copy_start_x =
-      std::min(std::max(ref_block_start_x, ref_start_x), ref_last_x);
-  const int copy_end_x =
-      std::max(std::min(ref_block_end_x, ref_last_x), copy_start_x);
-  const int copy_start_y =
-      std::min(std::max(ref_block_start_y, ref_start_y), ref_last_y);
+  const int copy_start_x = Clip3(ref_block_start_x, ref_start_x, ref_last_x);
+  const int copy_start_y = Clip3(ref_block_start_y, ref_start_y, ref_last_y);
+  const int copy_end_x = Clip3(ref_block_end_x, copy_start_x, ref_last_x);
   const int block_width = copy_end_x - copy_start_x + 1;
   const bool extend_left = ref_block_start_x < ref_start_x;
   const bool extend_right = ref_block_end_x > ref_last_x;
@@ -1184,12 +1238,6 @@
                                    kConvolveBorderLeftTop * pixel_size);
   }
 
-  const int has_horizontal_filter = static_cast<int>(
-      ((mv.mv[MotionVector::kColumn] * (1 << (1 - subsampling_x))) &
-       kSubPixelMask) != 0);
-  const int has_vertical_filter = static_cast<int>(
-      ((mv.mv[MotionVector::kRow] * (1 << (1 - subsampling_y))) &
-       kSubPixelMask) != 0);
   void* const output =
       (is_compound || is_inter_intra) ? prediction : static_cast<void*>(dest);
   ptrdiff_t output_stride = (is_compound || is_inter_intra)
@@ -1214,14 +1262,17 @@
                   vertical_filter_index, start_x, start_y, step_x, step_y,
                   width, height, output, output_stride);
   } else {
+    const int horizontal_filter_id = (start_x >> 6) & kSubPixelMask;
+    const int vertical_filter_id = (start_y >> 6) & kSubPixelMask;
+
     dsp::ConvolveFunc convolve_func =
         dsp_.convolve[reference_frame_index == -1][is_compound]
-                     [has_vertical_filter][has_horizontal_filter];
+                     [vertical_filter_id != 0][horizontal_filter_id != 0];
     assert(convolve_func != nullptr);
 
     convolve_func(block_start, convolve_buffer_stride, horizontal_filter_index,
-                  vertical_filter_index, start_x, start_y, width, height,
-                  output, output_stride);
+                  vertical_filter_index, horizontal_filter_id,
+                  vertical_filter_id, width, height, output, output_stride);
   }
   return true;
 }

diff --git a/libgav1/src/tile/tile.cc b/libgav1/src/tile/tile.cc
index f79158f..9699517 100644
--- a/libgav1/src/tile/tile.cc
+++ b/libgav1/src/tile/tile.cc

@@ -40,11 +40,8 @@
 namespace {
 
 // Import all the constants in the anonymous namespace.
-#include "src/quantizer_tables.inc"
 #include "src/scan_tables.inc"
 
-// Precision bits when scaling reference frames.
-constexpr int kReferenceScaleShift = 14;
 // Range above kNumQuantizerBaseLevels which the exponential golomb coding
 // process is activated.
 constexpr int kQuantizerCoefficientBaseRange = 12;
@@ -422,6 +419,7 @@
            RefCountedBuffer* const current_frame, const DecoderState& state,
            FrameScratchBuffer* const frame_scratch_buffer,
            const WedgeMaskArray& wedge_masks,
+           const QuantizerMatrix& quantizer_matrix,
            SymbolDecoderContext* const saved_symbol_decoder_context,
            const SegmentationMap* prev_segment_ids,
            PostFilter* const post_filter, const dsp::Dsp* const dsp,
@@ -446,6 +444,7 @@
       motion_field_(frame_scratch_buffer->motion_field),
       reference_order_hint_(state.reference_order_hint),
       wedge_masks_(wedge_masks),
+      quantizer_matrix_(quantizer_matrix),
       reader_(data_, size_, frame_header_.enable_cdf_update),
       symbol_decoder_context_(frame_scratch_buffer->symbol_decoder_context),
       saved_symbol_decoder_context_(saved_symbol_decoder_context),
@@ -503,7 +502,7 @@
   memset(delta_lf_, 0, sizeof(delta_lf_));
   delta_lf_all_zero_ = true;
   const YuvBuffer& buffer = post_filter_.frame_buffer();
-  for (int plane = 0; plane < PlaneCount(); ++plane) {
+  for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
     // Verify that the borders are big enough for Reconstruct(). max_tx_length
     // is the maximum value of tx_width and tx_height for the plane.
     const int max_tx_length = (plane == kPlaneY) ? 64 : 32;
@@ -543,12 +542,12 @@
                          buffer.stride(plane),
                          post_filter_.GetUnfilteredBuffer(plane));
     const int plane_height =
-        RightShiftWithRounding(frame_header_.height, subsampling_y_[plane]);
+        SubsampledValue(frame_header_.height, subsampling_y_[plane]);
     deblock_row_limit_[plane] =
         std::min(frame_header_.rows4x4, DivideBy4(plane_height + 3)
                                             << subsampling_y_[plane]);
     const int plane_width =
-        RightShiftWithRounding(frame_header_.width, subsampling_x_[plane]);
+        SubsampledValue(frame_header_.width, subsampling_x_[plane]);
     deblock_column_limit_[plane] =
         std::min(frame_header_.columns4x4, DivideBy4(plane_width + 3)
                                                << subsampling_x_[plane]);
@@ -610,7 +609,7 @@
   const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
   for (int column4x4 = column4x4_start_; column4x4 < column4x4_end_;
        column4x4 += block_width4x4) {
-    if (!ProcessSuperBlock(row4x4, column4x4, block_width4x4, scratch_buffer,
+    if (!ProcessSuperBlock(row4x4, column4x4, scratch_buffer,
                            processing_mode)) {
       LIBGAV1_DLOG(ERROR, "Error decoding super block row: %d column: %d",
                    row4x4, column4x4);
@@ -643,9 +642,6 @@
 }
 
 bool Tile::ParseAndDecode() {
-  // If this is the main thread, we build the loop filter bit masks when parsing
-  // so that it happens in the current thread. This ensures that the main thread
-  // does as much work as possible.
   if (split_parse_and_decode_) {
     if (!ThreadedParseAndDecode()) return false;
     SaveSymbolDecoderContext();
@@ -777,8 +773,8 @@
     for (int column4x4 = column4x4_start_, column_index = 0;
          column4x4 < column4x4_end_;
          column4x4 += block_width4x4, ++column_index) {
-      if (!ProcessSuperBlock(row4x4, column4x4, block_width4x4,
-                             scratch_buffer.get(), kProcessingModeParseOnly)) {
+      if (!ProcessSuperBlock(row4x4, column4x4, scratch_buffer.get(),
+                             kProcessingModeParseOnly)) {
         std::lock_guard<std::mutex> lock(threading_.mutex);
         threading_.abort = true;
         break;
@@ -863,8 +859,8 @@
       tile_scratch_buffer_pool_->Get();
   bool ok = scratch_buffer != nullptr;
   if (ok) {
-    ok = ProcessSuperBlock(row4x4, column4x4, block_width4x4,
-                           scratch_buffer.get(), kProcessingModeDecodeOnly);
+    ok = ProcessSuperBlock(row4x4, column4x4, scratch_buffer.get(),
+                           kProcessingModeDecodeOnly);
     tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
   }
   std::unique_lock<std::mutex> lock(threading_.mutex);
@@ -921,7 +917,7 @@
   const size_t pixel_size =
       (sequence_header_.color_config.bitdepth == 8 ? sizeof(uint8_t)
                                                    : sizeof(uint16_t));
-  for (int plane = 0; plane < PlaneCount(); ++plane) {
+  for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
     const int row_to_copy =
         (MultiplyBy4(row4x4 + block_width4x4) >> subsampling_y_[plane]) - 1;
     const size_t pixels_to_copy =
@@ -1060,6 +1056,18 @@
     if (bp.is_inter) {
       cdf = symbol_decoder_context_
                 .inter_tx_type_cdf[cdf_index][cdf_tx_size_index];
+      switch (tx_set) {
+        case kTransformSetInter1:
+          tx_type = static_cast<TransformType>(reader_.ReadSymbol<16>(cdf));
+          break;
+        case kTransformSetInter2:
+          tx_type = static_cast<TransformType>(reader_.ReadSymbol<12>(cdf));
+          break;
+        default:
+          assert(tx_set == kTransformSetInter3);
+          tx_type = static_cast<TransformType>(reader_.ReadSymbol(cdf));
+          break;
+      }
     } else {
       const PredictionMode intra_direction =
           block.bp->prediction_parameters->use_filter_intra
@@ -1069,9 +1077,12 @@
       cdf =
           symbol_decoder_context_
               .intra_tx_type_cdf[cdf_index][cdf_tx_size_index][intra_direction];
+      assert(tx_set == kTransformSetIntra1 || tx_set == kTransformSetIntra2);
+      tx_type = static_cast<TransformType>((tx_set == kTransformSetIntra1)
+                                               ? reader_.ReadSymbol<7>(cdf)
+                                               : reader_.ReadSymbol<5>(cdf));
     }
-    tx_type = static_cast<TransformType>(
-        reader_.ReadSymbol(cdf, kNumTransformTypesInSet[tx_set]));
+
     // This array does not contain an entry for kTransformSetDctOnly, so the
     // first dimension needs to be offset by 1.
     tx_type = kInverseTransformTypeBySet[tx_set - 1][tx_type];
@@ -1089,49 +1100,57 @@
 // positions are still all 0s according to the diagonal scan order.
 template <typename ResidualType>
 void Tile::ReadCoeffBase2D(
-    const uint16_t* scan, PlaneType plane_type, TransformSize tx_size,
-    int clamped_tx_size_context, int adjusted_tx_width_log2, int eob,
+    const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
+    int eob,
     uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
-    ResidualType* const quantized_buffer) {
+    uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+                                 [kCoeffBaseRangeSymbolCount + 1],
+    ResidualType* const quantized_buffer, uint8_t* const level_buffer) {
   const int tx_width = 1 << adjusted_tx_width_log2;
-  int i = eob - 2;
-  do {
-    constexpr auto threshold = static_cast<ResidualType>(3);
+  for (int i = eob - 2; i >= 1; --i) {
     const uint16_t pos = scan[i];
     const int row = pos >> adjusted_tx_width_log2;
     const int column = pos & (tx_width - 1);
     auto* const quantized = &quantized_buffer[pos];
-    int context;
-    if (pos == 0) {
-      context = 0;
-    } else {
-      context = std::min(
-          4, DivideBy2(
-                 1 + (std::min(quantized[1], threshold) +             // {0, 1}
-                      std::min(quantized[tx_width], threshold) +      // {1, 0}
-                      std::min(quantized[tx_width + 1], threshold) +  // {1, 1}
-                      std::min(quantized[2], threshold) +             // {0, 2}
-                      std::min(quantized[MultiplyBy2(tx_width)],
-                               threshold))));  // {2, 0}
-      context += kCoeffBaseContextOffset[tx_size][std::min(row, 4)]
-                                        [std::min(column, 4)];
-    }
+    auto* const levels = &level_buffer[pos];
+    const int neighbor_sum = 1 + levels[1] + levels[tx_width] +
+                             levels[tx_width + 1] + levels[2] +
+                             levels[MultiplyBy2(tx_width)];
+    const int context =
+        ((neighbor_sum > 7) ? 4 : DivideBy2(neighbor_sum)) +
+        kCoeffBaseContextOffset[tx_size][std::min(row, 4)][std::min(column, 4)];
     int level =
         reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[context]);
+    levels[0] = level;
     if (level > kNumQuantizerBaseLevels) {
       // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
       // + 1, because we clip the overall output to 6 and the unclipped
       // quantized values will always result in an output of greater than 6.
-      context = std::min(6, DivideBy2(1 + quantized[1] +          // {0, 1}
-                                      quantized[tx_width] +       // {1, 0}
-                                      quantized[tx_width + 1]));  // {1, 1}
-      if (pos != 0) {
-        context += 14 >> static_cast<int>((row | column) < 2);
-      }
-      level += ReadCoeffBaseRange(clamped_tx_size_context, context, plane_type);
+      int context = std::min(6, DivideBy2(1 + quantized[1] +          // {0, 1}
+                                          quantized[tx_width] +       // {1, 0}
+                                          quantized[tx_width + 1]));  // {1, 1}
+      context += 14 >> static_cast<int>((row | column) < 2);
+      level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
     }
     quantized[0] = level;
-  } while (--i >= 0);
+  }
+  // Read position 0.
+  {
+    auto* const quantized = &quantized_buffer[0];
+    int level = reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[0]);
+    level_buffer[0] = level;
+    if (level > kNumQuantizerBaseLevels) {
+      // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
+      // + 1, because we clip the overall output to 6 and the unclipped
+      // quantized values will always result in an output of greater than 6.
+      const int context =
+          std::min(6, DivideBy2(1 + quantized[1] +          // {0, 1}
+                                quantized[tx_width] +       // {1, 0}
+                                quantized[tx_width + 1]));  // {1, 1}
+      level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
+    }
+    quantized[0] = level;
+  }
 }
 
 // Section 8.3.2 in the spec, under coeff_base and coeff_br.
@@ -1148,41 +1167,41 @@
 // we always do the boundary check for its fourth right neighbor.
 template <typename ResidualType>
 void Tile::ReadCoeffBaseHorizontal(
-    const uint16_t* scan, PlaneType plane_type, TransformSize /*tx_size*/,
-    int clamped_tx_size_context, int adjusted_tx_width_log2, int eob,
+    const uint16_t* scan, TransformSize /*tx_size*/, int adjusted_tx_width_log2,
+    int eob,
     uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
-    ResidualType* const quantized_buffer) {
+    uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+                                 [kCoeffBaseRangeSymbolCount + 1],
+    ResidualType* const quantized_buffer, uint8_t* const level_buffer) {
   const int tx_width = 1 << adjusted_tx_width_log2;
   int i = eob - 2;
   do {
-    constexpr auto threshold = static_cast<ResidualType>(3);
     const uint16_t pos = scan[i];
     const int column = pos & (tx_width - 1);
     auto* const quantized = &quantized_buffer[pos];
-    int context = std::min(
-        4,
-        DivideBy2(1 +
-                  (std::min(quantized[1], threshold) +         // {0, 1}
-                   std::min(quantized[tx_width], threshold) +  // {1, 0}
-                   std::min(quantized[2], threshold) +         // {0, 2}
-                   std::min(quantized[3], threshold) +         // {0, 3}
-                   std::min(quantized[4],
-                            static_cast<ResidualType>(
-                                (column + 4 < tx_width) ? 3 : 0)))));  // {0, 4}
-    context += kCoeffBasePositionContextOffset[column];
+    auto* const levels = &level_buffer[pos];
+    const int neighbor_sum =
+        1 + (levels[1] +                                  // {0, 1}
+             levels[tx_width] +                           // {1, 0}
+             levels[2] +                                  // {0, 2}
+             levels[3] +                                  // {0, 3}
+             ((column + 4 < tx_width) ? levels[4] : 0));  // {0, 4}
+    const int context = ((neighbor_sum > 7) ? 4 : DivideBy2(neighbor_sum)) +
+                        kCoeffBasePositionContextOffset[column];
     int level =
         reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[context]);
+    levels[0] = level;
     if (level > kNumQuantizerBaseLevels) {
       // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
       // + 1, because we clip the overall output to 6 and the unclipped
       // quantized values will always result in an output of greater than 6.
-      context = std::min(6, DivideBy2(1 + quantized[1] +     // {0, 1}
-                                      quantized[tx_width] +  // {1, 0}
-                                      quantized[2]));        // {0, 2}
+      int context = std::min(6, DivideBy2(1 + quantized[1] +     // {0, 1}
+                                          quantized[tx_width] +  // {1, 0}
+                                          quantized[2]));        // {0, 2}
       if (pos != 0) {
         context += 14 >> static_cast<int>(column == 0);
       }
-      level += ReadCoeffBaseRange(clamped_tx_size_context, context, plane_type);
+      level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
     }
     quantized[0] = level;
   } while (--i >= 0);
@@ -1193,36 +1212,36 @@
 // Right boundary check is performed explicitly.
 template <typename ResidualType>
 void Tile::ReadCoeffBaseVertical(
-    const uint16_t* scan, PlaneType plane_type, TransformSize /*tx_size*/,
-    int clamped_tx_size_context, int adjusted_tx_width_log2, int eob,
+    const uint16_t* scan, TransformSize /*tx_size*/, int adjusted_tx_width_log2,
+    int eob,
     uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
-    ResidualType* const quantized_buffer) {
+    uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+                                 [kCoeffBaseRangeSymbolCount + 1],
+    ResidualType* const quantized_buffer, uint8_t* const level_buffer) {
   const int tx_width = 1 << adjusted_tx_width_log2;
   int i = eob - 2;
   do {
-    constexpr auto threshold = static_cast<ResidualType>(3);
     const uint16_t pos = scan[i];
     const int row = pos >> adjusted_tx_width_log2;
     const int column = pos & (tx_width - 1);
     auto* const quantized = &quantized_buffer[pos];
-    const int quantized_column1 = (column + 1 < tx_width) ? quantized[1] : 0;
-    int context =
-        std::min(4, DivideBy2(1 + (std::min(quantized_column1, 3) +  // {0, 1}
-                                   std::min(quantized[tx_width],
-                                            threshold) +  // {1, 0}
-                                   std::min(quantized[MultiplyBy2(tx_width)],
-                                            threshold) +  // {2, 0}
-                                   std::min(quantized[tx_width * 3],
-                                            threshold) +  // {3, 0}
-                                   std::min(quantized[MultiplyBy4(tx_width)],
-                                            threshold))));  // {4, 0}
-    context += kCoeffBasePositionContextOffset[row];
+    auto* const levels = &level_buffer[pos];
+    const int neighbor_sum =
+        1 + (((column + 1 < tx_width) ? levels[1] : 0) +  // {0, 1}
+             levels[tx_width] +                           // {1, 0}
+             levels[MultiplyBy2(tx_width)] +              // {2, 0}
+             levels[tx_width * 3] +                       // {3, 0}
+             levels[MultiplyBy4(tx_width)]);              // {4, 0}
+    const int context = ((neighbor_sum > 7) ? 4 : DivideBy2(neighbor_sum)) +
+                        kCoeffBasePositionContextOffset[row];
     int level =
         reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[context]);
+    levels[0] = level;
     if (level > kNumQuantizerBaseLevels) {
       // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
       // + 1, because we clip the overall output to 6 and the unclipped
       // quantized values will always result in an output of greater than 6.
+      const int quantized_column1 = (column + 1 < tx_width) ? quantized[1] : 0;
       int context =
           std::min(6, DivideBy2(1 + quantized_column1 +              // {0, 1}
                                 quantized[tx_width] +                // {1, 0}
@@ -1230,7 +1249,7 @@
       if (pos != 0) {
         context += 14 >> static_cast<int>(row == 0);
       }
-      level += ReadCoeffBaseRange(clamped_tx_size_context, context, plane_type);
+      level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
     }
     quantized[0] = level;
   } while (--i >= 0);
@@ -1272,68 +1291,6 @@
          num_left_elements);
 }
 
-void Tile::ScaleMotionVector(const MotionVector& mv, const Plane plane,
-                             const int reference_frame_index, const int x,
-                             const int y, int* const start_x,
-                             int* const start_y, int* const step_x,
-                             int* const step_y) {
-  const int reference_upscaled_width =
-      (reference_frame_index == -1)
-          ? frame_header_.upscaled_width
-          : reference_frames_[reference_frame_index]->upscaled_width();
-  const int reference_height =
-      (reference_frame_index == -1)
-          ? frame_header_.height
-          : reference_frames_[reference_frame_index]->frame_height();
-  assert(2 * frame_header_.width >= reference_upscaled_width &&
-         2 * frame_header_.height >= reference_height &&
-         frame_header_.width <= 16 * reference_upscaled_width &&
-         frame_header_.height <= 16 * reference_height);
-  const bool is_scaled_x = reference_upscaled_width != frame_header_.width;
-  const bool is_scaled_y = reference_height != frame_header_.height;
-  const int half_sample = 1 << (kSubPixelBits - 1);
-  int orig_x = (x << kSubPixelBits) + ((2 * mv.mv[1]) >> subsampling_x_[plane]);
-  int orig_y = (y << kSubPixelBits) + ((2 * mv.mv[0]) >> subsampling_y_[plane]);
-  const int rounding_offset =
-      DivideBy2(1 << (kScaleSubPixelBits - kSubPixelBits));
-  if (is_scaled_x) {
-    const int scale_x = ((reference_upscaled_width << kReferenceScaleShift) +
-                         DivideBy2(frame_header_.width)) /
-                        frame_header_.width;
-    *step_x = RightShiftWithRoundingSigned(
-        scale_x, kReferenceScaleShift - kScaleSubPixelBits);
-    orig_x += half_sample;
-    // When frame size is 4k and above, orig_x can be above 16 bits, scale_x can
-    // be up to 15 bits. So we use int64_t to hold base_x.
-    const int64_t base_x = static_cast<int64_t>(orig_x) * scale_x -
-                           (half_sample << kReferenceScaleShift);
-    *start_x =
-        RightShiftWithRoundingSigned(
-            base_x, kReferenceScaleShift + kSubPixelBits - kScaleSubPixelBits) +
-        rounding_offset;
-  } else {
-    *step_x = 1 << kScaleSubPixelBits;
-    *start_x = LeftShift(orig_x, 6) + rounding_offset;
-  }
-  if (is_scaled_y) {
-    const int scale_y = ((reference_height << kReferenceScaleShift) +
-                         DivideBy2(frame_header_.height)) /
-                        frame_header_.height;
-    *step_y = RightShiftWithRoundingSigned(
-        scale_y, kReferenceScaleShift - kScaleSubPixelBits);
-    orig_y += half_sample;
-    const int64_t base_y = static_cast<int64_t>(orig_y) * scale_y -
-                           (half_sample << kReferenceScaleShift);
-    *start_y =
-        RightShiftWithRoundingSigned(
-            base_y, kReferenceScaleShift + kSubPixelBits - kScaleSubPixelBits) +
-        rounding_offset;
-  } else {
-    *step_y = 1 << kScaleSubPixelBits;
-    *start_y = LeftShift(orig_y, 6) + rounding_offset;
-  }
-}
-
 template <typename ResidualType, bool is_dc_coefficient>
 bool Tile::ReadSignAndApplyDequantization(
     const uint16_t* const scan, int i, int q_value,
@@ -1395,13 +1352,11 @@
   return true;
 }
 
-int Tile::ReadCoeffBaseRange(int clamped_tx_size_context, int cdf_context,
-                             int plane_type) {
+int Tile::ReadCoeffBaseRange(uint16_t* cdf) {
   int level = 0;
   for (int j = 0; j < kCoeffBaseRangeMaxIterations; ++j) {
-    const int coeff_base_range = reader_.ReadSymbol<kCoeffBaseRangeSymbolCount>(
-        symbol_decoder_context_.coeff_base_range_cdf[clamped_tx_size_context]
-                                                    [plane_type][cdf_context]);
+    const int coeff_base_range =
+        reader_.ReadSymbol<kCoeffBaseRangeSymbolCount>(cdf);
     level += coeff_base_range;
     if (coeff_base_range < (kCoeffBaseRangeSymbolCount - 1)) break;
   }
@@ -1442,6 +1397,11 @@
   // Clear padding to avoid bottom boundary checks when parsing quantized
   // coefficients.
   memset(residual, 0, (tx_width * tx_height + tx_padding) * residual_size_);
+  uint8_t level_buffer[(32 + kResidualPaddingVertical) * 32];
+  memset(
+      level_buffer, 0,
+      kTransformWidth[adjusted_tx_size] * kTransformHeight[adjusted_tx_size] +
+          tx_padding);
   const int clamped_tx_height = std::min(tx_height, 32);
   if (plane == kPlaneY) {
     ReadTransformType(block, x4, y4, tx_size);
@@ -1452,33 +1412,38 @@
   const PlaneType plane_type = GetPlaneType(plane);
   const TransformClass tx_class = GetTransformClass(*tx_type);
   context = static_cast<int>(tx_class != kTransformClass2D);
-  uint16_t* cdf;
+  int eob_pt = 1;
   switch (eob_multi_size) {
     case 0:
-      cdf = symbol_decoder_context_.eob_pt_16_cdf[plane_type][context];
+      eob_pt += reader_.ReadSymbol<kEobPt16SymbolCount>(
+          symbol_decoder_context_.eob_pt_16_cdf[plane_type][context]);
       break;
     case 1:
-      cdf = symbol_decoder_context_.eob_pt_32_cdf[plane_type][context];
+      eob_pt += reader_.ReadSymbol<kEobPt32SymbolCount>(
+          symbol_decoder_context_.eob_pt_32_cdf[plane_type][context]);
       break;
     case 2:
-      cdf = symbol_decoder_context_.eob_pt_64_cdf[plane_type][context];
+      eob_pt += reader_.ReadSymbol<kEobPt64SymbolCount>(
+          symbol_decoder_context_.eob_pt_64_cdf[plane_type][context]);
       break;
     case 3:
-      cdf = symbol_decoder_context_.eob_pt_128_cdf[plane_type][context];
+      eob_pt += reader_.ReadSymbol<kEobPt128SymbolCount>(
+          symbol_decoder_context_.eob_pt_128_cdf[plane_type][context]);
       break;
     case 4:
-      cdf = symbol_decoder_context_.eob_pt_256_cdf[plane_type][context];
+      eob_pt += reader_.ReadSymbol<kEobPt256SymbolCount>(
+          symbol_decoder_context_.eob_pt_256_cdf[plane_type][context]);
       break;
     case 5:
-      cdf = symbol_decoder_context_.eob_pt_512_cdf[plane_type];
+      eob_pt += reader_.ReadSymbol<kEobPt512SymbolCount>(
+          symbol_decoder_context_.eob_pt_512_cdf[plane_type]);
       break;
     case 6:
     default:
-      cdf = symbol_decoder_context_.eob_pt_1024_cdf[plane_type];
+      eob_pt += reader_.ReadSymbol<kEobPt1024SymbolCount>(
+          symbol_decoder_context_.eob_pt_1024_cdf[plane_type]);
       break;
   }
-  const int eob_pt =
-      1 + reader_.ReadSymbol(cdf, kEobPt16SymbolCount + eob_multi_size);
   int eob = (eob_pt < 2) ? eob_pt : ((1 << (eob_pt - 2)) + 1);
   if (eob_pt >= 3) {
     context = eob_pt - 3;
@@ -1496,20 +1461,22 @@
   }
   const uint16_t* scan = kScan[tx_class][tx_size];
   const int clamped_tx_size_context = std::min(tx_size_context, 3);
+  auto coeff_base_range_cdf =
+      symbol_decoder_context_
+          .coeff_base_range_cdf[clamped_tx_size_context][plane_type];
   // Read the last coefficient.
   {
     context = GetCoeffBaseContextEob(tx_size, eob - 1);
     const uint16_t pos = scan[eob - 1];
     int level =
-        1 + reader_.ReadSymbol(
+        1 + reader_.ReadSymbol<kCoeffBaseEobSymbolCount>(
                 symbol_decoder_context_
-                    .coeff_base_eob_cdf[tx_size_context][plane_type][context],
-                kCoeffBaseEobSymbolCount);
+                    .coeff_base_eob_cdf[tx_size_context][plane_type][context]);
+    level_buffer[pos] = level;
     if (level > kNumQuantizerBaseLevels) {
-      level += ReadCoeffBaseRange(
-          clamped_tx_size_context,
-          GetCoeffBaseRangeContextEob(adjusted_tx_width_log2, pos, tx_class),
-          plane_type);
+      level +=
+          ReadCoeffBaseRange(coeff_base_range_cdf[GetCoeffBaseRangeContextEob(
+              adjusted_tx_width_log2, pos, tx_class)]);
     }
     residual[pos] = level;
   }
@@ -1518,18 +1485,19 @@
     // Lookup used to call the right variant of ReadCoeffBase*() based on the
     // transform class.
     static constexpr void (Tile::*kGetCoeffBaseFunc[])(
-        const uint16_t* scan, PlaneType plane_type, TransformSize tx_size,
-        int clamped_tx_size_context, int adjusted_tx_width_log2, int eob,
+        const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
+        int eob,
         uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
-        ResidualType* quantized_buffer) = {
-        &Tile::ReadCoeffBase2D<ResidualType>,
-        &Tile::ReadCoeffBaseHorizontal<ResidualType>,
-        &Tile::ReadCoeffBaseVertical<ResidualType>};
+        uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+                                     [kCoeffBaseRangeSymbolCount + 1],
+        ResidualType* quantized_buffer,
+        uint8_t* level_buffer) = {&Tile::ReadCoeffBase2D<ResidualType>,
+                                  &Tile::ReadCoeffBaseHorizontal<ResidualType>,
+                                  &Tile::ReadCoeffBaseVertical<ResidualType>};
     (this->*kGetCoeffBaseFunc[tx_class])(
-        scan, plane_type, tx_size, clamped_tx_size_context,
-        adjusted_tx_width_log2, eob,
+        scan, tx_size, adjusted_tx_width_log2, eob,
         symbol_decoder_context_.coeff_base_cdf[tx_size_context][plane_type],
-        residual);
+        coeff_base_range_cdf, residual, level_buffer);
   }
   const int max_value = (1 << (7 + sequence_header_.color_config.bitdepth)) - 1;
   const int current_quantizer_index = GetQIndex(
@@ -1542,8 +1510,9 @@
        *tx_type < kTransformTypeIdentityIdentity &&
        !frame_header_.segmentation.lossless[bp.segment_id] &&
        frame_header_.quantizer.matrix_level[plane] < 15)
-          ? &kQuantizerMatrix[frame_header_.quantizer.matrix_level[plane]]
-                             [plane_type][kQuantizerMatrixOffset[tx_size]]
+          ? quantizer_matrix_[frame_header_.quantizer.matrix_level[plane]]
+                             [plane_type][adjusted_tx_size]
+                                 .get()
           : nullptr;
   int coefficient_level = 0;
   int8_t dc_category = 0;
@@ -1657,11 +1626,12 @@
     const int sb_row_index = SuperBlockRowIndex(block.row4x4);
     const int sb_column_index = SuperBlockColumnIndex(block.column4x4);
     if (mode == kProcessingModeDecodeOnly) {
-      TransformParameterQueue& tx_params =
+      Queue<TransformParameters>& tx_params =
           *residual_buffer_threaded_[sb_row_index][sb_column_index]
                ->transform_parameters();
       ReconstructBlock(block, plane, start_x, start_y, tx_size,
-                       tx_params.Type(), tx_params.NonZeroCoeffCount());
+                       tx_params.Front().type,
+                       tx_params.Front().non_zero_coeff_count);
       tx_params.Pop();
     } else {
       TransformType tx_type;
@@ -1684,7 +1654,7 @@
         assert(mode == kProcessingModeParseOnly);
         residual_buffer_threaded_[sb_row_index][sb_column_index]
             ->transform_parameters()
-            ->Push(non_zero_coeff_count, tx_type);
+            ->Push(TransformParameters(tx_type, non_zero_coeff_count));
       }
     }
   }
@@ -1793,8 +1763,9 @@
   const BlockParameters& bp = *block.bp;
   for (int chunk_y = 0; chunk_y < height_chunks; ++chunk_y) {
     for (int chunk_x = 0; chunk_x < width_chunks; ++chunk_x) {
-      for (int plane = 0; plane < (block.HasChroma() ? PlaneCount() : 1);
-           ++plane) {
+      const int num_planes = block.HasChroma() ? PlaneCount() : 1;
+      int plane = kPlaneY;
+      do {
         const int subsampling_x = subsampling_x_[plane];
         const int subsampling_y = subsampling_y_[plane];
         // For Y Plane, when lossless is true |bp.transform_size| is always
@@ -1833,7 +1804,7 @@
             }
           }
         }
-      }
+      } while (++plane < num_planes);
     }
   }
   return true;
@@ -1913,6 +1884,7 @@
   GetClampParameters(block, min, max);
   BlockParameters& bp = *block.bp;
   const PredictionParameters& prediction_parameters = *bp.prediction_parameters;
+  bp.mv.mv64 = 0;
   if (is_compound) {
     for (int i = 0; i < 2; ++i) {
       const PredictionMode mode = GetSinglePredictionMode(i, bp.y_mode);
@@ -1975,6 +1947,7 @@
   BlockParameters& bp = *block.bp;
   const PredictionParameters& prediction_parameters = *bp.prediction_parameters;
   const MotionVector& ref_mv_0 = prediction_parameters.reference_mv(0);
+  bp.mv.mv64 = 0;
   ReadMotionVector(block, 0);
   if (ref_mv_0.mv32 == 0) {
     const MotionVector& ref_mv_1 = prediction_parameters.reference_mv(1);
@@ -1998,7 +1971,9 @@
 }
 
 void Tile::ResetEntropyContext(const Block& block) {
-  for (int plane = 0; plane < (block.HasChroma() ? PlaneCount() : 1); ++plane) {
+  const int num_planes = block.HasChroma() ? PlaneCount() : 1;
+  int plane = kPlaneY;
+  do {
     const int subsampling_x = subsampling_x_[plane];
     const int start_x = block.column4x4 >> subsampling_x;
     const int end_x =
@@ -2017,7 +1992,7 @@
            end_y - start_y);
     memset(&dc_categories_[kEntropyContextLeft][plane][start_y], 0,
            end_y - start_y);
-  }
+  } while (++plane < num_planes);
 }
 
 bool Tile::ComputePrediction(const Block& block) {
@@ -2036,7 +2011,7 @@
   bool is_local_valid = false;
   // Local warping parameters, similar usage as is_local_valid.
   GlobalMotion local_warp_params;
-  int plane = 0;
+  int plane = kPlaneY;
   do {
     const int8_t subsampling_x = subsampling_x_[plane];
     const int8_t subsampling_y = subsampling_y_[plane];
@@ -2147,7 +2122,6 @@
 }
 
 bool Tile::ProcessBlock(int row4x4, int column4x4, BlockSize block_size,
-                        ParameterTree* const tree,
                         TileScratchBuffer* const scratch_buffer,
                         ResidualPtr* residual) {
   // Do not process the block if the starting point is beyond the visible frame.
@@ -2158,8 +2132,24 @@
       column4x4 >= frame_header_.columns4x4) {
     return true;
   }
-  BlockParameters& bp = *tree->parameters();
-  block_parameters_holder_.FillCache(row4x4, column4x4, block_size, &bp);
+
+  if (split_parse_and_decode_) {
+    // Push block ordering info to the queue. DecodeBlock() will use this queue
+    // to decode the blocks in the correct order.
+    const int sb_row_index = SuperBlockRowIndex(row4x4);
+    const int sb_column_index = SuperBlockColumnIndex(column4x4);
+    residual_buffer_threaded_[sb_row_index][sb_column_index]
+        ->partition_tree_order()
+        ->Push(PartitionTreeNode(row4x4, column4x4, block_size));
+  }
+
+  BlockParameters* bp_ptr =
+      block_parameters_holder_.Get(row4x4, column4x4, block_size);
+  if (bp_ptr == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Failed to get BlockParameters.");
+    return false;
+  }
+  BlockParameters& bp = *bp_ptr;
   Block block(*this, block_size, row4x4, column4x4, scratch_buffer, residual);
   bp.size = block_size;
   bp.prediction_parameters =
@@ -2211,16 +2201,13 @@
   return true;
 }
 
-bool Tile::DecodeBlock(ParameterTree* const tree,
+bool Tile::DecodeBlock(int row4x4, int column4x4, BlockSize block_size,
                        TileScratchBuffer* const scratch_buffer,
                        ResidualPtr* residual) {
-  const int row4x4 = tree->row4x4();
-  const int column4x4 = tree->column4x4();
   if (row4x4 >= frame_header_.rows4x4 ||
       column4x4 >= frame_header_.columns4x4) {
     return true;
   }
-  const BlockSize block_size = tree->block_size();
   Block block(*this, block_size, row4x4, column4x4, scratch_buffer, residual);
   if (!ComputePrediction(block) ||
       !Residual(block, kProcessingModeDecodeOnly)) {
@@ -2231,27 +2218,22 @@
 }
 
 bool Tile::ProcessPartition(int row4x4_start, int column4x4_start,
-                            ParameterTree* const root,
                             TileScratchBuffer* const scratch_buffer,
                             ResidualPtr* residual) {
-  Stack<ParameterTree*, kDfsStackSize> stack;
+  Stack<PartitionTreeNode, kDfsStackSize> stack;
 
   // Set up the first iteration.
-  ParameterTree* node = root;
-  int row4x4 = row4x4_start;
-  int column4x4 = column4x4_start;
-  BlockSize block_size = SuperBlockSize();
+  stack.Push(
+      PartitionTreeNode(row4x4_start, column4x4_start, SuperBlockSize()));
 
   // DFS loop. If it sees a terminal node (leaf node), ProcessBlock is invoked.
   // Otherwise, the children are pushed into the stack for future processing.
   do {
-    if (!stack.Empty()) {
-      // Set up subsequent iterations.
-      node = stack.Pop();
-      row4x4 = node->row4x4();
-      column4x4 = node->column4x4();
-      block_size = node->block_size();
-    }
+    PartitionTreeNode node = stack.Pop();
+    int row4x4 = node.row4x4;
+    int column4x4 = node.column4x4;
+    BlockSize block_size = node.block_size;
+
     if (row4x4 >= frame_header_.rows4x4 ||
         column4x4 >= frame_header_.columns4x4) {
       continue;
@@ -2287,13 +2269,13 @@
           sequence_header_.color_config.subsampling_y);
       return false;
     }
-    if (!node->SetPartitionType(partition)) {
-      LIBGAV1_DLOG(ERROR, "node->SetPartitionType() failed.");
-      return false;
-    }
+
+    const int quarter_block4x4 = half_block4x4 >> 1;
+    const BlockSize split_size = kSubSize[kPartitionSplit][block_size];
+    assert(partition == kPartitionNone || sub_size != kBlockInvalid);
     switch (partition) {
       case kPartitionNone:
-        if (!ProcessBlock(row4x4, column4x4, sub_size, node, scratch_buffer,
+        if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
                           residual)) {
           return false;
         }
@@ -2301,28 +2283,82 @@
       case kPartitionSplit:
         // The children must be added in reverse order since a stack is being
         // used.
-        for (int i = 3; i >= 0; --i) {
-          ParameterTree* const child = node->children(i);
-          assert(child != nullptr);
-          stack.Push(child);
-        }
+        stack.Push(PartitionTreeNode(row4x4 + half_block4x4,
+                                     column4x4 + half_block4x4, sub_size));
+        stack.Push(
+            PartitionTreeNode(row4x4 + half_block4x4, column4x4, sub_size));
+        stack.Push(
+            PartitionTreeNode(row4x4, column4x4 + half_block4x4, sub_size));
+        stack.Push(PartitionTreeNode(row4x4, column4x4, sub_size));
         break;
       case kPartitionHorizontal:
+        if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
+                          residual) ||
+            !ProcessBlock(row4x4 + half_block4x4, column4x4, sub_size,
+                          scratch_buffer, residual)) {
+          return false;
+        }
+        break;
       case kPartitionVertical:
+        if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
+                          residual) ||
+            !ProcessBlock(row4x4, column4x4 + half_block4x4, sub_size,
+                          scratch_buffer, residual)) {
+          return false;
+        }
+        break;
       case kPartitionHorizontalWithTopSplit:
+        if (!ProcessBlock(row4x4, column4x4, split_size, scratch_buffer,
+                          residual) ||
+            !ProcessBlock(row4x4, column4x4 + half_block4x4, split_size,
+                          scratch_buffer, residual) ||
+            !ProcessBlock(row4x4 + half_block4x4, column4x4, sub_size,
+                          scratch_buffer, residual)) {
+          return false;
+        }
+        break;
       case kPartitionHorizontalWithBottomSplit:
+        if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
+                          residual) ||
+            !ProcessBlock(row4x4 + half_block4x4, column4x4, split_size,
+                          scratch_buffer, residual) ||
+            !ProcessBlock(row4x4 + half_block4x4, column4x4 + half_block4x4,
+                          split_size, scratch_buffer, residual)) {
+          return false;
+        }
+        break;
       case kPartitionVerticalWithLeftSplit:
+        if (!ProcessBlock(row4x4, column4x4, split_size, scratch_buffer,
+                          residual) ||
+            !ProcessBlock(row4x4 + half_block4x4, column4x4, split_size,
+                          scratch_buffer, residual) ||
+            !ProcessBlock(row4x4, column4x4 + half_block4x4, sub_size,
+                          scratch_buffer, residual)) {
+          return false;
+        }
+        break;
       case kPartitionVerticalWithRightSplit:
+        if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
+                          residual) ||
+            !ProcessBlock(row4x4, column4x4 + half_block4x4, split_size,
+                          scratch_buffer, residual) ||
+            !ProcessBlock(row4x4 + half_block4x4, column4x4 + half_block4x4,
+                          split_size, scratch_buffer, residual)) {
+          return false;
+        }
+        break;
       case kPartitionHorizontal4:
+        for (int i = 0; i < 4; ++i) {
+          if (!ProcessBlock(row4x4 + i * quarter_block4x4, column4x4, sub_size,
+                            scratch_buffer, residual)) {
+            return false;
+          }
+        }
+        break;
       case kPartitionVertical4:
         for (int i = 0; i < 4; ++i) {
-          ParameterTree* const child = node->children(i);
-          // Once a null child is seen, all the subsequent children will also be
-          // null.
-          if (child == nullptr) break;
-          if (!ProcessBlock(child->row4x4(), child->column4x4(),
-                            child->block_size(), child, scratch_buffer,
-                            residual)) {
+          if (!ProcessBlock(row4x4, column4x4 + i * quarter_block4x4, sub_size,
+                            scratch_buffer, residual)) {
             return false;
           }
         }
@@ -2367,7 +2403,7 @@
          sizeof(scratch_buffer->block_decoded));
   // Set specific edge cases to true.
   const int sb_size4 = sequence_header_.use_128x128_superblock ? 32 : 16;
-  for (int plane = 0; plane < PlaneCount(); ++plane) {
+  for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
     const int subsampling_x = subsampling_x_[plane];
     const int subsampling_y = subsampling_y_[plane];
     const int sb_width4 = (column4x4_end_ - column4x4) >> subsampling_x;
@@ -2395,7 +2431,7 @@
   }
 }
 
-bool Tile::ProcessSuperBlock(int row4x4, int column4x4, int block_width4x4,
+bool Tile::ProcessSuperBlock(int row4x4, int column4x4,
                              TileScratchBuffer* const scratch_buffer,
                              ProcessingMode mode) {
   const bool parsing =
@@ -2413,13 +2449,10 @@
   if (parsing) {
     ReadLoopRestorationCoefficients(row4x4, column4x4, block_size);
   }
-  const int row = row4x4 / block_width4x4;
-  const int column = column4x4 / block_width4x4;
   if (parsing && decoding) {
     uint8_t* residual_buffer = residual_buffer_.get();
-    if (!ProcessPartition(row4x4, column4x4,
-                          block_parameters_holder_.Tree(row, column),
-                          scratch_buffer, &residual_buffer)) {
+    if (!ProcessPartition(row4x4, column4x4, scratch_buffer,
+                          &residual_buffer)) {
       LIBGAV1_DLOG(ERROR, "Error decoding partition row: %d column: %d", row4x4,
                    column4x4);
       return false;
@@ -2437,18 +2470,14 @@
     }
     uint8_t* residual_buffer =
         residual_buffer_threaded_[sb_row_index][sb_column_index]->buffer();
-    if (!ProcessPartition(row4x4, column4x4,
-                          block_parameters_holder_.Tree(row, column),
-                          scratch_buffer, &residual_buffer)) {
+    if (!ProcessPartition(row4x4, column4x4, scratch_buffer,
+                          &residual_buffer)) {
       LIBGAV1_DLOG(ERROR, "Error parsing partition row: %d column: %d", row4x4,
                    column4x4);
       return false;
     }
   } else {
-    uint8_t* residual_buffer =
-        residual_buffer_threaded_[sb_row_index][sb_column_index]->buffer();
-    if (!DecodeSuperBlock(block_parameters_holder_.Tree(row, column),
-                          scratch_buffer, &residual_buffer)) {
+    if (!DecodeSuperBlock(sb_row_index, sb_column_index, scratch_buffer)) {
       LIBGAV1_DLOG(ERROR, "Error decoding superblock row: %d column: %d",
                    row4x4, column4x4);
       return false;
@@ -2459,26 +2488,23 @@
   return true;
 }
 
-bool Tile::DecodeSuperBlock(ParameterTree* const tree,
-                            TileScratchBuffer* const scratch_buffer,
-                            ResidualPtr* residual) {
-  Stack<ParameterTree*, kDfsStackSize> stack;
-  stack.Push(tree);
-  do {
-    ParameterTree* const node = stack.Pop();
-    if (node->partition() != kPartitionNone) {
-      for (int i = 3; i >= 0; --i) {
-        if (node->children(i) == nullptr) continue;
-        stack.Push(node->children(i));
-      }
-      continue;
-    }
-    if (!DecodeBlock(node, scratch_buffer, residual)) {
+bool Tile::DecodeSuperBlock(int sb_row_index, int sb_column_index,
+                            TileScratchBuffer* const scratch_buffer) {
+  uint8_t* residual_buffer =
+      residual_buffer_threaded_[sb_row_index][sb_column_index]->buffer();
+  Queue<PartitionTreeNode>& partition_tree_order =
+      *residual_buffer_threaded_[sb_row_index][sb_column_index]
+           ->partition_tree_order();
+  while (!partition_tree_order.Empty()) {
+    PartitionTreeNode block = partition_tree_order.Front();
+    if (!DecodeBlock(block.row4x4, block.column4x4, block.block_size,
+                     scratch_buffer, &residual_buffer)) {
       LIBGAV1_DLOG(ERROR, "Error decoding block row: %d column: %d",
-                   node->row4x4(), node->column4x4());
+                   block.row4x4, block.column4x4);
       return false;
     }
-  } while (!stack.Empty());
+    partition_tree_order.Pop();
+  }
   return true;
 }
 

diff --git a/libgav1/src/utils/array_2d.h b/libgav1/src/utils/array_2d.h
index 2df6241..df2da9f 100644
--- a/libgav1/src/utils/array_2d.h
+++ b/libgav1/src/utils/array_2d.h

@@ -120,7 +120,7 @@
   const T* operator[](int row) const { return data_view_[row]; }
 
  private:
-  std::unique_ptr<T[]> data_ = nullptr;
+  std::unique_ptr<T[]> data_;
   size_t allocated_size_ = 0;
   size_t size_ = 0;
   Array2DView<T> data_view_;

diff --git a/libgav1/src/utils/block_parameters_holder.cc b/libgav1/src/utils/block_parameters_holder.cc
index 79bb2b8..3bb9f1e 100644
--- a/libgav1/src/utils/block_parameters_holder.cc
+++ b/libgav1/src/utils/block_parameters_holder.cc

@@ -19,53 +19,29 @@
 #include "src/utils/common.h"
 #include "src/utils/constants.h"
 #include "src/utils/logging.h"
-#include "src/utils/parameter_tree.h"
 #include "src/utils/types.h"
 
 namespace libgav1 {
 
-namespace {
-
-// Returns the number of super block rows/columns for |value4x4| where value4x4
-// is either rows4x4 or columns4x4.
-int RowsOrColumns4x4ToSuperBlocks(int value4x4, bool use_128x128_superblock) {
-  return use_128x128_superblock ? DivideBy128(MultiplyBy4(value4x4) + 127)
-                                : DivideBy64(MultiplyBy4(value4x4) + 63);
-}
-
-}  // namespace
-
-bool BlockParametersHolder::Reset(int rows4x4, int columns4x4,
-                                  bool use_128x128_superblock) {
+bool BlockParametersHolder::Reset(int rows4x4, int columns4x4) {
   rows4x4_ = rows4x4;
   columns4x4_ = columns4x4;
-  use_128x128_superblock_ = use_128x128_superblock;
-  if (!block_parameters_cache_.Reset(rows4x4_, columns4x4_)) {
-    LIBGAV1_DLOG(ERROR, "block_parameters_cache_.Reset() failed.");
-    return false;
+  index_ = 0;
+  return block_parameters_cache_.Reset(rows4x4_, columns4x4_) &&
+         block_parameters_.Resize(rows4x4_ * columns4x4_);
+}
+
+BlockParameters* BlockParametersHolder::Get(int row4x4, int column4x4,
+                                            BlockSize block_size) {
+  const size_t index = index_.fetch_add(1, std::memory_order_relaxed);
+  if (index >= block_parameters_.size()) return nullptr;
+  auto& bp = block_parameters_.get()[index];
+  if (bp == nullptr) {
+    bp.reset(new (std::nothrow) BlockParameters);
+    if (bp == nullptr) return nullptr;
   }
-  const int rows =
-      RowsOrColumns4x4ToSuperBlocks(rows4x4_, use_128x128_superblock_);
-  const int columns =
-      RowsOrColumns4x4ToSuperBlocks(columns4x4_, use_128x128_superblock_);
-  const BlockSize sb_size =
-      use_128x128_superblock_ ? kBlock128x128 : kBlock64x64;
-  const int multiplier = kNum4x4BlocksWide[sb_size];
-  if (!trees_.Reset(rows, columns)) {
-    LIBGAV1_DLOG(ERROR, "trees_.Reset() failed.");
-    return false;
-  }
-  for (int i = 0; i < rows; ++i) {
-    for (int j = 0; j < columns; ++j) {
-      trees_[i][j] =
-          ParameterTree::Create(i * multiplier, j * multiplier, sb_size);
-      if (trees_[i][j] == nullptr) {
-        LIBGAV1_DLOG(ERROR, "Allocation of trees_[%d][%d] failed.", i, j);
-        return false;
-      }
-    }
-  }
-  return true;
+  FillCache(row4x4, column4x4, block_size, bp.get());
+  return bp.get();
 }
 
 void BlockParametersHolder::FillCache(int row4x4, int column4x4,

diff --git a/libgav1/src/utils/block_parameters_holder.h b/libgav1/src/utils/block_parameters_holder.h
index 35543c3..ca36907 100644
--- a/libgav1/src/utils/block_parameters_holder.h
+++ b/libgav1/src/utils/block_parameters_holder.h

@@ -17,18 +17,18 @@
 #ifndef LIBGAV1_SRC_UTILS_BLOCK_PARAMETERS_HOLDER_H_
 #define LIBGAV1_SRC_UTILS_BLOCK_PARAMETERS_HOLDER_H_
 
+#include <atomic>
 #include <memory>
 
 #include "src/utils/array_2d.h"
 #include "src/utils/compiler_attributes.h"
 #include "src/utils/constants.h"
-#include "src/utils/parameter_tree.h"
+#include "src/utils/dynamic_buffer.h"
 #include "src/utils/types.h"
 
 namespace libgav1 {
 
-// Holds a 2D array of |ParameterTree| objects. Each tree stores the parameters
-// corresponding to a superblock.
+// Holds the BlockParameters pointers to each 4x4 block in the frame.
 class BlockParametersHolder {
  public:
   BlockParametersHolder() = default;
@@ -37,10 +37,13 @@
   BlockParametersHolder(const BlockParametersHolder&) = delete;
   BlockParametersHolder& operator=(const BlockParametersHolder&) = delete;
 
-  // If |use_128x128_superblock| is true, 128x128 superblocks will be used,
-  // otherwise 64x64 superblocks will be used.
-  LIBGAV1_MUST_USE_RESULT bool Reset(int rows4x4, int columns4x4,
-                                     bool use_128x128_superblock);
+  LIBGAV1_MUST_USE_RESULT bool Reset(int rows4x4, int columns4x4);
+
+  // Returns a pointer to a BlockParameters object that can be used safely until
+  // the next call to Reset(). Returns nullptr on memory allocation failure. It
+  // also fills the cache matrix for the block starting at |row4x4|, |column4x4|
+  // of size |block_size| with the returned pointer.
+  BlockParameters* Get(int row4x4, int column4x4, BlockSize block_size);
 
   // Finds the BlockParameters corresponding to |row4x4| and |column4x4|. This
   // is done as a simple look up of the |block_parameters_cache_| matrix.
@@ -59,20 +62,24 @@
 
   int columns4x4() const { return columns4x4_; }
 
-  // Returns the ParameterTree corresponding to superblock starting at (|row|,
-  // |column|).
-  ParameterTree* Tree(int row, int column) { return trees_[row][column].get(); }
+ private:
+  // Needs access to FillCache for testing Cdef.
+  template <int bitdepth, typename Pixel>
+  friend class PostFilterApplyCdefTest;
 
-  // Fills the cache matrix for the block starting at |row4x4|, |column4x4| of
-  // size |block_size| with the pointer |bp|.
   void FillCache(int row4x4, int column4x4, BlockSize block_size,
                  BlockParameters* bp);
 
- private:
   int rows4x4_ = 0;
   int columns4x4_ = 0;
-  bool use_128x128_superblock_ = false;
-  Array2D<std::unique_ptr<ParameterTree>> trees_;
+
+  // Owns the memory of BlockParameters pointers for the entire frame. It can
+  // hold upto |rows4x4_| * |columns4x4_| objects. Each object will be allocated
+  // on demand and re-used across frames.
+  DynamicBuffer<std::unique_ptr<BlockParameters>> block_parameters_;
+
+  // Points to the next available index of |block_parameters_|.
+  std::atomic<int> index_;
 
   // This is a 2d array of size |rows4x4_| * |columns4x4_|. This is filled in by
   // FillCache() and used by Find() to perform look ups using exactly one look

diff --git a/libgav1/src/utils/common.h b/libgav1/src/utils/common.h
index 8caad2e..2e599f0 100644
--- a/libgav1/src/utils/common.h
+++ b/libgav1/src/utils/common.h

@@ -30,12 +30,12 @@
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
-#include <cstdlib>
 #include <cstring>
 #include <type_traits>
 
 #include "src/utils/bit_mask_set.h"
 #include "src/utils/constants.h"
+#include "src/utils/memory.h"
 #include "src/utils/types.h"
 
 namespace libgav1 {
@@ -58,6 +58,17 @@
   return value < low ? low : (value > high ? high : value);
 }
 
+template <typename Pixel>
+void ExtendLine(void* const line_start, const int width, const int left,
+                const int right) {
+  auto* const start = static_cast<Pixel*>(line_start);
+  const Pixel* src = start;
+  Pixel* dst = start - left;
+  // Copy to left and right borders.
+  Memset(dst, src[0], left);
+  Memset(dst + left + width, src[width - 1], right);
+}
+
 // The following 2 templates set a block of data with uncontiguous memory to
 // |value|. The compilers usually generate several branches to handle different
 // cases of |columns| when inlining memset() and std::fill(), and these branches
@@ -110,7 +121,7 @@
   const unsigned char bit_set = _BitScanReverse(&first_set_bit, n);
   assert(bit_set != 0);
   static_cast<void>(bit_set);
-  return 31 - static_cast<int>(first_set_bit);
+  return 31 ^ static_cast<int>(first_set_bit);
 }
 
 inline int CountLeadingZeros(uint64_t n) {
@@ -119,20 +130,20 @@
 #if defined(HAVE_BITSCANREVERSE64)
   const unsigned char bit_set =
       _BitScanReverse64(&first_set_bit, static_cast<unsigned __int64>(n));
-#else  // !defined(HAVE_BITSCANREVERSE64)
+#else   // !defined(HAVE_BITSCANREVERSE64)
   const auto n_hi = static_cast<unsigned long>(n >> 32);  // NOLINT(runtime/int)
   if (n_hi != 0) {
     const unsigned char bit_set = _BitScanReverse(&first_set_bit, n_hi);
     assert(bit_set != 0);
     static_cast<void>(bit_set);
-    return 31 - static_cast<int>(first_set_bit);
+    return 31 ^ static_cast<int>(first_set_bit);
   }
   const unsigned char bit_set = _BitScanReverse(
       &first_set_bit, static_cast<unsigned long>(n));  // NOLINT(runtime/int)
 #endif  // defined(HAVE_BITSCANREVERSE64)
   assert(bit_set != 0);
   static_cast<void>(bit_set);
-  return 63 - static_cast<int>(first_set_bit);
+  return 63 ^ static_cast<int>(first_set_bit);
 }
 
 #undef HAVE_BITSCANREVERSE64
@@ -185,22 +196,22 @@
 
 inline int FloorLog2(int32_t n) {
   assert(n > 0);
-  return 31 - CountLeadingZeros(static_cast<uint32_t>(n));
+  return 31 ^ CountLeadingZeros(static_cast<uint32_t>(n));
 }
 
 inline int FloorLog2(uint32_t n) {
   assert(n > 0);
-  return 31 - CountLeadingZeros(n);
+  return 31 ^ CountLeadingZeros(n);
 }
 
 inline int FloorLog2(int64_t n) {
   assert(n > 0);
-  return 63 - CountLeadingZeros(static_cast<uint64_t>(n));
+  return 63 ^ CountLeadingZeros(static_cast<uint64_t>(n));
 }
 
 inline int FloorLog2(uint64_t n) {
   assert(n > 0);
-  return 63 - CountLeadingZeros(n);
+  return 63 ^ CountLeadingZeros(n);
 }
 
 inline int CeilLog2(unsigned int n) {
@@ -211,8 +222,9 @@
   return (n < 2) ? 0 : FloorLog2(n - 1) + 1;
 }
 
-constexpr int Ceil(int dividend, int divisor) {
-  return dividend / divisor + static_cast<int>(dividend % divisor != 0);
+inline int RightShiftWithCeiling(int value, int bits) {
+  assert(bits > 0);
+  return (value + (1 << bits) - 1) >> bits;
 }
 
 inline int32_t RightShiftWithRounding(int32_t value, int bits) {
@@ -363,7 +375,7 @@
 // behavior and result apply to other CPUs' SIMD instructions.
 inline int GetRelativeDistance(const unsigned int a, const unsigned int b,
                                const unsigned int order_hint_shift_bits) {
-  const int diff = a - b;
+  const int diff = static_cast<int>(a) - static_cast<int>(b);
   assert(order_hint_shift_bits <= 31);
   if (order_hint_shift_bits == 0) {
     assert(a == 0);
@@ -510,6 +522,8 @@
   return filter_index;
 }
 
+// This has identical results as RightShiftWithRounding since |subsampling| can
+// only be 0 or 1.
 constexpr int SubsampledValue(int value, int subsampling) {
   return (value + subsampling) >> subsampling;
 }

diff --git a/libgav1/src/utils/constants.cc b/libgav1/src/utils/constants.cc
index 97959fa..80d7acb 100644
--- a/libgav1/src/utils/constants.cc
+++ b/libgav1/src/utils/constants.cc

@@ -871,311 +871,4 @@
 const uint8_t kDeblockFilterLevelIndex[kMaxPlanes][kNumLoopFilterTypes] = {
     {0, 1}, {2, 2}, {3, 3}};
 
-const int8_t kMaskIdLookup[4][kMaxBlockSizes] = {
-    // transform size 4x4.
-    {0,  1,  13, 2, 3,  4,  15, 14, 5,  6,  7,
-     17, 16, 8,  9, 10, 18, 11, 12, -1, -1, -1},
-    // transform size 8x8.
-    {-1, -1, -1, -1, 19, 20, 29, -1, 21, 22, 23,
-     31, 30, 24, 25, 26, 32, 27, 28, -1, -1, -1},
-    // transform size 16x16.
-    {-1, -1, -1, -1, -1, -1, -1, -1, -1, 33, 34,
-     40, -1, 35, 36, 37, 41, 38, 39, -1, -1, -1},
-    // transform size 32x32.
-    {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-     -1, -1, -1, 42, 43, -1, 44, 45, -1, -1, -1},
-};
-
-const int8_t kVerticalBorderMaskIdLookup[kMaxBlockSizes] = {
-    0,  47, 61, 49, 19, 51, 63, 62, 53, 33, 55,
-    65, 64, 57, 42, 59, 66, 60, 46, -1, -1, -1};
-
-const uint64_t kTopMaskLookup[67][4] = {
-    // transform size 4X4
-    {0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 4X4, transform size 4X4
-    {0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 4X8, transform size 4X4
-    {0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 8X4, transform size 4X4
-    {0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 8X8, transform size 4X4
-    {0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 8X16, transform size 4X4
-    {0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 16X8, transform size 4X4
-    {0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 16X16, transform size 4X4
-    {0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 16X32, transform size 4X4
-    {0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 32X16, transform size 4X4
-    {0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 32X32, transform size 4X4
-    {0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
-     0x00ff00ff00ff00ffULL},  // block size 32X64, transform size 4X4
-    {0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 64X32, transform size 4X4
-    {0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
-     0xffffffffffffffffULL},  // block size 64X64, transform size 4x4
-    {0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 4X16, transform size 4X4
-    {0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 16X4, transform size 4X4
-    {0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 8X32, transform size 4X4
-    {0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 32X8, transform size 4X4
-    {0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL,
-     0x000f000f000f000fULL},  // block size 16X64, transform size 4X4
-    {0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 64X16, transform size 4X4
-    // transform size 8X8
-    {0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 8X8, transform size 8X8
-    {0x0000000300000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 8X16, transform size 8X8
-    {0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 16X8, transform size 8X8
-    {0x0000000f0000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 16X16, transform size 8X8
-    {0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 16X32, transform size 8X8
-    {0x000000ff000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 32X16, transform size 8X8
-    {0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 32X32, transform size 8X8
-    {0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x000000ff000000ffULL,
-     0x000000ff000000ffULL},  // block size 32X64, transform size 8X8
-    {0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 64X32, transform size 8X8
-    {0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL,
-     0x0000ffff0000ffffULL},  // block size 64X64, transform size 8X8
-    {0x0000000300000003ULL, 0x0000000300000003ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 8X32, transform size 8X8
-    {0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 32X8, transform size 8X8
-    {0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000f0000000fULL,
-     0x0000000f0000000fULL},  // block size 16X64, transform size 8X8
-    {0x0000ffff0000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 64X16, transform size 8X8
-    // transform size 16X16
-    {0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 16X16, transform size 16X16
-    {0x000000000000000fULL, 0x000000000000000fULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 16X32, transform size 16X16
-    {0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 32X16, transform size 16X16
-    {0x00000000000000ffULL, 0x00000000000000ffULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 32X32, transform size 16X16
-    {0x00000000000000ffULL, 0x00000000000000ffULL, 0x00000000000000ffULL,
-     0x00000000000000ffULL},  // block size 32X64, transform size 16X16
-    {0x000000000000ffffULL, 0x000000000000ffffULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 64X32, transform size 16X16
-    {0x000000000000ffffULL, 0x000000000000ffffULL, 0x000000000000ffffULL,
-     0x000000000000ffffULL},  // block size 64X64, transform size 16X16
-    {0x000000000000000fULL, 0x000000000000000fULL, 0x000000000000000fULL,
-     0x000000000000000fULL},  // block size 16X64, transform size 16X16
-    {0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 64X16, transform size 16X16
-    // transform size 32X32
-    {0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 32X32, transform size 32X32
-    {0x00000000000000ffULL, 0x0000000000000000ULL, 0x00000000000000ffULL,
-     0x0000000000000000ULL},  // block size 32X64, transform size 32X32
-    {0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 64X32, transform size 32X32
-    {0x000000000000ffffULL, 0x0000000000000000ULL, 0x000000000000ffffULL,
-     0x0000000000000000ULL},  // block size 64X64, transform size 32X32
-    // transform size 64X64
-    {0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 64X64, transform size 64X64
-    // 2:1, 1:2 transform sizes.
-    {0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 4X8, transform size 4X8
-    {0x0000000100000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 4X16, transform size 4X8
-    {0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 8X4, transform size 8X4
-    {0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 16X4, transform size 8X4
-    {0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 8X16, transform size 8X16
-    {0x0000000000000003ULL, 0x0000000000000003ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 8X32, transform size 8X16
-    {0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 16X8, transform size 16X8
-    {0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 32X8, transform size 16X8
-    {0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 16X32, transform size 16X32
-    {0x000000000000000fULL, 0x0000000000000000ULL, 0x000000000000000fULL,
-     0x0000000000000000ULL},  // block size 16X64, transform size 16X32
-    {0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 32X16, transform size 32X16
-    {0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 64X16, transform size 32X16
-    {0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 32X64, transform size 32X64
-    {0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 64X32, transform size 64X32
-    // 4:1, 1:4 transform sizes.
-    {0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 4X16, transform size 4X16
-    {0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 16X4, transform size 16X4
-    {0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 8X32, transform size 8X32
-    {0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 32X8, transform size 32X8
-    {0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 16X64, transform size 16X64
-    {0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 64X16, transform size 64X16
-};
-
-const uint64_t kLeftMaskLookup[67][4] = {
-    // transform size 4X4
-    {0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 4X4, transform size 4X4
-    {0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 4X8, transform size 4X4
-    {0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 8X4, transform size 4X4
-    {0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 8X8, transform size 4X4
-    {0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 8X16, transform size 4X4
-    {0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 16X8, transform size 4X4
-    {0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 16X16, transform size 4X4
-    {0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 16X32, transform size 4X4
-    {0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 32X16, transform size 4X4
-    {0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 32X32, transform size 4X4
-    {0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
-     0x00ff00ff00ff00ffULL},  // block size 32X64, transform size 4X4
-    {0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 64X32, transform size 4X4
-    {0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
-     0xffffffffffffffffULL},  // block size 64X64, transform size 4X4
-    {0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 4X16, transform size 4X4
-    {0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 16X4, transform size 4X4
-    {0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 8X32, transform size 4X4
-    {0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 32X8, transform size 4X4
-    {0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL,
-     0x000f000f000f000fULL},  // block size 16X64, transform size 4X4
-    {0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 64X16, transform size 4X4
-    // transform size 8X8
-    {0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 8X8, transform size 8X8
-    {0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 8X16, transform size 8X8
-    {0x0000000000050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 16X8, transform size 8X8
-    {0x0005000500050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 16X16, transform size 8X8
-    {0x0005000500050005ULL, 0x0005000500050005ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 16X32, transform size 8X8
-    {0x0055005500550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 32X16, transform size 8X8
-    {0x0055005500550055ULL, 0x0055005500550055ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 32X32, transform size 8X8
-    {0x0055005500550055ULL, 0x0055005500550055ULL, 0x0055005500550055ULL,
-     0x0055005500550055ULL},  // block size 32X64, transform size 8X8
-    {0x5555555555555555ULL, 0x5555555555555555ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 64X32, transform size 8X8
-    {0x5555555555555555ULL, 0x5555555555555555ULL, 0x5555555555555555ULL,
-     0x5555555555555555ULL},  // block size 64X64, transform size 8X8
-    {0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 8X32, transform size 8X8
-    {0x0000000000550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 32X8, transform size 8X8
-    {0x0005000500050005ULL, 0x0005000500050005ULL, 0x0005000500050005ULL,
-     0x0005000500050005ULL},  // block size 16X64, transform size 8X8
-    {0x5555555555555555ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 64X16, transform size 8X8
-    // transform size 16X16
-    {0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 16X16, transform size 16X16
-    {0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 16X32, transform size 16X16
-    {0x0011001100110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 32X16, transform size 16X16
-    {0x0011001100110011ULL, 0x0011001100110011ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 32X32, transform size 16X16
-    {0x0011001100110011ULL, 0x0011001100110011ULL, 0x0011001100110011ULL,
-     0x0011001100110011ULL},  // block size 32X64, transform size 16X16
-    {0x1111111111111111ULL, 0x1111111111111111ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 64X32, transform size 16X16
-    {0x1111111111111111ULL, 0x1111111111111111ULL, 0x1111111111111111ULL,
-     0x1111111111111111ULL},  // block size 64X64, transform size 16X16
-    {0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
-     0x0001000100010001ULL},  // block size 16X64, transform size 16X16
-    {0x1111111111111111ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 64X16, transform size 16X16
-    // transform size 32X32
-    {0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 32X32, transform size 32X32
-    {0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL,
-     0x0101010101010101ULL},  // block size 32X64, transform size 32X32
-    {0x0101010101010101ULL, 0x0101010101010101ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 64X32, transform size 32X32
-    {0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL,
-     0x0101010101010101ULL},  // block size 64X64, transform size 32X32
-    // transform size 64X64
-    {0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
-     0x0001000100010001ULL},  // block size 64X64, transform size 64X64
-    // 2:1, 1:2 transform sizes.
-    {0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 4X8, transform size 4X8
-    {0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 4X16, transform size 4X8
-    {0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 8X4, transform size 8X4
-    {0x0000000000000005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 16X4, transform size 8X4
-    {0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 8X16, transform size 8X16
-    {0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 8X32, transform size 8X16
-    {0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 16X8, transform size 16X8
-    {0x0000000000110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 32X8, transform size 16X8
-    {0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 16X32, transform size 16X32
-    {0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
-     0x0001000100010001ULL},  // block size 16X64, transform size 16X32
-    {0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 32X16, transform size 32X16
-    {0x0101010101010101ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 64X16, transform size 32X16
-    {0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
-     0x0001000100010001ULL},  // block size 32X64, transform size 32X64
-    {0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 64X32, transform size 64X32
-    // 4:1, 1:4 transform sizes.
-    {0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 4X16, transform size 4X16
-    {0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 16X4, transform size 16X4
-    {0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 8X32, transform size 8X32
-    {0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 32X8, transform size 32X8
-    {0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
-     0x0001000100010001ULL},  // block size 16X64, transform size 16X64
-    {0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-     0x0000000000000000ULL},  // block size 64X16, transform size 64X16
-};
-
 }  // namespace libgav1

diff --git a/libgav1/src/utils/constants.h b/libgav1/src/utils/constants.h
index ce987b4..a2076c5 100644
--- a/libgav1/src/utils/constants.h
+++ b/libgav1/src/utils/constants.h

@@ -44,6 +44,8 @@
   kMinQuantizer = 0,
   kMinLossyQuantizer = 1,
   kMaxQuantizer = 255,
+  // Quantizer matrix is used only when level < 15.
+  kNumQuantizerLevelsForQuantizerMatrix = 15,
   kFrameLfCount = 4,
   kMaxLoopFilterValue = 63,
   kNum4x4In64x64 = 256,
@@ -106,6 +108,7 @@
   kMaxScaledSuperBlockSizeInPixels = 128 * 2,
   kMaxSuperBlockSizeSquareInPixels = 128 * 128,
   kNum4x4InLoopFilterUnit = 16,
+  kNum4x4InLoopRestorationUnit = 16,
   kProjectionMvClamp = (1 << 14) - 1,  // == 16383
   kProjectionMvMaxHorizontalOffset = 8,
   kCdefUnitSize = 64,
@@ -124,11 +127,12 @@
   kSuperResScaleBits = 14,
   kSuperResExtraBits = kSuperResScaleBits - kSuperResFilterBits,
   kSuperResScaleMask = (1 << 14) - 1,
-  kSuperResHorizontalBorder = 8,
+  kSuperResHorizontalBorder = 4,
   kSuperResVerticalBorder = 1,
-  // The SIMD implementations of superres calculate up to 4 extra upscaled
-  // pixels which will over-read 2 downscaled pixels in the end of each row.
-  kSuperResHorizontalPadding = 2,
+  // The SIMD implementations of superres calculate up to 15 extra upscaled
+  // pixels which will over-read up to 15 downscaled pixels in the end of each
+  // row. Set the padding to 16 for alignment purposes.
+  kSuperResHorizontalPadding = 16,
   // TODO(chengchen): consider merging these constants:
   // kFilterBits, kWienerFilterBits, and kSgrProjPrecisionBits, which are all 7,
   // They are designed to match AV1 convolution, which increases coeff
@@ -625,6 +629,52 @@
   abort();
 }
 
+inline const char* ToString(const TransformSize size) {
+  switch (size) {
+    case kTransformSize4x4:
+      return "kTransformSize4x4";
+    case kTransformSize4x8:
+      return "kTransformSize4x8";
+    case kTransformSize4x16:
+      return "kTransformSize4x16";
+    case kTransformSize8x4:
+      return "kTransformSize8x4";
+    case kTransformSize8x8:
+      return "kTransformSize8x8";
+    case kTransformSize8x16:
+      return "kTransformSize8x16";
+    case kTransformSize8x32:
+      return "kTransformSize8x32";
+    case kTransformSize16x4:
+      return "kTransformSize16x4";
+    case kTransformSize16x8:
+      return "kTransformSize16x8";
+    case kTransformSize16x16:
+      return "kTransformSize16x16";
+    case kTransformSize16x32:
+      return "kTransformSize16x32";
+    case kTransformSize16x64:
+      return "kTransformSize16x64";
+    case kTransformSize32x8:
+      return "kTransformSize32x8";
+    case kTransformSize32x16:
+      return "kTransformSize32x16";
+    case kTransformSize32x32:
+      return "kTransformSize32x32";
+    case kTransformSize32x64:
+      return "kTransformSize32x64";
+    case kTransformSize64x16:
+      return "kTransformSize64x16";
+    case kTransformSize64x32:
+      return "kTransformSize64x32";
+    case kTransformSize64x64:
+      return "kTransformSize64x64";
+    case kNumTransformSizes:
+      return "kNumTransformSizes";
+  }
+  abort();
+}
+
 inline const char* ToString(const TransformType type) {
   switch (type) {
     case kTransformTypeDctDct:
@@ -735,14 +785,6 @@
 
 extern const uint8_t kDeblockFilterLevelIndex[kMaxPlanes][kNumLoopFilterTypes];
 
-extern const int8_t kMaskIdLookup[4][kMaxBlockSizes];
-
-extern const int8_t kVerticalBorderMaskIdLookup[kMaxBlockSizes];
-
-extern const uint64_t kTopMaskLookup[67][4];
-
-extern const uint64_t kLeftMaskLookup[67][4];
-
 }  // namespace libgav1
 
 #endif  // LIBGAV1_SRC_UTILS_CONSTANTS_H_

diff --git a/libgav1/src/utils/cpu.cc b/libgav1/src/utils/cpu.cc
index a6b7057..b3c51da 100644
--- a/libgav1/src/utils/cpu.cc
+++ b/libgav1/src/utils/cpu.cc

@@ -39,7 +39,7 @@
   __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(ecx));
   return (static_cast<uint64_t>(edx) << 32) | eax;
 }
-#else  // _MSC_VER
+#else   // _MSC_VER
 void CpuId(int leaf, uint32_t info[4]) {
   __cpuidex(reinterpret_cast<int*>(info), leaf, 0 /*ecx=subleaf*/);
 }

diff --git a/libgav1/src/utils/cpu.h b/libgav1/src/utils/cpu.h
index d098f1d..aefc2df 100644
--- a/libgav1/src/utils/cpu.h
+++ b/libgav1/src/utils/cpu.h

@@ -21,19 +21,58 @@
 
 namespace libgav1 {
 
-#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
+#if defined(__i386__) || defined(__x86_64__)
+#define LIBGAV1_X86
+#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
+#define LIBGAV1_X86
 #define LIBGAV1_X86_MSVC
 #endif
 
-#if !defined(LIBGAV1_ENABLE_SSE4_1)
-#if defined(__SSE4_1__) || defined(LIBGAV1_X86_MSVC)
-#define LIBGAV1_ENABLE_SSE4_1 1
-#else
-#define LIBGAV1_ENABLE_SSE4_1 0
-#endif
-#endif  // !defined(LIBGAV1_ENABLE_SSE4_1)
+#if defined(LIBGAV1_X86)
 
-#undef LIBGAV1_X86_MSVC
+#if !defined(LIBGAV1_ENABLE_SSE4_1)
+#define LIBGAV1_ENABLE_SSE4_1 1
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+#if !defined(LIBGAV1_ENABLE_AVX2)
+#define LIBGAV1_ENABLE_AVX2 1
+#endif  // !defined(LIBGAV1_ENABLE_AVX2)
+#else   // !LIBGAV1_ENABLE_SSE4_1
+// Disable AVX2 when SSE4.1 is disabled as it may rely on shared components.
+#undef LIBGAV1_ENABLE_AVX2
+#define LIBGAV1_ENABLE_AVX2 0
+#endif  // LIBGAV1_ENABLE_SSE4_1
+
+#else  // !LIBGAV1_X86
+
+#undef LIBGAV1_ENABLE_AVX2
+#define LIBGAV1_ENABLE_AVX2 0
+#undef LIBGAV1_ENABLE_SSE4_1
+#define LIBGAV1_ENABLE_SSE4_1 0
+
+#endif  // LIBGAV1_X86
+
+// For x86 LIBGAV1_TARGETING_* indicate the source being built is targeting
+// (at least) that instruction set. This prevents disabling other instruction
+// sets if the current instruction set isn't a global target, e.g., building
+// *_avx2.cc w/-mavx2, but the remaining files without the flag.
+#if LIBGAV1_ENABLE_AVX2 && defined(__AVX2__)
+#define LIBGAV1_TARGETING_AVX2 1
+#else
+#define LIBGAV1_TARGETING_AVX2 0
+#endif
+
+// Note: LIBGAV1_X86_MSVC isn't completely correct for Visual Studio, but there
+// is no equivalent to __SSE4_1__. LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS will be
+// enabled in dsp.h to compensate for this.
+#if LIBGAV1_ENABLE_SSE4_1 && (defined(__SSE4_1__) || defined(LIBGAV1_X86_MSVC))
+#define LIBGAV1_TARGETING_SSE4_1 1
+#else
+#define LIBGAV1_TARGETING_SSE4_1 0
+#endif
+
+#undef LIBGAV1_X86
 
 #if !defined(LIBGAV1_ENABLE_NEON)
 // TODO(jzern): add support for _M_ARM64.

diff --git a/libgav1/src/utils/dynamic_buffer.h b/libgav1/src/utils/dynamic_buffer.h
index 5e2f644..40ece26 100644
--- a/libgav1/src/utils/dynamic_buffer.h
+++ b/libgav1/src/utils/dynamic_buffer.h

@@ -28,6 +28,7 @@
 class DynamicBuffer {
  public:
   T* get() { return buffer_.get(); }
+  const T* get() const { return buffer_.get(); }
 
   // Resizes the buffer so that it can hold at least |size| elements. Existing
   // contents will be destroyed when resizing to a larger size.
@@ -45,6 +46,8 @@
     return true;
   }
 
+  size_t size() const { return size_; }
+
  private:
   std::unique_ptr<T[]> buffer_;
   size_t size_ = 0;

diff --git a/libgav1/src/utils/entropy_decoder.cc b/libgav1/src/utils/entropy_decoder.cc
index dfe3bba..bf21199 100644
--- a/libgav1/src/utils/entropy_decoder.cc
+++ b/libgav1/src/utils/entropy_decoder.cc

@@ -20,6 +20,7 @@
 #include "src/utils/common.h"
 #include "src/utils/compiler_attributes.h"
 #include "src/utils/constants.h"
+#include "src/utils/cpu.h"
 
 #if defined(__ARM_NEON__) || defined(__aarch64__) || \
     (defined(_MSC_VER) && defined(_M_ARM))
@@ -32,24 +33,20 @@
 #include <arm_neon.h>
 #endif
 
-#if defined(__SSE4_1__) || defined(LIBGAV1_X86_MSVC)
-#define LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4 1
+#if defined(__SSE2__) || defined(LIBGAV1_X86_MSVC)
+#define LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 1
 #else
-#define LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4 0
+#define LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 0
 #endif
 
-#if LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4
-#include <smmintrin.h>
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+#include <emmintrin.h>
 #endif
 
 namespace libgav1 {
 namespace {
 
 constexpr uint32_t kReadBitMask = ~255;
-// This constant is used to set the value of |bits_| so that bits can be read
-// after end of stream without trying to refill the buffer for a reasonably long
-// time.
-constexpr int kLargeBitCount = 0x4000;
 constexpr int kCdfPrecision = 6;
 constexpr int kMinimumProbabilityPerSymbol = 4;
 
@@ -78,10 +75,12 @@
   //  count >> 4 is 2 for count == 31.
   // Now, the equation becomes:
   //  4 + (count >> 4) + (symbol_count > 3).
-  // Since (count >> 4) can only be 0 or 1 or 2, the addition can be replaced
-  // with bitwise or. So the final equation is:
-  // (4 | (count >> 4)) + (symbol_count > 3).
-  const int rate = (4 | (count >> 4)) + static_cast<int>(symbol_count > 3);
+  // Since (count >> 4) can only be 0 or 1 or 2, the addition could be replaced
+  // with bitwise or:
+  //  (4 | (count >> 4)) + (symbol_count > 3).
+  // but using addition will allow the compiler to eliminate an operation when
+  // symbol_count is known and this function is inlined.
+  const int rate = (count >> 4) + 4 + static_cast<int>(symbol_count > 3);
   // Hints for further optimizations:
   //
   // 1. clang can vectorize this for loop with width 4, even though the loop
@@ -103,13 +102,15 @@
   // signed integer and right-shifted. This requires the right shift of a
   // signed integer be an arithmetic shift, which is true for clang, gcc, and
   // Visual C++.
-  for (int i = 0; i < symbol_count - 1; ++i) {
+  assert(symbol_count - 1 > 0);
+  int i = 0;
+  do {
     if (i < symbol) {
       cdf[i] += (kCdfMaxProbability - cdf[i]) >> rate;
     } else {
       cdf[i] -= cdf[i] >> rate;
     }
-  }
+  } while (++i < symbol_count - 1);
   cdf[symbol_count] += static_cast<uint16_t>(count < 32);
 }
 
@@ -146,8 +147,9 @@
 //     cdf[i] -= static_cast<int16_t>(cdf[i] - a) >> rate;
 //   }
 //
-// The following ARM NEON implementations use the second form, which seems
-// slightly faster.
+// The following ARM NEON implementations use a modified version of the first
+// form, using the comparison mask and unsigned rollover to avoid the need to
+// calculate rounding.
 //
 // The cdf array has symbol_count + 1 elements. The first symbol_count elements
 // are the CDF. The last element is a count that is initialized to 0 and may
@@ -169,42 +171,47 @@
 void UpdateCdf5(uint16_t* const cdf, const int symbol) {
   uint16x4_t cdf_vec = vld1_u16(cdf);
   const uint16_t count = cdf[5];
-  const int rate = (4 | (count >> 4)) + 1;
-  const uint16x4_t zero = vdup_n_u16(0);
-  const uint16x4_t cdf_max_probability =
-      vdup_n_u16(kCdfMaxProbability + 1 - (1 << rate));
+  const int rate = (count >> 4) + 5;
+  const uint16x4_t cdf_max_probability = vdup_n_u16(kCdfMaxProbability);
   const uint16x4_t index = vcreate_u16(0x0003000200010000);
   const uint16x4_t symbol_vec = vdup_n_u16(symbol);
-  const uint16x4_t mask = vclt_u16(index, symbol_vec);
-  const uint16x4_t a = vbsl_u16(mask, cdf_max_probability, zero);
-  const int16x4_t diff = vreinterpret_s16_u16(vsub_u16(cdf_vec, a));
+  const uint16x4_t mask = vcge_u16(index, symbol_vec);
+  // i < symbol: 32768, i >= symbol: 65535.
+  const uint16x4_t a = vorr_u16(mask, cdf_max_probability);
+  // i < symbol: 32768 - cdf, i >= symbol: 65535 - cdf.
+  const int16x4_t diff = vreinterpret_s16_u16(vsub_u16(a, cdf_vec));
+  // i < symbol: cdf - 0, i >= symbol: cdf - 65535.
+  const uint16x4_t cdf_offset = vsub_u16(cdf_vec, mask);
   const int16x4_t negative_rate = vdup_n_s16(-rate);
+  // i < symbol: (32768 - cdf) >> rate, i >= symbol: (65535 (-1) - cdf) >> rate.
   const uint16x4_t delta = vreinterpret_u16_s16(vshl_s16(diff, negative_rate));
-  cdf_vec = vsub_u16(cdf_vec, delta);
+  // i < symbol: (cdf - 0) + ((32768 - cdf) >> rate).
+  // i >= symbol: (cdf - 65535) + ((65535 - cdf) >> rate).
+  cdf_vec = vadd_u16(cdf_offset, delta);
   vst1_u16(cdf, cdf_vec);
   cdf[5] = count + static_cast<uint16_t>(count < 32);
 }
 
 // This version works for |symbol_count| = 7, 8, or 9.
+// See UpdateCdf5 for implementation details.
 template <int symbol_count>
 void UpdateCdf7To9(uint16_t* const cdf, const int symbol) {
   static_assert(symbol_count >= 7 && symbol_count <= 9, "");
   uint16x8_t cdf_vec = vld1q_u16(cdf);
   const uint16_t count = cdf[symbol_count];
-  const int rate = (4 | (count >> 4)) + 1;
-  const uint16x8_t zero = vdupq_n_u16(0);
-  const uint16x8_t cdf_max_probability =
-      vdupq_n_u16(kCdfMaxProbability + 1 - (1 << rate));
+  const int rate = (count >> 4) + 5;
+  const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
   const uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
                                         vcreate_u16(0x0007000600050004));
   const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
-  const uint16x8_t mask = vcltq_u16(index, symbol_vec);
-  const uint16x8_t a = vbslq_u16(mask, cdf_max_probability, zero);
-  const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(cdf_vec, a));
+  const uint16x8_t mask = vcgeq_u16(index, symbol_vec);
+  const uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
+  const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
+  const uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask);
   const int16x8_t negative_rate = vdupq_n_s16(-rate);
   const uint16x8_t delta =
       vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
-  cdf_vec = vsubq_u16(cdf_vec, delta);
+  cdf_vec = vaddq_u16(cdf_offset, delta);
   vst1q_u16(cdf, cdf_vec);
   cdf[symbol_count] = count + static_cast<uint16_t>(count < 32);
 }
@@ -217,27 +224,31 @@
   UpdateCdf7To9<8>(cdf, symbol);
 }
 
+void UpdateCdf9(uint16_t* const cdf, const int symbol) {
+  UpdateCdf7To9<9>(cdf, symbol);
+}
+
+// See UpdateCdf5 for implementation details.
 void UpdateCdf11(uint16_t* const cdf, const int symbol) {
   uint16x8_t cdf_vec = vld1q_u16(cdf + 2);
   const uint16_t count = cdf[11];
   cdf[11] = count + static_cast<uint16_t>(count < 32);
-  const int rate = (4 | (count >> 4)) + 1;
+  const int rate = (count >> 4) + 5;
   if (symbol > 1) {
     cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
     cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
-    const uint16x8_t zero = vdupq_n_u16(0);
-    const uint16x8_t cdf_max_probability =
-        vdupq_n_u16(kCdfMaxProbability + 1 - (1 << rate));
+    const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
     const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
     const int16x8_t negative_rate = vdupq_n_s16(-rate);
     const uint16x8_t index = vcombine_u16(vcreate_u16(0x0005000400030002),
                                           vcreate_u16(0x0009000800070006));
-    const uint16x8_t mask = vcltq_u16(index, symbol_vec);
-    const uint16x8_t a = vbslq_u16(mask, cdf_max_probability, zero);
-    const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(cdf_vec, a));
+    const uint16x8_t mask = vcgeq_u16(index, symbol_vec);
+    const uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
+    const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
+    const uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask);
     const uint16x8_t delta =
         vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
-    cdf_vec = vsubq_u16(cdf_vec, delta);
+    cdf_vec = vaddq_u16(cdf_offset, delta);
     vst1q_u16(cdf + 2, cdf_vec);
   } else {
     if (symbol != 0) {
@@ -254,65 +265,67 @@
   }
 }
 
+// See UpdateCdf5 for implementation details.
 void UpdateCdf13(uint16_t* const cdf, const int symbol) {
   uint16x8_t cdf_vec0 = vld1q_u16(cdf);
   uint16x8_t cdf_vec1 = vld1q_u16(cdf + 4);
   const uint16_t count = cdf[13];
-  const int rate = (4 | (count >> 4)) + 1;
-  const uint16x8_t zero = vdupq_n_u16(0);
-  const uint16x8_t cdf_max_probability =
-      vdupq_n_u16(kCdfMaxProbability + 1 - (1 << rate));
+  const int rate = (count >> 4) + 5;
+  const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
   const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
   const int16x8_t negative_rate = vdupq_n_s16(-rate);
 
   uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
                                   vcreate_u16(0x0007000600050004));
-  uint16x8_t mask = vcltq_u16(index, symbol_vec);
-  uint16x8_t a = vbslq_u16(mask, cdf_max_probability, zero);
-  int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(cdf_vec0, a));
+  uint16x8_t mask = vcgeq_u16(index, symbol_vec);
+  uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
+  int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec0));
+  uint16x8_t cdf_offset = vsubq_u16(cdf_vec0, mask);
   uint16x8_t delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
-  cdf_vec0 = vsubq_u16(cdf_vec0, delta);
+  cdf_vec0 = vaddq_u16(cdf_offset, delta);
   vst1q_u16(cdf, cdf_vec0);
 
   index = vcombine_u16(vcreate_u16(0x0007000600050004),
                        vcreate_u16(0x000b000a00090008));
-  mask = vcltq_u16(index, symbol_vec);
-  a = vbslq_u16(mask, cdf_max_probability, zero);
-  diff = vreinterpretq_s16_u16(vsubq_u16(cdf_vec1, a));
+  mask = vcgeq_u16(index, symbol_vec);
+  a = vorrq_u16(mask, cdf_max_probability);
+  diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec1));
+  cdf_offset = vsubq_u16(cdf_vec1, mask);
   delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
-  cdf_vec1 = vsubq_u16(cdf_vec1, delta);
+  cdf_vec1 = vaddq_u16(cdf_offset, delta);
   vst1q_u16(cdf + 4, cdf_vec1);
 
   cdf[13] = count + static_cast<uint16_t>(count < 32);
 }
 
+// See UpdateCdf5 for implementation details.
 void UpdateCdf16(uint16_t* const cdf, const int symbol) {
   uint16x8_t cdf_vec = vld1q_u16(cdf);
   const uint16_t count = cdf[16];
-  const int rate = (4 | (count >> 4)) + 1;
-  const uint16x8_t zero = vdupq_n_u16(0);
-  const uint16x8_t cdf_max_probability =
-      vdupq_n_u16(kCdfMaxProbability + 1 - (1 << rate));
+  const int rate = (count >> 4) + 5;
+  const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
   const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
   const int16x8_t negative_rate = vdupq_n_s16(-rate);
 
   uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
                                   vcreate_u16(0x0007000600050004));
-  uint16x8_t mask = vcltq_u16(index, symbol_vec);
-  uint16x8_t a = vbslq_u16(mask, cdf_max_probability, zero);
-  int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(cdf_vec, a));
+  uint16x8_t mask = vcgeq_u16(index, symbol_vec);
+  uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
+  int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
+  uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask);
   uint16x8_t delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
-  cdf_vec = vsubq_u16(cdf_vec, delta);
+  cdf_vec = vaddq_u16(cdf_offset, delta);
   vst1q_u16(cdf, cdf_vec);
 
   cdf_vec = vld1q_u16(cdf + 8);
   index = vcombine_u16(vcreate_u16(0x000b000a00090008),
                        vcreate_u16(0x000f000e000d000c));
-  mask = vcltq_u16(index, symbol_vec);
-  a = vbslq_u16(mask, cdf_max_probability, zero);
-  diff = vreinterpretq_s16_u16(vsubq_u16(cdf_vec, a));
+  mask = vcgeq_u16(index, symbol_vec);
+  a = vorrq_u16(mask, cdf_max_probability);
+  diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
+  cdf_offset = vsubq_u16(cdf_vec, mask);
   delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
-  cdf_vec = vsubq_u16(cdf_vec, delta);
+  cdf_vec = vaddq_u16(cdf_offset, delta);
   vst1q_u16(cdf + 8, cdf_vec);
 
   cdf[16] = count + static_cast<uint16_t>(count < 32);
@@ -320,7 +333,7 @@
 
 #else  // !LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
 
-#if LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
 
 inline __m128i LoadLo8(const void* a) {
   return _mm_loadl_epi64(static_cast<const __m128i*>(a));
@@ -341,39 +354,47 @@
 void UpdateCdf5(uint16_t* const cdf, const int symbol) {
   __m128i cdf_vec = LoadLo8(cdf);
   const uint16_t count = cdf[5];
-  const int rate = (4 | (count >> 4)) + 1;
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i cdf_max_probability = _mm_shufflelo_epi16(
-      _mm_cvtsi32_si128(kCdfMaxProbability + 1 - (1 << rate)), 0);
-  const __m128i index = _mm_set_epi32(0x0, 0x0, 0x00030002, 0x00010000);
+  const int rate = (count >> 4) + 5;
+  const __m128i cdf_max_probability =
+      _mm_shufflelo_epi16(_mm_cvtsi32_si128(kCdfMaxProbability), 0);
+  const __m128i index = _mm_set_epi32(0x0, 0x0, 0x00040003, 0x00020001);
   const __m128i symbol_vec = _mm_shufflelo_epi16(_mm_cvtsi32_si128(symbol), 0);
-  const __m128i mask = _mm_cmplt_epi16(index, symbol_vec);
-  const __m128i a = _mm_blendv_epi8(zero, cdf_max_probability, mask);
-  const __m128i diff = _mm_sub_epi16(cdf_vec, a);
+  // i >= symbol.
+  const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+  // i < symbol: 32768, i >= symbol: 65535.
+  const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+  // i < symbol: 32768 - cdf, i >= symbol: 65535 - cdf.
+  const __m128i diff = _mm_sub_epi16(a, cdf_vec);
+  // i < symbol: cdf - 0, i >= symbol: cdf - 65535.
+  const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask);
+  // i < symbol: (32768 - cdf) >> rate, i >= symbol: (65535 (-1) - cdf) >> rate.
   const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
-  cdf_vec = _mm_sub_epi16(cdf_vec, delta);
+  // i < symbol: (cdf - 0) + ((32768 - cdf) >> rate).
+  // i >= symbol: (cdf - 65535) + ((65535 - cdf) >> rate).
+  cdf_vec = _mm_add_epi16(cdf_offset, delta);
   StoreLo8(cdf, cdf_vec);
   cdf[5] = count + static_cast<uint16_t>(count < 32);
 }
 
 // This version works for |symbol_count| = 7, 8, or 9.
+// See UpdateCdf5 for implementation details.
 template <int symbol_count>
 void UpdateCdf7To9(uint16_t* const cdf, const int symbol) {
   static_assert(symbol_count >= 7 && symbol_count <= 9, "");
   __m128i cdf_vec = LoadUnaligned16(cdf);
   const uint16_t count = cdf[symbol_count];
-  const int rate = (4 | (count >> 4)) + 1;
-  const __m128i zero = _mm_setzero_si128();
+  const int rate = (count >> 4) + 5;
   const __m128i cdf_max_probability =
-      _mm_set1_epi16(kCdfMaxProbability + 1 - (1 << rate));
+      _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
   const __m128i index =
-      _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
-  const __m128i symbol_vec = _mm_set1_epi16(symbol);
-  const __m128i mask = _mm_cmplt_epi16(index, symbol_vec);
-  const __m128i a = _mm_blendv_epi8(zero, cdf_max_probability, mask);
-  const __m128i diff = _mm_sub_epi16(cdf_vec, a);
+      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+  const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
+  const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+  const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+  const __m128i diff = _mm_sub_epi16(a, cdf_vec);
+  const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask);
   const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
-  cdf_vec = _mm_sub_epi16(cdf_vec, delta);
+  cdf_vec = _mm_add_epi16(cdf_offset, delta);
   StoreUnaligned16(cdf, cdf_vec);
   cdf[symbol_count] = count + static_cast<uint16_t>(count < 32);
 }
@@ -386,25 +407,30 @@
   UpdateCdf7To9<8>(cdf, symbol);
 }
 
+void UpdateCdf9(uint16_t* const cdf, const int symbol) {
+  UpdateCdf7To9<9>(cdf, symbol);
+}
+
+// See UpdateCdf5 for implementation details.
 void UpdateCdf11(uint16_t* const cdf, const int symbol) {
   __m128i cdf_vec = LoadUnaligned16(cdf + 2);
   const uint16_t count = cdf[11];
   cdf[11] = count + static_cast<uint16_t>(count < 32);
-  const int rate = (4 | (count >> 4)) + 1;
+  const int rate = (count >> 4) + 5;
   if (symbol > 1) {
     cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
     cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
-    const __m128i zero = _mm_setzero_si128();
     const __m128i cdf_max_probability =
-        _mm_set1_epi16(kCdfMaxProbability + 1 - (1 << rate));
+        _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
     const __m128i index =
-        _mm_set_epi32(0x00090008, 0x00070006, 0x00050004, 0x00030002);
-    const __m128i symbol_vec = _mm_set1_epi16(symbol);
-    const __m128i mask = _mm_cmplt_epi16(index, symbol_vec);
-    const __m128i a = _mm_blendv_epi8(zero, cdf_max_probability, mask);
-    const __m128i diff = _mm_sub_epi16(cdf_vec, a);
+        _mm_set_epi32(0x000a0009, 0x00080007, 0x00060005, 0x00040003);
+    const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
+    const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+    const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+    const __m128i diff = _mm_sub_epi16(a, cdf_vec);
+    const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask);
     const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
-    cdf_vec = _mm_sub_epi16(cdf_vec, delta);
+    cdf_vec = _mm_add_epi16(cdf_offset, delta);
     StoreUnaligned16(cdf + 2, cdf_vec);
   } else {
     if (symbol != 0) {
@@ -420,32 +446,33 @@
   }
 }
 
+// See UpdateCdf5 for implementation details.
 void UpdateCdf13(uint16_t* const cdf, const int symbol) {
-  __m128i cdf_vec0 = LoadUnaligned16(cdf);
+  __m128i cdf_vec0 = LoadLo8(cdf);
   __m128i cdf_vec1 = LoadUnaligned16(cdf + 4);
   const uint16_t count = cdf[13];
-  const int rate = (4 | (count >> 4)) + 1;
-  const __m128i zero = _mm_setzero_si128();
+  const int rate = (count >> 4) + 5;
   const __m128i cdf_max_probability =
-      _mm_set1_epi16(kCdfMaxProbability + 1 - (1 << rate));
-  const __m128i symbol_vec = _mm_set1_epi16(symbol);
+      _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
+  const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
 
-  const __m128i index =
-      _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
-  const __m128i mask = _mm_cmplt_epi16(index, symbol_vec);
-  const __m128i a = _mm_blendv_epi8(zero, cdf_max_probability, mask);
-  const __m128i diff = _mm_sub_epi16(cdf_vec0, a);
+  const __m128i index = _mm_set_epi32(0x0, 0x0, 0x00040003, 0x00020001);
+  const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+  const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+  const __m128i diff = _mm_sub_epi16(a, cdf_vec0);
+  const __m128i cdf_offset = _mm_sub_epi16(cdf_vec0, mask);
   const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
-  cdf_vec0 = _mm_sub_epi16(cdf_vec0, delta);
-  StoreUnaligned16(cdf, cdf_vec0);
+  cdf_vec0 = _mm_add_epi16(cdf_offset, delta);
+  StoreLo8(cdf, cdf_vec0);
 
   const __m128i index1 =
-      _mm_set_epi32(0x000b000a, 0x00090008, 0x00070006, 0x00050004);
-  const __m128i mask1 = _mm_cmplt_epi16(index1, symbol_vec);
-  const __m128i a1 = _mm_blendv_epi8(zero, cdf_max_probability, mask1);
-  const __m128i diff1 = _mm_sub_epi16(cdf_vec1, a1);
+      _mm_set_epi32(0x000c000b, 0x000a0009, 0x00080007, 0x00060005);
+  const __m128i mask1 = _mm_cmpgt_epi16(index1, symbol_vec);
+  const __m128i a1 = _mm_or_si128(mask1, cdf_max_probability);
+  const __m128i diff1 = _mm_sub_epi16(a1, cdf_vec1);
+  const __m128i cdf_offset1 = _mm_sub_epi16(cdf_vec1, mask1);
   const __m128i delta1 = _mm_sra_epi16(diff1, _mm_cvtsi32_si128(rate));
-  cdf_vec1 = _mm_sub_epi16(cdf_vec1, delta1);
+  cdf_vec1 = _mm_add_epi16(cdf_offset1, delta1);
   StoreUnaligned16(cdf + 4, cdf_vec1);
 
   cdf[13] = count + static_cast<uint16_t>(count < 32);
@@ -454,35 +481,36 @@
 void UpdateCdf16(uint16_t* const cdf, const int symbol) {
   __m128i cdf_vec0 = LoadUnaligned16(cdf);
   const uint16_t count = cdf[16];
-  const int rate = (4 | (count >> 4)) + 1;
-  const __m128i zero = _mm_setzero_si128();
+  const int rate = (count >> 4) + 5;
   const __m128i cdf_max_probability =
-      _mm_set1_epi16(kCdfMaxProbability + 1 - (1 << rate));
-  const __m128i symbol_vec = _mm_set1_epi16(symbol);
+      _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
+  const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
 
   const __m128i index =
-      _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
-  const __m128i mask = _mm_cmplt_epi16(index, symbol_vec);
-  const __m128i a = _mm_blendv_epi8(zero, cdf_max_probability, mask);
-  const __m128i diff = _mm_sub_epi16(cdf_vec0, a);
+      _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+  const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+  const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+  const __m128i diff = _mm_sub_epi16(a, cdf_vec0);
+  const __m128i cdf_offset = _mm_sub_epi16(cdf_vec0, mask);
   const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
-  cdf_vec0 = _mm_sub_epi16(cdf_vec0, delta);
+  cdf_vec0 = _mm_add_epi16(cdf_offset, delta);
   StoreUnaligned16(cdf, cdf_vec0);
 
   __m128i cdf_vec1 = LoadUnaligned16(cdf + 8);
   const __m128i index1 =
-      _mm_set_epi32(0x000f000e, 0x000d000c, 0x000b000a, 0x00090008);
-  const __m128i mask1 = _mm_cmplt_epi16(index1, symbol_vec);
-  const __m128i a1 = _mm_blendv_epi8(zero, cdf_max_probability, mask1);
-  const __m128i diff1 = _mm_sub_epi16(cdf_vec1, a1);
+      _mm_set_epi32(0x0010000f, 0x000e000d, 0x000c000b, 0x000a0009);
+  const __m128i mask1 = _mm_cmpgt_epi16(index1, symbol_vec);
+  const __m128i a1 = _mm_or_si128(mask1, cdf_max_probability);
+  const __m128i diff1 = _mm_sub_epi16(a1, cdf_vec1);
+  const __m128i cdf_offset1 = _mm_sub_epi16(cdf_vec1, mask1);
   const __m128i delta1 = _mm_sra_epi16(diff1, _mm_cvtsi32_si128(rate));
-  cdf_vec1 = _mm_sub_epi16(cdf_vec1, delta1);
+  cdf_vec1 = _mm_add_epi16(cdf_offset1, delta1);
   StoreUnaligned16(cdf + 8, cdf_vec1);
 
   cdf[16] = count + static_cast<uint16_t>(count < 32);
 }
 
-#else  // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4
+#else  // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
 
 void UpdateCdf5(uint16_t* const cdf, const int symbol) {
   UpdateCdf(cdf, 5, symbol);
@@ -496,6 +524,10 @@
   UpdateCdf(cdf, 8, symbol);
 }
 
+void UpdateCdf9(uint16_t* const cdf, const int symbol) {
+  UpdateCdf(cdf, 9, symbol);
+}
+
 void UpdateCdf11(uint16_t* const cdf, const int symbol) {
   UpdateCdf(cdf, 11, symbol);
 }
@@ -508,9 +540,28 @@
   UpdateCdf(cdf, 16, symbol);
 }
 
-#endif  // LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4
+#endif  // LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
 #endif  // LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
 
+inline DaalaBitReader::WindowSize HostToBigEndian(
+    const DaalaBitReader::WindowSize x) {
+  static_assert(sizeof(x) == 4 || sizeof(x) == 8, "");
+#if defined(__GNUC__)
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  return (sizeof(x) == 8) ? __builtin_bswap64(x) : __builtin_bswap32(x);
+#else
+  return x;
+#endif
+#elif defined(_WIN32)
+  // Note Windows targets are assumed to be little endian.
+  return static_cast<DaalaBitReader::WindowSize>(
+      (sizeof(x) == 8) ? _byteswap_uint64(static_cast<unsigned __int64>(x))
+                       : _byteswap_ulong(static_cast<unsigned long>(x)));
+#else
+#error Unknown compiler!
+#endif  // defined(__GNUC__)
+}
+
 }  // namespace
 
 #if !LIBGAV1_CXX17
@@ -520,11 +571,33 @@
 DaalaBitReader::DaalaBitReader(const uint8_t* data, size_t size,
                                bool allow_update_cdf)
     : data_(data),
-      size_(size),
-      data_index_(0),
-      allow_update_cdf_(allow_update_cdf) {
-  window_diff_ = (WindowSize{1} << (kWindowSize - 1)) - 1;
-  values_in_range_ = kCdfMaxProbability;
+      data_end_(data + size),
+      data_memcpy_end_((size >= sizeof(WindowSize))
+                           ? data + size - sizeof(WindowSize) + 1
+                           : data),
+      allow_update_cdf_(allow_update_cdf),
+      values_in_range_(kCdfMaxProbability) {
+  if (data_ < data_memcpy_end_) {
+    // This is a simplified version of PopulateBits() which loads 8 extra bits
+    // and skips the unnecessary shifts of value and window_diff_.
+    WindowSize value;
+    memcpy(&value, data_, sizeof(value));
+    data_ += sizeof(value);
+    window_diff_ = HostToBigEndian(value) ^ -1;
+    // Note the initial value of bits_ is larger than kMaxCachedBits as it's
+    // used to restore the most significant 0 bit that would be present after
+    // PopulateBits() when we extract the first symbol value.
+    // As shown in Section 8.2.2 Initialization process for symbol decoder,
+    // which uses a fixed offset to read the symbol values, the most
+    // significant bit is always 0:
+    //   The variable numBits is set equal to Min( sz * 8, 15).
+    //   The variable buf is read using the f(numBits) parsing process.
+    //   The variable paddedBuf is set equal to ( buf << (15 - numBits) ).
+    //   The variable SymbolValue is set to ((1 << 15) - 1) ^ paddedBuf.
+    bits_ = kWindowSize - 15;
+    return;
+  }
+  window_diff_ = 0;
   bits_ = -15;
   PopulateBits();
 }
@@ -537,12 +610,11 @@
 int DaalaBitReader::ReadBit() {
   const uint32_t curr =
       ((values_in_range_ & kReadBitMask) >> 1) + kMinimumProbabilityPerSymbol;
-  const WindowSize zero_threshold = static_cast<WindowSize>(curr)
-                                    << (kWindowSize - 16);
+  const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
   int bit = 1;
-  if (window_diff_ >= zero_threshold) {
+  if (symbol_value >= curr) {
     values_in_range_ -= curr;
-    window_diff_ -= zero_threshold;
+    window_diff_ -= static_cast<WindowSize>(curr) << bits_;
     bit = 0;
   } else {
     values_in_range_ = curr;
@@ -580,7 +652,8 @@
 }
 
 bool DaalaBitReader::ReadSymbol(uint16_t* cdf) {
-  const bool symbol = ReadSymbolImpl(cdf) != 0;
+  assert(cdf[1] == 0);
+  const bool symbol = ReadSymbolImpl(cdf[0]) != 0;
   if (allow_update_cdf_) {
     const uint16_t count = cdf[2];
     // rate is computed in the spec as:
@@ -608,15 +681,15 @@
   return symbol;
 }
 
-bool DaalaBitReader::ReadSymbolWithoutCdfUpdate(uint16_t* cdf) {
+bool DaalaBitReader::ReadSymbolWithoutCdfUpdate(uint16_t cdf) {
   return ReadSymbolImpl(cdf) != 0;
 }
 
 template <int symbol_count>
 int DaalaBitReader::ReadSymbol(uint16_t* const cdf) {
   static_assert(symbol_count >= 3 && symbol_count <= 16, "");
-  if (symbol_count == 4) {
-    return ReadSymbol4(cdf);
+  if (symbol_count == 3 || symbol_count == 4) {
+    return ReadSymbol3Or4(cdf, symbol_count);
   }
   int symbol;
   if (symbol_count == 8) {
@@ -633,6 +706,8 @@
       UpdateCdf7(cdf, symbol);
     } else if (symbol_count == 8) {
       UpdateCdf8(cdf, symbol);
+    } else if (symbol_count == 9) {
+      UpdateCdf9(cdf, symbol);
     } else if (symbol_count == 11) {
       UpdateCdf11(cdf, symbol);
     } else if (symbol_count == 13) {
@@ -653,8 +728,7 @@
   uint32_t curr = values_in_range_;
   int symbol = -1;
   uint32_t prev;
-  const auto symbol_value =
-      static_cast<uint32_t>(window_diff_ >> (kWindowSize - 16));
+  const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
   uint32_t delta = kMinimumProbabilityPerSymbol * symbol_count;
   // Search through the |cdf| array to determine where the scaled cdf value and
   // |symbol_value| cross over.
@@ -665,7 +739,7 @@
     delta -= kMinimumProbabilityPerSymbol;
   } while (symbol_value < curr);
   values_in_range_ = prev - curr;
-  window_diff_ -= static_cast<WindowSize>(curr) << (kWindowSize - 16);
+  window_diff_ -= static_cast<WindowSize>(curr) << bits_;
   NormalizeRange();
   return symbol;
 }
@@ -675,8 +749,7 @@
   assert(cdf[symbol_count - 1] == 0);
   assert(symbol_count > 1 && symbol_count <= 16);
   --symbol_count;
-  const auto symbol_value =
-      static_cast<uint32_t>(window_diff_ >> (kWindowSize - 16));
+  const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
   // Search through the |cdf| array to determine where the scaled cdf value and
   // |symbol_value| cross over. Since the CDFs are sorted, we can use binary
   // search to do this. Let |symbol| be the index of the first |cdf| array
@@ -709,36 +782,36 @@
   assert(low == high + 1);
   // At this point, |low| is the symbol that has been decoded.
   values_in_range_ = prev - curr;
-  window_diff_ -= static_cast<WindowSize>(curr) << (kWindowSize - 16);
+  window_diff_ -= static_cast<WindowSize>(curr) << bits_;
   NormalizeRange();
   return low;
 }
 
-int DaalaBitReader::ReadSymbolImpl(const uint16_t* const cdf) {
-  assert(cdf[1] == 0);
-  const auto symbol_value =
-      static_cast<uint32_t>(window_diff_ >> (kWindowSize - 16));
-  const uint32_t curr = ScaleCdf(values_in_range_ >> 8, cdf, 0, 1);
+int DaalaBitReader::ReadSymbolImpl(uint16_t cdf) {
+  const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+  const uint32_t curr =
+      (((values_in_range_ >> 8) * (cdf >> kCdfPrecision)) >> 1) +
+      kMinimumProbabilityPerSymbol;
   const int symbol = static_cast<int>(symbol_value < curr);
   if (symbol == 1) {
     values_in_range_ = curr;
   } else {
     values_in_range_ -= curr;
-    window_diff_ -= static_cast<WindowSize>(curr) << (kWindowSize - 16);
+    window_diff_ -= static_cast<WindowSize>(curr) << bits_;
   }
   NormalizeRange();
   return symbol;
 }
 
-// Equivalent to ReadSymbol(cdf, 4), with the ReadSymbolImpl and UpdateCdf
+// Equivalent to ReadSymbol(cdf, [3,4]), with the ReadSymbolImpl and UpdateCdf
 // calls inlined.
-int DaalaBitReader::ReadSymbol4(uint16_t* const cdf) {
-  assert(cdf[3] == 0);
+int DaalaBitReader::ReadSymbol3Or4(uint16_t* const cdf,
+                                   const int symbol_count) {
+  assert(cdf[symbol_count - 1] == 0);
   uint32_t curr = values_in_range_;
   uint32_t prev;
-  const auto symbol_value =
-      static_cast<uint32_t>(window_diff_ >> (kWindowSize - 16));
-  uint32_t delta = kMinimumProbabilityPerSymbol * 3;
+  const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+  uint32_t delta = kMinimumProbabilityPerSymbol * (symbol_count - 1);
   const uint32_t values_in_range_shifted = values_in_range_ >> 8;
 
   // Search through the |cdf| array to determine where the scaled cdf value and
@@ -756,11 +829,11 @@
   //    delta -= kMinimumProbabilityPerSymbol;
   //  } while (symbol_value < curr);
   //  if (allow_update_cdf_) {
-  //    UpdateCdf(cdf, 4, symbol);
+  //    UpdateCdf(cdf, [3,4], symbol);
   //  }
   //
-  // The do-while loop is unrolled with four iterations, and the UpdateCdf call
-  // is inlined and merged into the four iterations.
+  // The do-while loop is unrolled with three or four iterations, and the
+  // UpdateCdf call is inlined and merged into the iterations.
   int symbol = 0;
   // Iteration 0.
   prev = curr;
@@ -769,31 +842,36 @@
   if (symbol_value >= curr) {
     // symbol == 0.
     if (allow_update_cdf_) {
-      // Inlined version of UpdateCdf(cdf, 4, /*symbol=*/0).
-      const uint16_t count = cdf[4];
-      cdf[4] += static_cast<uint16_t>(count < 32);
-      const int rate = (4 | (count >> 4)) + 1;
+      // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/0).
+      const uint16_t count = cdf[symbol_count];
+      cdf[symbol_count] += static_cast<uint16_t>(count < 32);
+      const int rate = (count >> 4) + 4 + static_cast<int>(symbol_count == 4);
+      if (symbol_count == 4) {
 #if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
-      // 1. On Motorola Moto G5 Plus (running 32-bit Android 8.1.0), the ARM
-      // NEON code is slower. Consider using the C version if __arm__ is
-      // defined.
-      // 2. The ARM NEON code (compiled for arm64) is slightly slower on
-      // Samsung Galaxy S8+ (SM-G955FD).
-      uint16x4_t cdf_vec = vld1_u16(cdf);
-      const int16x4_t negative_rate = vdup_n_s16(-rate);
-      const uint16x4_t delta = vshl_u16(cdf_vec, negative_rate);
-      cdf_vec = vsub_u16(cdf_vec, delta);
-      vst1_u16(cdf, cdf_vec);
-#elif LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4
-      __m128i cdf_vec = LoadLo8(cdf);
-      const __m128i delta = _mm_sra_epi16(cdf_vec, _mm_cvtsi32_si128(rate));
-      cdf_vec = _mm_sub_epi16(cdf_vec, delta);
-      StoreLo8(cdf, cdf_vec);
-#else  // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4
-      cdf[0] -= cdf[0] >> rate;
-      cdf[1] -= cdf[1] >> rate;
-      cdf[2] -= cdf[2] >> rate;
+        // 1. On Motorola Moto G5 Plus (running 32-bit Android 8.1.0), the ARM
+        // NEON code is slower. Consider using the C version if __arm__ is
+        // defined.
+        // 2. The ARM NEON code (compiled for arm64) is slightly slower on
+        // Samsung Galaxy S8+ (SM-G955FD).
+        uint16x4_t cdf_vec = vld1_u16(cdf);
+        const int16x4_t negative_rate = vdup_n_s16(-rate);
+        const uint16x4_t delta = vshl_u16(cdf_vec, negative_rate);
+        cdf_vec = vsub_u16(cdf_vec, delta);
+        vst1_u16(cdf, cdf_vec);
+#elif LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+        __m128i cdf_vec = LoadLo8(cdf);
+        const __m128i delta = _mm_sra_epi16(cdf_vec, _mm_cvtsi32_si128(rate));
+        cdf_vec = _mm_sub_epi16(cdf_vec, delta);
+        StoreLo8(cdf, cdf_vec);
+#else  // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+        cdf[0] -= cdf[0] >> rate;
+        cdf[1] -= cdf[1] >> rate;
+        cdf[2] -= cdf[2] >> rate;
 #endif
+      } else {  // symbol_count == 3.
+        cdf[0] -= cdf[0] >> rate;
+        cdf[1] -= cdf[1] >> rate;
+      }
     }
     goto found;
   }
@@ -806,81 +884,88 @@
   if (symbol_value >= curr) {
     // symbol == 1.
     if (allow_update_cdf_) {
-      // Inlined version of UpdateCdf(cdf, 4, /*symbol=*/1).
-      const uint16_t count = cdf[4];
-      cdf[4] += static_cast<uint16_t>(count < 32);
-      const int rate = (4 | (count >> 4)) + 1;
+      // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/1).
+      const uint16_t count = cdf[symbol_count];
+      cdf[symbol_count] += static_cast<uint16_t>(count < 32);
+      const int rate = (count >> 4) + 4 + static_cast<int>(symbol_count == 4);
       cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
       cdf[1] -= cdf[1] >> rate;
-      cdf[2] -= cdf[2] >> rate;
+      if (symbol_count == 4) cdf[2] -= cdf[2] >> rate;
     }
     goto found;
   }
   ++symbol;
-  delta -= kMinimumProbabilityPerSymbol;
-  // Iteration 2.
+  if (symbol_count == 4) {
+    delta -= kMinimumProbabilityPerSymbol;
+    // Iteration 2.
+    prev = curr;
+    curr = ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) +
+           delta;
+    if (symbol_value >= curr) {
+      // symbol == 2.
+      if (allow_update_cdf_) {
+        // Inlined version of UpdateCdf(cdf, 4, /*symbol=*/2).
+        const uint16_t count = cdf[4];
+        cdf[4] += static_cast<uint16_t>(count < 32);
+        const int rate = (count >> 4) + 5;
+        cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+        cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
+        cdf[2] -= cdf[2] >> rate;
+      }
+      goto found;
+    }
+    ++symbol;
+  }
+  // |delta| is 0 for the last iteration.
+  // Iteration 2 (symbol_count == 3) or 3 (symbol_count == 4).
   prev = curr;
-  curr =
-      ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta;
-  if (symbol_value >= curr) {
-    // symbol == 2.
-    if (allow_update_cdf_) {
-      // Inlined version of UpdateCdf(cdf, 4, /*symbol=*/2).
-      const uint16_t count = cdf[4];
-      cdf[4] += static_cast<uint16_t>(count < 32);
-      const int rate = (4 | (count >> 4)) + 1;
+  // Since cdf[symbol_count - 1] is 0 and |delta| is 0, |curr| is also 0.
+  curr = 0;
+  // symbol == [2,3].
+  if (allow_update_cdf_) {
+    // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/[2,3]).
+    const uint16_t count = cdf[symbol_count];
+    cdf[symbol_count] += static_cast<uint16_t>(count < 32);
+    const int rate = (4 | (count >> 4)) + static_cast<int>(symbol_count == 4);
+    if (symbol_count == 4) {
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+      // On Motorola Moto G5 Plus (running 32-bit Android 8.1.0), the ARM NEON
+      // code is a tiny bit slower. Consider using the C version if __arm__ is
+      // defined.
+      uint16x4_t cdf_vec = vld1_u16(cdf);
+      const uint16x4_t cdf_max_probability = vdup_n_u16(kCdfMaxProbability);
+      const int16x4_t diff =
+          vreinterpret_s16_u16(vsub_u16(cdf_max_probability, cdf_vec));
+      const int16x4_t negative_rate = vdup_n_s16(-rate);
+      const uint16x4_t delta =
+          vreinterpret_u16_s16(vshl_s16(diff, negative_rate));
+      cdf_vec = vadd_u16(cdf_vec, delta);
+      vst1_u16(cdf, cdf_vec);
+      cdf[3] = 0;
+#elif LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+      __m128i cdf_vec = LoadLo8(cdf);
+      const __m128i cdf_max_probability =
+          _mm_shufflelo_epi16(_mm_cvtsi32_si128(kCdfMaxProbability), 0);
+      const __m128i diff = _mm_sub_epi16(cdf_max_probability, cdf_vec);
+      const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+      cdf_vec = _mm_add_epi16(cdf_vec, delta);
+      StoreLo8(cdf, cdf_vec);
+      cdf[3] = 0;
+#else  // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
       cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
       cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
-      cdf[2] -= cdf[2] >> rate;
-    }
-    goto found;
-  }
-  ++symbol;
-  // |delta| is 0 for the last iteration.
-  // Iteration 3.
-  prev = curr;
-  // Since cdf[3] is 0 and |delta| is 0, |curr| is also 0.
-  curr = 0;
-  // symbol == 3.
-  if (allow_update_cdf_) {
-    // Inlined version of UpdateCdf(cdf, 4, /*symbol=*/3).
-    const uint16_t count = cdf[4];
-    cdf[4] += static_cast<uint16_t>(count < 32);
-    const int rate = (4 | (count >> 4)) + 1;
-#if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
-    // On Motorola Moto G5 Plus (running 32-bit Android 8.1.0), the ARM NEON
-    // code is a tiny bit slower. Consider using the C version if __arm__ is
-    // defined.
-    uint16x4_t cdf_vec = vld1_u16(cdf);
-    const uint16x4_t cdf_max_probability = vdup_n_u16(kCdfMaxProbability);
-    const int16x4_t diff =
-        vreinterpret_s16_u16(vsub_u16(cdf_max_probability, cdf_vec));
-    const int16x4_t negative_rate = vdup_n_s16(-rate);
-    const uint16x4_t delta =
-        vreinterpret_u16_s16(vshl_s16(diff, negative_rate));
-    cdf_vec = vadd_u16(cdf_vec, delta);
-    vst1_u16(cdf, cdf_vec);
-    cdf[3] = 0;
-#elif LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4
-    __m128i cdf_vec = LoadLo8(cdf);
-    const __m128i cdf_max_probability =
-        _mm_shufflelo_epi16(_mm_cvtsi32_si128(kCdfMaxProbability), 0);
-    const __m128i diff = _mm_sub_epi16(cdf_max_probability, cdf_vec);
-    const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
-    cdf_vec = _mm_add_epi16(cdf_vec, delta);
-    StoreLo8(cdf, cdf_vec);
-    cdf[3] = 0;
-#else  // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4
-    cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
-    cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
-    cdf[2] += (kCdfMaxProbability - cdf[2]) >> rate;
+      cdf[2] += (kCdfMaxProbability - cdf[2]) >> rate;
 #endif
+    } else {  // symbol_count == 3.
+      cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+      cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
+    }
   }
 found:
   // End of unrolled do-while loop.
 
   values_in_range_ = prev - curr;
-  window_diff_ -= static_cast<WindowSize>(curr) << (kWindowSize - 16);
+  window_diff_ -= static_cast<WindowSize>(curr) << bits_;
   NormalizeRange();
   return symbol;
 }
@@ -889,8 +974,7 @@
   assert(cdf[7] == 0);
   uint32_t curr = values_in_range_;
   uint32_t prev;
-  const auto symbol_value =
-      static_cast<uint32_t>(window_diff_ >> (kWindowSize - 16));
+  const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
   uint32_t delta = kMinimumProbabilityPerSymbol * 7;
   // Search through the |cdf| array to determine where the scaled cdf value and
   // |symbol_value| cross over.
@@ -944,12 +1028,13 @@
   // End of unrolled do-while loop.
 
   values_in_range_ = prev - curr;
-  window_diff_ -= static_cast<WindowSize>(curr) << (kWindowSize - 16);
+  window_diff_ -= static_cast<WindowSize>(curr) << bits_;
   NormalizeRange();
   return symbol;
 }
 
 void DaalaBitReader::PopulateBits() {
+  constexpr int kMaxCachedBits = kWindowSize - 16;
 #if defined(__aarch64__)
   // Fast path: read eight bytes and add the first six bytes to window_diff_.
   // This fast path makes the following assumptions.
@@ -962,27 +1047,25 @@
   // performance (measured on Lenovo ThinkStation P920 running Linux). (The
   // reason is still unknown.) Therefore this fast path is only used on arm64.
   static_assert(kWindowSize == 64, "");
-  if (size_ - data_index_ >= 8) {
+  if (data_ < data_memcpy_end_) {
     uint64_t value;
     // arm64 supports unaligned loads, so this memcpy call is compiled to a
     // single ldr instruction.
-    memcpy(&value, &data_[data_index_], 8);
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-    value = __builtin_bswap64(value);
-#endif
-    value &= 0xffffffffffff0000;
-    window_diff_ ^= static_cast<WindowSize>(value) >> (bits_ + 16);
-    data_index_ += 6;
-    bits_ += 6 * 8;
+    memcpy(&value, data_, sizeof(value));
+    data_ += kMaxCachedBits >> 3;
+    value = HostToBigEndian(value) ^ -1;
+    value >>= kWindowSize - kMaxCachedBits;
+    window_diff_ = value | (window_diff_ << kMaxCachedBits);
+    bits_ += kMaxCachedBits;
     return;
   }
 #endif
 
-  size_t data_index = data_index_;
+  const uint8_t* data = data_;
   int bits = bits_;
   WindowSize window_diff = window_diff_;
 
-  int shift = kWindowSize - 9 - (bits + 15);
+  int count = kWindowSize - 9 - (bits + 15);
   // The fast path above, if compiled, would cause clang 8.0.7 to vectorize
   // this loop. Since -15 <= bits_ <= -1, this loop has at most 6 or 7
   // iterations when WindowSize is 64 bits. So it is not profitable to
@@ -992,23 +1075,26 @@
 #ifdef __clang__
 #pragma clang loop vectorize(disable) interleave(disable)
 #endif
-  for (; shift >= 0 && data_index < size_; shift -= 8) {
-    window_diff ^= static_cast<WindowSize>(data_[data_index++]) << shift;
+  for (; count >= 0 && data < data_end_; count -= 8) {
+    const uint8_t value = *data++ ^ -1;
+    window_diff = static_cast<WindowSize>(value) | (window_diff << 8);
     bits += 8;
   }
-  if (data_index >= size_) {
-    bits = kLargeBitCount;
+  assert(bits <= kMaxCachedBits);
+  if (data == data_end_) {
+    // Shift in some 1s. This is equivalent to providing fake 0 data bits.
+    window_diff = ((window_diff + 1) << (kMaxCachedBits - bits)) - 1;
+    bits = kMaxCachedBits;
   }
 
-  data_index_ = data_index;
+  data_ = data;
   bits_ = bits;
   window_diff_ = window_diff;
 }
 
 void DaalaBitReader::NormalizeRange() {
-  const int bits_used = 15 - FloorLog2(values_in_range_);
+  const int bits_used = 15 ^ FloorLog2(values_in_range_);
   bits_ -= bits_used;
-  window_diff_ = ((window_diff_ + 1) << bits_used) - 1;
   values_in_range_ <<= bits_used;
   if (bits_ < 0) PopulateBits();
 }
@@ -1017,10 +1103,13 @@
 template int DaalaBitReader::ReadSymbol<3>(uint16_t* cdf);
 template int DaalaBitReader::ReadSymbol<4>(uint16_t* cdf);
 template int DaalaBitReader::ReadSymbol<5>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<6>(uint16_t* cdf);
 template int DaalaBitReader::ReadSymbol<7>(uint16_t* cdf);
 template int DaalaBitReader::ReadSymbol<8>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<9>(uint16_t* cdf);
 template int DaalaBitReader::ReadSymbol<10>(uint16_t* cdf);
 template int DaalaBitReader::ReadSymbol<11>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<12>(uint16_t* cdf);
 template int DaalaBitReader::ReadSymbol<13>(uint16_t* cdf);
 template int DaalaBitReader::ReadSymbol<14>(uint16_t* cdf);
 template int DaalaBitReader::ReadSymbol<16>(uint16_t* cdf);

diff --git a/libgav1/src/utils/entropy_decoder.h b/libgav1/src/utils/entropy_decoder.h
index 75c633b..c066b98 100644
--- a/libgav1/src/utils/entropy_decoder.h
+++ b/libgav1/src/utils/entropy_decoder.h

@@ -27,6 +27,10 @@
 
 class DaalaBitReader : public BitReader {
  public:
+  // WindowSize must be an unsigned integer type with at least 32 bits. Use the
+  // largest type with fast arithmetic. size_t should meet these requirements.
+  using WindowSize = size_t;
+
   DaalaBitReader(const uint8_t* data, size_t size, bool allow_update_cdf);
   ~DaalaBitReader() override = default;
 
@@ -42,7 +46,7 @@
   // ReadSymbol() calls for which the |symbol_count| is equal to 2 (boolean
   // symbols) will use this variant.
   bool ReadSymbol(uint16_t* cdf);
-  bool ReadSymbolWithoutCdfUpdate(uint16_t* cdf);
+  bool ReadSymbolWithoutCdfUpdate(uint16_t cdf);
   // Use either linear search or binary search for decoding the symbol depending
   // on |symbol_count|. ReadSymbol calls for which the |symbol_count| is known
   // at compile time will use this variant.
@@ -50,10 +54,6 @@
   int ReadSymbol(uint16_t* cdf);
 
  private:
-  // WindowSize must be an unsigned integer type with at least 32 bits. Use the
-  // largest type with fast arithmetic. size_t should meet these requirements.
-  static_assert(sizeof(size_t) == sizeof(void*), "");
-  using WindowSize = size_t;
   static constexpr int kWindowSize = static_cast<int>(sizeof(WindowSize)) * 8;
   static_assert(kWindowSize >= 32, "");
 
@@ -72,9 +72,9 @@
   inline int ReadSymbolImplBinarySearch(const uint16_t* cdf, int symbol_count);
   // Specialized implementation of ReadSymbolImpl based on the fact that
   // symbol_count == 2.
-  inline int ReadSymbolImpl(const uint16_t* cdf);
+  inline int ReadSymbolImpl(uint16_t cdf);
   // ReadSymbolN is a specialization of ReadSymbol for symbol_count == N.
-  LIBGAV1_ALWAYS_INLINE int ReadSymbol4(uint16_t* cdf);
+  LIBGAV1_ALWAYS_INLINE int ReadSymbol3Or4(uint16_t* cdf, int symbol_count);
   // ReadSymbolImplN is a specialization of ReadSymbolImpl for
   // symbol_count == N.
   LIBGAV1_ALWAYS_INLINE int ReadSymbolImpl8(const uint16_t* cdf);
@@ -83,28 +83,37 @@
   // calls PopulateBits() if necessary.
   inline void NormalizeRange();
 
-  const uint8_t* const data_;
-  const size_t size_;
-  size_t data_index_;
+  const uint8_t* data_;
+  const uint8_t* const data_end_;
+  // If |data_| < |data_memcpy_end_|, then we can read sizeof(WindowSize) bytes
+  // from |data_|. Note with sizeof(WindowSize) == 4 this is only used in the
+  // constructor, not PopulateBits().
+  const uint8_t* const data_memcpy_end_;
   const bool allow_update_cdf_;
-  // Number of bits of data in the current value.
+  // Number of cached bits of data in the current value.
   int bits_;
   // Number of values in the current range. Declared as uint32_t for better
   // performance but only the lower 16 bits are used.
   uint32_t values_in_range_;
   // The difference between the high end of the current range and the coded
-  // value minus 1. The 16 most significant bits of this variable is used to
+  // value minus 1. The 16 bits above |bits_| of this variable are used to
   // decode the next symbol. It is filled in whenever |bits_| is less than 0.
+  // Note this implementation differs from the spec as it trades the need to
+  // shift in 1s in NormalizeRange() with an extra shift in PopulateBits(),
+  // which occurs less frequently.
   WindowSize window_diff_;
 };
 
 extern template int DaalaBitReader::ReadSymbol<3>(uint16_t* cdf);
 extern template int DaalaBitReader::ReadSymbol<4>(uint16_t* cdf);
 extern template int DaalaBitReader::ReadSymbol<5>(uint16_t* cdf);
+extern template int DaalaBitReader::ReadSymbol<6>(uint16_t* cdf);
 extern template int DaalaBitReader::ReadSymbol<7>(uint16_t* cdf);
 extern template int DaalaBitReader::ReadSymbol<8>(uint16_t* cdf);
+extern template int DaalaBitReader::ReadSymbol<9>(uint16_t* cdf);
 extern template int DaalaBitReader::ReadSymbol<10>(uint16_t* cdf);
 extern template int DaalaBitReader::ReadSymbol<11>(uint16_t* cdf);
+extern template int DaalaBitReader::ReadSymbol<12>(uint16_t* cdf);
 extern template int DaalaBitReader::ReadSymbol<13>(uint16_t* cdf);
 extern template int DaalaBitReader::ReadSymbol<14>(uint16_t* cdf);
 extern template int DaalaBitReader::ReadSymbol<16>(uint16_t* cdf);

diff --git a/libgav1/src/utils/libgav1_utils.cmake b/libgav1/src/utils/libgav1_utils.cmake
index 8b6ec4b..587ca5d 100644
--- a/libgav1/src/utils/libgav1_utils.cmake
+++ b/libgav1/src/utils/libgav1_utils.cmake

@@ -39,8 +39,6 @@
             "${libgav1_source}/utils/logging.cc"
             "${libgav1_source}/utils/logging.h"
             "${libgav1_source}/utils/memory.h"
-            "${libgav1_source}/utils/parameter_tree.cc"
-            "${libgav1_source}/utils/parameter_tree.h"
             "${libgav1_source}/utils/queue.h"
             "${libgav1_source}/utils/raw_bit_reader.cc"
             "${libgav1_source}/utils/raw_bit_reader.h"

diff --git a/libgav1/src/utils/logging.cc b/libgav1/src/utils/logging.cc
index 9a43c22..26e3e15 100644
--- a/libgav1/src/utils/logging.cc
+++ b/libgav1/src/utils/logging.cc

@@ -56,7 +56,7 @@
   va_end(ap);
   fprintf(stderr, "\n");
 }
-#else  // !LIBGAV1_ENABLE_LOGGING
+#else   // !LIBGAV1_ENABLE_LOGGING
 void Log(LogSeverity /*severity*/, const char* /*file*/, int /*line*/,
          const char* /*format*/, ...) {}
 #endif  // LIBGAV1_ENABLE_LOGGING

diff --git a/libgav1/src/utils/logging.h b/libgav1/src/utils/logging.h
index 48928db..473aebd 100644
--- a/libgav1/src/utils/logging.h
+++ b/libgav1/src/utils/logging.h

@@ -35,13 +35,13 @@
 // setting LIBGAV1_ENABLE_LOGGING.
 // Severity is given as an all-caps version of enum LogSeverity with the
 // leading 'k' removed: LIBGAV1_DLOG(INFO, "...");
-#define LIBGAV1_DLOG(severity, ...)                                       \
-  do {                                                                    \
-    constexpr const char* libgav1_logging_internal_basename =             \
-        ::libgav1::internal::Basename(__FILE__, sizeof(__FILE__) - 1);    \
-    ::libgav1::internal::Log(LIBGAV1_LOGGING_INTERNAL_##severity,         \
-                             libgav1_logging_internal_basename, __LINE__, \
-                             __VA_ARGS__);                                \
+#define LIBGAV1_DLOG(severity, ...)                                     \
+  do {                                                                  \
+    constexpr const char* libgav1_logging_internal_basename =           \
+        libgav1::internal::Basename(__FILE__, sizeof(__FILE__) - 1);    \
+    libgav1::internal::Log(LIBGAV1_LOGGING_INTERNAL_##severity,         \
+                           libgav1_logging_internal_basename, __LINE__, \
+                           __VA_ARGS__);                                \
   } while (0)
 #else
 #define LIBGAV1_DLOG(severity, ...) \
@@ -49,10 +49,10 @@
   } while (0)
 #endif  // LIBGAV1_ENABLE_LOGGING
 
-#define LIBGAV1_LOGGING_INTERNAL_ERROR ::libgav1::internal::LogSeverity::kError
+#define LIBGAV1_LOGGING_INTERNAL_ERROR libgav1::internal::LogSeverity::kError
 #define LIBGAV1_LOGGING_INTERNAL_WARNING \
-  ::libgav1::internal::LogSeverity::kWarning
-#define LIBGAV1_LOGGING_INTERNAL_INFO ::libgav1::internal::LogSeverity::kInfo
+  libgav1::internal::LogSeverity::kWarning
+#define LIBGAV1_LOGGING_INTERNAL_INFO libgav1::internal::LogSeverity::kInfo
 
 namespace libgav1 {
 namespace internal {

diff --git a/libgav1/src/utils/memory.h b/libgav1/src/utils/memory.h
index 80c1d8c..a8da53b 100644
--- a/libgav1/src/utils/memory.h
+++ b/libgav1/src/utils/memory.h

@@ -34,8 +34,9 @@
 enum {
 // The byte alignment required for buffers used with SIMD code to be read or
 // written with aligned operations.
-#if defined(__i386__) || defined(_M_IX86)
-  kMaxAlignment = 16,  // extended alignment is safe on x86.
+#if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || \
+    defined(_M_X64)
+  kMaxAlignment = 32,  // extended alignment is safe on x86.
 #else
   kMaxAlignment = alignof(max_align_t),
 #endif
@@ -70,7 +71,7 @@
   // more convenient to use memalign(). Unlike glibc, Android does not consider
   // memalign() an obsolete function.
   return memalign(alignment, size);
-#else  // !defined(__ANDROID__)
+#else   // !defined(__ANDROID__)
   void* ptr = nullptr;
   // posix_memalign requires that the requested alignment be at least
   // sizeof(void*). In this case, fall back on malloc which should return

diff --git a/libgav1/src/utils/parameter_tree.cc b/libgav1/src/utils/parameter_tree.cc
deleted file mode 100644
index 9426ce6..0000000
--- a/libgav1/src/utils/parameter_tree.cc
+++ /dev/null

@@ -1,133 +0,0 @@
-// Copyright 2019 The libgav1 Authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "src/utils/parameter_tree.h"
-
-#include <cassert>
-#include <memory>
-#include <new>
-
-#include "src/utils/common.h"
-#include "src/utils/constants.h"
-#include "src/utils/logging.h"
-#include "src/utils/types.h"
-
-namespace libgav1 {
-
-// static
-std::unique_ptr<ParameterTree> ParameterTree::Create(int row4x4, int column4x4,
-                                                     BlockSize block_size,
-                                                     bool is_leaf) {
-  std::unique_ptr<ParameterTree> tree(
-      new (std::nothrow) ParameterTree(row4x4, column4x4, block_size));
-  if (tree != nullptr && is_leaf && !tree->SetPartitionType(kPartitionNone)) {
-    tree = nullptr;
-  }
-  return tree;
-}
-
-bool ParameterTree::SetPartitionType(Partition partition) {
-  assert(!partition_type_set_);
-  partition_ = partition;
-  partition_type_set_ = true;
-  const int block_width4x4 = kNum4x4BlocksWide[block_size_];
-  const int half_block4x4 = block_width4x4 >> 1;
-  const int quarter_block4x4 = half_block4x4 >> 1;
-  const BlockSize sub_size = kSubSize[partition][block_size_];
-  const BlockSize split_size = kSubSize[kPartitionSplit][block_size_];
-  assert(partition == kPartitionNone || sub_size != kBlockInvalid);
-  switch (partition) {
-    case kPartitionNone:
-      parameters_.reset(new (std::nothrow) BlockParameters());
-      return parameters_ != nullptr;
-    case kPartitionHorizontal:
-      children_[0] = ParameterTree::Create(row4x4_, column4x4_, sub_size, true);
-      children_[1] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_,
-                                           sub_size, true);
-      return children_[0] != nullptr && children_[1] != nullptr;
-    case kPartitionVertical:
-      children_[0] = ParameterTree::Create(row4x4_, column4x4_, sub_size, true);
-      children_[1] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4,
-                                           sub_size, true);
-      return children_[0] != nullptr && children_[1] != nullptr;
-    case kPartitionSplit:
-      children_[0] =
-          ParameterTree::Create(row4x4_, column4x4_, sub_size, false);
-      children_[1] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4,
-                                           sub_size, false);
-      children_[2] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_,
-                                           sub_size, false);
-      children_[3] = ParameterTree::Create(
-          row4x4_ + half_block4x4, column4x4_ + half_block4x4, sub_size, false);
-      return children_[0] != nullptr && children_[1] != nullptr &&
-             children_[2] != nullptr && children_[3] != nullptr;
-    case kPartitionHorizontalWithTopSplit:
-      assert(split_size != kBlockInvalid);
-      children_[0] =
-          ParameterTree::Create(row4x4_, column4x4_, split_size, true);
-      children_[1] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4,
-                                           split_size, true);
-      children_[2] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_,
-                                           sub_size, true);
-      return children_[0] != nullptr && children_[1] != nullptr &&
-             children_[2] != nullptr;
-    case kPartitionHorizontalWithBottomSplit:
-      assert(split_size != kBlockInvalid);
-      children_[0] = ParameterTree::Create(row4x4_, column4x4_, sub_size, true);
-      children_[1] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_,
-                                           split_size, true);
-      children_[2] =
-          ParameterTree::Create(row4x4_ + half_block4x4,
-                                column4x4_ + half_block4x4, split_size, true);
-      return children_[0] != nullptr && children_[1] != nullptr &&
-             children_[2] != nullptr;
-    case kPartitionVerticalWithLeftSplit:
-      assert(split_size != kBlockInvalid);
-      children_[0] =
-          ParameterTree::Create(row4x4_, column4x4_, split_size, true);
-      children_[1] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_,
-                                           split_size, true);
-      children_[2] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4,
-                                           sub_size, true);
-      return children_[0] != nullptr && children_[1] != nullptr &&
-             children_[2] != nullptr;
-    case kPartitionVerticalWithRightSplit:
-      assert(split_size != kBlockInvalid);
-      children_[0] = ParameterTree::Create(row4x4_, column4x4_, sub_size, true);
-      children_[1] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4,
-                                           split_size, true);
-      children_[2] =
-          ParameterTree::Create(row4x4_ + half_block4x4,
-                                column4x4_ + half_block4x4, split_size, true);
-      return children_[0] != nullptr && children_[1] != nullptr &&
-             children_[2] != nullptr;
-    case kPartitionHorizontal4:
-      for (int i = 0; i < 4; ++i) {
-        children_[i] = ParameterTree::Create(row4x4_ + i * quarter_block4x4,
-                                             column4x4_, sub_size, true);
-        if (children_[i] == nullptr) return false;
-      }
-      return true;
-    default:
-      assert(partition == kPartitionVertical4);
-      for (int i = 0; i < 4; ++i) {
-        children_[i] = ParameterTree::Create(
-            row4x4_, column4x4_ + i * quarter_block4x4, sub_size, true);
-        if (children_[i] == nullptr) return false;
-      }
-      return true;
-  }
-}
-
-}  // namespace libgav1

diff --git a/libgav1/src/utils/parameter_tree.h b/libgav1/src/utils/parameter_tree.h
deleted file mode 100644
index 935f3eb..0000000
--- a/libgav1/src/utils/parameter_tree.h
+++ /dev/null

@@ -1,113 +0,0 @@
-/*
- * Copyright 2019 The libgav1 Authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef LIBGAV1_SRC_UTILS_PARAMETER_TREE_H_
-#define LIBGAV1_SRC_UTILS_PARAMETER_TREE_H_
-
-#include <cassert>
-#include <memory>
-
-#include "src/utils/common.h"
-#include "src/utils/compiler_attributes.h"
-#include "src/utils/constants.h"
-#include "src/utils/memory.h"
-#include "src/utils/types.h"
-
-namespace libgav1 {
-
-class ParameterTree : public Allocable {
- public:
-  // Creates a parameter tree to store the parameters of a block of size
-  // |block_size| starting at coordinates |row4x4| and |column4x4|. If |is_leaf|
-  // is set to true, the memory will be allocated for the BlockParameters for
-  // this node. Otherwise, no memory will be allocated. If |is_leaf| is set to
-  // false, |block_size| must be a square block, i.e.,
-  // kBlockWidthPixels[block_size] must be equal to
-  // kBlockHeightPixels[block_size].
-  static std::unique_ptr<ParameterTree> Create(int row4x4, int column4x4,
-                                               BlockSize block_size,
-                                               bool is_leaf = false);
-
-  // Move only (not Copyable).
-  ParameterTree(ParameterTree&& other) = default;
-  ParameterTree& operator=(ParameterTree&& other) = default;
-  ParameterTree(const ParameterTree&) = delete;
-  ParameterTree& operator=(const ParameterTree&) = delete;
-
-  // Set the partition type of the current node to |partition|.
-  // if (partition == kPartitionNone) {
-  //   Memory will be allocated for the BlockParameters for this node.
-  // } else if (partition != kPartitionSplit) {
-  //   The appropriate child nodes will be populated and memory will be
-  //   allocated for the BlockParameters of the children.
-  // } else {
-  //   The appropriate child nodes will be populated but they are considered to
-  //   be hanging, i.e., future calls to SetPartitionType() on the child nodes
-  //   will have to set them or their descendants to a terminal type.
-  // }
-  // This function must be called only once per node.
-  LIBGAV1_MUST_USE_RESULT bool SetPartitionType(Partition partition);
-
-  // Basic getters.
-  int row4x4() const { return row4x4_; }
-  int column4x4() const { return column4x4_; }
-  BlockSize block_size() const { return block_size_; }
-  Partition partition() const { return partition_; }
-  ParameterTree* children(int index) const {
-    assert(index < 4);
-    return children_[index].get();
-  }
-  // Returns the BlockParameters object of the current node if one exists.
-  // Otherwise returns nullptr. This function will return a valid
-  // BlockParameters object only for leaf nodes.
-  BlockParameters* parameters() const { return parameters_.get(); }
-
- private:
-  ParameterTree(int row4x4, int column4x4, BlockSize block_size)
-      : row4x4_(row4x4), column4x4_(column4x4), block_size_(block_size) {}
-
-  Partition partition_ = kPartitionNone;
-  std::unique_ptr<BlockParameters> parameters_ = nullptr;
-  int row4x4_ = -1;
-  int column4x4_ = -1;
-  BlockSize block_size_ = kBlockInvalid;
-  bool partition_type_set_ = false;
-
-  // Child values are defined as follows for various partition types:
-  //  * Horizontal: 0 top partition; 1 bottom partition; 2 nullptr; 3 nullptr;
-  //  * Vertical: 0 left partition; 1 right partition; 2 nullptr; 3 nullptr;
-  //  * Split: 0 top-left partition; 1 top-right partition; 2; bottom-left
-  //    partition; 3 bottom-right partition;
-  //  * HorizontalWithTopSplit: 0 top-left partition; 1 top-right partition; 2
-  //    bottom partition; 3 nullptr;
-  //  * HorizontalWithBottomSplit: 0 top partition; 1 bottom-left partition; 2
-  //    bottom-right partition; 3 nullptr;
-  //  * VerticalWithLeftSplit: 0 top-left partition; 1 bottom-left partition; 2
-  //    right partition; 3 nullptr;
-  //  * VerticalWithRightSplit: 0 left-partition; 1 top-right partition; 2
-  //    bottom-right partition; 3 nullptr;
-  //  * Horizontal4: 0 top partition; 1 second top partition; 2 third top
-  //    partition; 3 bottom partition;
-  //  * Vertical4: 0 left partition; 1 second left partition; 2 third left
-  //    partition; 3 right partition;
-  std::unique_ptr<ParameterTree> children_[4] = {};
-
-  friend class ParameterTreeTest;
-};
-
-}  // namespace libgav1
-
-#endif  // LIBGAV1_SRC_UTILS_PARAMETER_TREE_H_

diff --git a/libgav1/src/utils/raw_bit_reader.h b/libgav1/src/utils/raw_bit_reader.h
index 76e7bfa..7d8ce8f 100644
--- a/libgav1/src/utils/raw_bit_reader.h
+++ b/libgav1/src/utils/raw_bit_reader.h

@@ -38,7 +38,7 @@
                         size_t* value);    // le(n) in the spec.
   bool ReadUnsignedLeb128(size_t* value);  // leb128() in the spec.
   // Reads a variable length unsigned number and stores it in |*value|. On a
-  // successful return, |*value| is in the range of 0 to UINT32_MAX − 1,
+  // successful return, |*value| is in the range of 0 to UINT32_MAX - 1,
   // inclusive.
   bool ReadUvlc(uint32_t* value);  // uvlc() in the spec.
   bool Finished() const;

diff --git a/libgav1/src/utils/threadpool.cc b/libgav1/src/utils/threadpool.cc
index 8c8f4fe..a3099e1 100644
--- a/libgav1/src/utils/threadpool.cc
+++ b/libgav1/src/utils/threadpool.cc

@@ -37,17 +37,21 @@
 #include <chrono>  // NOLINT (unapproved c++11 header)
 #endif
 
+// Define the GetTid() function, a wrapper for the gettid() system call in
+// Linux.
+#if defined(__ANDROID__)
+static pid_t GetTid() { return gettid(); }
+#elif defined(__GLIBC__)
 // The glibc wrapper for the gettid() system call was added in glibc 2.30.
 // Emulate it for older versions of glibc.
-#if defined(__GLIBC_PREREQ)
-#if !__GLIBC_PREREQ(2, 30)
-
+#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 30)
+static pid_t GetTid() { return gettid(); }
+#else  // Older than glibc 2.30
 #include <sys/syscall.h>
 
-static pid_t gettid() { return static_cast<pid_t>(syscall(SYS_gettid)); }
-
-#endif
-#endif  // defined(__GLIBC_PREREQ)
+static pid_t GetTid() { return static_cast<pid_t>(syscall(SYS_gettid)); }
+#endif  // glibc 2.30 or later.
+#endif  // defined(__GLIBC__)
 
 namespace libgav1 {
 
@@ -216,7 +220,7 @@
     // If the |name| buffer is longer than 16 bytes, pthread_setname_np fails
     // with error 34 (ERANGE) on Android.
     char name[16];
-    pid_t id = gettid();
+    pid_t id = GetTid();
     int rv = snprintf(name, sizeof(name), "%s/%" PRId64, pool_->name_prefix_,
                       static_cast<int64_t>(id));
     assert(rv >= 0);

diff --git a/libgav1/src/utils/types.h b/libgav1/src/utils/types.h
index c0ac76c..eba13b7 100644
--- a/libgav1/src/utils/types.h
+++ b/libgav1/src/utils/types.h

@@ -18,6 +18,7 @@
 #define LIBGAV1_SRC_UTILS_TYPES_H_
 
 #include <array>
+#include <cstddef>
 #include <cstdint>
 #include <memory>
 
@@ -320,7 +321,7 @@
 
 struct LoopRestoration {
   LoopRestorationType type[kMaxPlanes];
-  int unit_size[kMaxPlanes];
+  int unit_size_log2[kMaxPlanes];
 };
 
 // Stores the quantization parameters of Section 5.9.12.
@@ -512,6 +513,10 @@
   Delta delta_lf;
   // A valid value of reference_frame_index[i] is in the range [0, 7]. -1
   // indicates an invalid value.
+  //
+  // NOTE: When the frame is an intra frame (frame_type is kFrameKey or
+  // kFrameIntraOnly), reference_frame_index is not used and may be
+  // uninitialized.
   int8_t reference_frame_index[kNumInterReferenceFrameTypes];
   // The ref_order_hint[ i ] syntax element in the uncompressed header.
   // Specifies the expected output order hint for each reference frame.
@@ -521,5 +526,24 @@
   FilmGrainParams film_grain_params;
 };
 
+// Structure used for traversing the partition tree.
+struct PartitionTreeNode {
+  PartitionTreeNode() = default;
+  PartitionTreeNode(int row4x4, int column4x4, BlockSize block_size)
+      : row4x4(row4x4), column4x4(column4x4), block_size(block_size) {}
+  int row4x4 = -1;
+  int column4x4 = -1;
+  BlockSize block_size = kBlockInvalid;
+};
+
+// Structure used for storing the transform parameters in a superblock.
+struct TransformParameters {
+  TransformParameters() = default;
+  TransformParameters(TransformType type, int non_zero_coeff_count)
+      : type(type), non_zero_coeff_count(non_zero_coeff_count) {}
+  TransformType type;
+  int non_zero_coeff_count;
+};
+
 }  // namespace libgav1
 #endif  // LIBGAV1_SRC_UTILS_TYPES_H_

diff --git a/libgav1/tests/fuzzer/decoder_fuzzer_frame_parallel.cc b/libgav1/tests/fuzzer/decoder_fuzzer_frame_parallel.cc
index 6e8b6a0..d1b1c54 100644
--- a/libgav1/tests/fuzzer/decoder_fuzzer_frame_parallel.cc
+++ b/libgav1/tests/fuzzer/decoder_fuzzer_frame_parallel.cc

@@ -121,14 +121,12 @@
 
     const libgav1::DecoderBuffer* buffer;
     libgav1::StatusCode status = decoder.DequeueFrame(&buffer);
-    if (status != libgav1::kStatusOk &&
-        status != libgav1::kStatusNothingToDequeue) {
-      break;
-    }
-    if (buffer == nullptr) {
-      dequeue_finished = status == libgav1::kStatusNothingToDequeue;
-    } else {
+    if (status == libgav1::kStatusNothingToDequeue) {
+      dequeue_finished = true;
+    } else if (status == libgav1::kStatusOk) {
       dequeue_finished = false;
+    } else {
+      break;
     }
   } while (input_buffer != nullptr || !file_reader->IsEndOfFile() ||
            !dequeue_finished);
commit	13ed79329b963d5057e84274c6142096ad2553c4	[log] [tgz]
author	James Zern <jzern@google.com>	Fri Apr 16 18:04:07 2021 +0000
committer	Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>	Fri Apr 16 18:04:07 2021 +0000
tree	74a1966c5ad4e106db4bfa2e0148efd8383a73da
parent	7cad90a8e9b500821a58d1f07aa40abf745a7b99 [diff]
parent	81461368d4b72d6cf819f395ec0710ef6fb7b80c [diff]