Merge rvc-qpr-dev-plus-aosp-without-vendor@6881855

Bug: 172690556
Merged-In: Ib7af1dc437feb7676e2c9ebe81075bc9e93f0eb9
Change-Id: I1325e88711b3e467279ebcc168ddb20525b0388a
diff --git a/Android.bp b/Android.bp
index 3d5b91a..e6a47be 100644
--- a/Android.bp
+++ b/Android.bp
@@ -18,6 +18,7 @@
 
     export_include_dirs: [
         ".",
+        "libgav1/src",
     ],
 
     cflags: [
@@ -40,10 +41,12 @@
         "libgav1/src/buffer_pool.cc",
         "libgav1/src/decoder.cc",
         "libgav1/src/decoder_impl.cc",
-        "libgav1/src/decoder_scratch_buffer.cc",
+        "libgav1/src/decoder_settings.cc",
         "libgav1/src/dsp/arm/average_blend_neon.cc",
+        "libgav1/src/dsp/arm/cdef_neon.cc",
         "libgav1/src/dsp/arm/convolve_neon.cc",
         "libgav1/src/dsp/arm/distance_weighted_blend_neon.cc",
+        "libgav1/src/dsp/arm/film_grain_neon.cc",
         "libgav1/src/dsp/arm/intra_edge_neon.cc",
         "libgav1/src/dsp/arm/intrapred_cfl_neon.cc",
         "libgav1/src/dsp/arm/intrapred_directional_neon.cc",
@@ -54,13 +57,16 @@
         "libgav1/src/dsp/arm/loop_filter_neon.cc",
         "libgav1/src/dsp/arm/loop_restoration_neon.cc",
         "libgav1/src/dsp/arm/mask_blend_neon.cc",
+        "libgav1/src/dsp/arm/motion_field_projection_neon.cc",
+        "libgav1/src/dsp/arm/motion_vector_search_neon.cc",
         "libgav1/src/dsp/arm/obmc_neon.cc",
+        "libgav1/src/dsp/arm/super_res_neon.cc",
         "libgav1/src/dsp/arm/warp_neon.cc",
+        "libgav1/src/dsp/arm/weight_mask_neon.cc",
         "libgav1/src/dsp/average_blend.cc",
         "libgav1/src/dsp/cdef.cc",
         "libgav1/src/dsp/constants.cc",
         "libgav1/src/dsp/convolve.cc",
-        "libgav1/src/dsp/cpu.cc",
         "libgav1/src/dsp/distance_weighted_blend.cc",
         "libgav1/src/dsp/dsp.cc",
         "libgav1/src/dsp/film_grain.cc",
@@ -70,9 +76,14 @@
         "libgav1/src/dsp/loop_filter.cc",
         "libgav1/src/dsp/loop_restoration.cc",
         "libgav1/src/dsp/mask_blend.cc",
+        "libgav1/src/dsp/motion_field_projection.cc",
+        "libgav1/src/dsp/motion_vector_search.cc",
         "libgav1/src/dsp/obmc.cc",
+        "libgav1/src/dsp/super_res.cc",
         "libgav1/src/dsp/warp.cc",
+        "libgav1/src/dsp/weight_mask.cc",
         "libgav1/src/dsp/x86/average_blend_sse4.cc",
+        "libgav1/src/dsp/x86/cdef_sse4.cc",
         "libgav1/src/dsp/x86/convolve_sse4.cc",
         "libgav1/src/dsp/x86/distance_weighted_blend_sse4.cc",
         "libgav1/src/dsp/x86/intra_edge_sse4.cc",
@@ -82,17 +93,29 @@
         "libgav1/src/dsp/x86/inverse_transform_sse4.cc",
         "libgav1/src/dsp/x86/loop_filter_sse4.cc",
         "libgav1/src/dsp/x86/loop_restoration_sse4.cc",
+        "libgav1/src/dsp/x86/mask_blend_sse4.cc",
+        "libgav1/src/dsp/x86/motion_field_projection_sse4.cc",
+        "libgav1/src/dsp/x86/motion_vector_search_sse4.cc",
         "libgav1/src/dsp/x86/obmc_sse4.cc",
+        "libgav1/src/dsp/x86/super_res_sse4.cc",
+        "libgav1/src/dsp/x86/warp_sse4.cc",
+        "libgav1/src/dsp/x86/weight_mask_sse4.cc",
+        "libgav1/src/film_grain.cc",
+        "libgav1/src/frame_buffer.cc",
         "libgav1/src/internal_frame_buffer_list.cc",
-        "libgav1/src/loop_filter_mask.cc",
         "libgav1/src/loop_restoration_info.cc",
         "libgav1/src/motion_vector.cc",
         "libgav1/src/obu_parser.cc",
-        "libgav1/src/post_filter.cc",
+        "libgav1/src/post_filter/cdef.cc",
+        "libgav1/src/post_filter/deblock.cc",
+        "libgav1/src/post_filter/loop_restoration.cc",
+        "libgav1/src/post_filter/post_filter.cc",
+        "libgav1/src/post_filter/super_res.cc",
         "libgav1/src/prediction_mask.cc",
         "libgav1/src/quantizer.cc",
         "libgav1/src/reconstruction.cc",
         "libgav1/src/residual_buffer_pool.cc",
+        "libgav1/src/status_code.cc",
         "libgav1/src/symbol_decoder_context.cc",
         "libgav1/src/threading_strategy.cc",
         "libgav1/src/tile/bitstream/mode_info.cc",
@@ -100,10 +123,12 @@
         "libgav1/src/tile/bitstream/partition.cc",
         "libgav1/src/tile/bitstream/transform_size.cc",
         "libgav1/src/tile/prediction.cc",
+        "libgav1/src/tile_scratch_buffer.cc",
         "libgav1/src/tile/tile.cc",
         "libgav1/src/utils/bit_reader.cc",
         "libgav1/src/utils/block_parameters_holder.cc",
         "libgav1/src/utils/constants.cc",
+        "libgav1/src/utils/cpu.cc",
         "libgav1/src/utils/entropy_decoder.cc",
         "libgav1/src/utils/executor.cc",
         "libgav1/src/utils/logging.cc",
@@ -112,6 +137,7 @@
         "libgav1/src/utils/segmentation.cc",
         "libgav1/src/utils/segmentation_map.cc",
         "libgav1/src/utils/threadpool.cc",
+        "libgav1/src/version.cc",
         "libgav1/src/warp_prediction.cc",
         "libgav1/src/yuv_buffer.cc",
     ],
diff --git a/README.version b/README.version
index 5d15f7f..b65b65a 100644
--- a/README.version
+++ b/README.version
@@ -1,11 +1,5 @@
 URL: https://chromium.googlesource.com/codecs/libgav1
-Version: cl/267700628
+Version: v0.16.0
 BugComponent: 324837
 Local Modifications:
-- ab3390a external/libgav1,cosmetics: add license headers
-- backport cl/281117442: Fully use the frame border for reference block.
-- backport cl/289984918: convolve: Use the correct subsampling for ref frames
-- backport cl/289966078: Move initial_display_delay out of OperatingParamet
-- backport cl/290784565: Handle a change of sequence header parameters.
-- backport cl/291222461: Disallow change of sequence header during a frame.
-- backport cl/289910031: obu: Check for size validity in SetTileDataOffset
+None
diff --git a/libgav1/.gitignore b/libgav1/.gitignore
new file mode 100644
index 0000000..87ccf24
--- /dev/null
+++ b/libgav1/.gitignore
@@ -0,0 +1,2 @@
+/build
+/third_party
diff --git a/libgav1/AUTHORS b/libgav1/AUTHORS
new file mode 100644
index 0000000..d92ea0a
--- /dev/null
+++ b/libgav1/AUTHORS
@@ -0,0 +1,6 @@
+# This is the list of libgav1 authors for copyright purposes.
+#
+# This does not necessarily list everyone who has contributed code, since in
+# some cases, their employer may be the copyright holder.  To see the full list
+# of contributors, see the revision history in source control.
+Google LLC
diff --git a/libgav1/CMakeLists.txt b/libgav1/CMakeLists.txt
new file mode 100644
index 0000000..f033bae
--- /dev/null
+++ b/libgav1/CMakeLists.txt
@@ -0,0 +1,124 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# libgav1 requires modern CMake.
+cmake_minimum_required(VERSION 3.7.1 FATAL_ERROR)
+
+# libgav1 requires C++11.
+set(CMAKE_CXX_STANDARD 11)
+set(ABSL_CXX_STANDARD 11)
+
+project(libgav1 CXX)
+
+set(libgav1_root "${CMAKE_CURRENT_SOURCE_DIR}")
+set(libgav1_build "${CMAKE_BINARY_DIR}")
+
+if("${libgav1_root}" STREQUAL "${libgav1_build}")
+  message(
+    FATAL_ERROR
+      "Building from within the libgav1 source tree is not supported.\n"
+      "Hint: Run these commands\n" "$ rm -rf CMakeCache.txt CMakeFiles\n"
+      "$ mkdir -p ../libgav1_build\n" "$ cd ../libgav1_build\n"
+      "And re-run CMake from the libgav1_build directory.")
+endif()
+
+set(libgav1_examples "${libgav1_root}/examples")
+set(libgav1_source "${libgav1_root}/src")
+
+include(FindThreads)
+
+include("${libgav1_examples}/libgav1_examples.cmake")
+include("${libgav1_root}/cmake/libgav1_build_definitions.cmake")
+include("${libgav1_root}/cmake/libgav1_cpu_detection.cmake")
+include("${libgav1_root}/cmake/libgav1_flags.cmake")
+include("${libgav1_root}/cmake/libgav1_helpers.cmake")
+include("${libgav1_root}/cmake/libgav1_install.cmake")
+include("${libgav1_root}/cmake/libgav1_intrinsics.cmake")
+include("${libgav1_root}/cmake/libgav1_options.cmake")
+include("${libgav1_root}/cmake/libgav1_sanitizer.cmake")
+include("${libgav1_root}/cmake/libgav1_targets.cmake")
+include("${libgav1_root}/cmake/libgav1_variables.cmake")
+include("${libgav1_source}/dsp/libgav1_dsp.cmake")
+include("${libgav1_source}/libgav1_decoder.cmake")
+include("${libgav1_source}/utils/libgav1_utils.cmake")
+
+libgav1_option(NAME LIBGAV1_ENABLE_OPTIMIZATIONS HELPSTRING
+               "Enables optimized code." VALUE ON)
+libgav1_option(NAME LIBGAV1_ENABLE_NEON HELPSTRING "Enables neon optimizations."
+               VALUE ON)
+libgav1_option(NAME LIBGAV1_ENABLE_SSE4_1 HELPSTRING
+               "Enables sse4.1 optimizations." VALUE ON)
+libgav1_option(
+  NAME LIBGAV1_VERBOSE HELPSTRING
+  "Enables verbose build system output. Higher numbers are more verbose." VALUE
+  OFF)
+
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Release)
+endif()
+
+libgav1_optimization_detect()
+libgav1_set_build_definitions()
+libgav1_set_cxx_flags()
+libgav1_configure_sanitizer()
+
+# Supported bit depth.
+libgav1_track_configuration_variable(LIBGAV1_MAX_BITDEPTH)
+
+# C++ and linker flags.
+libgav1_track_configuration_variable(LIBGAV1_CXX_FLAGS)
+libgav1_track_configuration_variable(LIBGAV1_EXE_LINKER_FLAGS)
+
+# Sanitizer integration.
+libgav1_track_configuration_variable(LIBGAV1_SANITIZE)
+
+# Generated source file directory.
+libgav1_track_configuration_variable(LIBGAV1_GENERATED_SOURCES_DIRECTORY)
+
+# Controls use of std::mutex and absl::Mutex in ThreadPool.
+libgav1_track_configuration_variable(LIBGAV1_THREADPOOL_USE_STD_MUTEX)
+
+if(LIBGAV1_VERBOSE)
+  libgav1_dump_cmake_flag_variables()
+  libgav1_dump_tracked_configuration_variables()
+  libgav1_dump_options()
+endif()
+
+set(libgav1_abseil_build "${libgav1_build}/abseil")
+set(libgav1_gtest_build "${libgav1_build}/gtest")
+
+# Compiler/linker flags must be lists, but come in from the environment as
+# strings. Break them up:
+if(NOT "${LIBGAV1_CXX_FLAGS}" STREQUAL "")
+  separate_arguments(LIBGAV1_CXX_FLAGS)
+endif()
+if(NOT "${LIBGAV1_EXE_LINKER_FLAGS}" STREQUAL "")
+  separate_arguments(LIBGAV1_EXE_LINKER_FLAGS)
+endif()
+
+add_subdirectory("${libgav1_root}/third_party/abseil-cpp"
+                 "${libgav1_abseil_build}" EXCLUDE_FROM_ALL)
+
+libgav1_reset_target_lists()
+libgav1_add_dsp_targets()
+libgav1_add_decoder_targets()
+libgav1_add_examples_targets()
+libgav1_add_utils_targets()
+libgav1_setup_install_target()
+
+if(LIBGAV1_VERBOSE)
+  libgav1_dump_cmake_flag_variables()
+  libgav1_dump_tracked_configuration_variables()
+  libgav1_dump_options()
+endif()
diff --git a/libgav1/CONTRIBUTING.md b/libgav1/CONTRIBUTING.md
new file mode 100644
index 0000000..69140ff
--- /dev/null
+++ b/libgav1/CONTRIBUTING.md
@@ -0,0 +1,27 @@
+# How to Contribute
+
+We'd love to accept your patches and contributions to this project. There are
+just a few small guidelines you need to follow.
+
+## Contributor License Agreement
+
+Contributions to this project must be accompanied by a Contributor License
+Agreement. You (or your employer) retain the copyright to your contribution;
+this simply gives us permission to use and redistribute your contributions as
+part of the project. Head over to <https://cla.developers.google.com/> to see
+your current agreements on file or to sign a new one.
+
+You generally only need to submit a CLA once, so if you've already submitted one
+(even if it was for a different project), you probably don't need to do it
+again.
+
+## Code reviews
+
+All submissions, including submissions by project members, require review. We
+use a [Gerrit](https://www.gerritcodereview.com) instance hosted at
+https://chromium-review.googlesource.com for this purpose.
+
+## Community Guidelines
+
+This project follows
+[Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
diff --git a/libgav1/README.md b/libgav1/README.md
new file mode 100644
index 0000000..b935679
--- /dev/null
+++ b/libgav1/README.md
@@ -0,0 +1,165 @@
+# libgav1 -- an AV1 decoder
+
+libgav1 is a Main profile (0) & High profile (1) compliant AV1 decoder. More
+information on the AV1 video format can be found at
+[aomedia.org](https://aomedia.org).
+
+[TOC]
+
+## Building
+
+### Prerequisites
+
+1.  A C++11 compiler. gcc 6+, clang 7+ or Microsoft Visual Studio 2017+ are
+    recommended.
+
+2.  [CMake >= 3.7.1](https://cmake.org/download/)
+
+3.  [Abseil](https://abseil.io)
+
+    From within the libgav1 directory:
+
+    ```shell
+      $ git clone https://github.com/abseil/abseil-cpp.git third_party/abseil-cpp
+    ```
+
+### Compile
+
+```shell
+  $ mkdir build && cd build
+  $ cmake -G "Unix Makefiles" ..
+  $ make
+```
+
+Configuration options:
+
+*   `LIBGAV1_MAX_BITDEPTH`: defines the maximum supported bitdepth (8, 10;
+    default: 10).
+*   `LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS`: define to a non-zero value to disable
+    [symbol reduction](#symbol-reduction) in an optimized build to keep all
+    versions of dsp functions available. Automatically defined in
+    `src/dsp/dsp.h` if unset.
+*   `LIBGAV1_ENABLE_NEON`: define to a non-zero value to enable NEON
+    optimizations. Automatically defined in `src/dsp/dsp.h` if unset.
+*   `LIBGAV1_ENABLE_SSE4_1`: define to a non-zero value to enable sse4.1
+    optimizations. Automatically defined in `src/dsp/dsp.h` if unset.
+*   `LIBGAV1_ENABLE_LOGGING`: define to 0/1 to control debug logging.
+    Automatically defined in `src/utils/logging.h` if unset.
+*   `LIBGAV1_EXAMPLES_ENABLE_LOGGING`: define to 0/1 to control error logging in
+    the examples. Automatically defined in `examples/logging.h` if unset.
+*   `LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK`: define to 1 to enable transform
+    coefficient range checks.
+*   `LIBGAV1_LOG_LEVEL`: controls the maximum allowed log level, see `enum
+    LogSeverity` in `src/utils/logging.h`. Automatically defined in
+    `src/utils/logging.cc` if unset.
+*   `LIBGAV1_THREADPOOL_USE_STD_MUTEX`: controls use of std::mutex and
+    absl::Mutex in ThreadPool. Defining this to 1 will remove any Abseil
+    dependency from the core library. Automatically defined in
+    `src/utils/threadpool.h` if unset.
+*   `LIBGAV1_MAX_THREADS`: sets the number of threads that the library is
+    allowed to create. Has to be an integer > 0. Otherwise this is ignored.
+    The default value is 128.
+*   `LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER`: the threshold multiplier that
+    is used to determine when to use frame parallel decoding. Frame parallel
+    decoding will be used if |threads| > |tile_count| * this multiplier. Has to
+    be an integer > 0. The default value is 4. This is an advanced setting
+    intended for testing purposes.
+
+For additional options see:
+
+```shell
+  $ cmake .. -LH
+```
+
+## Testing
+
+*   `gav1_decode` can be used to decode IVF files, see `gav1_decode --help` for
+    options. Note: tools like [FFmpeg](https://ffmpeg.org) can be used to
+    convert other container formats to IVF.
+
+## Development
+
+### Contributing
+
+See [CONTRIBUTING.md](CONTRIBUTING.md) for details on how to submit patches.
+
+### Style
+
+libgav1 follows the
+[Google C++ style guide](https://google.github.io/styleguide/cppguide.html) with
+formatting enforced by `clang-format`.
+
+### Comments
+
+Comments of the form '`// X.Y(.Z).`', '`Section X.Y(.Z).`' or '`... in the
+spec`' reference the relevant section(s) in the
+[AV1 specification](http://aomediacodec.github.io/av1-spec/av1-spec.pdf).
+
+### DSP structure
+
+*   `src/dsp/dsp.cc` defines the main entry point: `libgav1::dsp::DspInit()`.
+    This handles cpu-detection and initializing each logical unit which populate
+    `libgav1::dsp::Dsp` function tables.
+*   `src/dsp/dsp.h` contains function and type definitions for all logical units
+    (e.g., intra-predictors)
+*   `src/utils/cpu.h` contains definitions for cpu-detection
+*   base implementations are located in `src/dsp/*.{h,cc}` with platform
+    specific optimizations in sub-folders
+*   unit tests define `DISABLED_Speed` test(s) to allow timing of individual
+    functions
+
+#### Symbol reduction
+
+Based on the build configuration unneeded lesser optimizations are removed using
+a hierarchical include and define system. Each logical unit in `src/dsp` should
+include all platform specific headers in descending order to allow higher level
+optimizations to disable lower level ones. See `src/dsp/loop_filter.h` for an
+example.
+
+Each function receives a new define which can be checked in platform specific
+headers. The format is: `LIBGAV1_<Dsp-table>_FunctionName` or
+`LIBGAV1_<Dsp-table>_[sub-table-index1][...-indexN]`, e.g.,
+`LIBGAV1_Dsp8bpp_AverageBlend`,
+`LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc`. The Dsp-table name is of
+the form `Dsp<bitdepth>bpp` e.g. `Dsp10bpp` for bitdepth == 10 (bpp stands for
+bits per pixel). The indices correspond to enum values used as lookups with
+leading 'k' removed. Platform specific headers then should first check if the
+symbol is defined and if not set the value to the corresponding
+`LIBGAV1_CPU_<arch>` value from `src/utils/cpu.h`.
+
+```
+  #ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc
+  #define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+  #endif
+```
+
+Within each module the code should check if the symbol is defined to its
+specific architecture or forced via `LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS` before
+defining the function. The `DSP_ENABLED_(8|10)BPP_*` macros are available to
+simplify this check for optimized code.
+
+```
+  #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDc)
+  ...
+
+  // In unoptimized code use the following structure; there's no equivalent
+  // define for LIBGAV1_CPU_C as it would require duplicating the function
+  // defines used in optimized code for only a small benefit to this
+  // boilerplate.
+  #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  ...
+  #else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+  #ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcFill
+  ...
+```
+
+## Bugs
+
+Please report all bugs to the issue tracker:
+https://issuetracker.google.com/issues/new?component=750480&template=1355007
+
+## Discussion
+
+Email: gav1-devel@googlegroups.com
+
+Web: https://groups.google.com/forum/#!forum/gav1-devel
diff --git a/libgav1/cmake/libgav1-config.cmake.template b/libgav1/cmake/libgav1-config.cmake.template
new file mode 100644
index 0000000..dc253d3
--- /dev/null
+++ b/libgav1/cmake/libgav1-config.cmake.template
@@ -0,0 +1,2 @@
+set(LIBGAV1_INCLUDE_DIRS "@LIBGAV1_INCLUDE_DIRS@")
+set(LIBGAV1_LIBRARIES "gav1")
diff --git a/libgav1/cmake/libgav1.pc.template b/libgav1/cmake/libgav1.pc.template
new file mode 100644
index 0000000..c571a43
--- /dev/null
+++ b/libgav1/cmake/libgav1.pc.template
@@ -0,0 +1,11 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: @PROJECT_NAME@
+Description: AV1 decoder library (@LIBGAV1_MAX_BITDEPTH@-bit).
+Version: @LIBGAV1_VERSION@
+Cflags: -I${includedir}
+Libs: -L${libdir} -lgav1
+Libs.private: @CMAKE_THREAD_LIBS_INIT@
diff --git a/libgav1/cmake/libgav1_build_definitions.cmake b/libgav1/cmake/libgav1_build_definitions.cmake
new file mode 100644
index 0000000..930d8f5
--- /dev/null
+++ b/libgav1/cmake/libgav1_build_definitions.cmake
@@ -0,0 +1,149 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_BUILD_DEFINITIONS_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_BUILD_DEFINITIONS_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_BUILD_DEFINITIONS_CMAKE_ 1)
+
+macro(libgav1_set_build_definitions)
+  string(TOLOWER "${CMAKE_BUILD_TYPE}" build_type_lowercase)
+
+  libgav1_load_version_info()
+  set(LIBGAV1_SOVERSION 0)
+
+  list(APPEND libgav1_include_paths "${libgav1_root}" "${libgav1_root}/src"
+              "${libgav1_build}" "${libgav1_root}/third_party/abseil-cpp")
+  list(APPEND libgav1_gtest_include_paths
+              "third_party/googletest/googlemock/include"
+              "third_party/googletest/googletest/include"
+              "third_party/googletest/googletest")
+  list(APPEND libgav1_test_include_paths ${libgav1_include_paths}
+              ${libgav1_gtest_include_paths})
+  list(APPEND libgav1_defines "LIBGAV1_CMAKE=1"
+              "LIBGAV1_FLAGS_SRCDIR=\"${libgav1_root}\""
+              "LIBGAV1_FLAGS_TMPDIR=\"/tmp\"")
+
+  if(MSVC OR WIN32)
+    list(APPEND libgav1_defines "_CRT_SECURE_NO_DEPRECATE=1" "NOMINMAX=1")
+  endif()
+
+  if(ANDROID)
+    if(CMAKE_ANDROID_ARCH_ABI STREQUAL "armeabi-v7a")
+      set(CMAKE_ANDROID_ARM_MODE ON)
+    endif()
+
+    if(build_type_lowercase MATCHES "rel")
+      list(APPEND libgav1_base_cxx_flags "-fno-stack-protector")
+    endif()
+  endif()
+
+  list(APPEND libgav1_base_cxx_flags "-Wall" "-Wextra" "-Wmissing-declarations"
+              "-Wno-sign-compare" "-fvisibility=hidden"
+              "-fvisibility-inlines-hidden")
+
+  if(BUILD_SHARED_LIBS)
+    set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+    set(libgav1_dependency libgav1_shared)
+  else()
+    set(libgav1_dependency libgav1_static)
+  endif()
+
+  list(APPEND libgav1_clang_cxx_flags "-Wextra-semi" "-Wmissing-prototypes"
+              "-Wshorten-64-to-32")
+
+  if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "6")
+      # Quiet warnings in copy-list-initialization where {} elision has always
+      # been allowed.
+      list(APPEND libgav1_clang_cxx_flags "-Wno-missing-braces")
+    endif()
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8)
+      list(APPEND libgav1_clang_cxx_flags "-Wextra-semi-stmt")
+    endif()
+  endif()
+
+  if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "7")
+      # Quiet warnings due to potential snprintf() truncation in threadpool.cc.
+      list(APPEND libgav1_base_cxx_flags "-Wno-format-truncation")
+
+      if(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7")
+        # Quiet gcc 6 vs 7 abi warnings:
+        # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77728
+        list(APPEND libgav1_base_cxx_flags "-Wno-psabi")
+        list(APPEND ABSL_GCC_FLAGS "-Wno-psabi")
+      endif()
+    endif()
+  endif()
+
+  if(build_type_lowercase MATCHES "rel")
+    # TODO(tomfinegan): this value is only a concern for the core library and
+    # can be made smaller if the test targets are avoided.
+    list(APPEND libgav1_base_cxx_flags "-Wstack-usage=196608")
+  endif()
+
+  list(APPEND libgav1_msvc_cxx_flags
+              # Warning level 3.
+              "/W3"
+              # Disable warning C4018:
+              # '<comparison operator>' signed/unsigned mismatch
+              "/wd4018"
+              # Disable warning C4244:
+              # 'argument': conversion from '<double/int>' to
+              # '<float/smaller int type>', possible loss of data
+              "/wd4244"
+              # Disable warning C4267:
+              # '=': conversion from '<double/int>' to
+              # '<float/smaller int type>', possible loss of data
+              "/wd4267"
+              # Disable warning C4309:
+              # 'argument': truncation of constant value
+              "/wd4309"
+              # Disable warning C4551:
+              # function call missing argument list
+              "/wd4551")
+
+  if(BUILD_SHARED_LIBS)
+    list(APPEND libgav1_msvc_cxx_flags
+                # Disable warning C4251:
+                # 'libgav1::DecoderImpl class member' needs to have
+                # dll-interface to be used by clients of class
+                # 'libgav1::Decoder'.
+                "/wd4251")
+  endif()
+
+  if(NOT LIBGAV1_MAX_BITDEPTH)
+    set(LIBGAV1_MAX_BITDEPTH 10)
+  elseif(NOT LIBGAV1_MAX_BITDEPTH EQUAL 8 AND NOT LIBGAV1_MAX_BITDEPTH EQUAL 10)
+    libgav1_die("LIBGAV1_MAX_BITDEPTH must be 8 or 10.")
+  endif()
+
+  list(APPEND libgav1_defines "LIBGAV1_MAX_BITDEPTH=${LIBGAV1_MAX_BITDEPTH}")
+
+  if(DEFINED LIBGAV1_THREADPOOL_USE_STD_MUTEX)
+    if(NOT LIBGAV1_THREADPOOL_USE_STD_MUTEX EQUAL 0
+       AND NOT LIBGAV1_THREADPOOL_USE_STD_MUTEX EQUAL 1)
+      libgav1_die("LIBGAV1_THREADPOOL_USE_STD_MUTEX must be 0 or 1.")
+    endif()
+
+    list(APPEND libgav1_defines
+         "LIBGAV1_THREADPOOL_USE_STD_MUTEX=${LIBGAV1_THREADPOOL_USE_STD_MUTEX}")
+  endif()
+
+  # Source file names ending in these suffixes will have the appropriate
+  # compiler flags added to their compile commands to enable intrinsics.
+  set(libgav1_neon_source_file_suffix "neon.cc")
+  set(libgav1_sse4_source_file_suffix "sse4.cc")
+endmacro()
diff --git a/libgav1/cmake/libgav1_cpu_detection.cmake b/libgav1/cmake/libgav1_cpu_detection.cmake
new file mode 100644
index 0000000..6972d34
--- /dev/null
+++ b/libgav1/cmake/libgav1_cpu_detection.cmake
@@ -0,0 +1,42 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_CPU_DETECTION_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_CPU_DETECTION_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_CPU_DETECTION_CMAKE_ 1)
+
+# Detect optimizations available for the current target CPU.
+macro(libgav1_optimization_detect)
+  if(LIBGAV1_ENABLE_OPTIMIZATIONS)
+    string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" cpu_lowercase)
+    if(cpu_lowercase MATCHES "^arm|^aarch64")
+      set(libgav1_have_neon ON)
+    elseif(cpu_lowercase MATCHES "^x86|amd64")
+      set(libgav1_have_sse4 ON)
+    endif()
+  endif()
+
+  if(libgav1_have_neon AND LIBGAV1_ENABLE_NEON)
+    list(APPEND libgav1_defines "LIBGAV1_ENABLE_NEON=1")
+  else()
+    list(APPEND libgav1_defines "LIBGAV1_ENABLE_NEON=0")
+  endif()
+
+  if(libgav1_have_sse4 AND LIBGAV1_ENABLE_SSE4_1)
+    list(APPEND libgav1_defines "LIBGAV1_ENABLE_SSE4_1=1")
+  else()
+    list(APPEND libgav1_defines "LIBGAV1_ENABLE_SSE4_1=0")
+  endif()
+endmacro()
diff --git a/libgav1/cmake/libgav1_flags.cmake b/libgav1/cmake/libgav1_flags.cmake
new file mode 100644
index 0000000..0b8df60
--- /dev/null
+++ b/libgav1/cmake/libgav1_flags.cmake
@@ -0,0 +1,245 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_FLAGS_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_FLAGS_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_FLAGS_CMAKE_ 1)
+
+include(CheckCXXCompilerFlag)
+include(CheckCXXSourceCompiles)
+
+# Adds compiler flags specified by FLAGS to the sources specified by SOURCES:
+#
+# libgav1_set_compiler_flags_for_sources(SOURCES <sources> FLAGS <flags>)
+macro(libgav1_set_compiler_flags_for_sources)
+  unset(compiler_SOURCES)
+  unset(compiler_FLAGS)
+  unset(optional_args)
+  unset(single_value_args)
+  set(multi_value_args SOURCES FLAGS)
+  cmake_parse_arguments(compiler "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+
+  if(NOT (compiler_SOURCES AND compiler_FLAGS))
+    libgav1_die("libgav1_set_compiler_flags_for_sources: SOURCES and "
+                "FLAGS required.")
+  endif()
+
+  set_source_files_properties(${compiler_SOURCES} PROPERTIES COMPILE_FLAGS
+                              ${compiler_FLAGS})
+
+  if(LIBGAV1_VERBOSE GREATER 1)
+    foreach(source ${compiler_SOURCES})
+      foreach(flag ${compiler_FLAGS})
+        message("libgav1_set_compiler_flags_for_sources: source:${source} "
+                "flag:${flag}")
+      endforeach()
+    endforeach()
+  endif()
+endmacro()
+
+# Tests compiler flags stored in list(s) specified by FLAG_LIST_VAR_NAMES, adds
+# flags to $LIBGAV1_CXX_FLAGS when tests pass. Terminates configuration if
+# FLAG_REQUIRED is specified and any flag check fails.
+#
+# ~~~
+# libgav1_test_cxx_flag(<FLAG_LIST_VAR_NAMES <flag list variable(s)>>
+#                       [FLAG_REQUIRED])
+# ~~~
+macro(libgav1_test_cxx_flag)
+  unset(cxx_test_FLAG_LIST_VAR_NAMES)
+  unset(cxx_test_FLAG_REQUIRED)
+  unset(single_value_args)
+  set(optional_args FLAG_REQUIRED)
+  set(multi_value_args FLAG_LIST_VAR_NAMES)
+  cmake_parse_arguments(cxx_test "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+
+  if(NOT cxx_test_FLAG_LIST_VAR_NAMES)
+    libgav1_die("libgav1_test_cxx_flag: FLAG_LIST_VAR_NAMES required")
+  endif()
+
+  unset(cxx_flags)
+  foreach(list_var ${cxx_test_FLAG_LIST_VAR_NAMES})
+    if(LIBGAV1_VERBOSE)
+      message("libgav1_test_cxx_flag: adding ${list_var} to cxx_flags")
+    endif()
+    list(APPEND cxx_flags ${${list_var}})
+  endforeach()
+
+  if(LIBGAV1_VERBOSE)
+    message("CXX test: all flags: ${cxx_flags}")
+  endif()
+
+  unset(all_cxx_flags)
+  list(APPEND all_cxx_flags ${LIBGAV1_CXX_FLAGS} ${cxx_flags})
+
+  # Turn off output from check_cxx_source_compiles. Print status directly
+  # instead since the logging messages from check_cxx_source_compiles can be
+  # quite confusing.
+  set(CMAKE_REQUIRED_QUIET TRUE)
+
+  # Run the actual compile test.
+  unset(libgav1_all_cxx_flags_pass CACHE)
+  message("--- Running combined CXX flags test, flags: ${all_cxx_flags}")
+  check_cxx_compiler_flag("${all_cxx_flags}" libgav1_all_cxx_flags_pass)
+
+  if(cxx_test_FLAG_REQUIRED AND NOT libgav1_all_cxx_flags_pass)
+    libgav1_die("Flag test failed for required flag(s): "
+                "${all_cxx_flags} and FLAG_REQUIRED specified.")
+  endif()
+
+  if(libgav1_all_cxx_flags_pass)
+    # Test passed: update the global flag list used by the libgav1 target
+    # creation wrappers.
+    set(LIBGAV1_CXX_FLAGS ${cxx_flags})
+    list(REMOVE_DUPLICATES LIBGAV1_CXX_FLAGS)
+
+    if(LIBGAV1_VERBOSE)
+      message("LIBGAV1_CXX_FLAGS=${LIBGAV1_CXX_FLAGS}")
+    endif()
+
+    message("--- Passed combined CXX flags test")
+  else()
+    message("--- Failed combined CXX flags test, testing flags individually.")
+
+    if(cxx_flags)
+      message("--- Testing flags from $cxx_flags: " "${cxx_flags}")
+      foreach(cxx_flag ${cxx_flags})
+        unset(cxx_flag_test_passed CACHE)
+        message("--- Testing flag: ${cxx_flag}")
+        check_cxx_compiler_flag("${cxx_flag}" cxx_flag_test_passed)
+
+        if(cxx_flag_test_passed)
+          message("--- Passed test for ${cxx_flag}")
+        else()
+          list(REMOVE_ITEM cxx_flags ${cxx_flag})
+          message("--- Failed test for ${cxx_flag}, flag removed.")
+        endif()
+      endforeach()
+
+      set(LIBGAV1_CXX_FLAGS ${cxx_flags})
+    endif()
+  endif()
+
+  if(LIBGAV1_CXX_FLAGS)
+    list(REMOVE_DUPLICATES LIBGAV1_CXX_FLAGS)
+  endif()
+endmacro()
+
+# Tests executable linker flags stored in list specified by FLAG_LIST_VAR_NAME,
+# adds flags to $LIBGAV1_EXE_LINKER_FLAGS when test passes. Terminates
+# configuration when flag check fails. libgav1_set_cxx_flags() must be called
+# before calling this macro because it assumes $LIBGAV1_CXX_FLAGS contains only
+# valid CXX flags.
+#
+# libgav1_test_exe_linker_flag(<FLAG_LIST_VAR_NAME <flag list variable)>)
+macro(libgav1_test_exe_linker_flag)
+  unset(link_FLAG_LIST_VAR_NAME)
+  unset(optional_args)
+  unset(multi_value_args)
+  set(single_value_args FLAG_LIST_VAR_NAME)
+  cmake_parse_arguments(link "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+
+  if(NOT link_FLAG_LIST_VAR_NAME)
+    libgav1_die("libgav1_test_link_flag: FLAG_LIST_VAR_NAME required")
+  endif()
+
+  libgav1_set_and_stringify(DEST linker_flags SOURCE_VARS
+                            ${link_FLAG_LIST_VAR_NAME})
+
+  if(LIBGAV1_VERBOSE)
+    message("EXE LINKER test: all flags: ${linker_flags}")
+  endif()
+
+  # Tests of $LIBGAV1_CXX_FLAGS have already passed. Include them with the
+  # linker test.
+  libgav1_set_and_stringify(DEST CMAKE_REQUIRED_FLAGS SOURCE_VARS
+                            LIBGAV1_CXX_FLAGS)
+
+  # Cache the global exe linker flags.
+  if(CMAKE_EXE_LINKER_FLAGS)
+    set(cached_CMAKE_EXE_LINKER_FLAGS ${CMAKE_EXE_LINKER_FLAGS})
+    libgav1_set_and_stringify(DEST CMAKE_EXE_LINKER_FLAGS SOURCE
+                              ${linker_flags})
+  endif()
+
+  libgav1_set_and_stringify(DEST CMAKE_EXE_LINKER_FLAGS SOURCE ${linker_flags}
+                            ${CMAKE_EXE_LINKER_FLAGS})
+
+  # Turn off output from check_cxx_source_compiles. Print status directly
+  # instead since the logging messages from check_cxx_source_compiles can be
+  # quite confusing.
+  set(CMAKE_REQUIRED_QUIET TRUE)
+
+  message("--- Running EXE LINKER test for flags: ${linker_flags}")
+
+  unset(linker_flag_test_passed CACHE)
+  set(libgav1_cxx_main "\nint main() { return 0; }")
+  check_cxx_source_compiles("${libgav1_cxx_main}" linker_flag_test_passed)
+
+  if(NOT linker_flag_test_passed)
+    libgav1_die("EXE LINKER test failed.")
+  endif()
+
+  message("--- Passed EXE LINKER flag test.")
+
+  # Restore cached global exe linker flags.
+  if(cached_CMAKE_EXE_LINKER_FLAGS)
+    set(CMAKE_EXE_LINKER_FLAGS cached_CMAKE_EXE_LINKER_FLAGS)
+  else()
+    unset(CMAKE_EXE_LINKER_FLAGS)
+  endif()
+endmacro()
+
+# Runs the libgav1 compiler tests. This macro builds up the list of list var(s)
+# that is passed to libgav1_test_cxx_flag().
+#
+# Note: libgav1_set_build_definitions() must be called before this macro.
+macro(libgav1_set_cxx_flags)
+  unset(cxx_flag_lists)
+
+  if(CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
+    list(APPEND cxx_flag_lists libgav1_base_cxx_flags)
+  endif()
+
+  # Append clang flags after the base set to allow -Wno* overrides to take
+  # effect. Some of the base flags may enable a large set of warnings, e.g.,
+  # -Wall.
+  if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    list(APPEND cxx_flag_lists libgav1_clang_cxx_flags)
+  endif()
+
+  if(MSVC)
+    list(APPEND cxx_flag_lists libgav1_msvc_cxx_flags)
+  endif()
+
+  if(LIBGAV1_VERBOSE)
+    if(cxx_flag_lists)
+      libgav1_set_and_stringify(DEST cxx_flags SOURCE_VARS ${cxx_flag_lists})
+      message("libgav1_set_cxx_flags: internal CXX flags: ${cxx_flags}")
+    endif()
+  endif()
+
+  if(LIBGAV1_CXX_FLAGS)
+    list(APPEND cxx_flag_lists LIBGAV1_CXX_FLAGS)
+    if(LIBGAV1_VERBOSE)
+      message("libgav1_set_cxx_flags: user CXX flags: ${LIBGAV1_CXX_FLAGS}")
+    endif()
+  endif()
+
+  libgav1_test_cxx_flag(FLAG_LIST_VAR_NAMES ${cxx_flag_lists})
+endmacro()
diff --git a/libgav1/cmake/libgav1_helpers.cmake b/libgav1/cmake/libgav1_helpers.cmake
new file mode 100644
index 0000000..76d8d67
--- /dev/null
+++ b/libgav1/cmake/libgav1_helpers.cmake
@@ -0,0 +1,134 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_HELPERS_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_HELPERS_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_HELPERS_CMAKE_ 1)
+
+# Kills build generation using message(FATAL_ERROR) and outputs all data passed
+# to the console via use of $ARGN.
+macro(libgav1_die)
+  message(FATAL_ERROR ${ARGN})
+endmacro()
+
+# Converts semi-colon delimited list variable(s) to string. Output is written to
+# variable supplied via the DEST parameter. Input is from an expanded variable
+# referenced by SOURCE and/or variable(s) referenced by SOURCE_VARS.
+macro(libgav1_set_and_stringify)
+  set(optional_args)
+  set(single_value_args DEST SOURCE_VAR)
+  set(multi_value_args SOURCE SOURCE_VARS)
+  cmake_parse_arguments(sas "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+
+  if(NOT sas_DEST OR NOT (sas_SOURCE OR sas_SOURCE_VARS))
+    libgav1_die("libgav1_set_and_stringify: DEST and at least one of SOURCE "
+                "SOURCE_VARS required.")
+  endif()
+
+  unset(${sas_DEST})
+
+  if(sas_SOURCE)
+    # $sas_SOURCE is one or more expanded variables, just copy the values to
+    # $sas_DEST.
+    set(${sas_DEST} "${sas_SOURCE}")
+  endif()
+
+  if(sas_SOURCE_VARS)
+    # $sas_SOURCE_VARS is one or more variable names. Each iteration expands a
+    # variable and appends it to $sas_DEST.
+    foreach(source_var ${sas_SOURCE_VARS})
+      set(${sas_DEST} "${${sas_DEST}} ${${source_var}}")
+    endforeach()
+
+    # Because $sas_DEST can be empty when entering this scope leading whitespace
+    # can be introduced to $sas_DEST on the first iteration of the above loop.
+    # Remove it:
+    string(STRIP "${${sas_DEST}}" ${sas_DEST})
+  endif()
+
+  # Lists in CMake are simply semicolon delimited strings, so stringification is
+  # just a find and replace of the semicolon.
+  string(REPLACE ";" " " ${sas_DEST} "${${sas_DEST}}")
+
+  if(LIBGAV1_VERBOSE GREATER 1)
+    message("libgav1_set_and_stringify: ${sas_DEST}=${${sas_DEST}}")
+  endif()
+endmacro()
+
+# Creates a dummy source file in $LIBGAV1_GENERATED_SOURCES_DIRECTORY and adds
+# it to the specified target. Optionally adds its path to a list variable.
+#
+# libgav1_create_dummy_source_file(<TARGET <target> BASENAME <basename of file>>
+# [LISTVAR <list variable>])
+macro(libgav1_create_dummy_source_file)
+  set(optional_args)
+  set(single_value_args TARGET BASENAME LISTVAR)
+  set(multi_value_args)
+  cmake_parse_arguments(cdsf "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+
+  if(NOT cdsf_TARGET OR NOT cdsf_BASENAME)
+    libgav1_die(
+      "libgav1_create_dummy_source_file: TARGET and BASENAME required.")
+  endif()
+
+  if(NOT LIBGAV1_GENERATED_SOURCES_DIRECTORY)
+    set(LIBGAV1_GENERATED_SOURCES_DIRECTORY "${libgav1_build}/gen_src")
+  endif()
+
+  set(dummy_source_dir "${LIBGAV1_GENERATED_SOURCES_DIRECTORY}")
+  set(dummy_source_file
+      "${dummy_source_dir}/libgav1_${cdsf_TARGET}_${cdsf_BASENAME}.cc")
+  set(dummy_source_code
+      "// Generated file. DO NOT EDIT!\n"
+      "// C++ source file created for target ${cdsf_TARGET}. \n"
+      "void libgav1_${cdsf_TARGET}_${cdsf_BASENAME}_dummy_function(void);\n"
+      "void libgav1_${cdsf_TARGET}_${cdsf_BASENAME}_dummy_function(void) {}\n")
+  file(WRITE "${dummy_source_file}" "${dummy_source_code}")
+
+  target_sources(${cdsf_TARGET} PRIVATE ${dummy_source_file})
+
+  if(cdsf_LISTVAR)
+    list(APPEND ${cdsf_LISTVAR} "${dummy_source_file}")
+  endif()
+endmacro()
+
+# Loads the version components from $libgav1_source/gav1/version.h and sets the
+# corresponding CMake variables:
+# - LIBGAV1_MAJOR_VERSION
+# - LIBGAV1_MINOR_VERSION
+# - LIBGAV1_PATCH_VERSION
+# - LIBGAV1_VERSION, which is:
+#   - $LIBGAV1_MAJOR_VERSION.$LIBGAV1_MINOR_VERSION.$LIBGAV1_PATCH_VERSION
+macro(libgav1_load_version_info)
+  file(STRINGS "${libgav1_source}/gav1/version.h" version_file_strings)
+  foreach(str ${version_file_strings})
+    if(str MATCHES "#define LIBGAV1_")
+      if(str MATCHES "#define LIBGAV1_MAJOR_VERSION ")
+        string(REPLACE "#define LIBGAV1_MAJOR_VERSION " "" LIBGAV1_MAJOR_VERSION
+                       "${str}")
+      elseif(str MATCHES "#define LIBGAV1_MINOR_VERSION ")
+        string(REPLACE "#define LIBGAV1_MINOR_VERSION " "" LIBGAV1_MINOR_VERSION
+                       "${str}")
+      elseif(str MATCHES "#define LIBGAV1_PATCH_VERSION ")
+        string(REPLACE "#define LIBGAV1_PATCH_VERSION " "" LIBGAV1_PATCH_VERSION
+                       "${str}")
+      endif()
+    endif()
+  endforeach()
+  set(LIBGAV1_VERSION "${LIBGAV1_MAJOR_VERSION}.${LIBGAV1_MINOR_VERSION}")
+  set(LIBGAV1_VERSION "${LIBGAV1_VERSION}.${LIBGAV1_PATCH_VERSION}")
+endmacro()
diff --git a/libgav1/cmake/libgav1_install.cmake b/libgav1/cmake/libgav1_install.cmake
new file mode 100644
index 0000000..b7f6006
--- /dev/null
+++ b/libgav1/cmake/libgav1_install.cmake
@@ -0,0 +1,60 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_INSTALL_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_INSTALL_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_INSTALL_CMAKE_ 1)
+
+# Sets up the Libgav1 install targets. Must be called after the static library
+# target is created.
+macro(libgav1_setup_install_target)
+  if(NOT (MSVC OR XCODE))
+    include(GNUInstallDirs)
+
+    # pkg-config: libgav1.pc
+    set(prefix "${CMAKE_INSTALL_PREFIX}")
+    set(exec_prefix "\${prefix}")
+    set(libdir "\${prefix}/${CMAKE_INSTALL_LIBDIR}")
+    set(includedir "\${prefix}/${CMAKE_INSTALL_INCLUDEDIR}")
+    set(libgav1_lib_name "libgav1")
+
+    configure_file("${libgav1_root}/cmake/libgav1.pc.template"
+                   "${libgav1_build}/libgav1.pc" @ONLY NEWLINE_STYLE UNIX)
+    install(FILES "${libgav1_build}/libgav1.pc"
+            DESTINATION "${prefix}/${CMAKE_INSTALL_LIBDIR}/pkgconfig")
+
+    # CMake config: libgav1-config.cmake
+    set(LIBGAV1_INCLUDE_DIRS "${prefix}/${CMAKE_INSTALL_INCLUDEDIR}")
+    configure_file("${libgav1_root}/cmake/libgav1-config.cmake.template"
+                   "${libgav1_build}/libgav1-config.cmake" @ONLY
+                   NEWLINE_STYLE UNIX)
+    install(
+      FILES "${libgav1_build}/libgav1-config.cmake"
+      DESTINATION "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_DATAROOTDIR}/cmake")
+
+    install(
+      FILES ${libgav1_api_includes}
+      DESTINATION "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/gav1")
+
+    install(TARGETS gav1_decode DESTINATION
+                    "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}")
+    install(TARGETS libgav1_static DESTINATION
+                    "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
+    if(BUILD_SHARED_LIBS)
+      install(TARGETS libgav1_shared DESTINATION
+                      "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
+    endif()
+  endif()
+endmacro()
diff --git a/libgav1/cmake/libgav1_intrinsics.cmake b/libgav1/cmake/libgav1_intrinsics.cmake
new file mode 100644
index 0000000..039ef35
--- /dev/null
+++ b/libgav1/cmake/libgav1_intrinsics.cmake
@@ -0,0 +1,110 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_INTRINSICS_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_INTRINSICS_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_INTRINSICS_CMAKE_ 1)
+
+# Returns the compiler flag for the SIMD intrinsics suffix specified by the
+# SUFFIX argument via the variable specified by the VARIABLE argument:
+# libgav1_get_intrinsics_flag_for_suffix(SUFFIX <suffix> VARIABLE <var name>)
+macro(libgav1_get_intrinsics_flag_for_suffix)
+  unset(intrinsics_SUFFIX)
+  unset(intrinsics_VARIABLE)
+  unset(optional_args)
+  unset(multi_value_args)
+  set(single_value_args SUFFIX VARIABLE)
+  cmake_parse_arguments(intrinsics "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+
+  if(NOT (intrinsics_SUFFIX AND intrinsics_VARIABLE))
+    message(FATAL_ERROR "libgav1_get_intrinsics_flag_for_suffix: SUFFIX and "
+                        "VARIABLE required.")
+  endif()
+
+  if(intrinsics_SUFFIX MATCHES "neon")
+    if(NOT MSVC)
+      set(${intrinsics_VARIABLE} "${LIBGAV1_NEON_INTRINSICS_FLAG}")
+    endif()
+  elseif(intrinsics_SUFFIX MATCHES "sse4")
+    if(NOT MSVC)
+      set(${intrinsics_VARIABLE} "-msse4.1")
+    endif()
+  else()
+    message(FATAL_ERROR "libgav1_get_intrinsics_flag_for_suffix: Unknown "
+                        "instrinics suffix: ${intrinsics_SUFFIX}")
+  endif()
+
+  if(LIBGAV1_VERBOSE GREATER 1)
+    message("libgav1_get_intrinsics_flag_for_suffix: "
+            "suffix:${intrinsics_SUFFIX} flag:${${intrinsics_VARIABLE}}")
+  endif()
+endmacro()
+
+# Processes source files specified by SOURCES and adds intrinsics flags as
+# necessary: libgav1_process_intrinsics_sources(SOURCES <sources>)
+#
+# Detects requirement for intrinsics flags using source file name suffix.
+# Currently supports only SSE4.1.
+macro(libgav1_process_intrinsics_sources)
+  unset(arg_TARGET)
+  unset(arg_SOURCES)
+  unset(optional_args)
+  set(single_value_args TARGET)
+  set(multi_value_args SOURCES)
+  cmake_parse_arguments(arg "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+  if(NOT (arg_TARGET AND arg_SOURCES))
+    message(FATAL_ERROR "libgav1_process_intrinsics_sources: TARGET and "
+                        "SOURCES required.")
+  endif()
+
+  if(LIBGAV1_ENABLE_SSE4_1 AND libgav1_have_sse4)
+    unset(sse4_sources)
+    list(APPEND sse4_sources ${arg_SOURCES})
+
+    list(FILTER sse4_sources INCLUDE REGEX
+         "${libgav1_sse4_source_file_suffix}$")
+
+    if(sse4_sources)
+      unset(sse4_flags)
+      libgav1_get_intrinsics_flag_for_suffix(SUFFIX
+                                             ${libgav1_sse4_source_file_suffix}
+                                             VARIABLE sse4_flags)
+      if(sse4_flags)
+        libgav1_set_compiler_flags_for_sources(SOURCES ${sse4_sources} FLAGS
+                                               ${sse4_flags})
+      endif()
+    endif()
+  endif()
+
+  if(LIBGAV1_ENABLE_NEON AND libgav1_have_neon)
+    unset(neon_sources)
+    list(APPEND neon_sources ${arg_SOURCES})
+    list(FILTER neon_sources INCLUDE REGEX
+         "${libgav1_neon_source_file_suffix}$")
+
+    if(neon_sources AND LIBGAV1_NEON_INTRINSICS_FLAG)
+      unset(neon_flags)
+      libgav1_get_intrinsics_flag_for_suffix(SUFFIX
+                                             ${libgav1_neon_source_file_suffix}
+                                             VARIABLE neon_flags)
+      if(neon_flags)
+        libgav1_set_compiler_flags_for_sources(SOURCES ${neon_sources} FLAGS
+                                               ${neon_flags})
+      endif()
+    endif()
+  endif()
+endmacro()
diff --git a/libgav1/cmake/libgav1_options.cmake b/libgav1/cmake/libgav1_options.cmake
new file mode 100644
index 0000000..6327bee
--- /dev/null
+++ b/libgav1/cmake/libgav1_options.cmake
@@ -0,0 +1,55 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_OPTIONS_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_OPTIONS_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_OPTIONS_CMAKE_)
+
+# Simple wrapper for CMake's builtin option command that tracks libgav1's build
+# options in the list variable $libgav1_options.
+macro(libgav1_option)
+  unset(option_NAME)
+  unset(option_HELPSTRING)
+  unset(option_VALUE)
+  unset(optional_args)
+  unset(multi_value_args)
+  set(single_value_args NAME HELPSTRING VALUE)
+  cmake_parse_arguments(option "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+
+  if(NOT (option_NAME AND option_HELPSTRING AND DEFINED option_VALUE))
+    message(FATAL_ERROR "libgav1_option: NAME HELPSTRING and VALUE required.")
+  endif()
+
+  option(${option_NAME} ${option_HELPSTRING} ${option_VALUE})
+
+  if(LIBGAV1_VERBOSE GREATER 2)
+    message("--------- libgav1_option ---------\n"
+            "option_NAME=${option_NAME}\n"
+            "option_HELPSTRING=${option_HELPSTRING}\n"
+            "option_VALUE=${option_VALUE}\n"
+            "------------------------------------------\n")
+  endif()
+
+  list(APPEND libgav1_options ${option_NAME})
+  list(REMOVE_DUPLICATES libgav1_options)
+endmacro()
+
+# Dumps the $libgav1_options list via CMake message command.
+macro(libgav1_dump_options)
+  foreach(option_name ${libgav1_options})
+    message("${option_name}: ${${option_name}}")
+  endforeach()
+endmacro()
diff --git a/libgav1/cmake/libgav1_sanitizer.cmake b/libgav1/cmake/libgav1_sanitizer.cmake
new file mode 100644
index 0000000..4bb2263
--- /dev/null
+++ b/libgav1/cmake/libgav1_sanitizer.cmake
@@ -0,0 +1,45 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_SANITIZER_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_SANITIZER_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_SANITIZER_CMAKE_ 1)
+
+macro(libgav1_configure_sanitizer)
+  if(LIBGAV1_SANITIZE AND NOT MSVC)
+    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+      if(LIBGAV1_SANITIZE MATCHES "cfi")
+        list(APPEND LIBGAV1_CXX_FLAGS "-flto" "-fno-sanitize-trap=cfi")
+        list(APPEND LIBGAV1_EXE_LINKER_FLAGS "-flto" "-fno-sanitize-trap=cfi"
+                    "-fuse-ld=gold")
+      endif()
+
+      if(${CMAKE_SIZEOF_VOID_P} EQUAL 4
+         AND LIBGAV1_SANITIZE MATCHES "integer|undefined")
+        list(APPEND LIBGAV1_EXE_LINKER_FLAGS "--rtlib=compiler-rt" "-lgcc_s")
+      endif()
+    endif()
+
+    list(APPEND LIBGAV1_CXX_FLAGS "-fsanitize=${LIBGAV1_SANITIZE}")
+    list(APPEND LIBGAV1_EXE_LINKER_FLAGS "-fsanitize=${LIBGAV1_SANITIZE}")
+
+    # Make sanitizer callstacks accurate.
+    list(APPEND LIBGAV1_CXX_FLAGS "-fno-omit-frame-pointer"
+                "-fno-optimize-sibling-calls")
+
+    libgav1_test_cxx_flag(FLAG_LIST_VAR_NAMES LIBGAV1_CXX_FLAGS FLAG_REQUIRED)
+    libgav1_test_exe_linker_flag(FLAG_LIST_VAR_NAME LIBGAV1_EXE_LINKER_FLAGS)
+  endif()
+endmacro()
diff --git a/libgav1/cmake/libgav1_targets.cmake b/libgav1/cmake/libgav1_targets.cmake
new file mode 100644
index 0000000..78b4865
--- /dev/null
+++ b/libgav1/cmake/libgav1_targets.cmake
@@ -0,0 +1,347 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_GAV1_TARGETS_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_GAV1_TARGETS_CMAKE_
+set(LIBGAV1_CMAKE_GAV1_TARGETS_CMAKE_ 1)
+
+# Resets list variables used to track libgav1 targets.
+macro(libgav1_reset_target_lists)
+  unset(libgav1_targets)
+  unset(libgav1_exe_targets)
+  unset(libgav1_lib_targets)
+  unset(libgav1_objlib_targets)
+  unset(libgav1_sources)
+  unset(libgav1_test_targets)
+endmacro()
+
+# Creates an executable target. The target name is passed as a parameter to the
+# NAME argument, and the sources passed as a parameter to the SOURCES argument:
+# libgav1_add_test(NAME <name> SOURCES <sources> [optional args])
+#
+# Optional args:
+# cmake-format: off
+#   - OUTPUT_NAME: Override output file basename. Target basename defaults to
+#     NAME.
+#   - TEST: Flag. Presence means treat executable as a test.
+#   - DEFINES: List of preprocessor macro definitions.
+#   - INCLUDES: list of include directories for the target.
+#   - COMPILE_FLAGS: list of compiler flags for the target.
+#   - LINK_FLAGS: List of linker flags for the target.
+#   - OBJLIB_DEPS: List of CMake object library target dependencies.
+#   - LIB_DEPS: List of CMake library dependencies.
+# cmake-format: on
+#
+# Sources passed to this macro are added to $libgav1_test_sources when TEST is
+# specified. Otherwise sources are added to $libgav1_sources.
+#
+# Targets passed to this macro are always added $libgav1_targets. When TEST is
+# specified targets are also added to list $libgav1_test_targets. Otherwise
+# targets are added to $libgav1_exe_targets.
+macro(libgav1_add_executable)
+  unset(exe_TEST)
+  unset(exe_TEST_DEFINES_MAIN)
+  unset(exe_NAME)
+  unset(exe_OUTPUT_NAME)
+  unset(exe_SOURCES)
+  unset(exe_DEFINES)
+  unset(exe_INCLUDES)
+  unset(exe_COMPILE_FLAGS)
+  unset(exe_LINK_FLAGS)
+  unset(exe_OBJLIB_DEPS)
+  unset(exe_LIB_DEPS)
+  set(optional_args TEST)
+  set(single_value_args NAME OUTPUT_NAME)
+  set(multi_value_args SOURCES DEFINES INCLUDES COMPILE_FLAGS LINK_FLAGS
+                       OBJLIB_DEPS LIB_DEPS)
+
+  cmake_parse_arguments(exe "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+
+  if(LIBGAV1_VERBOSE GREATER 1)
+    message("--------- libgav1_add_executable ---------\n"
+            "exe_TEST=${exe_TEST}\n"
+            "exe_TEST_DEFINES_MAIN=${exe_TEST_DEFINES_MAIN}\n"
+            "exe_NAME=${exe_NAME}\n"
+            "exe_OUTPUT_NAME=${exe_OUTPUT_NAME}\n"
+            "exe_SOURCES=${exe_SOURCES}\n"
+            "exe_DEFINES=${exe_DEFINES}\n"
+            "exe_INCLUDES=${exe_INCLUDES}\n"
+            "exe_COMPILE_FLAGS=${exe_COMPILE_FLAGS}\n"
+            "exe_LINK_FLAGS=${exe_LINK_FLAGS}\n"
+            "exe_OBJLIB_DEPS=${exe_OBJLIB_DEPS}\n"
+            "exe_LIB_DEPS=${exe_LIB_DEPS}\n"
+            "------------------------------------------\n")
+  endif()
+
+  if(NOT (exe_NAME AND exe_SOURCES))
+    message(FATAL_ERROR "libgav1_add_executable: NAME and SOURCES required.")
+  endif()
+
+  list(APPEND libgav1_targets ${exe_NAME})
+  if(exe_TEST)
+    list(APPEND libgav1_test_targets ${exe_NAME})
+    list(APPEND libgav1_test_sources ${exe_SOURCES})
+  else()
+    list(APPEND libgav1_exe_targets ${exe_NAME})
+    list(APPEND libgav1_sources ${exe_SOURCES})
+  endif()
+
+  add_executable(${exe_NAME} ${exe_SOURCES})
+
+  if(exe_OUTPUT_NAME)
+    set_target_properties(${exe_NAME} PROPERTIES OUTPUT_NAME ${exe_OUTPUT_NAME})
+  endif()
+
+  libgav1_process_intrinsics_sources(TARGET ${exe_NAME} SOURCES ${exe_SOURCES})
+
+  if(exe_DEFINES)
+    target_compile_definitions(${exe_NAME} PRIVATE ${exe_DEFINES})
+  endif()
+
+  if(exe_INCLUDES)
+    target_include_directories(${exe_NAME} PRIVATE ${exe_INCLUDES})
+  endif()
+
+  if(exe_COMPILE_FLAGS OR LIBGAV1_CXX_FLAGS)
+    target_compile_options(${exe_NAME}
+                           PRIVATE ${exe_COMPILE_FLAGS} ${LIBGAV1_CXX_FLAGS})
+  endif()
+
+  if(exe_LINK_FLAGS OR LIBGAV1_EXE_LINKER_FLAGS)
+    set_target_properties(${exe_NAME}
+                          PROPERTIES LINK_FLAGS ${exe_LINK_FLAGS}
+                                     ${LIBGAV1_EXE_LINKER_FLAGS})
+  endif()
+
+  if(exe_OBJLIB_DEPS)
+    foreach(objlib_dep ${exe_OBJLIB_DEPS})
+      target_sources(${exe_NAME} PRIVATE $<TARGET_OBJECTS:${objlib_dep}>)
+    endforeach()
+  endif()
+
+  if(CMAKE_THREAD_LIBS_INIT)
+    list(APPEND exe_LIB_DEPS ${CMAKE_THREAD_LIBS_INIT})
+  endif()
+
+  if(BUILD_SHARED_LIBS AND (MSVC OR WIN32))
+    target_compile_definitions(${lib_NAME} PRIVATE "LIBGAV1_BUILDING_DLL=0")
+  endif()
+
+  if(exe_LIB_DEPS)
+    unset(exe_static)
+    if("${CMAKE_EXE_LINKER_FLAGS} ${LIBGAV1_EXE_LINKER_FLAGS}" MATCHES "static")
+      set(exe_static ON)
+    endif()
+
+    if(exe_static AND CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
+      # Third party dependencies can introduce dependencies on system and test
+      # libraries. Since the target created here is an executable, and CMake
+      # does not provide a method of controlling order of link dependencies,
+      # wrap all of the dependencies of this target in start/end group flags to
+      # ensure that dependencies of third party targets can be resolved when
+      # those dependencies happen to be resolved by dependencies of the current
+      # target.
+      list(INSERT exe_LIB_DEPS 0 -Wl,--start-group)
+      list(APPEND exe_LIB_DEPS -Wl,--end-group)
+    endif()
+    target_link_libraries(${exe_NAME} PRIVATE ${exe_LIB_DEPS})
+  endif()
+endmacro()
+
+# Creates a library target of the specified type. The target name is passed as a
+# parameter to the NAME argument, the type as a parameter to the TYPE argument,
+# and the sources passed as a parameter to the SOURCES argument:
+# libgav1_add_library(NAME <name> TYPE <type> SOURCES <sources> [optional args])
+#
+# Optional args:
+# cmake-format: off
+#   - OUTPUT_NAME: Override output file basename. Target basename defaults to
+#     NAME. OUTPUT_NAME is ignored when BUILD_SHARED_LIBS is enabled and CMake
+#     is generating a build for which MSVC or WIN32 are true. This is to avoid
+#     output basename collisions with DLL import libraries.
+#   - TEST: Flag. Presence means treat library as a test.
+#   - DEFINES: List of preprocessor macro definitions.
+#   - INCLUDES: list of include directories for the target.
+#   - COMPILE_FLAGS: list of compiler flags for the target.
+#   - LINK_FLAGS: List of linker flags for the target.
+#   - OBJLIB_DEPS: List of CMake object library target dependencies.
+#   - LIB_DEPS: List of CMake library dependencies.
+#   - PUBLIC_INCLUDES: List of include paths to export to dependents.
+# cmake-format: on
+#
+# Sources passed to the macro are added to the lists tracking libgav1 sources:
+# cmake-format: off
+#   - When TEST is specified sources are added to $libgav1_test_sources.
+#   - Otherwise sources are added to $libgav1_sources.
+# cmake-format: on
+#
+# Targets passed to this macro are added to the lists tracking libgav1 targets:
+# cmake-format: off
+#   - Targets are always added to $libgav1_targets.
+#   - When the TEST flag is specified, targets are added to
+#     $libgav1_test_targets.
+#   - When TEST is not specified:
+#     - Libraries of type SHARED are added to $libgav1_dylib_targets.
+#     - Libraries of type OBJECT are added to $libgav1_objlib_targets.
+#     - Libraries of type STATIC are added to $libgav1_lib_targets.
+# cmake-format: on
+macro(libgav1_add_library)
+  unset(lib_TEST)
+  unset(lib_NAME)
+  unset(lib_OUTPUT_NAME)
+  unset(lib_TYPE)
+  unset(lib_SOURCES)
+  unset(lib_DEFINES)
+  unset(lib_INCLUDES)
+  unset(lib_COMPILE_FLAGS)
+  unset(lib_LINK_FLAGS)
+  unset(lib_OBJLIB_DEPS)
+  unset(lib_LIB_DEPS)
+  unset(lib_PUBLIC_INCLUDES)
+  set(optional_args TEST)
+  set(single_value_args NAME OUTPUT_NAME TYPE)
+  set(multi_value_args SOURCES DEFINES INCLUDES COMPILE_FLAGS LINK_FLAGS
+                       OBJLIB_DEPS LIB_DEPS PUBLIC_INCLUDES)
+
+  cmake_parse_arguments(lib "${optional_args}" "${single_value_args}"
+                        "${multi_value_args}" ${ARGN})
+
+  if(LIBGAV1_VERBOSE GREATER 1)
+    message("--------- libgav1_add_library ---------\n"
+            "lib_TEST=${lib_TEST}\n"
+            "lib_NAME=${lib_NAME}\n"
+            "lib_OUTPUT_NAME=${lib_OUTPUT_NAME}\n"
+            "lib_TYPE=${lib_TYPE}\n"
+            "lib_SOURCES=${lib_SOURCES}\n"
+            "lib_DEFINES=${lib_DEFINES}\n"
+            "lib_INCLUDES=${lib_INCLUDES}\n"
+            "lib_COMPILE_FLAGS=${lib_COMPILE_FLAGS}\n"
+            "lib_LINK_FLAGS=${lib_LINK_FLAGS}\n"
+            "lib_OBJLIB_DEPS=${lib_OBJLIB_DEPS}\n"
+            "lib_LIB_DEPS=${lib_LIB_DEPS}\n"
+            "lib_PUBLIC_INCLUDES=${lib_PUBLIC_INCLUDES}\n"
+            "---------------------------------------\n")
+  endif()
+
+  if(NOT (lib_NAME AND lib_TYPE AND lib_SOURCES))
+    message(FATAL_ERROR "libgav1_add_library: NAME, TYPE and SOURCES required.")
+  endif()
+
+  list(APPEND libgav1_targets ${lib_NAME})
+  if(lib_TEST)
+    list(APPEND libgav1_test_targets ${lib_NAME})
+    list(APPEND libgav1_test_sources ${lib_SOURCES})
+  else()
+    list(APPEND libgav1_sources ${lib_SOURCES})
+    if(lib_TYPE STREQUAL OBJECT)
+      list(APPEND libgav1_objlib_targets ${lib_NAME})
+    elseif(lib_TYPE STREQUAL SHARED)
+      list(APPEND libgav1_dylib_targets ${lib_NAME})
+    elseif(lib_TYPE STREQUAL STATIC)
+      list(APPEND libgav1_lib_targets ${lib_NAME})
+    else()
+      message(WARNING "libgav1_add_library: Unhandled type: ${lib_TYPE}")
+    endif()
+  endif()
+
+  add_library(${lib_NAME} ${lib_TYPE} ${lib_SOURCES})
+  libgav1_process_intrinsics_sources(TARGET ${lib_NAME} SOURCES ${lib_SOURCES})
+
+  if(lib_OUTPUT_NAME)
+    if(NOT (BUILD_SHARED_LIBS AND (MSVC OR WIN32)))
+      set_target_properties(${lib_NAME}
+                            PROPERTIES OUTPUT_NAME ${lib_OUTPUT_NAME})
+    endif()
+  endif()
+
+  if(lib_DEFINES)
+    target_compile_definitions(${lib_NAME} PRIVATE ${lib_DEFINES})
+  endif()
+
+  if(lib_INCLUDES)
+    target_include_directories(${lib_NAME} PRIVATE ${lib_INCLUDES})
+  endif()
+
+  if(lib_PUBLIC_INCLUDES)
+    target_include_directories(${lib_NAME} PUBLIC ${lib_PUBLIC_INCLUDES})
+  endif()
+
+  if(lib_COMPILE_FLAGS OR LIBGAV1_CXX_FLAGS)
+    target_compile_options(${lib_NAME}
+                           PRIVATE ${lib_COMPILE_FLAGS} ${LIBGAV1_CXX_FLAGS})
+  endif()
+
+  if(lib_LINK_FLAGS)
+    set_target_properties(${lib_NAME} PROPERTIES LINK_FLAGS ${lib_LINK_FLAGS})
+  endif()
+
+  if(lib_OBJLIB_DEPS)
+    foreach(objlib_dep ${lib_OBJLIB_DEPS})
+      target_sources(${lib_NAME} PRIVATE $<TARGET_OBJECTS:${objlib_dep}>)
+    endforeach()
+  endif()
+
+  if(lib_LIB_DEPS)
+    if(lib_TYPE STREQUAL STATIC)
+      set(link_type PUBLIC)
+    else()
+      set(link_type PRIVATE)
+      if(lib_TYPE STREQUAL SHARED AND CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
+        # The libgav1 shared object uses the static libgav1 as input to turn it
+        # into a shared object. Include everything from the static library in
+        # the shared object.
+        if(APPLE)
+          list(INSERT lib_LIB_DEPS 0 -Wl,-force_load)
+        else()
+          list(INSERT lib_LIB_DEPS 0 -Wl,--whole-archive)
+          list(APPEND lib_LIB_DEPS -Wl,--no-whole-archive)
+        endif()
+      endif()
+    endif()
+    target_link_libraries(${lib_NAME} ${link_type} ${lib_LIB_DEPS})
+  endif()
+
+  if(NOT MSVC AND lib_NAME MATCHES "^lib")
+    # Non-MSVC generators prepend lib to static lib target file names. Libgav1
+    # already includes lib in its name. Avoid naming output files liblib*.
+    set_target_properties(${lib_NAME} PROPERTIES PREFIX "")
+  endif()
+
+  if(lib_TYPE STREQUAL SHARED AND NOT MSVC)
+    set_target_properties(${lib_NAME} PROPERTIES SOVERSION ${LIBGAV1_SOVERSION})
+  endif()
+
+  if(BUILD_SHARED_LIBS AND (MSVC OR WIN32))
+    if(lib_TYPE STREQUAL SHARED)
+      target_compile_definitions(${lib_NAME} PRIVATE "LIBGAV1_BUILDING_DLL=1")
+    else()
+      target_compile_definitions(${lib_NAME} PRIVATE "LIBGAV1_BUILDING_DLL=0")
+    endif()
+  endif()
+
+  # Determine if $lib_NAME is a header only target.
+  set(sources_list ${lib_SOURCES})
+  list(FILTER sources_list INCLUDE REGEX cc$)
+  if(NOT sources_list)
+    if(NOT XCODE)
+      # This is a header only target. Tell CMake the link language.
+      set_target_properties(${lib_NAME} PROPERTIES LINKER_LANGUAGE CXX)
+    else()
+      # The Xcode generator ignores LINKER_LANGUAGE. Add a dummy cc file.
+      libgav1_create_dummy_source_file(TARGET ${lib_NAME} BASENAME ${lib_NAME})
+    endif()
+  endif()
+endmacro()
diff --git a/libgav1/cmake/libgav1_variables.cmake b/libgav1/cmake/libgav1_variables.cmake
new file mode 100644
index 0000000..0dd0f37
--- /dev/null
+++ b/libgav1/cmake/libgav1_variables.cmake
@@ -0,0 +1,78 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_VARIABLES_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_VARIABLES_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_VARIABLES_CMAKE_ 1)
+
+# Halts generation when $variable_name does not refer to a directory that
+# exists.
+macro(libgav1_variable_must_be_directory variable_name)
+  if("${variable_name}" STREQUAL "")
+    message(
+      FATAL_ERROR
+        "Empty variable_name passed to libgav1_variable_must_be_directory.")
+  endif()
+
+  if("${${variable_name}}" STREQUAL "")
+    message(
+      FATAL_ERROR
+        "Empty variable ${variable_name} is required to build libgav1.")
+  endif()
+
+  if(NOT IS_DIRECTORY "${${variable_name}}")
+    message(
+      FATAL_ERROR
+        "${variable_name}, which is ${${variable_name}}, does not refer to a\n"
+        "directory.")
+  endif()
+endmacro()
+
+# Adds $var_name to the tracked variables list.
+macro(libgav1_track_configuration_variable var_name)
+  if(LIBGAV1_VERBOSE GREATER 2)
+    message("---- libgav1_track_configuration_variable ----\n"
+            "var_name=${var_name}\n"
+            "----------------------------------------------\n")
+  endif()
+
+  list(APPEND libgav1_configuration_variables ${var_name})
+  list(REMOVE_DUPLICATES libgav1_configuration_variables)
+endmacro()
+
+# Logs current C++ and executable linker flags via CMake's message command.
+macro(libgav1_dump_cmake_flag_variables)
+  unset(flag_variables)
+  list(APPEND flag_variables "CMAKE_CXX_FLAGS_INIT" "CMAKE_CXX_FLAGS"
+              "CMAKE_EXE_LINKER_FLAGS_INIT" "CMAKE_EXE_LINKER_FLAGS")
+  if(CMAKE_BUILD_TYPE)
+    list(APPEND flag_variables "CMAKE_BUILD_TYPE"
+                "CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}_INIT"
+                "CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}"
+                "CMAKE_EXE_LINKER_FLAGS_${CMAKE_BUILD_TYPE}_INIT"
+                "CMAKE_EXE_LINKER_FLAGS_${CMAKE_BUILD_TYPE}")
+  endif()
+  foreach(flag_variable ${flag_variables})
+    message("${flag_variable}:${${flag_variable}}")
+  endforeach()
+endmacro()
+
+# Dumps the variables tracked in $libgav1_configuration_variables via CMake's
+# message command.
+macro(libgav1_dump_tracked_configuration_variables)
+  foreach(config_variable ${libgav1_configuration_variables})
+    message("${config_variable}:${${config_variable}}")
+  endforeach()
+endmacro()
diff --git a/libgav1/cmake/toolchains/aarch64-linux-gnu.cmake b/libgav1/cmake/toolchains/aarch64-linux-gnu.cmake
new file mode 100644
index 0000000..7ffe397
--- /dev/null
+++ b/libgav1/cmake/toolchains/aarch64-linux-gnu.cmake
@@ -0,0 +1,28 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_TOOLCHAINS_AARCH64_LINUX_GNU_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_TOOLCHAINS_AARCH64_LINUX_GNU_CMAKE_
+set(LIBGAV1_CMAKE_TOOLCHAINS_AARCH64_LINUX_GNU_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+
+if("${CROSS}" STREQUAL "")
+  set(CROSS aarch64-linux-gnu-)
+endif()
+
+set(CMAKE_CXX_COMPILER ${CROSS}g++)
+set(CMAKE_CXX_FLAGS_INIT "-march=armv8-a")
+set(CMAKE_SYSTEM_PROCESSOR "aarch64")
diff --git a/libgav1/cmake/toolchains/android.cmake b/libgav1/cmake/toolchains/android.cmake
new file mode 100644
index 0000000..492957b
--- /dev/null
+++ b/libgav1/cmake/toolchains/android.cmake
@@ -0,0 +1,53 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_TOOLCHAINS_ANDROID_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_TOOLCHAINS_ANDROID_CMAKE_
+
+# Additional ANDROID_* settings are available, see:
+# https://developer.android.com/ndk/guides/cmake#variables
+
+if(NOT ANDROID_PLATFORM)
+  set(ANDROID_PLATFORM android-21)
+endif()
+
+# Choose target architecture with:
+#
+# -DANDROID_ABI={armeabi-v7a,armeabi-v7a with NEON,arm64-v8a,x86,x86_64}
+if(NOT ANDROID_ABI)
+  set(ANDROID_ABI arm64-v8a)
+endif()
+
+# Force arm mode for 32-bit targets (instead of the default thumb) to improve
+# performance.
+if(NOT ANDROID_ARM_MODE)
+  set(ANDROID_ARM_MODE arm)
+endif()
+
+# Toolchain files don't have access to cached variables:
+# https://gitlab.kitware.com/cmake/cmake/issues/16170. Set an intermediate
+# environment variable when loaded the first time.
+if(LIBGAV1_ANDROID_NDK_PATH)
+  set(ENV{LIBGAV1_ANDROID_NDK_PATH} "${LIBGAV1_ANDROID_NDK_PATH}")
+else()
+  set(LIBGAV1_ANDROID_NDK_PATH "$ENV{LIBGAV1_ANDROID_NDK_PATH}")
+endif()
+
+if(NOT LIBGAV1_ANDROID_NDK_PATH)
+  message(FATAL_ERROR "LIBGAV1_ANDROID_NDK_PATH not set.")
+  return()
+endif()
+
+include("${LIBGAV1_ANDROID_NDK_PATH}/build/cmake/android.toolchain.cmake")
diff --git a/libgav1/cmake/toolchains/arm-linux-gnueabihf.cmake b/libgav1/cmake/toolchains/arm-linux-gnueabihf.cmake
new file mode 100644
index 0000000..8051f0d
--- /dev/null
+++ b/libgav1/cmake/toolchains/arm-linux-gnueabihf.cmake
@@ -0,0 +1,29 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_TOOLCHAINS_ARM_LINUX_GNUEABIHF_CMAKE_)
+  return()
+endif() # LIBGAV1_CMAKE_TOOLCHAINS_ARM_LINUX_GNUEABIHF_CMAKE_
+set(LIBGAV1_CMAKE_TOOLCHAINS_ARM_LINUX_GNUEABIHF_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+
+if("${CROSS}" STREQUAL "")
+  set(CROSS arm-linux-gnueabihf-)
+endif()
+
+set(CMAKE_CXX_COMPILER ${CROSS}g++)
+set(CMAKE_CXX_FLAGS_INIT "-march=armv7-a -marm")
+set(CMAKE_SYSTEM_PROCESSOR "armv7")
+set(LIBGAV1_NEON_INTRINSICS_FLAG "-mfpu=neon")
diff --git a/libgav1/codereview.settings b/libgav1/codereview.settings
new file mode 100644
index 0000000..ccba2ee
--- /dev/null
+++ b/libgav1/codereview.settings
@@ -0,0 +1,4 @@
+# This file is used by git cl to get repository specific information.
+GERRIT_HOST: True
+CODE_REVIEW_SERVER: chromium-review.googlesource.com
+GERRIT_SQUASH_UPLOADS: False
diff --git a/libgav1/examples/file_reader.cc b/libgav1/examples/file_reader.cc
new file mode 100644
index 0000000..b096722
--- /dev/null
+++ b/libgav1/examples/file_reader.cc
@@ -0,0 +1,186 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_reader.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <new>
+#include <string>
+#include <vector>
+
+#if defined(_WIN32)
+#include <fcntl.h>
+#include <io.h>
+#endif
+
+#include "examples/file_reader_constants.h"
+#include "examples/file_reader_factory.h"
+#include "examples/file_reader_interface.h"
+#include "examples/ivf_parser.h"
+#include "examples/logging.h"
+
+namespace libgav1 {
+namespace {
+
+FILE* SetBinaryMode(FILE* stream) {
+#if defined(_WIN32)
+  _setmode(_fileno(stream), _O_BINARY);
+#endif
+  return stream;
+}
+
+}  // namespace
+
+bool FileReader::registered_in_factory_ =
+    FileReaderFactory::RegisterReader(FileReader::Open);
+
+FileReader::~FileReader() {
+  if (owns_file_) fclose(file_);
+}
+
+std::unique_ptr<FileReaderInterface> FileReader::Open(
+    const std::string& file_name, const bool error_tolerant) {
+  if (file_name.empty()) return nullptr;
+
+  FILE* raw_file_ptr;
+
+  bool owns_file = true;
+  if (file_name == "-") {
+    raw_file_ptr = SetBinaryMode(stdin);
+    owns_file = false;  // stdin is owned by the Standard C Library.
+  } else {
+    raw_file_ptr = fopen(file_name.c_str(), "rb");
+  }
+
+  if (raw_file_ptr == nullptr) {
+    return nullptr;
+  }
+
+  std::unique_ptr<FileReader> file(
+      new (std::nothrow) FileReader(raw_file_ptr, owns_file, error_tolerant));
+  if (file == nullptr) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Out of memory");
+    if (owns_file) fclose(raw_file_ptr);
+    return nullptr;
+  }
+
+  if (!file->ReadIvfFileHeader()) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Unsupported file type");
+    return nullptr;
+  }
+
+  return file;
+}
+
+// IVF Frame Header format, from https://wiki.multimedia.cx/index.php/IVF
+// bytes 0-3    size of frame in bytes (not including the 12-byte header)
+// bytes 4-11   64-bit presentation timestamp
+// bytes 12..   frame data
+bool FileReader::ReadTemporalUnit(std::vector<uint8_t>* const tu_data,
+                                  int64_t* const timestamp) {
+  if (tu_data == nullptr) return false;
+  tu_data->clear();
+
+  uint8_t header_buffer[kIvfFrameHeaderSize];
+  const size_t num_read = fread(header_buffer, 1, kIvfFrameHeaderSize, file_);
+
+  if (IsEndOfFile()) {
+    if (num_read != 0) {
+      LIBGAV1_EXAMPLES_LOG_ERROR(
+          "Cannot read IVF frame header: Not enough data available");
+      return false;
+    }
+
+    return true;
+  }
+
+  IvfFrameHeader ivf_frame_header;
+  if (!ParseIvfFrameHeader(header_buffer, &ivf_frame_header)) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Could not parse IVF frame header");
+    if (error_tolerant_) {
+      ivf_frame_header.frame_size =
+          std::min(ivf_frame_header.frame_size, size_t{kMaxTemporalUnitSize});
+    } else {
+      return false;
+    }
+  }
+
+  if (timestamp != nullptr) *timestamp = ivf_frame_header.timestamp;
+
+  tu_data->resize(ivf_frame_header.frame_size);
+  const size_t size_read =
+      fread(tu_data->data(), 1, ivf_frame_header.frame_size, file_);
+  if (size_read != ivf_frame_header.frame_size) {
+    LIBGAV1_EXAMPLES_LOG_ERROR(
+        "Unexpected EOF or I/O error reading frame data");
+    if (error_tolerant_) {
+      tu_data->resize(size_read);
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Attempt to read an IVF file header. Returns true for success, and false for
+// failure.
+//
+// IVF File Header format, from https://wiki.multimedia.cx/index.php/IVF
+// bytes 0-3    signature: 'DKIF'
+// bytes 4-5    version (should be 0)
+// bytes 6-7    length of header in bytes
+// bytes 8-11   codec FourCC (e.g., 'VP80')
+// bytes 12-13  width in pixels
+// bytes 14-15  height in pixels
+// bytes 16-19  frame rate
+// bytes 20-23  time scale
+// bytes 24-27  number of frames in file
+// bytes 28-31  unused
+//
+// Note: The rate and scale fields correspond to the numerator and denominator
+// of frame rate (fps) or time base (the reciprocal of frame rate) as follows:
+//
+// bytes 16-19  frame rate  timebase.den  framerate.numerator
+// bytes 20-23  time scale  timebase.num  framerate.denominator
+bool FileReader::ReadIvfFileHeader() {
+  uint8_t header_buffer[kIvfFileHeaderSize];
+  const size_t num_read = fread(header_buffer, 1, kIvfFileHeaderSize, file_);
+  if (num_read != kIvfFileHeaderSize) {
+    LIBGAV1_EXAMPLES_LOG_ERROR(
+        "Cannot read IVF header: Not enough data available");
+    return false;
+  }
+
+  IvfFileHeader ivf_file_header;
+  if (!ParseIvfFileHeader(header_buffer, &ivf_file_header)) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Could not parse IVF file header");
+    if (error_tolerant_) {
+      ivf_file_header = {};
+    } else {
+      return false;
+    }
+  }
+
+  width_ = ivf_file_header.width;
+  height_ = ivf_file_header.height;
+  frame_rate_ = ivf_file_header.frame_rate_numerator;
+  time_scale_ = ivf_file_header.frame_rate_denominator;
+  type_ = kFileTypeIvf;
+
+  return true;
+}
+
+}  // namespace libgav1
diff --git a/libgav1/examples/file_reader.h b/libgav1/examples/file_reader.h
new file mode 100644
index 0000000..c342a20
--- /dev/null
+++ b/libgav1/examples/file_reader.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_READER_H_
+#define LIBGAV1_EXAMPLES_FILE_READER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "examples/file_reader_interface.h"
+
+namespace libgav1 {
+
+// Temporal Unit based file reader class. Currently supports only IVF files.
+class FileReader : public FileReaderInterface {
+ public:
+  enum FileType {
+    kFileTypeUnknown,
+    kFileTypeIvf,
+  };
+
+  // Creates and returns a FileReader that reads from |file_name|.
+  // If |error_tolerant| is true format and read errors are ignored,
+  // ReadTemporalUnit() may return truncated data.
+  // Returns nullptr when the file does not exist, cannot be read, or is not an
+  // IVF file.
+  static std::unique_ptr<FileReaderInterface> Open(const std::string& file_name,
+                                                   bool error_tolerant = false);
+
+  FileReader() = delete;
+  FileReader(const FileReader&) = delete;
+  FileReader& operator=(const FileReader&) = delete;
+
+  // Closes |file_|.
+  ~FileReader() override;
+
+  // Reads a temporal unit from |file_| and writes the data to |tu_data|.
+  // Returns true when:
+  // - A temporal unit is read successfully, or
+  // - At end of file.
+  // When ReadTemporalUnit() is called at the end of the file, it will return
+  // true without writing any data to |tu_data|.
+  //
+  // The |timestamp| pointer is optional: callers not interested in timestamps
+  // can pass nullptr. When |timestamp| is not a nullptr, this function returns
+  // the presentation timestamp from the IVF frame header.
+  /*LIBGAV1_MUST_USE_RESULT*/ bool ReadTemporalUnit(
+      std::vector<uint8_t>* tu_data, int64_t* timestamp) override;
+
+  /*LIBGAV1_MUST_USE_RESULT*/ bool IsEndOfFile() const override {
+    return feof(file_) != 0;
+  }
+
+  // The values returned by these accessors are strictly informative. No
+  // validation is performed when they are read from the IVF file header.
+  size_t width() const override { return width_; }
+  size_t height() const override { return height_; }
+  size_t frame_rate() const override { return frame_rate_; }
+  size_t time_scale() const override { return time_scale_; }
+
+ private:
+  FileReader(FILE* file, bool owns_file, bool error_tolerant)
+      : file_(file), owns_file_(owns_file), error_tolerant_(error_tolerant) {}
+
+  bool ReadIvfFileHeader();
+
+  FILE* file_ = nullptr;
+  size_t width_ = 0;
+  size_t height_ = 0;
+  size_t frame_rate_ = 0;
+  size_t time_scale_ = 0;
+  FileType type_ = kFileTypeUnknown;
+  // True if this object owns file_ and is responsible for closing it when
+  // done.
+  const bool owns_file_;
+  const bool error_tolerant_;
+
+  static bool registered_in_factory_;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_EXAMPLES_FILE_READER_H_
diff --git a/libgav1/src/decoder_scratch_buffer.cc b/libgav1/examples/file_reader_constants.cc
similarity index 75%
copy from libgav1/src/decoder_scratch_buffer.cc
copy to libgav1/examples/file_reader_constants.cc
index bb9b5f2..8439071 100644
--- a/libgav1/src/decoder_scratch_buffer.cc
+++ b/libgav1/examples/file_reader_constants.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "src/decoder_scratch_buffer.h"
+#include "examples/file_reader_constants.h"
 
 namespace libgav1 {
 
-// static
-constexpr int DecoderScratchBuffer::kBlockDecodedStride;
-constexpr int DecoderScratchBuffer::kPixelSize;
+const char kIvfSignature[4] = {'D', 'K', 'I', 'F'};
+const char kAv1FourCcUpper[4] = {'A', 'V', '0', '1'};
+const char kAv1FourCcLower[4] = {'a', 'v', '0', '1'};
 
 }  // namespace libgav1
diff --git a/libgav1/examples/file_reader_constants.h b/libgav1/examples/file_reader_constants.h
new file mode 100644
index 0000000..00922b4
--- /dev/null
+++ b/libgav1/examples/file_reader_constants.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_READER_CONSTANTS_H_
+#define LIBGAV1_EXAMPLES_FILE_READER_CONSTANTS_H_
+
+namespace libgav1 {
+
+enum {
+  kIvfHeaderVersion = 0,
+  kIvfFrameHeaderSize = 12,
+  kIvfFileHeaderSize = 32,
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+  kMaxTemporalUnitSize = 512 * 1024,
+#else
+  kMaxTemporalUnitSize = 256 * 1024 * 1024,
+#endif
+};
+
+extern const char kIvfSignature[4];
+extern const char kAv1FourCcUpper[4];
+extern const char kAv1FourCcLower[4];
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_EXAMPLES_FILE_READER_CONSTANTS_H_
diff --git a/libgav1/examples/file_reader_factory.cc b/libgav1/examples/file_reader_factory.cc
new file mode 100644
index 0000000..d5260eb
--- /dev/null
+++ b/libgav1/examples/file_reader_factory.cc
@@ -0,0 +1,51 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_reader_factory.h"
+
+#include <new>
+
+#include "examples/logging.h"
+
+namespace libgav1 {
+namespace {
+
+std::vector<FileReaderFactory::OpenFunction>* GetFileReaderOpenFunctions() {
+  static auto* open_functions =
+      new (std::nothrow) std::vector<FileReaderFactory::OpenFunction>();
+  return open_functions;
+}
+
+}  // namespace
+
+bool FileReaderFactory::RegisterReader(OpenFunction open_function) {
+  if (open_function == nullptr) return false;
+  auto* open_functions = GetFileReaderOpenFunctions();
+  const size_t num_readers = open_functions->size();
+  open_functions->push_back(open_function);
+  return open_functions->size() == num_readers + 1;
+}
+
+std::unique_ptr<FileReaderInterface> FileReaderFactory::OpenReader(
+    const std::string& file_name, const bool error_tolerant /*= false*/) {
+  for (auto* open_function : *GetFileReaderOpenFunctions()) {
+    auto reader = open_function(file_name, error_tolerant);
+    if (reader == nullptr) continue;
+    return reader;
+  }
+  LIBGAV1_EXAMPLES_LOG_ERROR("No file reader able to open input");
+  return nullptr;
+}
+
+}  // namespace libgav1
diff --git a/libgav1/examples/file_reader_factory.h b/libgav1/examples/file_reader_factory.h
new file mode 100644
index 0000000..0f53484
--- /dev/null
+++ b/libgav1/examples/file_reader_factory.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_READER_FACTORY_H_
+#define LIBGAV1_EXAMPLES_FILE_READER_FACTORY_H_
+
+#include <memory>
+#include <string>
+
+#include "examples/file_reader_interface.h"
+
+namespace libgav1 {
+
+class FileReaderFactory {
+ public:
+  using OpenFunction = std::unique_ptr<FileReaderInterface> (*)(
+      const std::string& file_name, bool error_tolerant);
+
+  FileReaderFactory() = delete;
+  FileReaderFactory(const FileReaderFactory&) = delete;
+  FileReaderFactory& operator=(const FileReaderFactory&) = delete;
+  ~FileReaderFactory() = default;
+
+  // Registers the OpenFunction for a FileReaderInterface and returns true when
+  // registration succeeds.
+  static bool RegisterReader(OpenFunction open_function);
+
+  // Passes |file_name| to each OpenFunction until one succeeds. Returns nullptr
+  // when no reader is found for |file_name|. Otherwise a FileReaderInterface is
+  // returned. If |error_tolerant| is true and the reader supports it, some
+  // format and read errors may be ignored and partial data returned.
+  static std::unique_ptr<FileReaderInterface> OpenReader(
+      const std::string& file_name, bool error_tolerant = false);
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_EXAMPLES_FILE_READER_FACTORY_H_
diff --git a/libgav1/examples/file_reader_interface.h b/libgav1/examples/file_reader_interface.h
new file mode 100644
index 0000000..d8f7030
--- /dev/null
+++ b/libgav1/examples/file_reader_interface.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_READER_INTERFACE_H_
+#define LIBGAV1_EXAMPLES_FILE_READER_INTERFACE_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+namespace libgav1 {
+
+class FileReaderInterface {
+ public:
+  FileReaderInterface() = default;
+  FileReaderInterface(const FileReaderInterface&) = delete;
+  FileReaderInterface& operator=(const FileReaderInterface&) = delete;
+
+  FileReaderInterface(FileReaderInterface&&) = default;
+  FileReaderInterface& operator=(FileReaderInterface&&) = default;
+
+  // Closes the file.
+  virtual ~FileReaderInterface() = default;
+
+  // Reads a temporal unit from the file and writes the data to |tu_data|.
+  // Returns true when:
+  // - A temporal unit is read successfully, or
+  // - At end of file.
+  // When ReadTemporalUnit() is called at the end of the file, it will return
+  // true without writing any data to |tu_data|.
+  //
+  // The |timestamp| pointer is optional: callers not interested in timestamps
+  // can pass nullptr. When |timestamp| is not a nullptr, this function returns
+  // the presentation timestamp of the temporal unit.
+  /*LIBGAV1_MUST_USE_RESULT*/ virtual bool ReadTemporalUnit(
+      std::vector<uint8_t>* tu_data, int64_t* timestamp) = 0;
+
+  /*LIBGAV1_MUST_USE_RESULT*/ virtual bool IsEndOfFile() const = 0;
+
+  // The values returned by these accessors are strictly informative. No
+  // validation is performed when they are read from file.
+  virtual size_t width() const = 0;
+  virtual size_t height() const = 0;
+  virtual size_t frame_rate() const = 0;
+  virtual size_t time_scale() const = 0;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_EXAMPLES_FILE_READER_INTERFACE_H_
diff --git a/libgav1/examples/file_writer.cc b/libgav1/examples/file_writer.cc
new file mode 100644
index 0000000..54afe14
--- /dev/null
+++ b/libgav1/examples/file_writer.cc
@@ -0,0 +1,183 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_writer.h"
+
+#include <cerrno>
+#include <cstdio>
+#include <cstring>
+#include <new>
+#include <string>
+
+#if defined(_WIN32)
+#include <fcntl.h>
+#include <io.h>
+#endif
+
+#include "examples/logging.h"
+
+namespace libgav1 {
+namespace {
+
+FILE* SetBinaryMode(FILE* stream) {
+#if defined(_WIN32)
+  _setmode(_fileno(stream), _O_BINARY);
+#endif
+  return stream;
+}
+
+std::string GetY4mColorSpaceString(
+    const FileWriter::Y4mParameters& y4m_parameters) {
+  std::string color_space_string;
+  switch (y4m_parameters.image_format) {
+    case kImageFormatMonochrome400:
+      color_space_string = "mono";
+      break;
+    case kImageFormatYuv420:
+      if (y4m_parameters.bitdepth == 8) {
+        if (y4m_parameters.chroma_sample_position ==
+            kChromaSamplePositionVertical) {
+          color_space_string = "420mpeg2";
+        } else if (y4m_parameters.chroma_sample_position ==
+                   kChromaSamplePositionColocated) {
+          color_space_string = "420";
+        } else {
+          color_space_string = "420jpeg";
+        }
+      } else {
+        color_space_string = "420";
+      }
+      break;
+    case kImageFormatYuv422:
+      color_space_string = "422";
+      break;
+    case kImageFormatYuv444:
+      color_space_string = "444";
+      break;
+  }
+
+  if (y4m_parameters.bitdepth > 8) {
+    const bool monochrome =
+        y4m_parameters.image_format == kImageFormatMonochrome400;
+    if (!monochrome) color_space_string += "p";
+    color_space_string += std::to_string(y4m_parameters.bitdepth);
+  }
+
+  return color_space_string;
+}
+
+}  // namespace
+
+FileWriter::~FileWriter() { fclose(file_); }
+
+std::unique_ptr<FileWriter> FileWriter::Open(
+    const std::string& file_name, FileType file_type,
+    const Y4mParameters* const y4m_parameters) {
+  if (file_name.empty() ||
+      (file_type == kFileTypeY4m && y4m_parameters == nullptr) ||
+      (file_type != kFileTypeRaw && file_type != kFileTypeY4m)) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Invalid parameters");
+    return nullptr;
+  }
+
+  FILE* raw_file_ptr;
+
+  if (file_name == "-") {
+    raw_file_ptr = SetBinaryMode(stdout);
+  } else {
+    raw_file_ptr = fopen(file_name.c_str(), "wb");
+  }
+
+  if (raw_file_ptr == nullptr) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Unable to open output file");
+    return nullptr;
+  }
+
+  std::unique_ptr<FileWriter> file(new (std::nothrow) FileWriter(raw_file_ptr));
+  if (file == nullptr) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Out of memory");
+    fclose(raw_file_ptr);
+    return nullptr;
+  }
+
+  if (file_type == kFileTypeY4m && !file->WriteY4mFileHeader(*y4m_parameters)) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Error writing Y4M file header");
+    return nullptr;
+  }
+
+  file->file_type_ = file_type;
+  return file;
+}
+
+bool FileWriter::WriteFrame(const DecoderBuffer& frame_buffer) {
+  if (file_type_ == kFileTypeY4m) {
+    const char kY4mFrameHeader[] = "FRAME\n";
+    if (fwrite(kY4mFrameHeader, 1, strlen(kY4mFrameHeader), file_) !=
+        strlen(kY4mFrameHeader)) {
+      LIBGAV1_EXAMPLES_LOG_ERROR("Error writing Y4M frame header");
+      return false;
+    }
+  }
+
+  const size_t pixel_size =
+      (frame_buffer.bitdepth == 8) ? sizeof(uint8_t) : sizeof(uint16_t);
+  for (int plane_index = 0; plane_index < frame_buffer.NumPlanes();
+       ++plane_index) {
+    const int height = frame_buffer.displayed_height[plane_index];
+    const int width = frame_buffer.displayed_width[plane_index];
+    const int stride = frame_buffer.stride[plane_index];
+    const uint8_t* const plane_pointer = frame_buffer.plane[plane_index];
+    for (int row = 0; row < height; ++row) {
+      const uint8_t* const row_pointer = &plane_pointer[row * stride];
+      if (fwrite(row_pointer, pixel_size, width, file_) !=
+          static_cast<size_t>(width)) {
+        char error_string[256];
+        snprintf(error_string, sizeof(error_string),
+                 "File write failed: %s (errno=%d)", strerror(errno), errno);
+        LIBGAV1_EXAMPLES_LOG_ERROR(error_string);
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+// Writes Y4M file header to |file_| and returns true when successful.
+//
+// A Y4M file begins with a plaintext file signature of 'YUV4MPEG2 '.
+//
+// Following the signature is any number of optional parameters preceded by a
+// space. We always write:
+//
+// Width: 'W' followed by image width in pixels.
+// Height: 'H' followed by image height in pixels.
+// Frame Rate: 'F' followed frames/second in the form numerator:denominator.
+// Interlacing: 'I' followed by 'p' for progressive.
+// Color space: 'C' followed by a string representation of the color space.
+//
+// More info here: https://wiki.multimedia.cx/index.php/YUV4MPEG2
+bool FileWriter::WriteY4mFileHeader(const Y4mParameters& y4m_parameters) {
+  std::string y4m_header = "YUV4MPEG2";
+  y4m_header += " W" + std::to_string(y4m_parameters.width);
+  y4m_header += " H" + std::to_string(y4m_parameters.height);
+  y4m_header += " F" + std::to_string(y4m_parameters.frame_rate_numerator) +
+                ":" + std::to_string(y4m_parameters.frame_rate_denominator);
+  y4m_header += " Ip C" + GetY4mColorSpaceString(y4m_parameters);
+  y4m_header += "\n";
+  return fwrite(y4m_header.c_str(), 1, y4m_header.length(), file_) ==
+         y4m_header.length();
+}
+
+}  // namespace libgav1
diff --git a/libgav1/examples/file_writer.h b/libgav1/examples/file_writer.h
new file mode 100644
index 0000000..00f6cc3
--- /dev/null
+++ b/libgav1/examples/file_writer.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_WRITER_H_
+#define LIBGAV1_EXAMPLES_FILE_WRITER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+#include <string>
+
+#include "gav1/decoder_buffer.h"
+
+namespace libgav1 {
+
+// Frame based file writer class. Supports only Y4M (YUV4MPEG2) and RAW output.
+class FileWriter {
+ public:
+  enum FileType : uint8_t {
+    kFileTypeRaw,
+    kFileTypeY4m,
+  };
+
+  struct Y4mParameters {
+    Y4mParameters() = default;
+    Y4mParameters(size_t width, size_t height, size_t frame_rate_numerator,
+                  size_t frame_rate_denominator,
+                  ChromaSamplePosition chroma_sample_position,
+                  ImageFormat image_format, size_t bitdepth)
+        : width(width),
+          height(height),
+          frame_rate_numerator(frame_rate_numerator),
+          frame_rate_denominator(frame_rate_denominator),
+          chroma_sample_position(chroma_sample_position),
+          image_format(image_format),
+          bitdepth(bitdepth) {}
+
+    Y4mParameters(const Y4mParameters& rhs) = default;
+    Y4mParameters& operator=(const Y4mParameters& rhs) = default;
+    Y4mParameters(Y4mParameters&& rhs) = default;
+    Y4mParameters& operator=(Y4mParameters&& rhs) = default;
+
+    size_t width = 0;
+    size_t height = 0;
+    size_t frame_rate_numerator = 30;
+    size_t frame_rate_denominator = 1;
+    ChromaSamplePosition chroma_sample_position = kChromaSamplePositionUnknown;
+    ImageFormat image_format = kImageFormatYuv420;
+    size_t bitdepth = 8;
+  };
+
+  // Opens |file_name|. When |file_type| is kFileTypeY4m the Y4M file header is
+  // written out to |file_| before this method returns.
+  //
+  // Returns a FileWriter instance after the file is opened successfully for
+  // kFileTypeRaw files, and after the Y4M file header bytes are written for
+  // kFileTypeY4m files. Returns nullptr upon failure.
+  static std::unique_ptr<FileWriter> Open(const std::string& file_name,
+                                          FileType type,
+                                          const Y4mParameters* y4m_parameters);
+
+  FileWriter() = delete;
+  FileWriter(const FileWriter&) = delete;
+  FileWriter& operator=(const FileWriter&) = delete;
+
+  FileWriter(FileWriter&&) = default;
+  FileWriter& operator=(FileWriter&&) = default;
+
+  // Closes |file_|.
+  ~FileWriter();
+
+  // Writes the frame data in |frame_buffer| to |file_|. Returns true after
+  // successful write of |frame_buffer| data.
+  /*LIBGAV1_MUST_USE_RESULT*/ bool WriteFrame(
+      const DecoderBuffer& frame_buffer);
+
+ private:
+  explicit FileWriter(FILE* file) : file_(file) {}
+
+  bool WriteY4mFileHeader(const Y4mParameters& y4m_parameters);
+
+  FILE* file_ = nullptr;
+  FileType file_type_ = kFileTypeRaw;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_EXAMPLES_FILE_WRITER_H_
diff --git a/libgav1/examples/gav1_decode.cc b/libgav1/examples/gav1_decode.cc
new file mode 100644
index 0000000..e7d3246
--- /dev/null
+++ b/libgav1/examples/gav1_decode.cc
@@ -0,0 +1,453 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cerrno>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <deque>
+#include <memory>
+#include <new>
+#include <vector>
+
+#include "absl/strings/numbers.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "examples/file_reader_factory.h"
+#include "examples/file_reader_interface.h"
+#include "examples/file_writer.h"
+#include "gav1/decoder.h"
+
+#ifdef GAV1_DECODE_USE_CV_PIXEL_BUFFER_POOL
+#include "examples/gav1_decode_cv_pixel_buffer_pool.h"
+#endif
+
+namespace {
+
+struct Options {
+  const char* input_file_name = nullptr;
+  const char* output_file_name = nullptr;
+  const char* frame_timing_file_name = nullptr;
+  libgav1::FileWriter::FileType output_file_type =
+      libgav1::FileWriter::kFileTypeRaw;
+  uint8_t post_filter_mask = 0x1f;
+  int threads = 1;
+  bool frame_parallel = false;
+  bool output_all_layers = false;
+  int operating_point = 0;
+  int limit = 0;
+  int skip = 0;
+  int verbose = 0;
+};
+
+struct Timing {
+  absl::Duration input;
+  absl::Duration dequeue;
+};
+
+struct FrameTiming {
+  absl::Time enqueue;
+  absl::Time dequeue;
+};
+
+void PrintHelp(FILE* const fout) {
+  fprintf(fout,
+          "Usage: gav1_decode [options] <input file>"
+          " [-o <output file>]\n");
+  fprintf(fout, "\n");
+  fprintf(fout, "Options:\n");
+  fprintf(fout, "  -h, --help This help message.\n");
+  fprintf(fout, "  --threads <positive integer> (Default 1).\n");
+  fprintf(fout, "  --frame_parallel.\n");
+  fprintf(fout,
+          "  --limit <integer> Stop decoding after N frames (0 = all).\n");
+  fprintf(fout, "  --skip <integer> Skip initial N frames (Default 0).\n");
+  fprintf(fout, "  --version.\n");
+  fprintf(fout, "  --y4m (Default false).\n");
+  fprintf(fout, "  --raw (Default true).\n");
+  fprintf(fout, "  -v logging verbosity, can be used multiple times.\n");
+  fprintf(fout, "  --all_layers.\n");
+  fprintf(fout,
+          "  --operating_point <integer between 0 and 31> (Default 0).\n");
+  fprintf(fout,
+          "  --frame_timing <file> Output per-frame timing to <file> in tsv"
+          " format.\n   Yields meaningful results only when frame parallel is"
+          " off.\n");
+  fprintf(fout, "\nAdvanced settings:\n");
+  fprintf(fout, "  --post_filter_mask <integer> (Default 0x1f).\n");
+  fprintf(fout,
+          "   Mask indicating which post filters should be applied to the"
+          " reconstructed\n   frame. This may be given as octal, decimal or"
+          " hexadecimal. From LSB:\n");
+  fprintf(fout, "     Bit 0: Loop filter (deblocking filter)\n");
+  fprintf(fout, "     Bit 1: Cdef\n");
+  fprintf(fout, "     Bit 2: SuperRes\n");
+  fprintf(fout, "     Bit 3: Loop Restoration\n");
+  fprintf(fout, "     Bit 4: Film Grain Synthesis\n");
+}
+
+void ParseOptions(int argc, char* argv[], Options* const options) {
+  for (int i = 1; i < argc; ++i) {
+    int32_t value;
+    if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) {
+      PrintHelp(stdout);
+      exit(EXIT_SUCCESS);
+    } else if (strcmp(argv[i], "-o") == 0) {
+      if (++i >= argc) {
+        fprintf(stderr, "Missing argument for '-o'\n");
+        PrintHelp(stderr);
+        exit(EXIT_FAILURE);
+      }
+      options->output_file_name = argv[i];
+    } else if (strcmp(argv[i], "--frame_timing") == 0) {
+      if (++i >= argc) {
+        fprintf(stderr, "Missing argument for '--frame_timing'\n");
+        PrintHelp(stderr);
+        exit(EXIT_FAILURE);
+      }
+      options->frame_timing_file_name = argv[i];
+    } else if (strcmp(argv[i], "--version") == 0) {
+      printf("gav1_decode, a libgav1 based AV1 decoder\n");
+      printf("libgav1 %s\n", libgav1::GetVersionString());
+      printf("max bitdepth: %d\n", libgav1::Decoder::GetMaxBitdepth());
+      printf("build configuration: %s\n", libgav1::GetBuildConfiguration());
+      exit(EXIT_SUCCESS);
+    } else if (strcmp(argv[i], "-v") == 0) {
+      ++options->verbose;
+    } else if (strcmp(argv[i], "--raw") == 0) {
+      options->output_file_type = libgav1::FileWriter::kFileTypeRaw;
+    } else if (strcmp(argv[i], "--y4m") == 0) {
+      options->output_file_type = libgav1::FileWriter::kFileTypeY4m;
+    } else if (strcmp(argv[i], "--threads") == 0) {
+      if (++i >= argc || !absl::SimpleAtoi(argv[i], &value)) {
+        fprintf(stderr, "Missing/Invalid value for --threads.\n");
+        PrintHelp(stderr);
+        exit(EXIT_FAILURE);
+      }
+      options->threads = value;
+    } else if (strcmp(argv[i], "--frame_parallel") == 0) {
+      options->frame_parallel = true;
+    } else if (strcmp(argv[i], "--all_layers") == 0) {
+      options->output_all_layers = true;
+    } else if (strcmp(argv[i], "--operating_point") == 0) {
+      if (++i >= argc || !absl::SimpleAtoi(argv[i], &value) || value < 0 ||
+          value >= 32) {
+        fprintf(stderr, "Missing/Invalid value for --operating_point.\n");
+        PrintHelp(stderr);
+        exit(EXIT_FAILURE);
+      }
+      options->operating_point = value;
+    } else if (strcmp(argv[i], "--limit") == 0) {
+      if (++i >= argc || !absl::SimpleAtoi(argv[i], &value) || value < 0) {
+        fprintf(stderr, "Missing/Invalid value for --limit.\n");
+        PrintHelp(stderr);
+        exit(EXIT_FAILURE);
+      }
+      options->limit = value;
+    } else if (strcmp(argv[i], "--skip") == 0) {
+      if (++i >= argc || !absl::SimpleAtoi(argv[i], &value) || value < 0) {
+        fprintf(stderr, "Missing/Invalid value for --skip.\n");
+        PrintHelp(stderr);
+        exit(EXIT_FAILURE);
+      }
+      options->skip = value;
+    } else if (strcmp(argv[i], "--post_filter_mask") == 0) {
+      errno = 0;
+      char* endptr = nullptr;
+      value = (++i >= argc) ? -1
+                            // NOLINTNEXTLINE(runtime/deprecated_fn)
+                            : static_cast<int32_t>(strtol(argv[i], &endptr, 0));
+      // Only the last 5 bits of the mask can be set.
+      if ((value & ~31) != 0 || errno != 0 || endptr == argv[i]) {
+        fprintf(stderr, "Invalid value for --post_filter_mask.\n");
+        PrintHelp(stderr);
+        exit(EXIT_FAILURE);
+      }
+      options->post_filter_mask = value;
+    } else if (strlen(argv[i]) > 1 && argv[i][0] == '-') {
+      fprintf(stderr, "Unknown option '%s'!\n", argv[i]);
+      exit(EXIT_FAILURE);
+    } else {
+      if (options->input_file_name == nullptr) {
+        options->input_file_name = argv[i];
+      } else {
+        fprintf(stderr, "Found invalid parameter: \"%s\".\n", argv[i]);
+        PrintHelp(stderr);
+        exit(EXIT_FAILURE);
+      }
+    }
+  }
+
+  if (argc < 2 || options->input_file_name == nullptr) {
+    fprintf(stderr, "Input file is required!\n");
+    PrintHelp(stderr);
+    exit(EXIT_FAILURE);
+  }
+}
+
+using InputBuffer = std::vector<uint8_t>;
+
+class InputBuffers {
+ public:
+  ~InputBuffers() {
+    for (auto buffer : free_buffers_) {
+      delete buffer;
+    }
+  }
+  InputBuffer* GetFreeBuffer() {
+    if (free_buffers_.empty()) {
+      auto* const buffer = new (std::nothrow) InputBuffer();
+      if (buffer == nullptr) {
+        fprintf(stderr, "Failed to create input buffer.\n");
+        return nullptr;
+      }
+      free_buffers_.push_back(buffer);
+    }
+    InputBuffer* const buffer = free_buffers_.front();
+    free_buffers_.pop_front();
+    return buffer;
+  }
+
+  void ReleaseInputBuffer(InputBuffer* buffer) {
+    free_buffers_.push_back(buffer);
+  }
+
+ private:
+  std::deque<InputBuffer*> free_buffers_;
+};
+
+void ReleaseInputBuffer(void* callback_private_data,
+                        void* buffer_private_data) {
+  auto* const input_buffers = static_cast<InputBuffers*>(callback_private_data);
+  input_buffers->ReleaseInputBuffer(
+      static_cast<InputBuffer*>(buffer_private_data));
+}
+
+int CloseFile(FILE* stream) { return (stream == nullptr) ? 0 : fclose(stream); }
+
+}  // namespace
+
+int main(int argc, char* argv[]) {
+  Options options;
+  ParseOptions(argc, argv, &options);
+
+  auto file_reader =
+      libgav1::FileReaderFactory::OpenReader(options.input_file_name);
+  if (file_reader == nullptr) {
+    fprintf(stderr, "Cannot open input file!\n");
+    return EXIT_FAILURE;
+  }
+
+  std::unique_ptr<FILE, decltype(&CloseFile)> frame_timing_file(nullptr,
+                                                                &CloseFile);
+  if (options.frame_timing_file_name != nullptr) {
+    frame_timing_file.reset(fopen(options.frame_timing_file_name, "wb"));
+    if (frame_timing_file == nullptr) {
+      fprintf(stderr, "Cannot open frame timing file '%s'!\n",
+              options.frame_timing_file_name);
+      return EXIT_FAILURE;
+    }
+  }
+
+#ifdef GAV1_DECODE_USE_CV_PIXEL_BUFFER_POOL
+  // Reference frames + 1 scratch frame (for either the current frame or the
+  // film grain frame).
+  constexpr int kNumBuffers = 8 + 1;
+  std::unique_ptr<Gav1DecodeCVPixelBufferPool> cv_pixel_buffers =
+      Gav1DecodeCVPixelBufferPool::Create(kNumBuffers);
+  if (cv_pixel_buffers == nullptr) {
+    fprintf(stderr, "Cannot create Gav1DecodeCVPixelBufferPool!\n");
+    return EXIT_FAILURE;
+  }
+#endif
+
+  InputBuffers input_buffers;
+  libgav1::Decoder decoder;
+  libgav1::DecoderSettings settings;
+  settings.post_filter_mask = options.post_filter_mask;
+  settings.threads = options.threads;
+  settings.frame_parallel = options.frame_parallel;
+  settings.output_all_layers = options.output_all_layers;
+  settings.operating_point = options.operating_point;
+  settings.blocking_dequeue = true;
+  settings.callback_private_data = &input_buffers;
+  settings.release_input_buffer = ReleaseInputBuffer;
+#ifdef GAV1_DECODE_USE_CV_PIXEL_BUFFER_POOL
+  settings.on_frame_buffer_size_changed = Gav1DecodeOnCVPixelBufferSizeChanged;
+  settings.get_frame_buffer = Gav1DecodeGetCVPixelBuffer;
+  settings.release_frame_buffer = Gav1DecodeReleaseCVPixelBuffer;
+  settings.callback_private_data = cv_pixel_buffers.get();
+  settings.release_input_buffer = nullptr;
+  // TODO(vigneshv): Support frame parallel mode to be used with
+  // CVPixelBufferPool.
+  settings.frame_parallel = false;
+#endif
+  libgav1::StatusCode status = decoder.Init(&settings);
+  if (status != libgav1::kStatusOk) {
+    fprintf(stderr, "Error initializing decoder: %s\n",
+            libgav1::GetErrorString(status));
+    return EXIT_FAILURE;
+  }
+
+  fprintf(stderr, "decoding '%s'\n", options.input_file_name);
+  if (options.verbose > 0 && options.skip > 0) {
+    fprintf(stderr, "skipping %d frame(s).\n", options.skip);
+  }
+
+  int input_frames = 0;
+  int decoded_frames = 0;
+  Timing timing = {};
+  std::vector<FrameTiming> frame_timing;
+  const bool record_frame_timing = frame_timing_file != nullptr;
+  std::unique_ptr<libgav1::FileWriter> file_writer;
+  InputBuffer* input_buffer = nullptr;
+  bool limit_reached = false;
+  bool dequeue_finished = false;
+  const absl::Time decode_loop_start = absl::Now();
+  do {
+    if (input_buffer == nullptr && !file_reader->IsEndOfFile() &&
+        !limit_reached) {
+      input_buffer = input_buffers.GetFreeBuffer();
+      if (input_buffer == nullptr) return EXIT_FAILURE;
+      const absl::Time read_start = absl::Now();
+      if (!file_reader->ReadTemporalUnit(input_buffer,
+                                         /*timestamp=*/nullptr)) {
+        fprintf(stderr, "Error reading input file.\n");
+        return EXIT_FAILURE;
+      }
+      timing.input += absl::Now() - read_start;
+    }
+
+    if (++input_frames <= options.skip) {
+      input_buffers.ReleaseInputBuffer(input_buffer);
+      input_buffer = nullptr;
+      continue;
+    }
+
+    if (input_buffer != nullptr) {
+      if (input_buffer->empty()) {
+        input_buffers.ReleaseInputBuffer(input_buffer);
+        input_buffer = nullptr;
+        continue;
+      }
+
+      const absl::Time enqueue_start = absl::Now();
+      status = decoder.EnqueueFrame(input_buffer->data(), input_buffer->size(),
+                                    static_cast<int64_t>(frame_timing.size()),
+                                    /*buffer_private_data=*/input_buffer);
+      if (status == libgav1::kStatusOk) {
+        if (options.verbose > 1) {
+          fprintf(stderr, "enqueue frame (length %zu)\n", input_buffer->size());
+        }
+        if (record_frame_timing) {
+          FrameTiming enqueue_time = {enqueue_start, absl::UnixEpoch()};
+          frame_timing.emplace_back(enqueue_time);
+        }
+
+        input_buffer = nullptr;
+        // Continue to enqueue frames until we get a kStatusTryAgain status.
+        continue;
+      }
+      if (status != libgav1::kStatusTryAgain) {
+        fprintf(stderr, "Unable to enqueue frame: %s\n",
+                libgav1::GetErrorString(status));
+        return EXIT_FAILURE;
+      }
+    }
+
+    const libgav1::DecoderBuffer* buffer;
+    status = decoder.DequeueFrame(&buffer);
+    if (status != libgav1::kStatusOk &&
+        status != libgav1::kStatusNothingToDequeue) {
+      fprintf(stderr, "Unable to dequeue frame: %s\n",
+              libgav1::GetErrorString(status));
+      return EXIT_FAILURE;
+    }
+    if (status == libgav1::kStatusNothingToDequeue) {
+      dequeue_finished = true;
+      continue;
+    }
+    dequeue_finished = false;
+    if (buffer == nullptr) continue;
+    ++decoded_frames;
+    if (options.verbose > 1) {
+      fprintf(stderr, "buffer dequeued\n");
+    }
+
+    if (record_frame_timing) {
+      frame_timing[static_cast<int>(buffer->user_private_data)].dequeue =
+          absl::Now();
+    }
+
+    if (options.output_file_name != nullptr && file_writer == nullptr) {
+      libgav1::FileWriter::Y4mParameters y4m_parameters;
+      y4m_parameters.width = buffer->displayed_width[0];
+      y4m_parameters.height = buffer->displayed_height[0];
+      y4m_parameters.frame_rate_numerator = file_reader->frame_rate();
+      y4m_parameters.frame_rate_denominator = file_reader->time_scale();
+      y4m_parameters.chroma_sample_position = buffer->chroma_sample_position;
+      y4m_parameters.image_format = buffer->image_format;
+      y4m_parameters.bitdepth = static_cast<size_t>(buffer->bitdepth);
+      file_writer = libgav1::FileWriter::Open(
+          options.output_file_name, options.output_file_type, &y4m_parameters);
+      if (file_writer == nullptr) {
+        fprintf(stderr, "Cannot open output file!\n");
+        return EXIT_FAILURE;
+      }
+    }
+
+    if (!limit_reached && file_writer != nullptr &&
+        !file_writer->WriteFrame(*buffer)) {
+      fprintf(stderr, "Error writing output file.\n");
+      return EXIT_FAILURE;
+    }
+    if (options.limit > 0 && options.limit == decoded_frames) {
+      limit_reached = true;
+      if (input_buffer != nullptr) {
+        input_buffers.ReleaseInputBuffer(input_buffer);
+      }
+      input_buffer = nullptr;
+    }
+  } while (input_buffer != nullptr ||
+           (!file_reader->IsEndOfFile() && !limit_reached) ||
+           !dequeue_finished);
+  timing.dequeue = absl::Now() - decode_loop_start - timing.input;
+
+  if (record_frame_timing) {
+    // Note timing for frame parallel will be skewed by the time spent queueing
+    // additional frames and in the output queue waiting for previous frames,
+    // the values reported won't be that meaningful.
+    fprintf(frame_timing_file.get(), "frame number\tdecode time us\n");
+    for (size_t i = 0; i < frame_timing.size(); ++i) {
+      const int decode_time_us = static_cast<int>(absl::ToInt64Microseconds(
+          frame_timing[i].dequeue - frame_timing[i].enqueue));
+      fprintf(frame_timing_file.get(), "%zu\t%d\n", i, decode_time_us);
+    }
+  }
+
+  if (options.verbose > 0) {
+    fprintf(stderr, "time to read input: %d us\n",
+            static_cast<int>(absl::ToInt64Microseconds(timing.input)));
+    const int decode_time_us =
+        static_cast<int>(absl::ToInt64Microseconds(timing.dequeue));
+    const double decode_fps =
+        (decode_time_us == 0) ? 0.0 : 1.0e6 * decoded_frames / decode_time_us;
+    fprintf(stderr, "time to decode input: %d us (%d frames, %.2f fps)\n",
+            decode_time_us, decoded_frames, decode_fps);
+  }
+
+  return EXIT_SUCCESS;
+}
diff --git a/libgav1/examples/gav1_decode_cv_pixel_buffer_pool.cc b/libgav1/examples/gav1_decode_cv_pixel_buffer_pool.cc
new file mode 100644
index 0000000..6aa4e61
--- /dev/null
+++ b/libgav1/examples/gav1_decode_cv_pixel_buffer_pool.cc
@@ -0,0 +1,278 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/gav1_decode_cv_pixel_buffer_pool.h"
+
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <memory>
+#include <new>
+#include <type_traits>
+
+namespace {
+
+struct CFTypeDeleter {
+  void operator()(CFTypeRef cf) const { CFRelease(cf); }
+};
+
+using UniqueCFNumberRef =
+    std::unique_ptr<std::remove_pointer<CFNumberRef>::type, CFTypeDeleter>;
+
+using UniqueCFDictionaryRef =
+    std::unique_ptr<std::remove_pointer<CFDictionaryRef>::type, CFTypeDeleter>;
+
+}  // namespace
+
+extern "C" {
+
+libgav1::StatusCode Gav1DecodeOnCVPixelBufferSizeChanged(
+    void* callback_private_data, int bitdepth,
+    libgav1::ImageFormat image_format, int width, int height, int left_border,
+    int right_border, int top_border, int bottom_border, int stride_alignment) {
+  auto* buffer_pool =
+      static_cast<Gav1DecodeCVPixelBufferPool*>(callback_private_data);
+  return buffer_pool->OnCVPixelBufferSizeChanged(
+      bitdepth, image_format, width, height, left_border, right_border,
+      top_border, bottom_border, stride_alignment);
+}
+
+libgav1::StatusCode Gav1DecodeGetCVPixelBuffer(
+    void* callback_private_data, int bitdepth,
+    libgav1::ImageFormat image_format, int width, int height, int left_border,
+    int right_border, int top_border, int bottom_border, int stride_alignment,
+    libgav1::FrameBuffer* frame_buffer) {
+  auto* buffer_pool =
+      static_cast<Gav1DecodeCVPixelBufferPool*>(callback_private_data);
+  return buffer_pool->GetCVPixelBuffer(
+      bitdepth, image_format, width, height, left_border, right_border,
+      top_border, bottom_border, stride_alignment, frame_buffer);
+}
+
+void Gav1DecodeReleaseCVPixelBuffer(void* callback_private_data,
+                                    void* buffer_private_data) {
+  auto* buffer_pool =
+      static_cast<Gav1DecodeCVPixelBufferPool*>(callback_private_data);
+  buffer_pool->ReleaseCVPixelBuffer(buffer_private_data);
+}
+
+}  // extern "C"
+
+// static
+std::unique_ptr<Gav1DecodeCVPixelBufferPool>
+Gav1DecodeCVPixelBufferPool::Create(size_t num_buffers) {
+  std::unique_ptr<Gav1DecodeCVPixelBufferPool> buffer_pool(
+      new (std::nothrow) Gav1DecodeCVPixelBufferPool(num_buffers));
+  return buffer_pool;
+}
+
+Gav1DecodeCVPixelBufferPool::Gav1DecodeCVPixelBufferPool(size_t num_buffers)
+    : num_buffers_(static_cast<int>(num_buffers)) {}
+
+Gav1DecodeCVPixelBufferPool::~Gav1DecodeCVPixelBufferPool() {
+  CVPixelBufferPoolRelease(pool_);
+}
+
+libgav1::StatusCode Gav1DecodeCVPixelBufferPool::OnCVPixelBufferSizeChanged(
+    int bitdepth, libgav1::ImageFormat image_format, int width, int height,
+    int left_border, int right_border, int top_border, int bottom_border,
+    int stride_alignment) {
+  if (bitdepth != 8 || (image_format != libgav1::kImageFormatYuv420 &&
+                        image_format != libgav1::kImageFormatMonochrome400)) {
+    fprintf(stderr,
+            "Only bitdepth 8, 4:2:0 videos are supported: bitdepth %d, "
+            "image_format: %d.\n",
+            bitdepth, image_format);
+    return libgav1::kStatusUnimplemented;
+  }
+
+  // stride_alignment must be a power of 2.
+  assert((stride_alignment & (stride_alignment - 1)) == 0);
+
+  // The possible keys for CVPixelBufferPool are:
+  //   kCVPixelBufferPoolMinimumBufferCountKey
+  //   kCVPixelBufferPoolMaximumBufferAgeKey
+  //   kCVPixelBufferPoolAllocationThresholdKey
+  const void* pool_keys[] = {kCVPixelBufferPoolMinimumBufferCountKey};
+  const int min_buffer_count = 10;
+  UniqueCFNumberRef cf_min_buffer_count(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &min_buffer_count));
+  if (cf_min_buffer_count == nullptr) {
+    fprintf(stderr, "CFNumberCreate failed.\n");
+    return libgav1::kStatusUnknownError;
+  }
+  const void* pool_values[] = {cf_min_buffer_count.get()};
+  UniqueCFDictionaryRef pool_attributes(CFDictionaryCreate(
+      nullptr, pool_keys, pool_values, 1, &kCFTypeDictionaryKeyCallBacks,
+      &kCFTypeDictionaryValueCallBacks));
+  if (pool_attributes == nullptr) {
+    fprintf(stderr, "CFDictionaryCreate failed.\n");
+    return libgav1::kStatusUnknownError;
+  }
+
+  // The pixelBufferAttributes argument to CVPixelBufferPoolCreate() cannot be
+  // null and must contain the pixel format, width, and height, otherwise
+  // CVPixelBufferPoolCreate() fails with kCVReturnInvalidPixelBufferAttributes
+  // (-6682).
+
+  // I420: kCVPixelFormatType_420YpCbCr8Planar (video range).
+  const int pixel_format = (image_format == libgav1::kImageFormatYuv420)
+                               ? kCVPixelFormatType_420YpCbCr8PlanarFullRange
+                               : kCVPixelFormatType_OneComponent8;
+  UniqueCFNumberRef cf_pixel_format(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &pixel_format));
+  UniqueCFNumberRef cf_width(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &width));
+  UniqueCFNumberRef cf_height(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &height));
+  UniqueCFNumberRef cf_left_border(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &left_border));
+  UniqueCFNumberRef cf_right_border(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &right_border));
+  UniqueCFNumberRef cf_top_border(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &top_border));
+  UniqueCFNumberRef cf_bottom_border(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &bottom_border));
+  UniqueCFNumberRef cf_stride_alignment(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &stride_alignment));
+
+  const void* buffer_keys[] = {
+      kCVPixelBufferPixelFormatTypeKey,
+      kCVPixelBufferWidthKey,
+      kCVPixelBufferHeightKey,
+      kCVPixelBufferExtendedPixelsLeftKey,
+      kCVPixelBufferExtendedPixelsRightKey,
+      kCVPixelBufferExtendedPixelsTopKey,
+      kCVPixelBufferExtendedPixelsBottomKey,
+      kCVPixelBufferBytesPerRowAlignmentKey,
+  };
+  const void* buffer_values[] = {
+      cf_pixel_format.get(),  cf_width.get(),
+      cf_height.get(),        cf_left_border.get(),
+      cf_right_border.get(),  cf_top_border.get(),
+      cf_bottom_border.get(), cf_stride_alignment.get(),
+  };
+  UniqueCFDictionaryRef buffer_attributes(CFDictionaryCreate(
+      kCFAllocatorDefault, buffer_keys, buffer_values, 8,
+      &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks));
+  if (buffer_attributes == nullptr) {
+    fprintf(stderr, "CFDictionaryCreate of buffer_attributes failed.\n");
+    return libgav1::kStatusUnknownError;
+  }
+  CVPixelBufferPoolRef cv_pool;
+  CVReturn ret = CVPixelBufferPoolCreate(
+      /*allocator=*/nullptr, pool_attributes.get(), buffer_attributes.get(),
+      &cv_pool);
+  if (ret != kCVReturnSuccess) {
+    fprintf(stderr, "CVPixelBufferPoolCreate failed: %d.\n",
+            static_cast<int>(ret));
+    return libgav1::kStatusOutOfMemory;
+  }
+  CVPixelBufferPoolRelease(pool_);
+  pool_ = cv_pool;
+  return libgav1::kStatusOk;
+}
+
+libgav1::StatusCode Gav1DecodeCVPixelBufferPool::GetCVPixelBuffer(
+    int bitdepth, libgav1::ImageFormat image_format, int /*width*/,
+    int /*height*/, int /*left_border*/, int /*right_border*/,
+    int /*top_border*/, int /*bottom_border*/, int /*stride_alignment*/,
+    libgav1::FrameBuffer* frame_buffer) {
+  static_cast<void>(bitdepth);
+  assert(bitdepth == 8 && (image_format == libgav1::kImageFormatYuv420 ||
+                           image_format == libgav1::kImageFormatMonochrome400));
+  const bool is_monochrome =
+      (image_format == libgav1::kImageFormatMonochrome400);
+
+  // The dictionary must have kCVPixelBufferPoolAllocationThresholdKey,
+  // otherwise CVPixelBufferPoolCreatePixelBufferWithAuxAttributes() fails with
+  // kCVReturnWouldExceedAllocationThreshold (-6689).
+  UniqueCFNumberRef cf_num_buffers(
+      CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &num_buffers_));
+
+  const void* buffer_keys[] = {
+      kCVPixelBufferPoolAllocationThresholdKey,
+  };
+  const void* buffer_values[] = {
+      cf_num_buffers.get(),
+  };
+  UniqueCFDictionaryRef aux_attributes(CFDictionaryCreate(
+      kCFAllocatorDefault, buffer_keys, buffer_values, 1,
+      &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks));
+  if (aux_attributes == nullptr) {
+    fprintf(stderr, "CFDictionaryCreate of aux_attributes failed.\n");
+    return libgav1::kStatusUnknownError;
+  }
+
+  CVPixelBufferRef pixel_buffer;
+  CVReturn ret = CVPixelBufferPoolCreatePixelBufferWithAuxAttributes(
+      /*allocator=*/nullptr, pool_, aux_attributes.get(), &pixel_buffer);
+  if (ret != kCVReturnSuccess) {
+    fprintf(stderr,
+            "CVPixelBufferPoolCreatePixelBufferWithAuxAttributes failed: %d.\n",
+            static_cast<int>(ret));
+    return libgav1::kStatusOutOfMemory;
+  }
+
+  ret = CVPixelBufferLockBaseAddress(pixel_buffer, /*lockFlags=*/0);
+  if (ret != kCVReturnSuccess) {
+    fprintf(stderr, "CVPixelBufferLockBaseAddress failed: %d.\n",
+            static_cast<int>(ret));
+    CFRelease(pixel_buffer);
+    return libgav1::kStatusUnknownError;
+  }
+
+  // If the pixel format type is kCVPixelFormatType_OneComponent8, the pixel
+  // buffer is nonplanar (CVPixelBufferIsPlanar returns false and
+  // CVPixelBufferGetPlaneCount returns 0), but
+  // CVPixelBufferGetBytesPerRowOfPlane and CVPixelBufferGetBaseAddressOfPlane
+  // still work for plane index 0, even though the documentation says they
+  // return NULL for nonplanar pixel buffers.
+  frame_buffer->stride[0] =
+      static_cast<int>(CVPixelBufferGetBytesPerRowOfPlane(pixel_buffer, 0));
+  frame_buffer->plane[0] = static_cast<uint8_t*>(
+      CVPixelBufferGetBaseAddressOfPlane(pixel_buffer, 0));
+  if (is_monochrome) {
+    frame_buffer->stride[1] = 0;
+    frame_buffer->stride[2] = 0;
+    frame_buffer->plane[1] = nullptr;
+    frame_buffer->plane[2] = nullptr;
+  } else {
+    frame_buffer->stride[1] =
+        static_cast<int>(CVPixelBufferGetBytesPerRowOfPlane(pixel_buffer, 1));
+    frame_buffer->stride[2] =
+        static_cast<int>(CVPixelBufferGetBytesPerRowOfPlane(pixel_buffer, 2));
+    frame_buffer->plane[1] = static_cast<uint8_t*>(
+        CVPixelBufferGetBaseAddressOfPlane(pixel_buffer, 1));
+    frame_buffer->plane[2] = static_cast<uint8_t*>(
+        CVPixelBufferGetBaseAddressOfPlane(pixel_buffer, 2));
+  }
+  frame_buffer->private_data = pixel_buffer;
+
+  return libgav1::kStatusOk;
+}
+
+void Gav1DecodeCVPixelBufferPool::ReleaseCVPixelBuffer(
+    void* buffer_private_data) {
+  auto const pixel_buffer = static_cast<CVPixelBufferRef>(buffer_private_data);
+  CVReturn ret =
+      CVPixelBufferUnlockBaseAddress(pixel_buffer, /*unlockFlags=*/0);
+  if (ret != kCVReturnSuccess) {
+    fprintf(stderr, "%s:%d: CVPixelBufferUnlockBaseAddress failed: %d.\n",
+            __FILE__, __LINE__, static_cast<int>(ret));
+    abort();
+  }
+  CFRelease(pixel_buffer);
+}
diff --git a/libgav1/examples/gav1_decode_cv_pixel_buffer_pool.h b/libgav1/examples/gav1_decode_cv_pixel_buffer_pool.h
new file mode 100644
index 0000000..7aee324
--- /dev/null
+++ b/libgav1/examples/gav1_decode_cv_pixel_buffer_pool.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_GAV1_DECODE_CV_PIXEL_BUFFER_POOL_H_
+#define LIBGAV1_EXAMPLES_GAV1_DECODE_CV_PIXEL_BUFFER_POOL_H_
+
+#include <CoreVideo/CoreVideo.h>
+
+#include <cstddef>
+#include <memory>
+
+#include "gav1/frame_buffer.h"
+
+extern "C" libgav1::StatusCode Gav1DecodeOnCVPixelBufferSizeChanged(
+    void* callback_private_data, int bitdepth,
+    libgav1::ImageFormat image_format, int width, int height, int left_border,
+    int right_border, int top_border, int bottom_border, int stride_alignment);
+
+extern "C" libgav1::StatusCode Gav1DecodeGetCVPixelBuffer(
+    void* callback_private_data, int bitdepth,
+    libgav1::ImageFormat image_format, int width, int height, int left_border,
+    int right_border, int top_border, int bottom_border, int stride_alignment,
+    libgav1::FrameBuffer* frame_buffer);
+
+extern "C" void Gav1DecodeReleaseCVPixelBuffer(void* callback_private_data,
+                                               void* buffer_private_data);
+
+class Gav1DecodeCVPixelBufferPool {
+ public:
+  static std::unique_ptr<Gav1DecodeCVPixelBufferPool> Create(
+      size_t num_buffers);
+
+  // Not copyable or movable.
+  Gav1DecodeCVPixelBufferPool(const Gav1DecodeCVPixelBufferPool&) = delete;
+  Gav1DecodeCVPixelBufferPool& operator=(const Gav1DecodeCVPixelBufferPool&) =
+      delete;
+
+  ~Gav1DecodeCVPixelBufferPool();
+
+  libgav1::StatusCode OnCVPixelBufferSizeChanged(
+      int bitdepth, libgav1::ImageFormat image_format, int width, int height,
+      int left_border, int right_border, int top_border, int bottom_border,
+      int stride_alignment);
+
+  libgav1::StatusCode GetCVPixelBuffer(int bitdepth,
+                                       libgav1::ImageFormat image_format,
+                                       int width, int height, int left_border,
+                                       int right_border, int top_border,
+                                       int bottom_border, int stride_alignment,
+                                       libgav1::FrameBuffer* frame_buffer);
+  void ReleaseCVPixelBuffer(void* buffer_private_data);
+
+ private:
+  Gav1DecodeCVPixelBufferPool(size_t num_buffers);
+
+  CVPixelBufferPoolRef pool_ = nullptr;
+  const int num_buffers_;
+};
+
+#endif  // LIBGAV1_EXAMPLES_GAV1_DECODE_CV_PIXEL_BUFFER_POOL_H_
diff --git a/libgav1/examples/ivf_parser.cc b/libgav1/examples/ivf_parser.cc
new file mode 100644
index 0000000..f8adb14
--- /dev/null
+++ b/libgav1/examples/ivf_parser.cc
@@ -0,0 +1,96 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/ivf_parser.h"
+
+#include <cstdio>
+#include <cstring>
+
+#include "examples/file_reader_constants.h"
+#include "examples/logging.h"
+
+namespace libgav1 {
+namespace {
+
+size_t ReadLittleEndian16(const uint8_t* const buffer) {
+  size_t value = buffer[1] << 8;
+  value |= buffer[0];
+  return value;
+}
+
+size_t ReadLittleEndian32(const uint8_t* const buffer) {
+  size_t value = buffer[3] << 24;
+  value |= buffer[2] << 16;
+  value |= buffer[1] << 8;
+  value |= buffer[0];
+  return value;
+}
+
+}  // namespace
+
+bool ParseIvfFileHeader(const uint8_t* const header_buffer,
+                        IvfFileHeader* const ivf_file_header) {
+  if (header_buffer == nullptr || ivf_file_header == nullptr) return false;
+
+  if (memcmp(kIvfSignature, header_buffer, 4) != 0) {
+    return false;
+  }
+
+  // Verify header version and length.
+  const size_t ivf_header_version = ReadLittleEndian16(&header_buffer[4]);
+  if (ivf_header_version != kIvfHeaderVersion) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Unexpected IVF version");
+  }
+
+  const size_t ivf_header_size = ReadLittleEndian16(&header_buffer[6]);
+  if (ivf_header_size != kIvfFileHeaderSize) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Invalid IVF file header size");
+    return false;
+  }
+
+  if (memcmp(kAv1FourCcLower, &header_buffer[8], 4) != 0 &&
+      memcmp(kAv1FourCcUpper, &header_buffer[8], 4) != 0) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Unsupported codec 4CC");
+    return false;
+  }
+
+  ivf_file_header->width = ReadLittleEndian16(&header_buffer[12]);
+  ivf_file_header->height = ReadLittleEndian16(&header_buffer[14]);
+  ivf_file_header->frame_rate_numerator =
+      ReadLittleEndian32(&header_buffer[16]);
+  ivf_file_header->frame_rate_denominator =
+      ReadLittleEndian32(&header_buffer[20]);
+
+  return true;
+}
+
+bool ParseIvfFrameHeader(const uint8_t* const header_buffer,
+                         IvfFrameHeader* const ivf_frame_header) {
+  if (header_buffer == nullptr || ivf_frame_header == nullptr) return false;
+
+  ivf_frame_header->frame_size = ReadLittleEndian32(header_buffer);
+  if (ivf_frame_header->frame_size > kMaxTemporalUnitSize) {
+    LIBGAV1_EXAMPLES_LOG_ERROR("Temporal Unit size exceeds maximum");
+    return false;
+  }
+
+  ivf_frame_header->timestamp = ReadLittleEndian32(&header_buffer[4]);
+  const uint64_t timestamp_hi =
+      static_cast<uint64_t>(ReadLittleEndian32(&header_buffer[8])) << 32;
+  ivf_frame_header->timestamp |= timestamp_hi;
+
+  return true;
+}
+
+}  // namespace libgav1
diff --git a/libgav1/examples/ivf_parser.h b/libgav1/examples/ivf_parser.h
new file mode 100644
index 0000000..b6bbc59
--- /dev/null
+++ b/libgav1/examples/ivf_parser.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_IVF_PARSER_H_
+#define LIBGAV1_EXAMPLES_IVF_PARSER_H_
+
+#include <cstddef>
+#include <cstdint>
+
+namespace libgav1 {
+
+struct IvfFileHeader {
+  IvfFileHeader() = default;
+  IvfFileHeader(const IvfFileHeader& rhs) = default;
+  IvfFileHeader& operator=(const IvfFileHeader& rhs) = default;
+  IvfFileHeader(IvfFileHeader&& rhs) = default;
+  IvfFileHeader& operator=(IvfFileHeader&& rhs) = default;
+
+  size_t width = 0;
+  size_t height = 0;
+  size_t frame_rate_numerator = 0;
+  size_t frame_rate_denominator = 0;
+};
+
+struct IvfFrameHeader {
+  IvfFrameHeader() = default;
+  IvfFrameHeader(const IvfFrameHeader& rhs) = default;
+  IvfFrameHeader& operator=(const IvfFrameHeader& rhs) = default;
+  IvfFrameHeader(IvfFrameHeader&& rhs) = default;
+  IvfFrameHeader& operator=(IvfFrameHeader&& rhs) = default;
+
+  size_t frame_size = 0;
+  int64_t timestamp = 0;
+};
+
+bool ParseIvfFileHeader(const uint8_t* header_buffer,
+                        IvfFileHeader* ivf_file_header);
+
+bool ParseIvfFrameHeader(const uint8_t* header_buffer,
+                         IvfFrameHeader* ivf_frame_header);
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_EXAMPLES_IVF_PARSER_H_
diff --git a/libgav1/examples/libgav1_examples.cmake b/libgav1/examples/libgav1_examples.cmake
new file mode 100644
index 0000000..1f949f3
--- /dev/null
+++ b/libgav1/examples/libgav1_examples.cmake
@@ -0,0 +1,63 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_EXAMPLES_LIBGAV1_EXAMPLES_CMAKE_)
+  return()
+endif() # LIBGAV1_EXAMPLES_LIBGAV1_EXAMPLES_CMAKE_
+set(LIBGAV1_EXAMPLES_LIBGAV1_EXAMPLES_CMAKE_ 1)
+
+set(libgav1_file_reader_sources "${libgav1_examples}/file_reader.cc"
+                                "${libgav1_examples}/file_reader.h"
+                                "${libgav1_examples}/file_reader_constants.cc"
+                                "${libgav1_examples}/file_reader_constants.h"
+                                "${libgav1_examples}/file_reader_factory.cc"
+                                "${libgav1_examples}/file_reader_factory.h"
+                                "${libgav1_examples}/file_reader_interface.h"
+                                "${libgav1_examples}/ivf_parser.cc"
+                                "${libgav1_examples}/ivf_parser.h"
+                                "${libgav1_examples}/logging.h")
+
+set(libgav1_file_writer_sources "${libgav1_examples}/file_writer.cc"
+                                "${libgav1_examples}/file_writer.h"
+                                "${libgav1_examples}/logging.h")
+
+set(libgav1_decode_sources "${libgav1_examples}/gav1_decode.cc")
+
+macro(libgav1_add_examples_targets)
+  libgav1_add_library(NAME libgav1_file_reader TYPE OBJECT SOURCES
+                      ${libgav1_file_reader_sources} DEFINES ${libgav1_defines}
+                      INCLUDES ${libgav1_include_paths})
+
+  libgav1_add_library(NAME libgav1_file_writer TYPE OBJECT SOURCES
+                      ${libgav1_file_writer_sources} DEFINES ${libgav1_defines}
+                      INCLUDES ${libgav1_include_paths})
+
+  libgav1_add_executable(NAME
+                         gav1_decode
+                         SOURCES
+                         ${libgav1_decode_sources}
+                         DEFINES
+                         ${libgav1_defines}
+                         INCLUDES
+                         ${libgav1_include_paths}
+                         ${libgav1_gtest_include_paths}
+                         OBJLIB_DEPS
+                         libgav1_file_reader
+                         libgav1_file_writer
+                         LIB_DEPS
+                         absl::strings
+                         absl::str_format_internal
+                         absl::time
+                         ${libgav1_dependency})
+endmacro()
diff --git a/libgav1/examples/logging.h b/libgav1/examples/logging.h
new file mode 100644
index 0000000..c0bcad7
--- /dev/null
+++ b/libgav1/examples/logging.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_LOGGING_H_
+#define LIBGAV1_EXAMPLES_LOGGING_H_
+
+#include <cstddef>
+#include <cstdio>
+
+namespace libgav1 {
+namespace examples {
+
+#if !defined(LIBGAV1_EXAMPLES_ENABLE_LOGGING)
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)
+#define LIBGAV1_EXAMPLES_ENABLE_LOGGING 0
+#else
+#define LIBGAV1_EXAMPLES_ENABLE_LOGGING 1
+#endif
+#endif
+
+#if LIBGAV1_EXAMPLES_ENABLE_LOGGING
+
+// Compile-time function to get the 'base' file_name, that is, the part of
+// a file_name after the last '/' or '\' path separator. The search starts at
+// the end of the string; the second parameter is the length of the string.
+constexpr const char* Basename(const char* file_name, size_t offset) {
+  return (offset == 0 || file_name[offset - 1] == '/' ||
+          file_name[offset - 1] == '\\')
+             ? file_name + offset
+             : Basename(file_name, offset - 1);
+}
+
+#define LIBGAV1_EXAMPLES_LOG_ERROR(error_string)                              \
+  do {                                                                        \
+    constexpr const char* libgav1_examples_basename =                         \
+        ::libgav1::examples::Basename(__FILE__, sizeof(__FILE__) - 1);        \
+    fprintf(stderr, "%s:%d (%s): %s.\n", libgav1_examples_basename, __LINE__, \
+            __func__, error_string);                                          \
+  } while (false)
+
+#else  // !LIBGAV1_EXAMPLES_ENABLE_LOGGING
+
+#define LIBGAV1_EXAMPLES_LOG_ERROR(error_string) \
+  do {                                           \
+  } while (false)
+
+#endif  // LIBGAV1_EXAMPLES_ENABLE_LOGGING
+
+}  // namespace examples
+}  // namespace libgav1
+
+#endif  // LIBGAV1_EXAMPLES_LOGGING_H_
diff --git a/libgav1/src/buffer_pool.cc b/libgav1/src/buffer_pool.cc
index 63312ef..c1a5606 100644
--- a/libgav1/src/buffer_pool.cc
+++ b/libgav1/src/buffer_pool.cc
@@ -18,6 +18,7 @@
 #include <cstring>
 
 #include "src/utils/common.h"
+#include "src/utils/constants.h"
 #include "src/utils/logging.h"
 
 namespace libgav1 {
@@ -36,19 +37,28 @@
 
 }  // namespace
 
-RefCountedBuffer::RefCountedBuffer() {
-  memset(&raw_frame_buffer_, 0, sizeof(raw_frame_buffer_));
-}
+RefCountedBuffer::RefCountedBuffer() = default;
 
 RefCountedBuffer::~RefCountedBuffer() = default;
 
 bool RefCountedBuffer::Realloc(int bitdepth, bool is_monochrome, int width,
                                int height, int subsampling_x, int subsampling_y,
-                               int border, int byte_alignment) {
-  return yuv_buffer_.Realloc(bitdepth, is_monochrome, width, height,
-                             subsampling_x, subsampling_y, border,
-                             byte_alignment, pool_->get_frame_buffer_,
-                             pool_->callback_private_data_, &raw_frame_buffer_);
+                               int left_border, int right_border,
+                               int top_border, int bottom_border) {
+  // The YuvBuffer::Realloc() could call the get frame buffer callback which
+  // will need to be thread safe. So we ensure that we only call Realloc() once
+  // at any given time.
+  std::lock_guard<std::mutex> lock(pool_->mutex_);
+  assert(!buffer_private_data_valid_);
+  if (!yuv_buffer_.Realloc(
+          bitdepth, is_monochrome, width, height, subsampling_x, subsampling_y,
+          left_border, right_border, top_border, bottom_border,
+          pool_->get_frame_buffer_, pool_->callback_private_data_,
+          &buffer_private_data_)) {
+    return false;
+  }
+  buffer_private_data_valid_ = true;
+  return true;
 }
 
 bool RefCountedBuffer::SetFrameDimensions(const ObuFrameHeader& frame_header) {
@@ -59,13 +69,13 @@
   render_height_ = frame_header.render_height;
   rows4x4_ = frame_header.rows4x4;
   columns4x4_ = frame_header.columns4x4;
-  const int rows4x4_half = DivideBy2(rows4x4_);
-  const int columns4x4_half = DivideBy2(columns4x4_);
-  if (!motion_field_reference_frame_.Reset(rows4x4_half, columns4x4_half,
-                                           /*zero_initialize=*/false) ||
-      !motion_field_mv_.Reset(rows4x4_half, columns4x4_half,
-                              /*zero_initialize=*/false)) {
-    return false;
+  if (frame_header.refresh_frame_flags != 0 &&
+      !IsIntraFrame(frame_header.frame_type)) {
+    const int rows4x4_half = DivideBy2(rows4x4_);
+    const int columns4x4_half = DivideBy2(columns4x4_);
+    if (!reference_info_.Reset(rows4x4_half, columns4x4_half)) {
+      return false;
+    }
   }
   return segmentation_map_.Allocate(rows4x4_, columns4x4_);
 }
@@ -103,55 +113,105 @@
   ptr->pool_->ReturnUnusedBuffer(ptr);
 }
 
-// static
-constexpr int BufferPool::kNumBuffers;
-
-BufferPool::BufferPool(const DecoderSettings& settings) {
-  if (settings.get != nullptr && settings.release != nullptr) {
-    get_frame_buffer_ = settings.get;
-    release_frame_buffer_ = settings.release;
-    callback_private_data_ = settings.callback_private_data;
+BufferPool::BufferPool(
+    FrameBufferSizeChangedCallback on_frame_buffer_size_changed,
+    GetFrameBufferCallback get_frame_buffer,
+    ReleaseFrameBufferCallback release_frame_buffer,
+    void* callback_private_data) {
+  if (get_frame_buffer != nullptr) {
+    // on_frame_buffer_size_changed may be null.
+    assert(release_frame_buffer != nullptr);
+    on_frame_buffer_size_changed_ = on_frame_buffer_size_changed;
+    get_frame_buffer_ = get_frame_buffer;
+    release_frame_buffer_ = release_frame_buffer;
+    callback_private_data_ = callback_private_data;
   } else {
-    internal_frame_buffers_ = InternalFrameBufferList::Create(kNumBuffers);
-    // GetInternalFrameBuffer checks whether its private_data argument is null,
-    // so we don't need to check whether internal_frame_buffers_ is null here.
+    on_frame_buffer_size_changed_ = OnInternalFrameBufferSizeChanged;
     get_frame_buffer_ = GetInternalFrameBuffer;
     release_frame_buffer_ = ReleaseInternalFrameBuffer;
-    callback_private_data_ = internal_frame_buffers_.get();
-  }
-  for (RefCountedBuffer& buffer : buffers_) {
-    buffer.SetBufferPool(this);
+    callback_private_data_ = &internal_frame_buffers_;
   }
 }
 
 BufferPool::~BufferPool() {
-  for (const RefCountedBuffer& buffer : buffers_) {
-    if (buffer.in_use_) {
-      assert(0 && "RefCountedBuffer still in use at destruction time.");
+  for (const auto* buffer : buffers_) {
+    if (buffer->in_use_) {
+      assert(false && "RefCountedBuffer still in use at destruction time.");
       LIBGAV1_DLOG(ERROR, "RefCountedBuffer still in use at destruction time.");
     }
+    delete buffer;
   }
 }
 
+bool BufferPool::OnFrameBufferSizeChanged(int bitdepth,
+                                          Libgav1ImageFormat image_format,
+                                          int width, int height,
+                                          int left_border, int right_border,
+                                          int top_border, int bottom_border) {
+  if (on_frame_buffer_size_changed_ == nullptr) return true;
+  return on_frame_buffer_size_changed_(callback_private_data_, bitdepth,
+                                       image_format, width, height, left_border,
+                                       right_border, top_border, bottom_border,
+                                       /*stride_alignment=*/16) == kStatusOk;
+}
+
 RefCountedBufferPtr BufferPool::GetFreeBuffer() {
-  for (RefCountedBuffer& buffer : buffers_) {
-    if (!buffer.in_use_) {
-      buffer.in_use_ = true;
-      return RefCountedBufferPtr(&buffer, RefCountedBuffer::ReturnToBufferPool);
+  // In frame parallel mode, the GetFreeBuffer() calls from ObuParser all happen
+  // from the same thread serially, but the GetFreeBuffer() call in
+  // DecoderImpl::ApplyFilmGrain can happen from multiple threads at the same
+  // time. So this function has to be thread safe.
+  // TODO(b/142583029): Investigate if the GetFreeBuffer() call in
+  // DecoderImpl::ApplyFilmGrain() call can be serialized so that this function
+  // need not be thread safe.
+  std::unique_lock<std::mutex> lock(mutex_);
+  for (auto buffer : buffers_) {
+    if (!buffer->in_use_) {
+      buffer->in_use_ = true;
+      buffer->progress_row_ = -1;
+      buffer->frame_state_ = kFrameStateUnknown;
+      lock.unlock();
+      return RefCountedBufferPtr(buffer, RefCountedBuffer::ReturnToBufferPool);
     }
   }
+  lock.unlock();
+  auto* const buffer = new (std::nothrow) RefCountedBuffer();
+  if (buffer == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Failed to allocate a new reference counted buffer.");
+    return RefCountedBufferPtr();
+  }
+  buffer->SetBufferPool(this);
+  buffer->in_use_ = true;
+  buffer->progress_row_ = -1;
+  buffer->frame_state_ = kFrameStateUnknown;
+  lock.lock();
+  const bool ok = buffers_.push_back(buffer);
+  lock.unlock();
+  if (!ok) {
+    LIBGAV1_DLOG(
+        ERROR,
+        "Failed to push the new reference counted buffer into the vector.");
+    delete buffer;
+    return RefCountedBufferPtr();
+  }
+  return RefCountedBufferPtr(buffer, RefCountedBuffer::ReturnToBufferPool);
+}
 
-  // We should never run out of free buffers. If we reach here, there is a
-  // reference leak.
-  return RefCountedBufferPtr();
+void BufferPool::Abort() {
+  std::unique_lock<std::mutex> lock(mutex_);
+  for (auto buffer : buffers_) {
+    if (buffer->in_use_) {
+      buffer->Abort();
+    }
+  }
 }
 
 void BufferPool::ReturnUnusedBuffer(RefCountedBuffer* buffer) {
+  std::lock_guard<std::mutex> lock(mutex_);
   assert(buffer->in_use_);
   buffer->in_use_ = false;
-  if (buffer->raw_frame_buffer_.data[0] != nullptr) {
-    release_frame_buffer_(callback_private_data_, &buffer->raw_frame_buffer_);
-    memset(&buffer->raw_frame_buffer_, 0, sizeof(buffer->raw_frame_buffer_));
+  if (buffer->buffer_private_data_valid_) {
+    release_frame_buffer_(callback_private_data_, buffer->buffer_private_data_);
+    buffer->buffer_private_data_valid_ = false;
   }
 }
 
diff --git a/libgav1/src/buffer_pool.h b/libgav1/src/buffer_pool.h
index 4a34e23..f35a633 100644
--- a/libgav1/src/buffer_pool.h
+++ b/libgav1/src/buffer_pool.h
@@ -18,27 +18,38 @@
 #define LIBGAV1_SRC_BUFFER_POOL_H_
 
 #include <array>
+#include <cassert>
+#include <climits>
+#include <condition_variable>  // NOLINT (unapproved c++11 header)
 #include <cstdint>
-#include <memory>
+#include <cstring>
+#include <mutex>  // NOLINT (unapproved c++11 header)
 
-#include "src/decoder_buffer.h"
-#include "src/decoder_settings.h"
 #include "src/dsp/common.h"
-#include "src/frame_buffer.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/gav1/frame_buffer.h"
 #include "src/internal_frame_buffer_list.h"
-#include "src/obu_parser.h"
 #include "src/symbol_decoder_context.h"
-#include "src/utils/array_2d.h"
+#include "src/utils/compiler_attributes.h"
 #include "src/utils/constants.h"
+#include "src/utils/reference_info.h"
 #include "src/utils/segmentation.h"
 #include "src/utils/segmentation_map.h"
 #include "src/utils/types.h"
+#include "src/utils/vector.h"
 #include "src/yuv_buffer.h"
 
 namespace libgav1 {
 
 class BufferPool;
 
+enum FrameState : uint8_t {
+  kFrameStateUnknown,
+  kFrameStateStarted,
+  kFrameStateParsed,
+  kFrameStateDecoded
+};
+
 // A reference-counted frame buffer. Clients should access it via
 // RefCountedBufferPtr, which manages reference counting transparently.
 class RefCountedBuffer {
@@ -48,34 +59,39 @@
   RefCountedBuffer& operator=(const RefCountedBuffer&) = delete;
 
   // Allocates the YUV buffer. Returns true on success. Returns false on
-  // failure.
+  // failure. This function ensures the thread safety of the |get_frame_buffer_|
+  // call (i.e.) only one |get_frame_buffer_| call will happen at a given time.
+  // TODO(b/142583029): In frame parallel mode, we can require the callbacks to
+  // be thread safe so that we can remove the thread safety of this function and
+  // applications can have fine grained locks.
   //
   // * |width| and |height| are the image dimensions in pixels.
   // * |subsampling_x| and |subsampling_y| (either 0 or 1) specify the
   //   subsampling of the width and height of the chroma planes, respectively.
-  // * |border| is the size of the borders (on all four sides) in pixels.
-  // * |byte_alignment| specifies the additional alignment requirement of the
-  //   data buffers of the Y, U, and V planes. If |byte_alignment| is 0, there
-  //   is no additional alignment requirement. Otherwise, |byte_alignment|
-  //   must be a power of 2 and greater than or equal to 16.
-  //   NOTE: The strides are a multiple of 16. Therefore only the first row in
-  //   each plane is aligned to |byte_alignment|. Subsequent rows are only
-  //   16-byte aligned.
+  // * |left_border|, |right_border|, |top_border|, and |bottom_border| are
+  //   the sizes (in pixels) of the borders on the left, right, top, and
+  //   bottom sides, respectively.
+  //
+  // NOTE: The strides are a multiple of 16. Since the first row in each plane
+  // is 16-byte aligned, subsequent rows are also 16-byte aligned.
   bool Realloc(int bitdepth, bool is_monochrome, int width, int height,
-               int subsampling_x, int subsampling_y, int border,
-               int byte_alignment);
+               int subsampling_x, int subsampling_y, int left_border,
+               int right_border, int top_border, int bottom_border);
 
   YuvBuffer* buffer() { return &yuv_buffer_; }
 
   // Returns the buffer private data set by the get frame buffer callback when
   // it allocated the YUV buffer.
-  void* buffer_private_data() const { return raw_frame_buffer_.private_data; }
+  void* buffer_private_data() const {
+    assert(buffer_private_data_valid_);
+    return buffer_private_data_;
+  }
 
   // NOTE: In the current frame, this is the frame_type syntax element in the
   // frame header. In a reference frame, this implements the RefFrameType array
   // in the spec.
   FrameType frame_type() const { return frame_type_; }
-  void set_frame_type(enum FrameType frame_type) { frame_type_ = frame_type; }
+  void set_frame_type(FrameType frame_type) { frame_type_ = frame_type; }
 
   // The sample position for subsampled streams. This is the
   // chroma_sample_position syntax element in the sequence header.
@@ -85,8 +101,7 @@
   ChromaSamplePosition chroma_sample_position() const {
     return chroma_sample_position_;
   }
-  void set_chroma_sample_position(
-      enum ChromaSamplePosition chroma_sample_position) {
+  void set_chroma_sample_position(ChromaSamplePosition chroma_sample_position) {
     chroma_sample_position_ = chroma_sample_position;
   }
 
@@ -94,19 +109,11 @@
   bool showable_frame() const { return showable_frame_; }
   void set_showable_frame(bool value) { showable_frame_ = value; }
 
-  uint8_t order_hint(ReferenceFrameType reference_frame) const {
-    return order_hint_[reference_frame];
-  }
-  void set_order_hint(ReferenceFrameType reference_frame, uint8_t order_hint) {
-    order_hint_[reference_frame] = order_hint;
-  }
-  void ClearOrderHints() { order_hint_.fill(0); }
-
   // Sets upscaled_width_, frame_width_, frame_height_, render_width_,
   // render_height_, rows4x4_ and columns4x4_ from the corresponding fields
-  // in frame_header. Allocates motion_field_reference_frame_,
-  // motion_field_mv_, and segmentation_map_. Returns true on success, false
-  // on failure.
+  // in frame_header. Allocates reference_info_.motion_field_reference_frame,
+  // reference_info_.motion_field_mv_, and segmentation_map_. Returns true on
+  // success, false on failure.
   bool SetFrameDimensions(const ObuFrameHeader& frame_header);
 
   int32_t upscaled_width() const { return upscaled_width_; }
@@ -119,17 +126,10 @@
   int32_t rows4x4() const { return rows4x4_; }
   int32_t columns4x4() const { return columns4x4_; }
 
-  // Entry at |row|, |column| corresponds to
-  // MfRefFrames[row * 2 + 1][column * 2 + 1] in the spec.
-  ReferenceFrameType* motion_field_reference_frame(int row, int column) {
-    return &motion_field_reference_frame_[row][column];
-  }
-
-  // Entry at |row|, |column| corresponds to
-  // MfMvs[row * 2 + 1][column * 2 + 1] in the spec.
-  MotionVector* motion_field_mv(int row, int column) {
-    return &motion_field_mv_[row][column];
-  }
+  int spatial_id() const { return spatial_id_; }
+  void set_spatial_id(int value) { spatial_id_ = value; }
+  int temporal_id() const { return temporal_id_; }
+  void set_temporal_id(int value) { temporal_id_ = value; }
 
   SegmentationMap* segmentation_map() { return &segmentation_map_; }
   const SegmentationMap* segmentation_map() const { return &segmentation_map_; }
@@ -180,6 +180,99 @@
     film_grain_params_ = params;
   }
 
+  const ReferenceInfo* reference_info() const { return &reference_info_; }
+  ReferenceInfo* reference_info() { return &reference_info_; }
+
+  // This will wake up the WaitUntil*() functions and make them return false.
+  void Abort() {
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      abort_ = true;
+    }
+    parsed_condvar_.notify_all();
+    decoded_condvar_.notify_all();
+    progress_row_condvar_.notify_all();
+  }
+
+  void SetFrameState(FrameState frame_state) {
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      frame_state_ = frame_state;
+    }
+    if (frame_state == kFrameStateParsed) {
+      parsed_condvar_.notify_all();
+    } else if (frame_state == kFrameStateDecoded) {
+      decoded_condvar_.notify_all();
+      progress_row_condvar_.notify_all();
+    }
+  }
+
+  // Sets the progress of this frame to |progress_row| and notifies any threads
+  // that may be waiting on rows <= |progress_row|.
+  void SetProgress(int progress_row) {
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      if (progress_row_ >= progress_row) return;
+      progress_row_ = progress_row;
+    }
+    progress_row_condvar_.notify_all();
+  }
+
+  void MarkFrameAsStarted() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (frame_state_ != kFrameStateUnknown) return;
+    frame_state_ = kFrameStateStarted;
+  }
+
+  // All the WaitUntil* functions will return true if the desired wait state was
+  // reached successfully. If the return value is false, then the caller must
+  // assume that the wait was not successful and try to stop whatever they are
+  // doing as early as possible.
+
+  // Waits until the frame has been parsed.
+  bool WaitUntilParsed() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    while (frame_state_ < kFrameStateParsed && !abort_) {
+      parsed_condvar_.wait(lock);
+    }
+    return !abort_;
+  }
+
+  // Waits until the |progress_row| has been decoded (as indicated either by
+  // |progress_row_| or |frame_state_|). |progress_row_cache| must not be
+  // nullptr and will be populated with the value of |progress_row_| after the
+  // wait.
+  //
+  // Typical usage of |progress_row_cache| is as follows:
+  //  * Initialize |*progress_row_cache| to INT_MIN.
+  //  * Call WaitUntil only if |*progress_row_cache| < |progress_row|.
+  bool WaitUntil(int progress_row, int* progress_row_cache) {
+    // If |progress_row| is negative, it means that the wait is on the top
+    // border to be available. The top border will be available when row 0 has
+    // been decoded. So we can simply wait on row 0 instead.
+    progress_row = std::max(progress_row, 0);
+    std::unique_lock<std::mutex> lock(mutex_);
+    while (progress_row_ < progress_row && frame_state_ != kFrameStateDecoded &&
+           !abort_) {
+      progress_row_condvar_.wait(lock);
+    }
+    // Once |frame_state_| reaches kFrameStateDecoded, |progress_row_| may no
+    // longer be updated. So we set |*progress_row_cache| to INT_MAX in that
+    // case.
+    *progress_row_cache =
+        (frame_state_ != kFrameStateDecoded) ? progress_row_ : INT_MAX;
+    return !abort_;
+  }
+
+  // Waits until the entire frame has been decoded.
+  bool WaitUntilDecoded() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    while (frame_state_ != kFrameStateDecoded && !abort_) {
+      decoded_condvar_.wait(lock);
+    }
+    return !abort_;
+  }
+
  private:
   friend class BufferPool;
 
@@ -190,17 +283,26 @@
   static void ReturnToBufferPool(RefCountedBuffer* ptr);
 
   BufferPool* pool_ = nullptr;
-  FrameBuffer raw_frame_buffer_;
+  bool buffer_private_data_valid_ = false;
+  void* buffer_private_data_ = nullptr;
   YuvBuffer yuv_buffer_;
   bool in_use_ = false;  // Only used by BufferPool.
 
-  enum FrameType frame_type_ = kFrameKey;
-  enum ChromaSamplePosition chroma_sample_position_ =
-      kChromaSamplePositionUnknown;
-  bool showable_frame_ = false;
+  std::mutex mutex_;
+  FrameState frame_state_ = kFrameStateUnknown LIBGAV1_GUARDED_BY(mutex_);
+  int progress_row_ = -1 LIBGAV1_GUARDED_BY(mutex_);
+  // Signaled when progress_row_ is updated or when frame_state_ is set to
+  // kFrameStateDecoded.
+  std::condition_variable progress_row_condvar_;
+  // Signaled when the frame state is set to kFrameStateParsed.
+  std::condition_variable parsed_condvar_;
+  // Signaled when the frame state is set to kFrameStateDecoded.
+  std::condition_variable decoded_condvar_;
+  bool abort_ = false LIBGAV1_GUARDED_BY(mutex_);
 
-  // Note: order_hint_[0] (for kReferenceFrameIntra) is not used.
-  std::array<uint8_t, kNumReferenceFrameTypes> order_hint_ = {};
+  FrameType frame_type_ = kFrameKey;
+  ChromaSamplePosition chroma_sample_position_ = kChromaSamplePositionUnknown;
+  bool showable_frame_ = false;
 
   int32_t upscaled_width_ = 0;
   int32_t frame_width_ = 0;
@@ -209,13 +311,9 @@
   int32_t render_height_ = 0;
   int32_t columns4x4_ = 0;
   int32_t rows4x4_ = 0;
+  int spatial_id_ = 0;
+  int temporal_id_ = 0;
 
-  // Array of size (rows4x4 / 2) x (columns4x4 / 2). Entry at i, j corresponds
-  // to MfRefFrames[i * 2 + 1][j * 2 + 1] in the spec.
-  Array2D<ReferenceFrameType> motion_field_reference_frame_;
-  // Array of size (rows4x4 / 2) x (columns4x4 / 2). Entry at i, j corresponds
-  // to MfMvs[i * 2 + 1][j * 2 + 1] in the spec.
-  Array2D<MotionVector> motion_field_mv_;
   // segmentation_map_ contains a rows4x4_ by columns4x4_ 2D array.
   SegmentationMap segmentation_map_;
 
@@ -233,6 +331,7 @@
   // on feature_enabled only, we also save their values as an optimization.
   Segmentation segmentation_ = {};
   FilmGrainParams film_grain_params_ = {};
+  ReferenceInfo reference_info_;
 };
 
 // RefCountedBufferPtr contains a reference to a RefCountedBuffer.
@@ -247,7 +346,10 @@
 // BufferPool maintains a pool of RefCountedBuffers.
 class BufferPool {
  public:
-  explicit BufferPool(const DecoderSettings& settings);
+  BufferPool(FrameBufferSizeChangedCallback on_frame_buffer_size_changed,
+             GetFrameBufferCallback get_frame_buffer,
+             ReleaseFrameBufferCallback release_frame_buffer,
+             void* callback_private_data);
 
   // Not copyable or movable.
   BufferPool(const BufferPool&) = delete;
@@ -255,26 +357,37 @@
 
   ~BufferPool();
 
-  // Finds a free buffer in the buffer pool and returns a reference to the
-  // free buffer. If there is no free buffer, returns a null pointer.
+  LIBGAV1_MUST_USE_RESULT bool OnFrameBufferSizeChanged(
+      int bitdepth, Libgav1ImageFormat image_format, int width, int height,
+      int left_border, int right_border, int top_border, int bottom_border);
+
+  // Finds a free buffer in the buffer pool and returns a reference to the free
+  // buffer. If there is no free buffer, returns a null pointer. This function
+  // is thread safe.
   RefCountedBufferPtr GetFreeBuffer();
 
+  // Aborts all the buffers that are in use.
+  void Abort();
+
  private:
   friend class RefCountedBuffer;
 
-  // Reference frames + 1 scratch frame (for either the current frame or the
-  // film grain frame).
-  static constexpr int kNumBuffers = kNumReferenceFrameTypes + 1;
-
   // Returns an unused buffer to the buffer pool. Called by RefCountedBuffer
-  // only.
+  // only. This function is thread safe.
   void ReturnUnusedBuffer(RefCountedBuffer* buffer);
 
-  RefCountedBuffer buffers_[kNumBuffers];
+  // Used to make the following functions thread safe: GetFreeBuffer(),
+  // ReturnUnusedBuffer(), RefCountedBuffer::Realloc().
+  std::mutex mutex_;
 
-  std::unique_ptr<InternalFrameBufferList> internal_frame_buffers_;
+  // Storing a RefCountedBuffer object in a Vector is complicated because of the
+  // copy/move semantics. So the simplest way around that is to store a list of
+  // pointers in the vector.
+  Vector<RefCountedBuffer*> buffers_ LIBGAV1_GUARDED_BY(mutex_);
+  InternalFrameBufferList internal_frame_buffers_;
 
   // Frame buffer callbacks.
+  FrameBufferSizeChangedCallback on_frame_buffer_size_changed_;
   GetFrameBufferCallback get_frame_buffer_;
   ReleaseFrameBufferCallback release_frame_buffer_;
   // Private data associated with the frame buffer callbacks.
diff --git a/libgav1/src/decoder.cc b/libgav1/src/decoder.cc
index 9a38dd1..b9e43e0 100644
--- a/libgav1/src/decoder.cc
+++ b/libgav1/src/decoder.cc
@@ -12,10 +12,73 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "src/decoder.h"
+#include "src/gav1/decoder.h"
+
+#include <memory>
+#include <new>
 
 #include "src/decoder_impl.h"
 
+extern "C" {
+
+Libgav1StatusCode Libgav1DecoderCreate(const Libgav1DecoderSettings* settings,
+                                       Libgav1Decoder** decoder_out) {
+  std::unique_ptr<libgav1::Decoder> cxx_decoder(new (std::nothrow)
+                                                    libgav1::Decoder());
+  if (cxx_decoder == nullptr) return kLibgav1StatusOutOfMemory;
+
+  libgav1::DecoderSettings cxx_settings;
+  cxx_settings.threads = settings->threads;
+  cxx_settings.frame_parallel = settings->frame_parallel != 0;
+  cxx_settings.blocking_dequeue = settings->blocking_dequeue != 0;
+  cxx_settings.on_frame_buffer_size_changed =
+      settings->on_frame_buffer_size_changed;
+  cxx_settings.get_frame_buffer = settings->get_frame_buffer;
+  cxx_settings.release_frame_buffer = settings->release_frame_buffer;
+  cxx_settings.release_input_buffer = settings->release_input_buffer;
+  cxx_settings.callback_private_data = settings->callback_private_data;
+  cxx_settings.output_all_layers = settings->output_all_layers != 0;
+  cxx_settings.operating_point = settings->operating_point;
+  cxx_settings.post_filter_mask = settings->post_filter_mask;
+
+  const Libgav1StatusCode status = cxx_decoder->Init(&cxx_settings);
+  if (status == kLibgav1StatusOk) {
+    *decoder_out = reinterpret_cast<Libgav1Decoder*>(cxx_decoder.release());
+  }
+  return status;
+}
+
+void Libgav1DecoderDestroy(Libgav1Decoder* decoder) {
+  auto* cxx_decoder = reinterpret_cast<libgav1::Decoder*>(decoder);
+  delete cxx_decoder;
+}
+
+Libgav1StatusCode Libgav1DecoderEnqueueFrame(Libgav1Decoder* decoder,
+                                             const uint8_t* data, size_t size,
+                                             int64_t user_private_data,
+                                             void* buffer_private_data) {
+  auto* cxx_decoder = reinterpret_cast<libgav1::Decoder*>(decoder);
+  return cxx_decoder->EnqueueFrame(data, size, user_private_data,
+                                   buffer_private_data);
+}
+
+Libgav1StatusCode Libgav1DecoderDequeueFrame(
+    Libgav1Decoder* decoder, const Libgav1DecoderBuffer** out_ptr) {
+  auto* cxx_decoder = reinterpret_cast<libgav1::Decoder*>(decoder);
+  return cxx_decoder->DequeueFrame(out_ptr);
+}
+
+Libgav1StatusCode Libgav1DecoderSignalEOS(Libgav1Decoder* decoder) {
+  auto* cxx_decoder = reinterpret_cast<libgav1::Decoder*>(decoder);
+  return cxx_decoder->SignalEOS();
+}
+
+int Libgav1DecoderGetMaxBitdepth() {
+  return libgav1::Decoder::GetMaxBitdepth();
+}
+
+}  // extern "C"
+
 namespace libgav1 {
 
 Decoder::Decoder() = default;
@@ -23,27 +86,31 @@
 Decoder::~Decoder() = default;
 
 StatusCode Decoder::Init(const DecoderSettings* const settings) {
-  if (initialized_) return kLibgav1StatusAlready;
+  if (impl_ != nullptr) return kStatusAlready;
   if (settings != nullptr) settings_ = *settings;
-  const StatusCode status = DecoderImpl::Create(&settings_, &impl_);
-  if (status != kLibgav1StatusOk) return status;
-  initialized_ = true;
-  return kLibgav1StatusOk;
+  return DecoderImpl::Create(&settings_, &impl_);
 }
 
 StatusCode Decoder::EnqueueFrame(const uint8_t* data, const size_t size,
-                                 int64_t user_private_data) {
-  if (!initialized_) return kLibgav1StatusNotInitialized;
-  return impl_->EnqueueFrame(data, size, user_private_data);
+                                 int64_t user_private_data,
+                                 void* buffer_private_data) {
+  if (impl_ == nullptr) return kStatusNotInitialized;
+  return impl_->EnqueueFrame(data, size, user_private_data,
+                             buffer_private_data);
 }
 
 StatusCode Decoder::DequeueFrame(const DecoderBuffer** out_ptr) {
-  if (!initialized_) return kLibgav1StatusNotInitialized;
+  if (impl_ == nullptr) return kStatusNotInitialized;
   return impl_->DequeueFrame(out_ptr);
 }
 
-int Decoder::GetMaxAllowedFrames() const {
-  return settings_.frame_parallel ? settings_.threads : 1;
+StatusCode Decoder::SignalEOS() {
+  if (impl_ == nullptr) return kStatusNotInitialized;
+  // In non-frame-parallel mode, we have to release all the references. This
+  // simply means replacing the |impl_| with a new instance so that all the
+  // existing references are released and the state is cleared.
+  impl_ = nullptr;
+  return DecoderImpl::Create(&settings_, &impl_);
 }
 
 // static.
diff --git a/libgav1/src/decoder.h b/libgav1/src/decoder.h
deleted file mode 100644
index 1e3ac1a..0000000
--- a/libgav1/src/decoder.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright 2019 The libgav1 Authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef LIBGAV1_SRC_DECODER_H_
-#define LIBGAV1_SRC_DECODER_H_
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-
-#include "src/decoder_buffer.h"
-#include "src/decoder_settings.h"
-#include "src/status_code.h"
-#include "src/symbol_visibility.h"
-
-namespace libgav1 {
-
-// Forward declaration.
-class DecoderImpl;
-
-class LIBGAV1_PUBLIC Decoder {
- public:
-  Decoder();
-  ~Decoder();
-
-  // Init must be called exactly once per instance. Subsequent calls will do
-  // nothing. If |settings| is nullptr, the decoder will be initialized with
-  // default settings. Returns kLibgav1StatusOk on success, an error status
-  // otherwise.
-  StatusCode Init(const DecoderSettings* settings);
-
-  // Enqueues a compressed frame to be decoded. Applications can continue
-  // enqueue'ing up to |GetMaxAllowedFrames()|. The decoder can be thought of as
-  // a queue of size |GetMaxAllowedFrames()|. Returns kLibgav1StatusOk on
-  // success and an error status otherwise. Returning an error status here isn't
-  // a fatal error and the decoder can continue decoding further frames. To
-  // signal EOF, call this function with |data| as nullptr and |size| as 0. That
-  // will release all the frames held by the decoder.
-  //
-  // |user_private_data| may be used to asssociate application specific private
-  // data with the compressed frame. It will be copied to the user_private_data
-  // field of the DecoderBuffer returned by the corresponding |DequeueFrame()|
-  // call.
-  //
-  // NOTE: |EnqueueFrame()| does not copy the data. Therefore, after a
-  // successful |EnqueueFrame()| call, the caller must keep the |data| buffer
-  // alive until the corresponding |DequeueFrame()| call returns.
-  StatusCode EnqueueFrame(const uint8_t* data, size_t size,
-                          int64_t user_private_data);
-
-  // Dequeues a decompressed frame. If there are enqueued compressed frames,
-  // decodes one and sets |*out_ptr| to the last displayable frame in the
-  // compressed frame. If there are no displayable frames available, sets
-  // |*out_ptr| to nullptr. Returns an error status if there is an error.
-  StatusCode DequeueFrame(const DecoderBuffer** out_ptr);
-
-  // Returns the maximum number of frames allowed to be enqueued at a time. The
-  // decoder will reject frames beyond this count. If |settings_.frame_parallel|
-  // is false, then this function will always return 1.
-  int GetMaxAllowedFrames() const;
-
-  // Returns the maximum bitdepth that is supported by this decoder.
-  static int GetMaxBitdepth();
-
- private:
-  bool initialized_ = false;
-  DecoderSettings settings_;
-  std::unique_ptr<DecoderImpl> impl_;
-};
-
-}  // namespace libgav1
-
-#endif  // LIBGAV1_SRC_DECODER_H_
diff --git a/libgav1/src/decoder_buffer.h b/libgav1/src/decoder_buffer.h
deleted file mode 100644
index ecd133d..0000000
--- a/libgav1/src/decoder_buffer.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright 2019 The libgav1 Authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef LIBGAV1_SRC_DECODER_BUFFER_H_
-#define LIBGAV1_SRC_DECODER_BUFFER_H_
-
-#include <cstdint>
-
-#include "src/frame_buffer.h"
-#include "src/symbol_visibility.h"
-
-// All the declarations in this file are part of the public ABI.
-
-namespace libgav1 {
-
-enum ChromaSamplePosition : uint8_t {
-  kChromaSamplePositionUnknown,
-  kChromaSamplePositionVertical,
-  kChromaSamplePositionColocated,
-  kChromaSamplePositionReserved
-};
-
-enum ImageFormat : uint8_t {
-  kImageFormatYuv420,
-  kImageFormatYuv422,
-  kImageFormatYuv444,
-  kImageFormatMonochrome400
-};
-
-struct LIBGAV1_PUBLIC DecoderBuffer {
-  int NumPlanes() const {
-    return (image_format == kImageFormatMonochrome400) ? 1 : 3;
-  }
-
-  ChromaSamplePosition chroma_sample_position;
-  ImageFormat image_format;
-
-  // TODO(wtc): Add the following members:
-  // - color range
-  //   * studio range: Y [16..235], UV [16..240]
-  //   * full range: (YUV/RGB [0..255]
-  // - CICP Color Primaries (cp)
-  // - CICP Transfer Characteristics (tc)
-  // - CICP Matrix Coefficients (mc)
-
-  // Image storage dimensions.
-  // NOTE: These fields are named w and h in vpx_image_t and aom_image_t.
-  // uint32_t width;  // Stored image width.
-  // uint32_t height;  // Stored image height.
-  int bitdepth;  // Stored image bitdepth.
-
-  // Image display dimensions.
-  // NOTES:
-  // 1. These fields are named d_w and d_h in vpx_image_t and aom_image_t.
-  // 2. libvpx and libaom clients use d_w and d_h much more often than w and h.
-  // 3. These fields can just be stored for the Y plane and the clients can
-  //    calculate the values for the U and V planes if the image format or
-  //    subsampling is exposed.
-  int displayed_width[3];   // Displayed image width.
-  int displayed_height[3];  // Displayed image height.
-
-  int stride[3];
-  uint8_t* plane[3];
-
-  // The |user_private_data| argument passed to Decoder::EnqueueFrame().
-  int64_t user_private_data;
-  // The |private_data| field of FrameBuffer. Set by the get frame buffer
-  // callback when it allocates a frame buffer.
-  void* buffer_private_data;
-};
-
-}  // namespace libgav1
-
-#endif  // LIBGAV1_SRC_DECODER_BUFFER_H_
diff --git a/libgav1/src/decoder_impl.cc b/libgav1/src/decoder_impl.cc
index 5c61993..e40c692 100644
--- a/libgav1/src/decoder_impl.cc
+++ b/libgav1/src/decoder_impl.cc
@@ -24,13 +24,18 @@
 #include "src/dsp/common.h"
 #include "src/dsp/constants.h"
 #include "src/dsp/dsp.h"
-#include "src/loop_filter_mask.h"
+#include "src/film_grain.h"
+#include "src/frame_buffer_utils.h"
+#include "src/frame_scratch_buffer.h"
 #include "src/loop_restoration_info.h"
+#include "src/obu_parser.h"
 #include "src/post_filter.h"
 #include "src/prediction_mask.h"
 #include "src/quantizer.h"
+#include "src/threading_strategy.h"
 #include "src/utils/blocking_counter.h"
 #include "src/utils/common.h"
+#include "src/utils/constants.h"
 #include "src/utils/logging.h"
 #include "src/utils/parameter_tree.h"
 #include "src/utils/raw_bit_reader.h"
@@ -44,275 +49,1066 @@
 constexpr int kMaxBlockWidth4x4 = 32;
 constexpr int kMaxBlockHeight4x4 = 32;
 
-// A cleanup helper class that releases the frame buffer reference held in
-// |frame| in the destructor.
-class RefCountedBufferPtrCleanup {
+// Computes the bottom border size in pixels. If CDEF, loop restoration or
+// SuperRes is enabled, adds extra border pixels to facilitate those steps to
+// happen nearly in-place (a few extra rows instead of an entire frame buffer).
+// The logic in this function should match the corresponding logic for
+// |vertical_shift| in the PostFilter constructor.
+int GetBottomBorderPixels(const bool do_cdef, const bool do_restoration,
+                          const bool do_superres, const int subsampling_y) {
+  int extra_border = 0;
+  if (do_cdef) {
+    extra_border += kCdefBorder;
+  } else if (do_restoration) {
+    // If CDEF is enabled, loop restoration is safe without extra border.
+    extra_border += kRestorationVerticalBorder;
+  }
+  if (do_superres) extra_border += kSuperResVerticalBorder;
+  // Double the number of extra bottom border pixels if the bottom border will
+  // be subsampled.
+  extra_border <<= subsampling_y;
+  return Align(kBorderPixels + extra_border, 2);  // Must be a multiple of 2.
+}
+
+// Sets |frame_scratch_buffer->tile_decoding_failed| to true (while holding on
+// to |frame_scratch_buffer->superblock_row_mutex|) and notifies the first
+// |count| condition variables in
+// |frame_scratch_buffer->superblock_row_progress_condvar|.
+void SetFailureAndNotifyAll(FrameScratchBuffer* const frame_scratch_buffer,
+                            int count) {
+  {
+    std::lock_guard<std::mutex> lock(
+        frame_scratch_buffer->superblock_row_mutex);
+    frame_scratch_buffer->tile_decoding_failed = true;
+  }
+  std::condition_variable* const condvars =
+      frame_scratch_buffer->superblock_row_progress_condvar.get();
+  for (int i = 0; i < count; ++i) {
+    condvars[i].notify_one();
+  }
+}
+
+// Helper class that releases the frame scratch buffer in the destructor.
+class FrameScratchBufferReleaser {
  public:
-  explicit RefCountedBufferPtrCleanup(RefCountedBufferPtr* frame)
-      : frame_(*frame) {}
-
-  // Not copyable or movable.
-  RefCountedBufferPtrCleanup(const RefCountedBufferPtrCleanup&) = delete;
-  RefCountedBufferPtrCleanup& operator=(const RefCountedBufferPtrCleanup&) =
-      delete;
-
-  ~RefCountedBufferPtrCleanup() { frame_ = nullptr; }
+  FrameScratchBufferReleaser(
+      FrameScratchBufferPool* frame_scratch_buffer_pool,
+      std::unique_ptr<FrameScratchBuffer>* frame_scratch_buffer)
+      : frame_scratch_buffer_pool_(frame_scratch_buffer_pool),
+        frame_scratch_buffer_(frame_scratch_buffer) {}
+  ~FrameScratchBufferReleaser() {
+    frame_scratch_buffer_pool_->Release(std::move(*frame_scratch_buffer_));
+  }
 
  private:
-  RefCountedBufferPtr& frame_;
+  FrameScratchBufferPool* const frame_scratch_buffer_pool_;
+  std::unique_ptr<FrameScratchBuffer>* const frame_scratch_buffer_;
 };
 
-}  // namespace
-
-void DecoderState::UpdateReferenceFrames(int refresh_frame_flags) {
-  for (int ref_index = 0, mask = refresh_frame_flags; mask != 0;
-       ++ref_index, mask >>= 1) {
-    if ((mask & 1) != 0) {
-      reference_valid[ref_index] = true;
-      reference_frame_id[ref_index] = current_frame_id;
-      reference_frame[ref_index] = current_frame;
-      reference_order_hint[ref_index] = order_hint;
+// Sets the |frame|'s segmentation map for two cases. The third case is handled
+// in Tile::DecodeBlock().
+void SetSegmentationMap(const ObuFrameHeader& frame_header,
+                        const SegmentationMap* prev_segment_ids,
+                        RefCountedBuffer* const frame) {
+  if (!frame_header.segmentation.enabled) {
+    // All segment_id's are 0.
+    frame->segmentation_map()->Clear();
+  } else if (!frame_header.segmentation.update_map) {
+    // Copy from prev_segment_ids.
+    if (prev_segment_ids == nullptr) {
+      // Treat a null prev_segment_ids pointer as if it pointed to a
+      // segmentation map containing all 0s.
+      frame->segmentation_map()->Clear();
+    } else {
+      frame->segmentation_map()->CopyFrom(*prev_segment_ids);
     }
   }
 }
 
-void DecoderState::ClearReferenceFrames() {
-  reference_valid = {};
-  reference_frame_id = {};
-  reference_order_hint = {};
-  for (int ref_index = 0; ref_index < kNumReferenceFrameTypes; ++ref_index) {
-    reference_frame[ref_index] = nullptr;
+StatusCode DecodeTilesNonFrameParallel(
+    const ObuSequenceHeader& sequence_header,
+    const ObuFrameHeader& frame_header,
+    const Vector<std::unique_ptr<Tile>>& tiles,
+    FrameScratchBuffer* const frame_scratch_buffer,
+    PostFilter* const post_filter) {
+  // Decode in superblock row order.
+  const int block_width4x4 = sequence_header.use_128x128_superblock ? 32 : 16;
+  std::unique_ptr<TileScratchBuffer> tile_scratch_buffer =
+      frame_scratch_buffer->tile_scratch_buffer_pool.Get();
+  if (tile_scratch_buffer == nullptr) return kLibgav1StatusOutOfMemory;
+  for (int row4x4 = 0; row4x4 < frame_header.rows4x4;
+       row4x4 += block_width4x4) {
+    for (const auto& tile_ptr : tiles) {
+      if (!tile_ptr->ProcessSuperBlockRow<kProcessingModeParseAndDecode, true>(
+              row4x4, tile_scratch_buffer.get())) {
+        return kLibgav1StatusUnknownError;
+      }
+    }
+    post_filter->ApplyFilteringForOneSuperBlockRow(
+        row4x4, block_width4x4, row4x4 + block_width4x4 >= frame_header.rows4x4,
+        /*do_deblock=*/true);
+  }
+  frame_scratch_buffer->tile_scratch_buffer_pool.Release(
+      std::move(tile_scratch_buffer));
+  return kStatusOk;
+}
+
+StatusCode DecodeTilesThreadedNonFrameParallel(
+    const Vector<std::unique_ptr<Tile>>& tiles,
+    FrameScratchBuffer* const frame_scratch_buffer,
+    PostFilter* const post_filter,
+    BlockingCounterWithStatus* const pending_tiles) {
+  ThreadingStrategy& threading_strategy =
+      frame_scratch_buffer->threading_strategy;
+  const int num_workers = threading_strategy.tile_thread_count();
+  BlockingCounterWithStatus pending_workers(num_workers);
+  std::atomic<int> tile_counter(0);
+  const int tile_count = static_cast<int>(tiles.size());
+  bool tile_decoding_failed = false;
+  // Submit tile decoding jobs to the thread pool.
+  for (int i = 0; i < num_workers; ++i) {
+    threading_strategy.tile_thread_pool()->Schedule([&tiles, tile_count,
+                                                     &tile_counter,
+                                                     &pending_workers,
+                                                     &pending_tiles]() {
+      bool failed = false;
+      int index;
+      while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+             tile_count) {
+        if (!failed) {
+          const auto& tile_ptr = tiles[index];
+          if (!tile_ptr->ParseAndDecode()) {
+            LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number());
+            failed = true;
+          }
+        } else {
+          pending_tiles->Decrement(false);
+        }
+      }
+      pending_workers.Decrement(!failed);
+    });
+  }
+  // Have the current thread partake in tile decoding.
+  int index;
+  while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+         tile_count) {
+    if (!tile_decoding_failed) {
+      const auto& tile_ptr = tiles[index];
+      if (!tile_ptr->ParseAndDecode()) {
+        LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number());
+        tile_decoding_failed = true;
+      }
+    } else {
+      pending_tiles->Decrement(false);
+    }
+  }
+  // Wait until all the workers are done. This ensures that all the tiles have
+  // been parsed.
+  tile_decoding_failed |= !pending_workers.Wait();
+  // Wait until all the tiles have been decoded.
+  tile_decoding_failed |= !pending_tiles->Wait();
+  if (tile_decoding_failed) return kStatusUnknownError;
+  assert(threading_strategy.post_filter_thread_pool() != nullptr);
+  post_filter->ApplyFilteringThreaded();
+  return kStatusOk;
+}
+
+StatusCode DecodeTilesFrameParallel(
+    const ObuSequenceHeader& sequence_header,
+    const ObuFrameHeader& frame_header,
+    const Vector<std::unique_ptr<Tile>>& tiles,
+    const SymbolDecoderContext& saved_symbol_decoder_context,
+    const SegmentationMap* const prev_segment_ids,
+    FrameScratchBuffer* const frame_scratch_buffer,
+    PostFilter* const post_filter, RefCountedBuffer* const current_frame) {
+  // Parse the frame.
+  for (const auto& tile : tiles) {
+    if (!tile->Parse()) {
+      LIBGAV1_DLOG(ERROR, "Failed to parse tile number: %d\n", tile->number());
+      return kStatusUnknownError;
+    }
+  }
+  if (frame_header.enable_frame_end_update_cdf) {
+    frame_scratch_buffer->symbol_decoder_context = saved_symbol_decoder_context;
+  }
+  current_frame->SetFrameContext(frame_scratch_buffer->symbol_decoder_context);
+  SetSegmentationMap(frame_header, prev_segment_ids, current_frame);
+  // Mark frame as parsed.
+  current_frame->SetFrameState(kFrameStateParsed);
+  std::unique_ptr<TileScratchBuffer> tile_scratch_buffer =
+      frame_scratch_buffer->tile_scratch_buffer_pool.Get();
+  if (tile_scratch_buffer == nullptr) {
+    return kStatusOutOfMemory;
+  }
+  const int block_width4x4 = sequence_header.use_128x128_superblock ? 32 : 16;
+  // Decode in superblock row order (inter prediction in the Tile class will
+  // block until the required superblocks in the reference frame are decoded).
+  for (int row4x4 = 0; row4x4 < frame_header.rows4x4;
+       row4x4 += block_width4x4) {
+    for (const auto& tile_ptr : tiles) {
+      if (!tile_ptr->ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
+              row4x4, tile_scratch_buffer.get())) {
+        LIBGAV1_DLOG(ERROR, "Failed to decode tile number: %d\n",
+                     tile_ptr->number());
+        return kStatusUnknownError;
+      }
+    }
+    const int progress_row = post_filter->ApplyFilteringForOneSuperBlockRow(
+        row4x4, block_width4x4, row4x4 + block_width4x4 >= frame_header.rows4x4,
+        /*do_deblock=*/true);
+    if (progress_row >= 0) {
+      current_frame->SetProgress(progress_row);
+    }
+  }
+  // Mark frame as decoded (we no longer care about row-level progress since the
+  // entire frame has been decoded).
+  current_frame->SetFrameState(kFrameStateDecoded);
+  frame_scratch_buffer->tile_scratch_buffer_pool.Release(
+      std::move(tile_scratch_buffer));
+  return kStatusOk;
+}
+
+// Helper function used by DecodeTilesThreadedFrameParallel. Applies the
+// deblocking filter for tile boundaries for the superblock row at |row4x4|.
+void ApplyDeblockingFilterForTileBoundaries(
+    PostFilter* const post_filter, const std::unique_ptr<Tile>* tile_row_base,
+    const ObuFrameHeader& frame_header, int row4x4, int block_width4x4,
+    int tile_columns, bool decode_entire_tiles_in_worker_threads) {
+  // Apply vertical deblock filtering for the first 64 columns of each tile.
+  for (int tile_column = 0; tile_column < tile_columns; ++tile_column) {
+    const Tile& tile = *tile_row_base[tile_column];
+    post_filter->ApplyDeblockFilter(
+        kLoopFilterTypeVertical, row4x4, tile.column4x4_start(),
+        tile.column4x4_start() + kNum4x4InLoopFilterUnit, block_width4x4);
+  }
+  if (decode_entire_tiles_in_worker_threads &&
+      row4x4 == tile_row_base[0]->row4x4_start()) {
+    // This is the first superblock row of a tile row. In this case, apply
+    // horizontal deblock filtering for the entire superblock row.
+    post_filter->ApplyDeblockFilter(kLoopFilterTypeHorizontal, row4x4, 0,
+                                    frame_header.columns4x4, block_width4x4);
+  } else {
+    // Apply horizontal deblock filtering for the first 64 columns of the
+    // first tile.
+    const Tile& first_tile = *tile_row_base[0];
+    post_filter->ApplyDeblockFilter(
+        kLoopFilterTypeHorizontal, row4x4, first_tile.column4x4_start(),
+        first_tile.column4x4_start() + kNum4x4InLoopFilterUnit, block_width4x4);
+    // Apply horizontal deblock filtering for the last 64 columns of the
+    // previous tile and the first 64 columns of the current tile.
+    for (int tile_column = 1; tile_column < tile_columns; ++tile_column) {
+      const Tile& tile = *tile_row_base[tile_column];
+      // If the previous tile has more than 64 columns, then include those
+      // for the horizontal deblock.
+      const Tile& previous_tile = *tile_row_base[tile_column - 1];
+      const int column4x4_start =
+          tile.column4x4_start() -
+          ((tile.column4x4_start() - kNum4x4InLoopFilterUnit !=
+            previous_tile.column4x4_start())
+               ? kNum4x4InLoopFilterUnit
+               : 0);
+      post_filter->ApplyDeblockFilter(
+          kLoopFilterTypeHorizontal, row4x4, column4x4_start,
+          tile.column4x4_start() + kNum4x4InLoopFilterUnit, block_width4x4);
+    }
+    // Apply horizontal deblock filtering for the last 64 columns of the
+    // last tile.
+    const Tile& last_tile = *tile_row_base[tile_columns - 1];
+    // Identify the last column4x4 value and do horizontal filtering for
+    // that column4x4. The value of last column4x4 is the nearest multiple
+    // of 16 that is before tile.column4x4_end().
+    const int column4x4_start = (last_tile.column4x4_end() - 1) & ~15;
+    // If column4x4_start is the same as tile.column4x4_start() then it
+    // means that the last tile has <= 64 columns. So there is nothing left
+    // to deblock (since it was already deblocked in the loop above).
+    if (column4x4_start != last_tile.column4x4_start()) {
+      post_filter->ApplyDeblockFilter(
+          kLoopFilterTypeHorizontal, row4x4, column4x4_start,
+          last_tile.column4x4_end(), block_width4x4);
+    }
   }
 }
 
+// Helper function used by DecodeTilesThreadedFrameParallel. Decodes the
+// superblock row starting at |row4x4| for tile at index |tile_index| in the
+// list of tiles |tiles|. If the decoding is successful, then it does the
+// following:
+//   * Schedule the next superblock row in the current tile column for decoding
+//     (the next superblock row may be in a different tile than the current
+//     one).
+//   * If an entire superblock row of the frame has been decoded, it notifies
+//     the waiters (if there are any).
+void DecodeSuperBlockRowInTile(
+    const Vector<std::unique_ptr<Tile>>& tiles, size_t tile_index, int row4x4,
+    const int superblock_size4x4, const int tile_columns,
+    const int superblock_rows, FrameScratchBuffer* const frame_scratch_buffer,
+    PostFilter* const post_filter, BlockingCounter* const pending_jobs) {
+  std::unique_ptr<TileScratchBuffer> scratch_buffer =
+      frame_scratch_buffer->tile_scratch_buffer_pool.Get();
+  if (scratch_buffer == nullptr) {
+    SetFailureAndNotifyAll(frame_scratch_buffer, superblock_rows);
+    return;
+  }
+  Tile& tile = *tiles[tile_index];
+  const bool ok = tile.ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
+      row4x4, scratch_buffer.get());
+  frame_scratch_buffer->tile_scratch_buffer_pool.Release(
+      std::move(scratch_buffer));
+  if (!ok) {
+    SetFailureAndNotifyAll(frame_scratch_buffer, superblock_rows);
+    return;
+  }
+  if (post_filter->DoDeblock()) {
+    // Apply vertical deblock filtering for all the columns in this tile except
+    // for the first 64 columns.
+    post_filter->ApplyDeblockFilter(
+        kLoopFilterTypeVertical, row4x4,
+        tile.column4x4_start() + kNum4x4InLoopFilterUnit, tile.column4x4_end(),
+        superblock_size4x4);
+    // Apply horizontal deblock filtering for all the columns in this tile
+    // except for the first and the last 64 columns.
+    // Note about the last tile of each row: For the last tile, column4x4_end
+    // may not be a multiple of 16. In that case it is still okay to simply
+    // subtract 16 since ApplyDeblockFilter() will only do the filters in
+    // increments of 64 columns (or 32 columns for chroma with subsampling).
+    post_filter->ApplyDeblockFilter(
+        kLoopFilterTypeHorizontal, row4x4,
+        tile.column4x4_start() + kNum4x4InLoopFilterUnit,
+        tile.column4x4_end() - kNum4x4InLoopFilterUnit, superblock_size4x4);
+  }
+  const int superblock_size4x4_log2 = FloorLog2(superblock_size4x4);
+  const int index = row4x4 >> superblock_size4x4_log2;
+  int* const superblock_row_progress =
+      frame_scratch_buffer->superblock_row_progress.get();
+  std::condition_variable* const superblock_row_progress_condvar =
+      frame_scratch_buffer->superblock_row_progress_condvar.get();
+  bool notify;
+  {
+    std::lock_guard<std::mutex> lock(
+        frame_scratch_buffer->superblock_row_mutex);
+    notify = ++superblock_row_progress[index] == tile_columns;
+  }
+  if (notify) {
+    // We are done decoding this superblock row. Notify the post filtering
+    // thread.
+    superblock_row_progress_condvar[index].notify_one();
+  }
+  // Schedule the next superblock row (if one exists).
+  ThreadPool& thread_pool =
+      *frame_scratch_buffer->threading_strategy.thread_pool();
+  const int next_row4x4 = row4x4 + superblock_size4x4;
+  if (!tile.IsRow4x4Inside(next_row4x4)) {
+    tile_index += tile_columns;
+  }
+  if (tile_index >= tiles.size()) return;
+  pending_jobs->IncrementBy(1);
+  thread_pool.Schedule([&tiles, tile_index, next_row4x4, superblock_size4x4,
+                        tile_columns, superblock_rows, frame_scratch_buffer,
+                        post_filter, pending_jobs]() {
+    DecodeSuperBlockRowInTile(tiles, tile_index, next_row4x4,
+                              superblock_size4x4, tile_columns, superblock_rows,
+                              frame_scratch_buffer, post_filter, pending_jobs);
+    pending_jobs->Decrement();
+  });
+}
+
+StatusCode DecodeTilesThreadedFrameParallel(
+    const ObuSequenceHeader& sequence_header,
+    const ObuFrameHeader& frame_header,
+    const Vector<std::unique_ptr<Tile>>& tiles,
+    const SymbolDecoderContext& saved_symbol_decoder_context,
+    const SegmentationMap* const prev_segment_ids,
+    FrameScratchBuffer* const frame_scratch_buffer,
+    PostFilter* const post_filter, RefCountedBuffer* const current_frame) {
+  // Parse the frame.
+  ThreadPool& thread_pool =
+      *frame_scratch_buffer->threading_strategy.thread_pool();
+  std::atomic<int> tile_counter(0);
+  const int tile_count = static_cast<int>(tiles.size());
+  const int num_workers = thread_pool.num_threads();
+  BlockingCounterWithStatus parse_workers(num_workers);
+  // Submit tile parsing jobs to the thread pool.
+  for (int i = 0; i < num_workers; ++i) {
+    thread_pool.Schedule([&tiles, tile_count, &tile_counter, &parse_workers]() {
+      bool failed = false;
+      int index;
+      while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+             tile_count) {
+        if (!failed) {
+          const auto& tile_ptr = tiles[index];
+          if (!tile_ptr->Parse()) {
+            LIBGAV1_DLOG(ERROR, "Error parsing tile #%d", tile_ptr->number());
+            failed = true;
+          }
+        }
+      }
+      parse_workers.Decrement(!failed);
+    });
+  }
+
+  // Have the current thread participate in parsing.
+  bool failed = false;
+  int index;
+  while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+         tile_count) {
+    if (!failed) {
+      const auto& tile_ptr = tiles[index];
+      if (!tile_ptr->Parse()) {
+        LIBGAV1_DLOG(ERROR, "Error parsing tile #%d", tile_ptr->number());
+        failed = true;
+      }
+    }
+  }
+
+  // Wait until all the parse workers are done. This ensures that all the tiles
+  // have been parsed.
+  if (!parse_workers.Wait() || failed) {
+    return kLibgav1StatusUnknownError;
+  }
+  if (frame_header.enable_frame_end_update_cdf) {
+    frame_scratch_buffer->symbol_decoder_context = saved_symbol_decoder_context;
+  }
+  current_frame->SetFrameContext(frame_scratch_buffer->symbol_decoder_context);
+  SetSegmentationMap(frame_header, prev_segment_ids, current_frame);
+  current_frame->SetFrameState(kFrameStateParsed);
+
+  // Decode the frame.
+  const int block_width4x4 = sequence_header.use_128x128_superblock ? 32 : 16;
+  const int block_width4x4_log2 =
+      sequence_header.use_128x128_superblock ? 5 : 4;
+  const int superblock_rows =
+      (frame_header.rows4x4 + block_width4x4 - 1) >> block_width4x4_log2;
+  if (!frame_scratch_buffer->superblock_row_progress.Resize(superblock_rows) ||
+      !frame_scratch_buffer->superblock_row_progress_condvar.Resize(
+          superblock_rows)) {
+    return kLibgav1StatusOutOfMemory;
+  }
+  int* const superblock_row_progress =
+      frame_scratch_buffer->superblock_row_progress.get();
+  memset(superblock_row_progress, 0,
+         superblock_rows * sizeof(superblock_row_progress[0]));
+  frame_scratch_buffer->tile_decoding_failed = false;
+  const int tile_columns = frame_header.tile_info.tile_columns;
+  const bool decode_entire_tiles_in_worker_threads =
+      num_workers >= tile_columns;
+  BlockingCounter pending_jobs(
+      decode_entire_tiles_in_worker_threads ? num_workers : tile_columns);
+  if (decode_entire_tiles_in_worker_threads) {
+    // Submit tile decoding jobs to the thread pool.
+    tile_counter = 0;
+    for (int i = 0; i < num_workers; ++i) {
+      thread_pool.Schedule([&tiles, tile_count, &tile_counter, &pending_jobs,
+                            frame_scratch_buffer, superblock_rows]() {
+        bool failed = false;
+        int index;
+        while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+               tile_count) {
+          if (failed) continue;
+          const auto& tile_ptr = tiles[index];
+          if (!tile_ptr->Decode(
+                  &frame_scratch_buffer->superblock_row_mutex,
+                  frame_scratch_buffer->superblock_row_progress.get(),
+                  frame_scratch_buffer->superblock_row_progress_condvar
+                      .get())) {
+            LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number());
+            failed = true;
+            SetFailureAndNotifyAll(frame_scratch_buffer, superblock_rows);
+          }
+        }
+        pending_jobs.Decrement();
+      });
+    }
+  } else {
+    // Schedule the jobs for first tile row.
+    for (int tile_index = 0; tile_index < tile_columns; ++tile_index) {
+      thread_pool.Schedule([&tiles, tile_index, block_width4x4, tile_columns,
+                            superblock_rows, frame_scratch_buffer, post_filter,
+                            &pending_jobs]() {
+        DecodeSuperBlockRowInTile(
+            tiles, tile_index, 0, block_width4x4, tile_columns, superblock_rows,
+            frame_scratch_buffer, post_filter, &pending_jobs);
+        pending_jobs.Decrement();
+      });
+    }
+  }
+
+  // Current thread will do the post filters.
+  std::condition_variable* const superblock_row_progress_condvar =
+      frame_scratch_buffer->superblock_row_progress_condvar.get();
+  const std::unique_ptr<Tile>* tile_row_base = &tiles[0];
+  for (int row4x4 = 0, index = 0; row4x4 < frame_header.rows4x4;
+       row4x4 += block_width4x4, ++index) {
+    if (!tile_row_base[0]->IsRow4x4Inside(row4x4)) {
+      tile_row_base += tile_columns;
+    }
+    {
+      std::unique_lock<std::mutex> lock(
+          frame_scratch_buffer->superblock_row_mutex);
+      while (superblock_row_progress[index] != tile_columns &&
+             !frame_scratch_buffer->tile_decoding_failed) {
+        superblock_row_progress_condvar[index].wait(lock);
+      }
+      if (frame_scratch_buffer->tile_decoding_failed) break;
+    }
+    if (post_filter->DoDeblock()) {
+      // Apply deblocking filter for the tile boundaries of this superblock row.
+      // The deblocking filter for the internal blocks will be applied in the
+      // tile worker threads. In this thread, we will only have to apply
+      // deblocking filter for the tile boundaries.
+      ApplyDeblockingFilterForTileBoundaries(
+          post_filter, tile_row_base, frame_header, row4x4, block_width4x4,
+          tile_columns, decode_entire_tiles_in_worker_threads);
+    }
+    // Apply all the post filters other than deblocking.
+    const int progress_row = post_filter->ApplyFilteringForOneSuperBlockRow(
+        row4x4, block_width4x4, row4x4 + block_width4x4 >= frame_header.rows4x4,
+        /*do_deblock=*/false);
+    if (progress_row >= 0) {
+      current_frame->SetProgress(progress_row);
+    }
+  }
+  // Wait until all the pending jobs are done. This ensures that all the tiles
+  // have been decoded and wrapped up.
+  pending_jobs.Wait();
+  {
+    std::lock_guard<std::mutex> lock(
+        frame_scratch_buffer->superblock_row_mutex);
+    if (frame_scratch_buffer->tile_decoding_failed) {
+      return kLibgav1StatusUnknownError;
+    }
+  }
+
+  current_frame->SetFrameState(kFrameStateDecoded);
+  return kStatusOk;
+}
+
+}  // namespace
+
 // static
 StatusCode DecoderImpl::Create(const DecoderSettings* settings,
                                std::unique_ptr<DecoderImpl>* output) {
   if (settings->threads <= 0) {
     LIBGAV1_DLOG(ERROR, "Invalid settings->threads: %d.", settings->threads);
-    return kLibgav1StatusInvalidArgument;
+    return kStatusInvalidArgument;
+  }
+  if (settings->frame_parallel) {
+    if (settings->release_input_buffer == nullptr) {
+      LIBGAV1_DLOG(ERROR,
+                   "release_input_buffer callback must not be null when "
+                   "frame_parallel is true.");
+      return kStatusInvalidArgument;
+    }
   }
   std::unique_ptr<DecoderImpl> impl(new (std::nothrow) DecoderImpl(settings));
   if (impl == nullptr) {
     LIBGAV1_DLOG(ERROR, "Failed to allocate DecoderImpl.");
-    return kLibgav1StatusOutOfMemory;
+    return kStatusOutOfMemory;
   }
   const StatusCode status = impl->Init();
-  if (status != kLibgav1StatusOk) return status;
+  if (status != kStatusOk) return status;
   *output = std::move(impl);
-  return kLibgav1StatusOk;
+  return kStatusOk;
 }
 
 DecoderImpl::DecoderImpl(const DecoderSettings* settings)
-    : buffer_pool_(*settings), settings_(*settings) {
+    : buffer_pool_(settings->on_frame_buffer_size_changed,
+                   settings->get_frame_buffer, settings->release_frame_buffer,
+                   settings->callback_private_data),
+      settings_(*settings) {
   dsp::DspInit();
-  GenerateWedgeMask(state_.wedge_master_mask.data(), state_.wedge_masks.data());
 }
 
 DecoderImpl::~DecoderImpl() {
-  // The frame buffer references need to be released before |buffer_pool_| is
-  // destroyed.
+  // Clean up and wait until all the threads have stopped. We just have to pass
+  // in a dummy status that is not kStatusOk or kStatusTryAgain to trigger the
+  // path that clears all the threads and structs.
+  SignalFailure(kStatusUnknownError);
+  // Release any other frame buffer references that we may be holding on to.
   ReleaseOutputFrame();
-  assert(state_.current_frame == nullptr);
+  output_frame_queue_.Clear();
   for (auto& reference_frame : state_.reference_frame) {
     reference_frame = nullptr;
   }
 }
 
 StatusCode DecoderImpl::Init() {
-  const int max_allowed_frames =
-      settings_.frame_parallel ? settings_.threads : 1;
-  assert(max_allowed_frames > 0);
-  if (!encoded_frames_.Init(max_allowed_frames)) {
-    LIBGAV1_DLOG(ERROR, "encoded_frames_.Init() failed.");
-    return kLibgav1StatusOutOfMemory;
+  if (!GenerateWedgeMask(&wedge_masks_)) {
+    LIBGAV1_DLOG(ERROR, "GenerateWedgeMask() failed.");
+    return kStatusOutOfMemory;
   }
-  return kLibgav1StatusOk;
+  if (!output_frame_queue_.Init(kMaxLayers)) {
+    LIBGAV1_DLOG(ERROR, "output_frame_queue_.Init() failed.");
+    return kStatusOutOfMemory;
+  }
+  return kStatusOk;
+}
+
+StatusCode DecoderImpl::InitializeFrameThreadPoolAndTemporalUnitQueue(
+    const uint8_t* data, size_t size) {
+  is_frame_parallel_ = false;
+  if (settings_.frame_parallel) {
+    DecoderState state;
+    std::unique_ptr<ObuParser> obu(new (std::nothrow) ObuParser(
+        data, size, settings_.operating_point, &buffer_pool_, &state));
+    if (obu == nullptr) {
+      LIBGAV1_DLOG(ERROR, "Failed to allocate OBU parser.");
+      return kStatusOutOfMemory;
+    }
+    RefCountedBufferPtr current_frame;
+    const StatusCode status = obu->ParseOneFrame(&current_frame);
+    if (status != kStatusOk) {
+      LIBGAV1_DLOG(ERROR, "Failed to parse OBU.");
+      return status;
+    }
+    current_frame = nullptr;
+    // We assume that the first frame that was parsed will contain the frame
+    // header. This assumption is usually true in practice. So we will simply
+    // not use frame parallel mode if this is not the case.
+    if (settings_.threads > 1 &&
+        !InitializeThreadPoolsForFrameParallel(
+            settings_.threads, obu->frame_header().tile_info.tile_count,
+            obu->frame_header().tile_info.tile_columns, &frame_thread_pool_,
+            &frame_scratch_buffer_pool_)) {
+      return kStatusOutOfMemory;
+    }
+  }
+  const int max_allowed_frames =
+      (frame_thread_pool_ != nullptr) ? frame_thread_pool_->num_threads() : 1;
+  assert(max_allowed_frames > 0);
+  if (!temporal_units_.Init(max_allowed_frames)) {
+    LIBGAV1_DLOG(ERROR, "temporal_units_.Init() failed.");
+    return kStatusOutOfMemory;
+  }
+  is_frame_parallel_ = frame_thread_pool_ != nullptr;
+  return kStatusOk;
 }
 
 StatusCode DecoderImpl::EnqueueFrame(const uint8_t* data, size_t size,
-                                     int64_t user_private_data) {
-  if (data == nullptr) {
-    // This has to actually flush the decoder.
-    return kLibgav1StatusOk;
+                                     int64_t user_private_data,
+                                     void* buffer_private_data) {
+  if (data == nullptr || size == 0) return kStatusInvalidArgument;
+  if (HasFailure()) return kStatusUnknownError;
+  if (!seen_first_frame_) {
+    seen_first_frame_ = true;
+    const StatusCode status =
+        InitializeFrameThreadPoolAndTemporalUnitQueue(data, size);
+    if (status != kStatusOk) {
+      return SignalFailure(status);
+    }
   }
-  if (encoded_frames_.Full()) {
-    return kLibgav1StatusResourceExhausted;
+  if (temporal_units_.Full()) {
+    return kStatusTryAgain;
   }
-  encoded_frames_.Push(EncodedFrame(data, size, user_private_data));
-  return kLibgav1StatusOk;
+  if (is_frame_parallel_) {
+    return ParseAndSchedule(data, size, user_private_data, buffer_private_data);
+  }
+  TemporalUnit temporal_unit(data, size, user_private_data,
+                             buffer_private_data);
+  temporal_units_.Push(std::move(temporal_unit));
+  return kStatusOk;
+}
+
+StatusCode DecoderImpl::SignalFailure(StatusCode status) {
+  if (status == kStatusOk || status == kStatusTryAgain) return status;
+  // Set the |failure_status_| first so that any pending jobs in
+  // |frame_thread_pool_| will exit right away when the thread pool is being
+  // released below.
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    failure_status_ = status;
+  }
+  // Make sure all waiting threads exit.
+  buffer_pool_.Abort();
+  frame_thread_pool_ = nullptr;
+  while (!temporal_units_.Empty()) {
+    if (settings_.release_input_buffer != nullptr) {
+      settings_.release_input_buffer(
+          settings_.callback_private_data,
+          temporal_units_.Front().buffer_private_data);
+    }
+    temporal_units_.Pop();
+  }
+  return status;
 }
 
 // DequeueFrame() follows the following policy to avoid holding unnecessary
-// frame buffer references in state_.current_frame and output_frame_.
-//
-// 1. state_.current_frame must be null when DequeueFrame() returns (success
-// or failure).
-//
-// 2. output_frame_ must be null when DequeueFrame() returns false.
+// frame buffer references in output_frame_: output_frame_ must be null when
+// DequeueFrame() returns false.
 StatusCode DecoderImpl::DequeueFrame(const DecoderBuffer** out_ptr) {
   if (out_ptr == nullptr) {
     LIBGAV1_DLOG(ERROR, "Invalid argument: out_ptr == nullptr.");
-    return kLibgav1StatusInvalidArgument;
+    return kStatusInvalidArgument;
   }
-  assert(state_.current_frame == nullptr);
   // We assume a call to DequeueFrame() indicates that the caller is no longer
   // using the previous output frame, so we can release it.
   ReleaseOutputFrame();
-  if (encoded_frames_.Empty()) {
-    // No encoded frame to decode. Not an error.
+  if (temporal_units_.Empty()) {
+    // No input frames to decode.
     *out_ptr = nullptr;
-    return kLibgav1StatusOk;
+    return kStatusNothingToDequeue;
   }
-  const EncodedFrame encoded_frame = encoded_frames_.Pop();
-  std::unique_ptr<ObuParser> obu(new (std::nothrow) ObuParser(
-      encoded_frame.data, encoded_frame.size, &state_));
-  if (obu == nullptr) {
-    LIBGAV1_DLOG(ERROR, "Failed to initialize OBU parser.");
-    return kLibgav1StatusOutOfMemory;
-  }
-  if (state_.has_sequence_header) {
-    obu->set_sequence_header(state_.sequence_header);
-  }
-  RefCountedBufferPtrCleanup current_frame_cleanup(&state_.current_frame);
-  RefCountedBufferPtr displayable_frame;
-  StatusCode status;
-  while (obu->HasData()) {
-    state_.current_frame = buffer_pool_.GetFreeBuffer();
-    if (state_.current_frame == nullptr) {
-      LIBGAV1_DLOG(ERROR, "Could not get current_frame from the buffer pool.");
-      return kLibgav1StatusResourceExhausted;
+  TemporalUnit& temporal_unit = temporal_units_.Front();
+  if (!is_frame_parallel_) {
+    // If |output_frame_queue_| is not empty, then return the first frame from
+    // that queue.
+    if (!output_frame_queue_.Empty()) {
+      RefCountedBufferPtr frame = std::move(output_frame_queue_.Front());
+      output_frame_queue_.Pop();
+      buffer_.user_private_data = temporal_unit.user_private_data;
+      if (output_frame_queue_.Empty()) {
+        temporal_units_.Pop();
+      }
+      const StatusCode status = CopyFrameToOutputBuffer(frame);
+      if (status != kStatusOk) {
+        return status;
+      }
+      *out_ptr = &buffer_;
+      return kStatusOk;
     }
+    // Decode the next available temporal unit and return.
+    const StatusCode status = DecodeTemporalUnit(temporal_unit, out_ptr);
+    if (status != kStatusOk) {
+      // In case of failure, discard all the output frames that we may be
+      // holding on references to.
+      output_frame_queue_.Clear();
+    }
+    if (settings_.release_input_buffer != nullptr) {
+      settings_.release_input_buffer(settings_.callback_private_data,
+                                     temporal_unit.buffer_private_data);
+    }
+    if (output_frame_queue_.Empty()) {
+      temporal_units_.Pop();
+    }
+    return status;
+  }
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (settings_.blocking_dequeue) {
+      while (!temporal_unit.decoded && failure_status_ == kStatusOk) {
+        decoded_condvar_.wait(lock);
+      }
+    } else {
+      if (!temporal_unit.decoded && failure_status_ == kStatusOk) {
+        return kStatusTryAgain;
+      }
+    }
+    if (failure_status_ != kStatusOk) {
+      const StatusCode failure_status = failure_status_;
+      lock.unlock();
+      return SignalFailure(failure_status);
+    }
+  }
+  if (settings_.release_input_buffer != nullptr &&
+      !temporal_unit.released_input_buffer) {
+    temporal_unit.released_input_buffer = true;
+    settings_.release_input_buffer(settings_.callback_private_data,
+                                   temporal_unit.buffer_private_data);
+  }
+  if (temporal_unit.status != kStatusOk) {
+    temporal_units_.Pop();
+    return SignalFailure(temporal_unit.status);
+  }
+  if (!temporal_unit.has_displayable_frame) {
+    *out_ptr = nullptr;
+    temporal_units_.Pop();
+    return kStatusOk;
+  }
+  assert(temporal_unit.output_layer_count > 0);
+  StatusCode status = CopyFrameToOutputBuffer(
+      temporal_unit.output_layers[temporal_unit.output_layer_count - 1].frame);
+  temporal_unit.output_layers[temporal_unit.output_layer_count - 1].frame =
+      nullptr;
+  if (status != kStatusOk) {
+    temporal_units_.Pop();
+    return SignalFailure(status);
+  }
+  buffer_.user_private_data = temporal_unit.user_private_data;
+  *out_ptr = &buffer_;
+  if (--temporal_unit.output_layer_count == 0) {
+    temporal_units_.Pop();
+  }
+  return kStatusOk;
+}
 
-    if (!obu->ParseOneFrame()) {
+StatusCode DecoderImpl::ParseAndSchedule(const uint8_t* data, size_t size,
+                                         int64_t user_private_data,
+                                         void* buffer_private_data) {
+  TemporalUnit temporal_unit(data, size, user_private_data,
+                             buffer_private_data);
+  std::unique_ptr<ObuParser> obu(new (std::nothrow) ObuParser(
+      temporal_unit.data, temporal_unit.size, settings_.operating_point,
+      &buffer_pool_, &state_));
+  if (obu == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Failed to allocate OBU parser.");
+    return kStatusOutOfMemory;
+  }
+  if (has_sequence_header_) {
+    obu->set_sequence_header(sequence_header_);
+  }
+  StatusCode status;
+  int position_in_temporal_unit = 0;
+  while (obu->HasData()) {
+    RefCountedBufferPtr current_frame;
+    status = obu->ParseOneFrame(&current_frame);
+    if (status != kStatusOk) {
       LIBGAV1_DLOG(ERROR, "Failed to parse OBU.");
-      return kLibgav1StatusUnknownError;
+      return status;
     }
-    if (std::find_if(obu->obu_headers().begin(), obu->obu_headers().end(),
-                     [](const ObuHeader& obu_header) {
-                       return obu_header.type == kObuSequenceHeader;
-                     }) != obu->obu_headers().end()) {
-      state_.sequence_header = obu->sequence_header();
-      state_.has_sequence_header = true;
+    if (IsNewSequenceHeader(*obu)) {
+      const ObuSequenceHeader& sequence_header = obu->sequence_header();
+      const Libgav1ImageFormat image_format =
+          ComposeImageFormat(sequence_header.color_config.is_monochrome,
+                             sequence_header.color_config.subsampling_x,
+                             sequence_header.color_config.subsampling_y);
+      const int max_bottom_border = GetBottomBorderPixels(
+          /*do_cdef=*/true, /*do_restoration=*/true,
+          /*do_superres=*/true, sequence_header.color_config.subsampling_y);
+      // TODO(vigneshv): This may not be the right place to call this callback
+      // for the frame parallel case. Investigate and fix it.
+      if (!buffer_pool_.OnFrameBufferSizeChanged(
+              sequence_header.color_config.bitdepth, image_format,
+              sequence_header.max_frame_width, sequence_header.max_frame_height,
+              kBorderPixels, kBorderPixels, kBorderPixels, max_bottom_border)) {
+        LIBGAV1_DLOG(ERROR, "buffer_pool_.OnFrameBufferSizeChanged failed.");
+        return kStatusUnknownError;
+      }
+    }
+    // This can happen when there are multiple spatial/temporal layers and if
+    // all the layers are outside the current operating point.
+    if (current_frame == nullptr) {
+      continue;
+    }
+    // Note that we cannot set EncodedFrame.temporal_unit here. It will be set
+    // in the code below after |temporal_unit| is std::move'd into the
+    // |temporal_units_| queue.
+    if (!temporal_unit.frames.emplace_back(obu.get(), state_, current_frame,
+                                           position_in_temporal_unit++)) {
+      LIBGAV1_DLOG(ERROR, "temporal_unit.frames.emplace_back failed.");
+      return kStatusOutOfMemory;
+    }
+    state_.UpdateReferenceFrames(current_frame,
+                                 obu->frame_header().refresh_frame_flags);
+  }
+  // This function cannot fail after this point. So it is okay to move the
+  // |temporal_unit| into |temporal_units_| queue.
+  temporal_units_.Push(std::move(temporal_unit));
+  if (temporal_units_.Back().frames.empty()) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    temporal_units_.Back().has_displayable_frame = false;
+    temporal_units_.Back().decoded = true;
+    return kStatusOk;
+  }
+  for (auto& frame : temporal_units_.Back().frames) {
+    EncodedFrame* const encoded_frame = &frame;
+    encoded_frame->temporal_unit = &temporal_units_.Back();
+    frame_thread_pool_->Schedule([this, encoded_frame]() {
+      if (HasFailure()) return;
+      const StatusCode status = DecodeFrame(encoded_frame);
+      encoded_frame->state = {};
+      encoded_frame->frame = nullptr;
+      TemporalUnit& temporal_unit = *encoded_frame->temporal_unit;
+      std::lock_guard<std::mutex> lock(mutex_);
+      if (failure_status_ != kStatusOk) return;
+      // temporal_unit's status defaults to kStatusOk. So we need to set it only
+      // on error. If |failure_status_| is not kStatusOk at this point, it means
+      // that there has already been a failure. So we don't care about this
+      // subsequent failure.  We will simply return the error code of the first
+      // failure.
+      if (status != kStatusOk) {
+        temporal_unit.status = status;
+        if (failure_status_ == kStatusOk) {
+          failure_status_ = status;
+        }
+      }
+      temporal_unit.decoded =
+          ++temporal_unit.decoded_count == temporal_unit.frames.size();
+      if (temporal_unit.decoded && settings_.output_all_layers &&
+          temporal_unit.output_layer_count > 1) {
+        std::sort(
+            temporal_unit.output_layers,
+            temporal_unit.output_layers + temporal_unit.output_layer_count);
+      }
+      if (temporal_unit.decoded || failure_status_ != kStatusOk) {
+        decoded_condvar_.notify_one();
+      }
+    });
+  }
+  return kStatusOk;
+}
+
+StatusCode DecoderImpl::DecodeFrame(EncodedFrame* const encoded_frame) {
+  const ObuSequenceHeader& sequence_header = encoded_frame->sequence_header;
+  const ObuFrameHeader& frame_header = encoded_frame->frame_header;
+  RefCountedBufferPtr current_frame = std::move(encoded_frame->frame);
+
+  std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer =
+      frame_scratch_buffer_pool_.Get();
+  if (frame_scratch_buffer == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Error when getting FrameScratchBuffer.");
+    return kStatusOutOfMemory;
+  }
+  // |frame_scratch_buffer| will be released when this local variable goes out
+  // of scope (i.e.) on any return path in this function.
+  FrameScratchBufferReleaser frame_scratch_buffer_releaser(
+      &frame_scratch_buffer_pool_, &frame_scratch_buffer);
+
+  StatusCode status;
+  if (!frame_header.show_existing_frame) {
+    if (encoded_frame->tile_buffers.empty()) {
+      // This means that the last call to ParseOneFrame() did not actually
+      // have any tile groups. This could happen in rare cases (for example,
+      // if there is a Metadata OBU after the TileGroup OBU). We currently do
+      // not have a reason to handle those cases, so we simply continue.
+      return kStatusOk;
+    }
+    status = DecodeTiles(sequence_header, frame_header,
+                         encoded_frame->tile_buffers, encoded_frame->state,
+                         frame_scratch_buffer.get(), current_frame.get());
+    if (status != kStatusOk) {
+      return status;
+    }
+  } else {
+    if (!current_frame->WaitUntilDecoded()) {
+      return kStatusUnknownError;
+    }
+  }
+  if (!frame_header.show_frame && !frame_header.show_existing_frame) {
+    // This frame is not displayable. Not an error.
+    return kStatusOk;
+  }
+  RefCountedBufferPtr film_grain_frame;
+  status = ApplyFilmGrain(
+      sequence_header, frame_header, current_frame, &film_grain_frame,
+      frame_scratch_buffer->threading_strategy.thread_pool());
+  if (status != kStatusOk) {
+    return status;
+  }
+
+  TemporalUnit& temporal_unit = *encoded_frame->temporal_unit;
+  std::lock_guard<std::mutex> lock(mutex_);
+  if (temporal_unit.has_displayable_frame && !settings_.output_all_layers) {
+    assert(temporal_unit.output_frame_position >= 0);
+    // A displayable frame was already found in this temporal unit. This can
+    // happen if there are multiple spatial/temporal layers. Since
+    // |settings_.output_all_layers| is false, we will output only the last
+    // displayable frame.
+    if (temporal_unit.output_frame_position >
+        encoded_frame->position_in_temporal_unit) {
+      return kStatusOk;
+    }
+    // Replace any output frame that we may have seen before with the current
+    // frame.
+    assert(temporal_unit.output_layer_count == 1);
+    --temporal_unit.output_layer_count;
+  }
+  temporal_unit.has_displayable_frame = true;
+  temporal_unit.output_layers[temporal_unit.output_layer_count].frame =
+      std::move(film_grain_frame);
+  temporal_unit.output_layers[temporal_unit.output_layer_count]
+      .position_in_temporal_unit = encoded_frame->position_in_temporal_unit;
+  ++temporal_unit.output_layer_count;
+  temporal_unit.output_frame_position =
+      encoded_frame->position_in_temporal_unit;
+  return kStatusOk;
+}
+
+StatusCode DecoderImpl::DecodeTemporalUnit(const TemporalUnit& temporal_unit,
+                                           const DecoderBuffer** out_ptr) {
+  std::unique_ptr<ObuParser> obu(new (std::nothrow) ObuParser(
+      temporal_unit.data, temporal_unit.size, settings_.operating_point,
+      &buffer_pool_, &state_));
+  if (obu == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Failed to allocate OBU parser.");
+    return kStatusOutOfMemory;
+  }
+  if (has_sequence_header_) {
+    obu->set_sequence_header(sequence_header_);
+  }
+  StatusCode status;
+  std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer =
+      frame_scratch_buffer_pool_.Get();
+  if (frame_scratch_buffer == nullptr) {
+    LIBGAV1_DLOG(ERROR, "Error when getting FrameScratchBuffer.");
+    return kStatusOutOfMemory;
+  }
+  // |frame_scratch_buffer| will be released when this local variable goes out
+  // of scope (i.e.) on any return path in this function.
+  FrameScratchBufferReleaser frame_scratch_buffer_releaser(
+      &frame_scratch_buffer_pool_, &frame_scratch_buffer);
+
+  while (obu->HasData()) {
+    RefCountedBufferPtr current_frame;
+    status = obu->ParseOneFrame(&current_frame);
+    if (status != kStatusOk) {
+      LIBGAV1_DLOG(ERROR, "Failed to parse OBU.");
+      return status;
+    }
+    if (IsNewSequenceHeader(*obu)) {
+      const ObuSequenceHeader& sequence_header = obu->sequence_header();
+      const Libgav1ImageFormat image_format =
+          ComposeImageFormat(sequence_header.color_config.is_monochrome,
+                             sequence_header.color_config.subsampling_x,
+                             sequence_header.color_config.subsampling_y);
+      const int max_bottom_border = GetBottomBorderPixels(
+          /*do_cdef=*/true, /*do_restoration=*/true,
+          /*do_superres=*/true, sequence_header.color_config.subsampling_y);
+      if (!buffer_pool_.OnFrameBufferSizeChanged(
+              sequence_header.color_config.bitdepth, image_format,
+              sequence_header.max_frame_width, sequence_header.max_frame_height,
+              kBorderPixels, kBorderPixels, kBorderPixels, max_bottom_border)) {
+        LIBGAV1_DLOG(ERROR, "buffer_pool_.OnFrameBufferSizeChanged failed.");
+        return kStatusUnknownError;
+      }
     }
     if (!obu->frame_header().show_existing_frame) {
-      if (obu->tile_groups().empty()) {
+      if (obu->tile_buffers().empty()) {
         // This means that the last call to ParseOneFrame() did not actually
         // have any tile groups. This could happen in rare cases (for example,
         // if there is a Metadata OBU after the TileGroup OBU). We currently do
         // not have a reason to handle those cases, so we simply continue.
         continue;
       }
-      status = DecodeTiles(obu.get());
-      if (status != kLibgav1StatusOk) {
+      status = DecodeTiles(obu->sequence_header(), obu->frame_header(),
+                           obu->tile_buffers(), state_,
+                           frame_scratch_buffer.get(), current_frame.get());
+      if (status != kStatusOk) {
         return status;
       }
     }
-    state_.UpdateReferenceFrames(obu->frame_header().refresh_frame_flags);
+    state_.UpdateReferenceFrames(current_frame,
+                                 obu->frame_header().refresh_frame_flags);
     if (obu->frame_header().show_frame ||
         obu->frame_header().show_existing_frame) {
-      if (displayable_frame != nullptr) {
-        // This can happen if there are multiple spatial/temporal layers. We
-        // don't care about it for now, so simply return the last displayable
-        // frame.
-        // TODO(b/129153372): Add support for outputting multiple
-        // spatial/temporal layers.
-        LIBGAV1_DLOG(
-            WARNING,
-            "More than one displayable frame found. Using the last one.");
+      if (!output_frame_queue_.Empty() && !settings_.output_all_layers) {
+        // There is more than one displayable frame in the current operating
+        // point and |settings_.output_all_layers| is false. In this case, we
+        // simply return the last displayable frame as the output frame and
+        // ignore the rest.
+        assert(output_frame_queue_.Size() == 1);
+        output_frame_queue_.Pop();
       }
-      displayable_frame = std::move(state_.current_frame);
-      if (obu->sequence_header().film_grain_params_present &&
-          displayable_frame->film_grain_params().apply_grain &&
-          (settings_.post_filter_mask & 0x10) != 0) {
-        RefCountedBufferPtr film_grain_frame;
-        if (!obu->frame_header().show_existing_frame &&
-            obu->frame_header().refresh_frame_flags == 0) {
-          // If show_existing_frame is true, then the current frame is a
-          // previously saved reference frame. If refresh_frame_flags is
-          // nonzero, then the state_.UpdateReferenceFrames() call above has
-          // saved the current frame as a reference frame. Therefore, if both
-          // of these conditions are false, then the current frame is not
-          // saved as a reference frame. displayable_frame should hold the
-          // only reference to the current frame.
-          assert(displayable_frame.use_count() == 1);
-          // Add film grain noise in place.
-          film_grain_frame = displayable_frame;
-        } else {
-          film_grain_frame = buffer_pool_.GetFreeBuffer();
-          if (film_grain_frame == nullptr) {
-            LIBGAV1_DLOG(
-                ERROR, "Could not get film_grain_frame from the buffer pool.");
-            return kLibgav1StatusResourceExhausted;
-          }
-          if (!film_grain_frame->Realloc(
-                  displayable_frame->buffer()->bitdepth(),
-                  displayable_frame->buffer()->is_monochrome(),
-                  displayable_frame->upscaled_width(),
-                  displayable_frame->frame_height(),
-                  displayable_frame->buffer()->subsampling_x(),
-                  displayable_frame->buffer()->subsampling_y(),
-                  /*border=*/0,
-                  /*byte_alignment=*/0)) {
-            LIBGAV1_DLOG(ERROR, "film_grain_frame->Realloc() failed.");
-            return kLibgav1StatusOutOfMemory;
-          }
-          film_grain_frame->set_chroma_sample_position(
-              displayable_frame->chroma_sample_position());
-        }
-        const dsp::Dsp* const dsp =
-            dsp::GetDspTable(displayable_frame->buffer()->bitdepth());
-        if (!dsp->film_grain_synthesis(
-                displayable_frame->buffer()->data(kPlaneY),
-                displayable_frame->buffer()->stride(kPlaneY),
-                displayable_frame->buffer()->data(kPlaneU),
-                displayable_frame->buffer()->stride(kPlaneU),
-                displayable_frame->buffer()->data(kPlaneV),
-                displayable_frame->buffer()->stride(kPlaneV),
-                displayable_frame->film_grain_params(),
-                displayable_frame->buffer()->is_monochrome(),
-                obu->sequence_header().color_config.matrix_coefficients ==
-                    kMatrixCoefficientIdentity,
-                displayable_frame->upscaled_width(),
-                displayable_frame->frame_height(),
-                displayable_frame->buffer()->subsampling_x(),
-                displayable_frame->buffer()->subsampling_y(),
-                film_grain_frame->buffer()->data(kPlaneY),
-                film_grain_frame->buffer()->stride(kPlaneY),
-                film_grain_frame->buffer()->data(kPlaneU),
-                film_grain_frame->buffer()->stride(kPlaneU),
-                film_grain_frame->buffer()->data(kPlaneV),
-                film_grain_frame->buffer()->stride(kPlaneV))) {
-          LIBGAV1_DLOG(ERROR, "dsp->film_grain_synthesis() failed.");
-          return kLibgav1StatusOutOfMemory;
-        }
-        displayable_frame = std::move(film_grain_frame);
-      }
+      RefCountedBufferPtr film_grain_frame;
+      status = ApplyFilmGrain(
+          obu->sequence_header(), obu->frame_header(), current_frame,
+          &film_grain_frame,
+          frame_scratch_buffer->threading_strategy.film_grain_thread_pool());
+      if (status != kStatusOk) return status;
+      output_frame_queue_.Push(std::move(film_grain_frame));
     }
   }
-  if (displayable_frame == nullptr) {
-    // No displayable frame in the encoded frame. Not an error.
+  if (output_frame_queue_.Empty()) {
+    // No displayable frame in the temporal unit. Not an error.
     *out_ptr = nullptr;
-    return kLibgav1StatusOk;
+    return kStatusOk;
   }
-  status = CopyFrameToOutputBuffer(displayable_frame);
-  if (status != kLibgav1StatusOk) {
+  status = CopyFrameToOutputBuffer(output_frame_queue_.Front());
+  output_frame_queue_.Pop();
+  if (status != kStatusOk) {
     return status;
   }
-  buffer_.user_private_data = encoded_frame.user_private_data;
+  buffer_.user_private_data = temporal_unit.user_private_data;
   *out_ptr = &buffer_;
-  return kLibgav1StatusOk;
-}
-
-bool DecoderImpl::AllocateCurrentFrame(const ObuFrameHeader& frame_header) {
-  const ColorConfig& color_config = state_.sequence_header.color_config;
-  state_.current_frame->set_chroma_sample_position(
-      color_config.chroma_sample_position);
-  return state_.current_frame->Realloc(
-      color_config.bitdepth, color_config.is_monochrome,
-      frame_header.upscaled_width, frame_header.height,
-      color_config.subsampling_x, color_config.subsampling_y, kBorderPixels,
-      /*byte_alignment=*/0);
+  return kStatusOk;
 }
 
 StatusCode DecoderImpl::CopyFrameToOutputBuffer(
@@ -336,9 +1132,15 @@
       LIBGAV1_DLOG(ERROR,
                    "Invalid chroma subsampling values: cannot determine buffer "
                    "image format.");
-      return kLibgav1StatusInvalidArgument;
+      return kStatusInvalidArgument;
     }
   }
+  buffer_.color_range = sequence_header_.color_config.color_range;
+  buffer_.color_primary = sequence_header_.color_config.color_primary;
+  buffer_.transfer_characteristics =
+      sequence_header_.color_config.transfer_characteristics;
+  buffer_.matrix_coefficients =
+      sequence_header_.color_config.matrix_coefficients;
 
   buffer_.bitdepth = yuv_buffer->bitdepth();
   const int num_planes =
@@ -347,8 +1149,8 @@
   for (; plane < num_planes; ++plane) {
     buffer_.stride[plane] = yuv_buffer->stride(plane);
     buffer_.plane[plane] = yuv_buffer->data(plane);
-    buffer_.displayed_width[plane] = yuv_buffer->displayed_width(plane);
-    buffer_.displayed_height[plane] = yuv_buffer->displayed_height(plane);
+    buffer_.displayed_width[plane] = yuv_buffer->width(plane);
+    buffer_.displayed_height[plane] = yuv_buffer->height(plane);
   }
   for (; plane < kMaxPlanes; ++plane) {
     buffer_.stride[plane] = 0;
@@ -356,9 +1158,11 @@
     buffer_.displayed_width[plane] = 0;
     buffer_.displayed_height[plane] = 0;
   }
+  buffer_.spatial_id = frame->spatial_id();
+  buffer_.temporal_id = frame->temporal_id();
   buffer_.buffer_private_data = frame->buffer_private_data();
   output_frame_ = frame;
-  return kLibgav1StatusOk;
+  return kStatusOk;
 }
 
 void DecoderImpl::ReleaseOutputFrame() {
@@ -368,336 +1172,458 @@
   output_frame_ = nullptr;
 }
 
-StatusCode DecoderImpl::DecodeTiles(const ObuParser* obu) {
-  if (PostFilter::DoDeblock(obu->frame_header(), settings_.post_filter_mask) &&
-      !loop_filter_mask_.Reset(obu->frame_header().width,
-                               obu->frame_header().height)) {
-    LIBGAV1_DLOG(ERROR, "Failed to allocate memory for loop filter masks.");
-    return kLibgav1StatusOutOfMemory;
-  }
-  LoopRestorationInfo loop_restoration_info(
-      obu->frame_header().loop_restoration, obu->frame_header().upscaled_width,
-      obu->frame_header().height,
-      obu->sequence_header().color_config.subsampling_x,
-      obu->sequence_header().color_config.subsampling_y,
-      obu->sequence_header().color_config.is_monochrome);
-  if (!loop_restoration_info.Allocate()) {
+StatusCode DecoderImpl::DecodeTiles(
+    const ObuSequenceHeader& sequence_header,
+    const ObuFrameHeader& frame_header, const Vector<TileBuffer>& tile_buffers,
+    const DecoderState& state, FrameScratchBuffer* const frame_scratch_buffer,
+    RefCountedBuffer* const current_frame) {
+  frame_scratch_buffer->tile_scratch_buffer_pool.Reset(
+      sequence_header.color_config.bitdepth);
+  if (!frame_scratch_buffer->loop_restoration_info.Reset(
+          &frame_header.loop_restoration, frame_header.upscaled_width,
+          frame_header.height, sequence_header.color_config.subsampling_x,
+          sequence_header.color_config.subsampling_y,
+          sequence_header.color_config.is_monochrome)) {
     LIBGAV1_DLOG(ERROR,
                  "Failed to allocate memory for loop restoration info units.");
-    return kLibgav1StatusOutOfMemory;
+    return kStatusOutOfMemory;
   }
-  if (!AllocateCurrentFrame(obu->frame_header())) {
+  const bool do_cdef =
+      PostFilter::DoCdef(frame_header, settings_.post_filter_mask);
+  const int num_planes = sequence_header.color_config.is_monochrome
+                             ? kMaxPlanesMonochrome
+                             : kMaxPlanes;
+  const bool do_restoration = PostFilter::DoRestoration(
+      frame_header.loop_restoration, settings_.post_filter_mask, num_planes);
+  const bool do_superres =
+      PostFilter::DoSuperRes(frame_header, settings_.post_filter_mask);
+  // Use kBorderPixels for the left, right, and top borders. Only the bottom
+  // border may need to be bigger. SuperRes border is needed only if we are
+  // applying SuperRes in-place which is being done only in single threaded
+  // mode.
+  const int bottom_border = GetBottomBorderPixels(
+      do_cdef, do_restoration,
+      do_superres &&
+          frame_scratch_buffer->threading_strategy.post_filter_thread_pool() ==
+              nullptr,
+      sequence_header.color_config.subsampling_y);
+  current_frame->set_chroma_sample_position(
+      sequence_header.color_config.chroma_sample_position);
+  if (!current_frame->Realloc(sequence_header.color_config.bitdepth,
+                              sequence_header.color_config.is_monochrome,
+                              frame_header.upscaled_width, frame_header.height,
+                              sequence_header.color_config.subsampling_x,
+                              sequence_header.color_config.subsampling_y,
+                              /*left_border=*/kBorderPixels,
+                              /*right_border=*/kBorderPixels,
+                              /*top_border=*/kBorderPixels, bottom_border)) {
     LIBGAV1_DLOG(ERROR, "Failed to allocate memory for the decoder buffer.");
-    return kLibgav1StatusOutOfMemory;
+    return kStatusOutOfMemory;
   }
-  Array2D<int16_t> cdef_index;
-  if (obu->sequence_header().enable_cdef) {
-    if (!cdef_index.Reset(
-            DivideBy16(obu->frame_header().rows4x4 + kMaxBlockHeight4x4),
-            DivideBy16(obu->frame_header().columns4x4 + kMaxBlockWidth4x4))) {
+  if (sequence_header.enable_cdef) {
+    if (!frame_scratch_buffer->cdef_index.Reset(
+            DivideBy16(frame_header.rows4x4 + kMaxBlockHeight4x4),
+            DivideBy16(frame_header.columns4x4 + kMaxBlockWidth4x4),
+            /*zero_initialize=*/false)) {
       LIBGAV1_DLOG(ERROR, "Failed to allocate memory for cdef index.");
-      return kLibgav1StatusOutOfMemory;
+      return kStatusOutOfMemory;
     }
   }
-  if (!inter_transform_sizes_.Reset(
-          obu->frame_header().rows4x4 + kMaxBlockHeight4x4,
-          obu->frame_header().columns4x4 + kMaxBlockWidth4x4,
+  if (!frame_scratch_buffer->inter_transform_sizes.Reset(
+          frame_header.rows4x4 + kMaxBlockHeight4x4,
+          frame_header.columns4x4 + kMaxBlockWidth4x4,
           /*zero_initialize=*/false)) {
     LIBGAV1_DLOG(ERROR, "Failed to allocate memory for inter_transform_sizes.");
-    return kLibgav1StatusOutOfMemory;
+    return kStatusOutOfMemory;
   }
-  if (obu->frame_header().use_ref_frame_mvs &&
-      !state_.motion_field_mv.Reset(DivideBy2(obu->frame_header().rows4x4),
-                                    DivideBy2(obu->frame_header().columns4x4),
-                                    /*zero_initialize=*/false)) {
-    LIBGAV1_DLOG(ERROR,
-                 "Failed to allocate memory for temporal motion vectors.");
-    return kLibgav1StatusOutOfMemory;
+  if (frame_header.use_ref_frame_mvs) {
+    if (!frame_scratch_buffer->motion_field.mv.Reset(
+            DivideBy2(frame_header.rows4x4), DivideBy2(frame_header.columns4x4),
+            /*zero_initialize=*/false) ||
+        !frame_scratch_buffer->motion_field.reference_offset.Reset(
+            DivideBy2(frame_header.rows4x4), DivideBy2(frame_header.columns4x4),
+            /*zero_initialize=*/false)) {
+      LIBGAV1_DLOG(ERROR,
+                   "Failed to allocate memory for temporal motion vectors.");
+      return kStatusOutOfMemory;
+    }
+
+    // For each motion vector, only mv[0] needs to be initialized to
+    // kInvalidMvValue, mv[1] is not necessary to be initialized and can be
+    // set to an arbitrary value. For simplicity, mv[1] is set to 0.
+    // The following memory initialization of contiguous memory is very fast. It
+    // is not recommended to make the initialization multi-threaded, unless the
+    // memory which needs to be initialized in each thread is still contiguous.
+    MotionVector invalid_mv;
+    invalid_mv.mv[0] = kInvalidMvValue;
+    invalid_mv.mv[1] = 0;
+    MotionVector* const motion_field_mv =
+        &frame_scratch_buffer->motion_field.mv[0][0];
+    std::fill(motion_field_mv,
+              motion_field_mv + frame_scratch_buffer->motion_field.mv.size(),
+              invalid_mv);
   }
 
   // The addition of kMaxBlockHeight4x4 and kMaxBlockWidth4x4 is necessary so
   // that the block parameters cache can be filled in for the last row/column
   // without having to check for boundary conditions.
-  BlockParametersHolder block_parameters_holder(
-      obu->frame_header().rows4x4 + kMaxBlockHeight4x4,
-      obu->frame_header().columns4x4 + kMaxBlockWidth4x4,
-      obu->sequence_header().use_128x128_superblock);
-  if (!block_parameters_holder.Init()) {
-    return kLibgav1StatusOutOfMemory;
+  if (!frame_scratch_buffer->block_parameters_holder.Reset(
+          frame_header.rows4x4 + kMaxBlockHeight4x4,
+          frame_header.columns4x4 + kMaxBlockWidth4x4,
+          sequence_header.use_128x128_superblock)) {
+    return kStatusOutOfMemory;
   }
   const dsp::Dsp* const dsp =
-      dsp::GetDspTable(obu->sequence_header().color_config.bitdepth);
+      dsp::GetDspTable(sequence_header.color_config.bitdepth);
   if (dsp == nullptr) {
     LIBGAV1_DLOG(ERROR, "Failed to get the dsp table for bitdepth %d.",
-                 obu->sequence_header().color_config.bitdepth);
-    return kLibgav1StatusInternalError;
-  }
-  // If prev_segment_ids is a null pointer, it is treated as if it pointed to
-  // a segmentation map containing all 0s.
-  const SegmentationMap* prev_segment_ids = nullptr;
-  if (obu->frame_header().primary_reference_frame == kPrimaryReferenceNone) {
-    symbol_decoder_context_.Initialize(
-        obu->frame_header().quantizer.base_index);
-  } else {
-    const int index =
-        obu->frame_header()
-            .reference_frame_index[obu->frame_header().primary_reference_frame];
-    const RefCountedBuffer* prev_frame = state_.reference_frame[index].get();
-    symbol_decoder_context_ = prev_frame->FrameContext();
-    if (obu->frame_header().segmentation.enabled &&
-        prev_frame->columns4x4() == obu->frame_header().columns4x4 &&
-        prev_frame->rows4x4() == obu->frame_header().rows4x4) {
-      prev_segment_ids = prev_frame->segmentation_map();
-    }
+                 sequence_header.color_config.bitdepth);
+    return kStatusInternalError;
   }
 
-  const uint8_t tile_size_bytes = obu->frame_header().tile_info.tile_size_bytes;
-  const int tile_count = obu->tile_groups().back().end + 1;
+  const int tile_count = frame_header.tile_info.tile_count;
   assert(tile_count >= 1);
   Vector<std::unique_ptr<Tile>> tiles;
   if (!tiles.reserve(tile_count)) {
     LIBGAV1_DLOG(ERROR, "tiles.reserve(%d) failed.\n", tile_count);
-    return kLibgav1StatusOutOfMemory;
+    return kStatusOutOfMemory;
   }
-  if (!threading_strategy_.Reset(obu->frame_header(), settings_.threads)) {
-    return kLibgav1StatusOutOfMemory;
+  ThreadingStrategy& threading_strategy =
+      frame_scratch_buffer->threading_strategy;
+  if (!is_frame_parallel_ &&
+      !threading_strategy.Reset(frame_header, settings_.threads)) {
+    return kStatusOutOfMemory;
   }
 
-  if (threading_strategy_.row_thread_pool(0) != nullptr) {
-    if (residual_buffer_pool_ == nullptr) {
-      residual_buffer_pool_.reset(new (std::nothrow) ResidualBufferPool(
-          obu->sequence_header().use_128x128_superblock,
-          obu->sequence_header().color_config.subsampling_x,
-          obu->sequence_header().color_config.subsampling_y,
-          obu->sequence_header().color_config.bitdepth == 8 ? sizeof(int16_t)
-                                                            : sizeof(int32_t)));
-      if (residual_buffer_pool_ == nullptr) {
+  if (threading_strategy.row_thread_pool(0) != nullptr || is_frame_parallel_) {
+    if (frame_scratch_buffer->residual_buffer_pool == nullptr) {
+      frame_scratch_buffer->residual_buffer_pool.reset(
+          new (std::nothrow) ResidualBufferPool(
+              sequence_header.use_128x128_superblock,
+              sequence_header.color_config.subsampling_x,
+              sequence_header.color_config.subsampling_y,
+              sequence_header.color_config.bitdepth == 8 ? sizeof(int16_t)
+                                                         : sizeof(int32_t)));
+      if (frame_scratch_buffer->residual_buffer_pool == nullptr) {
         LIBGAV1_DLOG(ERROR, "Failed to allocate residual buffer.\n");
-        return kLibgav1StatusOutOfMemory;
+        return kStatusOutOfMemory;
       }
     } else {
-      residual_buffer_pool_->Reset(
-          obu->sequence_header().use_128x128_superblock,
-          obu->sequence_header().color_config.subsampling_x,
-          obu->sequence_header().color_config.subsampling_y,
-          obu->sequence_header().color_config.bitdepth == 8 ? sizeof(int16_t)
-                                                            : sizeof(int32_t));
+      frame_scratch_buffer->residual_buffer_pool->Reset(
+          sequence_header.use_128x128_superblock,
+          sequence_header.color_config.subsampling_x,
+          sequence_header.color_config.subsampling_y,
+          sequence_header.color_config.bitdepth == 8 ? sizeof(int16_t)
+                                                     : sizeof(int32_t));
     }
   }
 
-  const bool do_cdef =
-      PostFilter::DoCdef(obu->frame_header(), settings_.post_filter_mask);
-  const int num_planes = obu->sequence_header().color_config.is_monochrome
-                             ? kMaxPlanesMonochrome
-                             : kMaxPlanes;
-  const bool do_restoration =
-      PostFilter::DoRestoration(obu->frame_header().loop_restoration,
-                                settings_.post_filter_mask, num_planes);
-  if (threading_strategy_.post_filter_thread_pool() != nullptr &&
+  if (threading_strategy.post_filter_thread_pool() != nullptr &&
       (do_cdef || do_restoration)) {
     const int window_buffer_width = PostFilter::GetWindowBufferWidth(
-        threading_strategy_.post_filter_thread_pool(), obu->frame_header());
+        threading_strategy.post_filter_thread_pool(), frame_header);
     size_t threaded_window_buffer_size =
         window_buffer_width *
         PostFilter::GetWindowBufferHeight(
-            threading_strategy_.post_filter_thread_pool(),
-            obu->frame_header()) *
-        (obu->sequence_header().color_config.bitdepth == 8 ? sizeof(uint8_t)
-                                                           : sizeof(uint16_t));
-    if (do_cdef && !do_restoration) {
+            threading_strategy.post_filter_thread_pool(), frame_header) *
+        (sequence_header.color_config.bitdepth == 8 ? sizeof(uint8_t)
+                                                    : sizeof(uint16_t));
+    if (do_cdef) {
       // TODO(chengchen): for cdef U, V planes, if there's subsampling, we can
       // use smaller buffer.
       threaded_window_buffer_size *= num_planes;
     }
-    if (threaded_window_buffer_size_ < threaded_window_buffer_size) {
-      // threaded_window_buffer_ will be subdivided by PostFilter into windows
-      // of width 512 pixels. Each row in the window is filtered by a worker
-      // thread. To avoid false sharing, each 512-pixel row processed by one
-      // thread should not share a cache line with a row processed by another
-      // thread. So we align threaded_window_buffer_ to the cache line size.
-      // In addition, it is faster to memcpy from an aligned buffer.
-      //
-      // On Linux, the cache line size can be looked up with the command:
-      //   getconf LEVEL1_DCACHE_LINESIZE
-      //
-      // The cache line size should ideally be queried at run time. 64 is a
-      // common cache line size of x86 CPUs. Web searches showed the cache line
-      // size of ARM CPUs is 32 or 64 bytes. So aligning to 64-byte boundary
-      // will work for all CPUs that we care about, even though it is excessive
-      // for some ARM CPUs.
-      constexpr size_t kCacheLineSize = 64;
-      // To avoid false sharing, PostFilter's window width in bytes should also
-      // be a multiple of the cache line size. For simplicity, we check the
-      // window width in pixels.
-      assert(window_buffer_width % kCacheLineSize == 0);
-      threaded_window_buffer_ = MakeAlignedUniquePtr<uint8_t>(
-          kCacheLineSize, threaded_window_buffer_size);
-      if (threaded_window_buffer_ == nullptr) {
-        LIBGAV1_DLOG(ERROR,
-                     "Failed to allocate threaded loop restoration buffer.\n");
-        threaded_window_buffer_size_ = 0;
-        return kLibgav1StatusOutOfMemory;
-      }
-      threaded_window_buffer_size_ = threaded_window_buffer_size;
+    // To avoid false sharing, PostFilter's window width in bytes should be a
+    // multiple of the cache line size. For simplicity, we check the window
+    // width in pixels.
+    assert(window_buffer_width % kCacheLineSize == 0);
+    if (!frame_scratch_buffer->threaded_window_buffer.Resize(
+            threaded_window_buffer_size)) {
+      LIBGAV1_DLOG(ERROR,
+                   "Failed to resize threaded loop restoration buffer.\n");
+      return kStatusOutOfMemory;
     }
   }
 
-  PostFilter post_filter(
-      obu->frame_header(), obu->sequence_header(), &loop_filter_mask_,
-      cdef_index, &loop_restoration_info, &block_parameters_holder,
-      state_.current_frame->buffer(), dsp,
-      threading_strategy_.post_filter_thread_pool(),
-      threaded_window_buffer_.get(), settings_.post_filter_mask);
-  SymbolDecoderContext saved_symbol_decoder_context;
-  int tile_index = 0;
-  BlockingCounterWithStatus pending_tiles(tile_count);
-  for (const auto& tile_group : obu->tile_groups()) {
-    size_t bytes_left = tile_group.data_size;
-    size_t byte_offset = 0;
-    // The for loop in 5.11.1.
-    for (int tile_number = tile_group.start; tile_number <= tile_group.end;
-         ++tile_number) {
-      size_t tile_size = 0;
-      if (tile_number != tile_group.end) {
-        RawBitReader bit_reader(tile_group.data + byte_offset, bytes_left);
-        if (!bit_reader.ReadLittleEndian(tile_size_bytes, &tile_size)) {
-          LIBGAV1_DLOG(ERROR, "Could not read tile size for tile #%d",
-                       tile_number);
-          return kLibgav1StatusBitstreamError;
-        }
-        ++tile_size;
-        byte_offset += tile_size_bytes;
-        bytes_left -= tile_size_bytes;
-        if (tile_size > bytes_left) {
-          LIBGAV1_DLOG(ERROR, "Invalid tile size %zu for tile #%d", tile_size,
-                       tile_number);
-          return kLibgav1StatusBitstreamError;
-        }
-      } else {
-        tile_size = bytes_left;
-      }
-
-      std::unique_ptr<Tile> tile(new (std::nothrow) Tile(
-          tile_number, tile_group.data + byte_offset, tile_size,
-          obu->sequence_header(), obu->frame_header(),
-          state_.current_frame.get(), state_.reference_frame_sign_bias,
-          state_.reference_frame, &state_.motion_field_mv,
-          state_.reference_order_hint, state_.wedge_masks,
-          symbol_decoder_context_, &saved_symbol_decoder_context,
-          prev_segment_ids, &post_filter, &block_parameters_holder, &cdef_index,
-          &inter_transform_sizes_, dsp,
-          threading_strategy_.row_thread_pool(tile_index++),
-          residual_buffer_pool_.get(), &decoder_scratch_buffer_pool_,
-          &pending_tiles));
-      if (tile == nullptr) {
-        LIBGAV1_DLOG(ERROR, "Failed to allocate tile.");
-        return kLibgav1StatusOutOfMemory;
-      }
-      tiles.push_back_unchecked(std::move(tile));
-
-      byte_offset += tile_size;
-      bytes_left -= tile_size;
+  if (do_cdef && do_restoration) {
+    // We need to store 4 rows per 64x64 unit.
+    const int num_deblock_units = MultiplyBy4(Ceil(frame_header.rows4x4, 16));
+    // subsampling_y is set to zero irrespective of the actual frame's
+    // subsampling since we need to store exactly |num_deblock_units| rows of
+    // the deblocked pixels.
+    if (!frame_scratch_buffer->deblock_buffer.Realloc(
+            sequence_header.color_config.bitdepth,
+            sequence_header.color_config.is_monochrome,
+            frame_header.upscaled_width, num_deblock_units,
+            sequence_header.color_config.subsampling_x,
+            /*subsampling_y=*/0, kBorderPixels, kBorderPixels, kBorderPixels,
+            kBorderPixels, nullptr, nullptr, nullptr)) {
+      return kStatusOutOfMemory;
     }
   }
+
+  if (do_superres) {
+    const int num_threads =
+        1 + ((threading_strategy.post_filter_thread_pool() == nullptr)
+                 ? 0
+                 : threading_strategy.post_filter_thread_pool()->num_threads());
+    const size_t superres_line_buffer_size =
+        num_threads *
+        (MultiplyBy4(frame_header.columns4x4) +
+         MultiplyBy2(kSuperResHorizontalBorder) + kSuperResHorizontalPadding) *
+        (sequence_header.color_config.bitdepth == 8 ? sizeof(uint8_t)
+                                                    : sizeof(uint16_t));
+    if (!frame_scratch_buffer->superres_line_buffer.Resize(
+            superres_line_buffer_size)) {
+      LIBGAV1_DLOG(ERROR, "Failed to resize superres line buffer.\n");
+      return kStatusOutOfMemory;
+    }
+  }
+
+  PostFilter post_filter(frame_header, sequence_header, frame_scratch_buffer,
+                         current_frame->buffer(), dsp,
+                         settings_.post_filter_mask);
+
+  if (is_frame_parallel_) {
+    // We can parse the current frame if all the reference frames have been
+    // parsed.
+    for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+      if (!state.reference_valid[i] || state.reference_frame[i] == nullptr) {
+        continue;
+      }
+      if (!state.reference_frame[i]->WaitUntilParsed()) {
+        return kStatusUnknownError;
+      }
+    }
+  }
+
+  // If prev_segment_ids is a null pointer, it is treated as if it pointed to
+  // a segmentation map containing all 0s.
+  const SegmentationMap* prev_segment_ids = nullptr;
+  if (frame_header.primary_reference_frame == kPrimaryReferenceNone) {
+    frame_scratch_buffer->symbol_decoder_context.Initialize(
+        frame_header.quantizer.base_index);
+  } else {
+    const int index =
+        frame_header
+            .reference_frame_index[frame_header.primary_reference_frame];
+    assert(index != -1);
+    const RefCountedBuffer* prev_frame = state.reference_frame[index].get();
+    frame_scratch_buffer->symbol_decoder_context = prev_frame->FrameContext();
+    if (frame_header.segmentation.enabled &&
+        prev_frame->columns4x4() == frame_header.columns4x4 &&
+        prev_frame->rows4x4() == frame_header.rows4x4) {
+      prev_segment_ids = prev_frame->segmentation_map();
+    }
+  }
+
+  // The Tile class must make use of a separate buffer to store the unfiltered
+  // pixels for the intra prediction of the next superblock row. This is done
+  // only when one of the following conditions are true:
+  //   * is_frame_parallel_ is true.
+  //   * settings_.threads == 1.
+  // In the non-frame-parallel multi-threaded case, we do not run the post
+  // filters in the decode loop. So this buffer need not be used.
+  const bool use_intra_prediction_buffer =
+      is_frame_parallel_ || settings_.threads == 1;
+  if (use_intra_prediction_buffer) {
+    if (!frame_scratch_buffer->intra_prediction_buffers.Resize(
+            frame_header.tile_info.tile_rows)) {
+      LIBGAV1_DLOG(ERROR, "Failed to Resize intra_prediction_buffers.");
+      return kStatusOutOfMemory;
+    }
+    IntraPredictionBuffer* const intra_prediction_buffers =
+        frame_scratch_buffer->intra_prediction_buffers.get();
+    for (int plane = 0; plane < num_planes; ++plane) {
+      const int subsampling =
+          (plane == kPlaneY) ? 0 : sequence_header.color_config.subsampling_x;
+      const size_t intra_prediction_buffer_size =
+          ((MultiplyBy4(frame_header.columns4x4) >> subsampling) *
+           (sequence_header.color_config.bitdepth == 8 ? sizeof(uint8_t)
+                                                       : sizeof(uint16_t)));
+      for (int tile_row = 0; tile_row < frame_header.tile_info.tile_rows;
+           ++tile_row) {
+        if (!intra_prediction_buffers[tile_row][plane].Resize(
+                intra_prediction_buffer_size)) {
+          LIBGAV1_DLOG(ERROR,
+                       "Failed to allocate intra prediction buffer for tile "
+                       "row %d plane %d.\n",
+                       tile_row, plane);
+          return kStatusOutOfMemory;
+        }
+      }
+    }
+  }
+
+  SymbolDecoderContext saved_symbol_decoder_context;
+  BlockingCounterWithStatus pending_tiles(tile_count);
+  for (int tile_number = 0; tile_number < tile_count; ++tile_number) {
+    std::unique_ptr<Tile> tile = Tile::Create(
+        tile_number, tile_buffers[tile_number].data,
+        tile_buffers[tile_number].size, sequence_header, frame_header,
+        current_frame, state, frame_scratch_buffer, wedge_masks_,
+        &saved_symbol_decoder_context, prev_segment_ids, &post_filter, dsp,
+        threading_strategy.row_thread_pool(tile_number), &pending_tiles,
+        is_frame_parallel_, use_intra_prediction_buffer);
+    if (tile == nullptr) {
+      LIBGAV1_DLOG(ERROR, "Failed to create tile.");
+      return kStatusOutOfMemory;
+    }
+    tiles.push_back_unchecked(std::move(tile));
+  }
   assert(tiles.size() == static_cast<size_t>(tile_count));
-  bool tile_decoding_failed = false;
-  if (threading_strategy_.tile_thread_pool() == nullptr) {
-    for (const auto& tile_ptr : tiles) {
-      if (!tile_decoding_failed) {
-        if (!tile_ptr->Decode(/*is_main_thread=*/true)) {
-          LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number());
-          tile_decoding_failed = true;
-        }
-      } else {
-        pending_tiles.Decrement(false);
-      }
+  if (is_frame_parallel_) {
+    if (frame_scratch_buffer->threading_strategy.thread_pool() == nullptr) {
+      return DecodeTilesFrameParallel(
+          sequence_header, frame_header, tiles, saved_symbol_decoder_context,
+          prev_segment_ids, frame_scratch_buffer, &post_filter, current_frame);
     }
+    return DecodeTilesThreadedFrameParallel(
+        sequence_header, frame_header, tiles, saved_symbol_decoder_context,
+        prev_segment_ids, frame_scratch_buffer, &post_filter, current_frame);
+  }
+  StatusCode status;
+  if (settings_.threads == 1) {
+    status = DecodeTilesNonFrameParallel(sequence_header, frame_header, tiles,
+                                         frame_scratch_buffer, &post_filter);
   } else {
-    const int num_workers = threading_strategy_.tile_thread_count();
-    BlockingCounterWithStatus pending_workers(num_workers);
-    std::atomic<int> tile_counter(0);
-    // Submit tile decoding jobs to the thread pool.
-    for (int i = 0; i < num_workers; ++i) {
-      threading_strategy_.tile_thread_pool()->Schedule(
-          [&tiles, tile_count, &tile_counter, &pending_workers,
-           &pending_tiles]() {
-            bool failed = false;
-            int index;
-            while ((index = tile_counter.fetch_add(
-                        1, std::memory_order_relaxed)) < tile_count) {
-              if (!failed) {
-                const auto& tile_ptr = tiles[index];
-                if (!tile_ptr->Decode(/*is_main_thread=*/false)) {
-                  LIBGAV1_DLOG(ERROR, "Error decoding tile #%d",
-                               tile_ptr->number());
-                  failed = true;
-                }
-              } else {
-                pending_tiles.Decrement(false);
-              }
-            }
-            pending_workers.Decrement(!failed);
-          });
-    }
-    // Have the current thread partake in tile decoding.
-    int index;
-    while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
-           tile_count) {
-      if (!tile_decoding_failed) {
-        const auto& tile_ptr = tiles[index];
-        if (!tile_ptr->Decode(/*is_main_thread=*/true)) {
-          LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number());
-          tile_decoding_failed = true;
-        }
-      } else {
-        pending_tiles.Decrement(false);
-      }
-    }
-    // Wait until all the workers are done. This ensures that all the tiles have
-    // been parsed.
-    tile_decoding_failed |= !pending_workers.Wait();
+    status = DecodeTilesThreadedNonFrameParallel(tiles, frame_scratch_buffer,
+                                                 &post_filter, &pending_tiles);
   }
-  // Wait until all the tiles have been decoded.
-  tile_decoding_failed |= !pending_tiles.Wait();
-
-  // At this point, all the tiles have been parsed and decoded and the
-  // threadpool will be empty.
-  if (tile_decoding_failed) return kLibgav1StatusUnknownError;
-
-  if (obu->frame_header().enable_frame_end_update_cdf) {
-    symbol_decoder_context_ = saved_symbol_decoder_context;
+  if (status != kStatusOk) return status;
+  if (frame_header.enable_frame_end_update_cdf) {
+    frame_scratch_buffer->symbol_decoder_context = saved_symbol_decoder_context;
   }
-  state_.current_frame->SetFrameContext(symbol_decoder_context_);
-  if (post_filter.DoDeblock()) {
-    loop_filter_mask_.Build(obu->sequence_header(), obu->frame_header(),
-                            obu->tile_groups().front().start,
-                            obu->tile_groups().back().end,
-                            block_parameters_holder, inter_transform_sizes_);
-  }
-  if (!post_filter.ApplyFiltering()) {
-    LIBGAV1_DLOG(ERROR, "Error applying in-loop filtering.");
-    return kLibgav1StatusUnknownError;
-  }
-  SetCurrentFrameSegmentationMap(obu->frame_header(), prev_segment_ids);
-  return kLibgav1StatusOk;
+  current_frame->SetFrameContext(frame_scratch_buffer->symbol_decoder_context);
+  SetSegmentationMap(frame_header, prev_segment_ids, current_frame);
+  return kStatusOk;
 }
 
-void DecoderImpl::SetCurrentFrameSegmentationMap(
+StatusCode DecoderImpl::ApplyFilmGrain(
+    const ObuSequenceHeader& sequence_header,
     const ObuFrameHeader& frame_header,
-    const SegmentationMap* prev_segment_ids) {
-  if (!frame_header.segmentation.enabled) {
-    // All segment_id's are 0.
-    state_.current_frame->segmentation_map()->Clear();
-  } else if (!frame_header.segmentation.update_map) {
-    // Copy from prev_segment_ids.
-    if (prev_segment_ids == nullptr) {
-      // Treat a null prev_segment_ids pointer as if it pointed to a
-      // segmentation map containing all 0s.
-      state_.current_frame->segmentation_map()->Clear();
-    } else {
-      state_.current_frame->segmentation_map()->CopyFrom(*prev_segment_ids);
-    }
+    const RefCountedBufferPtr& displayable_frame,
+    RefCountedBufferPtr* film_grain_frame, ThreadPool* thread_pool) {
+  if (!sequence_header.film_grain_params_present ||
+      !displayable_frame->film_grain_params().apply_grain ||
+      (settings_.post_filter_mask & 0x10) == 0) {
+    *film_grain_frame = displayable_frame;
+    return kStatusOk;
   }
+  if (!frame_header.show_existing_frame &&
+      frame_header.refresh_frame_flags == 0) {
+    // If show_existing_frame is true, then the current frame is a previously
+    // saved reference frame. If refresh_frame_flags is nonzero, then the
+    // state_.UpdateReferenceFrames() call above has saved the current frame as
+    // a reference frame. Therefore, if both of these conditions are false, then
+    // the current frame is not saved as a reference frame. displayable_frame
+    // should hold the only reference to the current frame.
+    assert(displayable_frame.use_count() == 1);
+    // Add film grain noise in place.
+    *film_grain_frame = displayable_frame;
+  } else {
+    *film_grain_frame = buffer_pool_.GetFreeBuffer();
+    if (*film_grain_frame == nullptr) {
+      LIBGAV1_DLOG(ERROR,
+                   "Could not get film_grain_frame from the buffer pool.");
+      return kStatusResourceExhausted;
+    }
+    if (!(*film_grain_frame)
+             ->Realloc(displayable_frame->buffer()->bitdepth(),
+                       displayable_frame->buffer()->is_monochrome(),
+                       displayable_frame->upscaled_width(),
+                       displayable_frame->frame_height(),
+                       displayable_frame->buffer()->subsampling_x(),
+                       displayable_frame->buffer()->subsampling_y(),
+                       kBorderPixelsFilmGrain, kBorderPixelsFilmGrain,
+                       kBorderPixelsFilmGrain, kBorderPixelsFilmGrain)) {
+      LIBGAV1_DLOG(ERROR, "film_grain_frame->Realloc() failed.");
+      return kStatusOutOfMemory;
+    }
+    (*film_grain_frame)
+        ->set_chroma_sample_position(
+            displayable_frame->chroma_sample_position());
+    (*film_grain_frame)->set_spatial_id(displayable_frame->spatial_id());
+    (*film_grain_frame)->set_temporal_id(displayable_frame->temporal_id());
+  }
+  const bool color_matrix_is_identity =
+      sequence_header.color_config.matrix_coefficients ==
+      kMatrixCoefficientsIdentity;
+  assert(displayable_frame->buffer()->stride(kPlaneU) ==
+         displayable_frame->buffer()->stride(kPlaneV));
+  const int input_stride_uv = displayable_frame->buffer()->stride(kPlaneU);
+  assert((*film_grain_frame)->buffer()->stride(kPlaneU) ==
+         (*film_grain_frame)->buffer()->stride(kPlaneV));
+  const int output_stride_uv = (*film_grain_frame)->buffer()->stride(kPlaneU);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+  if (displayable_frame->buffer()->bitdepth() > 8) {
+    FilmGrain<10> film_grain(displayable_frame->film_grain_params(),
+                             displayable_frame->buffer()->is_monochrome(),
+                             color_matrix_is_identity,
+                             displayable_frame->buffer()->subsampling_x(),
+                             displayable_frame->buffer()->subsampling_y(),
+                             displayable_frame->upscaled_width(),
+                             displayable_frame->frame_height(), thread_pool);
+    if (!film_grain.AddNoise(
+            displayable_frame->buffer()->data(kPlaneY),
+            displayable_frame->buffer()->stride(kPlaneY),
+            displayable_frame->buffer()->data(kPlaneU),
+            displayable_frame->buffer()->data(kPlaneV), input_stride_uv,
+            (*film_grain_frame)->buffer()->data(kPlaneY),
+            (*film_grain_frame)->buffer()->stride(kPlaneY),
+            (*film_grain_frame)->buffer()->data(kPlaneU),
+            (*film_grain_frame)->buffer()->data(kPlaneV), output_stride_uv)) {
+      LIBGAV1_DLOG(ERROR, "film_grain.AddNoise() failed.");
+      return kStatusOutOfMemory;
+    }
+    return kStatusOk;
+  }
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+  FilmGrain<8> film_grain(displayable_frame->film_grain_params(),
+                          displayable_frame->buffer()->is_monochrome(),
+                          color_matrix_is_identity,
+                          displayable_frame->buffer()->subsampling_x(),
+                          displayable_frame->buffer()->subsampling_y(),
+                          displayable_frame->upscaled_width(),
+                          displayable_frame->frame_height(), thread_pool);
+  if (!film_grain.AddNoise(
+          displayable_frame->buffer()->data(kPlaneY),
+          displayable_frame->buffer()->stride(kPlaneY),
+          displayable_frame->buffer()->data(kPlaneU),
+          displayable_frame->buffer()->data(kPlaneV), input_stride_uv,
+          (*film_grain_frame)->buffer()->data(kPlaneY),
+          (*film_grain_frame)->buffer()->stride(kPlaneY),
+          (*film_grain_frame)->buffer()->data(kPlaneU),
+          (*film_grain_frame)->buffer()->data(kPlaneV), output_stride_uv)) {
+    LIBGAV1_DLOG(ERROR, "film_grain.AddNoise() failed.");
+    return kStatusOutOfMemory;
+  }
+  return kStatusOk;
+}
+
+bool DecoderImpl::IsNewSequenceHeader(const ObuParser& obu) {
+  if (std::find_if(obu.obu_headers().begin(), obu.obu_headers().end(),
+                   [](const ObuHeader& obu_header) {
+                     return obu_header.type == kObuSequenceHeader;
+                   }) == obu.obu_headers().end()) {
+    return false;
+  }
+  const ObuSequenceHeader sequence_header = obu.sequence_header();
+  const bool sequence_header_changed =
+      !has_sequence_header_ ||
+      sequence_header_.color_config.bitdepth !=
+          sequence_header.color_config.bitdepth ||
+      sequence_header_.color_config.is_monochrome !=
+          sequence_header.color_config.is_monochrome ||
+      sequence_header_.color_config.subsampling_x !=
+          sequence_header.color_config.subsampling_x ||
+      sequence_header_.color_config.subsampling_y !=
+          sequence_header.color_config.subsampling_y ||
+      sequence_header_.max_frame_width != sequence_header.max_frame_width ||
+      sequence_header_.max_frame_height != sequence_header.max_frame_height;
+  sequence_header_ = sequence_header;
+  has_sequence_header_ = true;
+  return sequence_header_changed;
 }
 
 }  // namespace libgav1
diff --git a/libgav1/src/decoder_impl.h b/libgav1/src/decoder_impl.h
index 18026f7..df1b091 100644
--- a/libgav1/src/decoder_impl.h
+++ b/libgav1/src/decoder_impl.h
@@ -18,23 +18,26 @@
 #define LIBGAV1_SRC_DECODER_IMPL_H_
 
 #include <array>
+#include <condition_variable>  // NOLINT (unapproved c++11 header)
 #include <cstddef>
 #include <cstdint>
 #include <memory>
+#include <mutex>  // NOLINT (unapproved c++11 header)
 
 #include "src/buffer_pool.h"
-#include "src/decoder_buffer.h"
-#include "src/decoder_settings.h"
+#include "src/decoder_state.h"
 #include "src/dsp/constants.h"
-#include "src/loop_filter_mask.h"
+#include "src/frame_scratch_buffer.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/gav1/decoder_settings.h"
+#include "src/gav1/status_code.h"
 #include "src/obu_parser.h"
 #include "src/residual_buffer_pool.h"
-#include "src/status_code.h"
 #include "src/symbol_decoder_context.h"
-#include "src/threading_strategy.h"
 #include "src/tile.h"
 #include "src/utils/array_2d.h"
 #include "src/utils/block_parameters_holder.h"
+#include "src/utils/compiler_attributes.h"
 #include "src/utils/constants.h"
 #include "src/utils/memory.h"
 #include "src/utils/queue.h"
@@ -43,69 +46,85 @@
 
 namespace libgav1 {
 
-struct EncodedFrame : public Allocable {
-  // The default constructor is invoked by the Queue<EncodedFrame>::Init()
+struct TemporalUnit;
+
+struct EncodedFrame {
+  EncodedFrame(ObuParser* const obu, const DecoderState& state,
+               const RefCountedBufferPtr& frame, int position_in_temporal_unit)
+      : sequence_header(obu->sequence_header()),
+        frame_header(obu->frame_header()),
+        state(state),
+        temporal_unit(nullptr),
+        frame(frame),
+        position_in_temporal_unit(position_in_temporal_unit) {
+    obu->MoveTileBuffer(&tile_buffers);
+    frame->MarkFrameAsStarted();
+  }
+
+  const ObuSequenceHeader sequence_header;
+  const ObuFrameHeader frame_header;
+  Vector<TileBuffer> tile_buffers;
+  DecoderState state;
+  TemporalUnit* temporal_unit;
+  RefCountedBufferPtr frame;
+  const int position_in_temporal_unit;
+};
+
+struct TemporalUnit : public Allocable {
+  // The default constructor is invoked by the Queue<TemporalUnit>::Init()
   // method. Queue<> does not use the default-constructed elements, so it is
   // safe for the default constructor to not initialize the members.
-  EncodedFrame() = default;
-  EncodedFrame(const uint8_t* data, size_t size, int64_t user_private_data)
-      : data(data), size(size), user_private_data(user_private_data) {}
+  TemporalUnit() = default;
+  TemporalUnit(const uint8_t* data, size_t size, int64_t user_private_data,
+               void* buffer_private_data)
+      : data(data),
+        size(size),
+        user_private_data(user_private_data),
+        buffer_private_data(buffer_private_data),
+        decoded(false),
+        status(kStatusOk),
+        has_displayable_frame(false),
+        output_frame_position(-1),
+        decoded_count(0),
+        output_layer_count(0),
+        released_input_buffer(false) {}
 
   const uint8_t* data;
   size_t size;
   int64_t user_private_data;
-};
+  void* buffer_private_data;
 
-struct DecoderState {
-  // Section 7.20. Updates frames in the reference_frame array with
-  // current_frame, based on the refresh_frame_flags bitmask.
-  void UpdateReferenceFrames(int refresh_frame_flags);
+  // The following members are used only in frame parallel mode.
+  bool decoded;
+  StatusCode status;
+  bool has_displayable_frame;
+  int output_frame_position;
 
-  // Clears all the reference frames.
-  void ClearReferenceFrames();
+  Vector<EncodedFrame> frames;
+  size_t decoded_count;
 
-  ObuSequenceHeader sequence_header = {};
-  // If true, sequence_header is valid.
-  bool has_sequence_header = false;
-  // reference_valid and reference_frame_id are used only if
-  // sequence_header_.frame_id_numbers_present is true.
-  // The reference_valid array is indexed by a reference picture slot number.
-  // A value (boolean) in the array signifies whether the corresponding
-  // reference picture slot is valid for use as a reference picture.
-  std::array<bool, kNumReferenceFrameTypes> reference_valid = {};
-  std::array<uint16_t, kNumReferenceFrameTypes> reference_frame_id = {};
-  // A valid value of current_frame_id is an unsigned integer of at most 16
-  // bits. -1 indicates current_frame_id is not initialized.
-  int current_frame_id = -1;
-  // The RefOrderHint array variable in the spec.
-  std::array<uint8_t, kNumReferenceFrameTypes> reference_order_hint = {};
-  // The OrderHint variable in the spec. Its value comes from either the
-  // order_hint syntax element in the uncompressed header (if
-  // show_existing_frame is false) or RefOrderHint[ frame_to_show_map_idx ]
-  // (if show_existing_frame is true and frame_type is KEY_FRAME). See Section
-  // 5.9.2 and Section 7.4.
-  //
-  // NOTE: When show_existing_frame is false, it is often more convenient to
-  // just use the order_hint field of the frame header as OrderHint. So this
-  // field is mainly used to update the reference_order_hint array in
-  // UpdateReferenceFrames().
-  uint8_t order_hint = 0;
-  // reference_frame_sign_bias[i] (a boolean) specifies the intended direction
-  // of the motion vector in time for each reference frame.
-  // * |false| indicates that the reference frame is a forwards reference (i.e.
-  //   the reference frame is expected to be output before the current frame);
-  // * |true| indicates that the reference frame is a backwards reference.
-  // Note: reference_frame_sign_bias[0] (for kReferenceFrameIntra) is not used.
-  std::array<bool, kNumReferenceFrameTypes> reference_frame_sign_bias = {};
-  std::array<RefCountedBufferPtr, kNumReferenceFrameTypes> reference_frame;
-  RefCountedBufferPtr current_frame;
-  // wedge_master_mask has to be initialized to zero.
-  std::array<uint8_t, 6 * kWedgeMaskMasterSize* kWedgeMaskMasterSize>
-      wedge_master_mask = {};
-  // TODO(chengchen): It is possible to reduce the buffer size. Because wedge
-  // mask sizes are 8x8, 8x16, ..., 32x32. This buffer size can fit 32x32.
-  std::array<uint8_t, kWedgeMaskSize> wedge_masks = {};
-  Array2D<TemporalMotionVector> motion_field_mv;
+  // The struct (and the counter) is used to support output of multiple layers
+  // within a single temporal unit. The decoding process will store the output
+  // frames in |output_layers| in the order they are finished decoding. At the
+  // end of the decoding process, this array will be sorted in reverse order of
+  // |position_in_temporal_unit|. DequeueFrame() will then return the frames in
+  // reverse order (so that the entire process can run with a single counter
+  // variable).
+  struct OutputLayer {
+    // Used by std::sort to sort |output_layers| in reverse order of
+    // |position_in_temporal_unit|.
+    bool operator<(const OutputLayer& rhs) const {
+      return position_in_temporal_unit > rhs.position_in_temporal_unit;
+    }
+
+    RefCountedBufferPtr frame;
+    int position_in_temporal_unit = 0;
+  } output_layers[kMaxLayers];
+  // Number of entries in |output_layers|.
+  int output_layer_count;
+  // Flag to ensure that we release the input buffer only once if there are
+  // multiple output layers.
+  bool released_input_buffer;
 };
 
 class DecoderImpl : public Allocable {
@@ -118,51 +137,121 @@
                            std::unique_ptr<DecoderImpl>* output);
   ~DecoderImpl();
   StatusCode EnqueueFrame(const uint8_t* data, size_t size,
-                          int64_t user_private_data);
+                          int64_t user_private_data, void* buffer_private_data);
   StatusCode DequeueFrame(const DecoderBuffer** out_ptr);
   static constexpr int GetMaxBitdepth() {
-#if LIBGAV1_MAX_BITDEPTH >= 10
-    return 10;
-#else
-    return 8;
-#endif
+    static_assert(LIBGAV1_MAX_BITDEPTH == 8 || LIBGAV1_MAX_BITDEPTH == 10,
+                  "LIBGAV1_MAX_BITDEPTH must be 8 or 10.");
+    return LIBGAV1_MAX_BITDEPTH;
   }
 
  private:
   explicit DecoderImpl(const DecoderSettings* settings);
   StatusCode Init();
-  bool AllocateCurrentFrame(const ObuFrameHeader& frame_header);
+  // Called when the first frame is enqueued. It does the OBU parsing for one
+  // temporal unit to retrieve the tile configuration and sets up the frame
+  // threading if frame parallel mode is allowed. It also initializes the
+  // |temporal_units_| queue based on the number of frame threads.
+  //
+  // The following are the limitations of the current implementation:
+  //  * It assumes that all frames in the video have the same tile
+  //    configuration. The frame parallel threading model will not be updated
+  //    based on tile configuration changes mid-stream.
+  //  * The above assumption holds true even when there is a new coded video
+  //    sequence (i.e.) a new sequence header.
+  StatusCode InitializeFrameThreadPoolAndTemporalUnitQueue(const uint8_t* data,
+                                                           size_t size);
+  // Used only in frame parallel mode. Signals failure and waits until the
+  // worker threads are aborted if |status| is a failure status. If |status| is
+  // equal to kStatusOk or kStatusTryAgain, this function does not do anything.
+  // Always returns the input parameter |status| as the return value.
+  //
+  // This function is called only from the application thread (from
+  // EnqueueFrame() and DequeueFrame()).
+  StatusCode SignalFailure(StatusCode status);
+
   void ReleaseOutputFrame();
-  // Populates buffer_ with values from |frame|. Adds a reference to |frame|
-  // in output_frame_.
+
+  // Decodes all the frames contained in the given temporal unit. Used only in
+  // non frame parallel mode.
+  StatusCode DecodeTemporalUnit(const TemporalUnit& temporal_unit,
+                                const DecoderBuffer** out_ptr);
+  // Used only in frame parallel mode. Does the OBU parsing for |data| and
+  // schedules the individual frames for decoding in the |frame_thread_pool_|.
+  StatusCode ParseAndSchedule(const uint8_t* data, size_t size,
+                              int64_t user_private_data,
+                              void* buffer_private_data);
+  // Decodes the |encoded_frame| and updates the
+  // |encoded_frame->temporal_unit|'s parameters if the decoded frame is a
+  // displayable frame. Used only in frame parallel mode.
+  StatusCode DecodeFrame(EncodedFrame* encoded_frame);
+
+  // Populates |buffer_| with values from |frame|. Adds a reference to |frame|
+  // in |output_frame_|.
   StatusCode CopyFrameToOutputBuffer(const RefCountedBufferPtr& frame);
-  StatusCode DecodeTiles(const ObuParser* obu);
-  // Sets the current frame's segmentation map for two cases. The third case
-  // is handled in Tile::DecodeBlock().
-  void SetCurrentFrameSegmentationMap(const ObuFrameHeader& frame_header,
-                                      const SegmentationMap* prev_segment_ids);
+  StatusCode DecodeTiles(const ObuSequenceHeader& sequence_header,
+                         const ObuFrameHeader& frame_header,
+                         const Vector<TileBuffer>& tile_buffers,
+                         const DecoderState& state,
+                         FrameScratchBuffer* frame_scratch_buffer,
+                         RefCountedBuffer* current_frame);
+  // Applies film grain synthesis to the |displayable_frame| and stores the film
+  // grain applied frame into |film_grain_frame|. Returns kStatusOk on success.
+  StatusCode ApplyFilmGrain(const ObuSequenceHeader& sequence_header,
+                            const ObuFrameHeader& frame_header,
+                            const RefCountedBufferPtr& displayable_frame,
+                            RefCountedBufferPtr* film_grain_frame,
+                            ThreadPool* thread_pool);
 
-  Queue<EncodedFrame> encoded_frames_;
+  bool IsNewSequenceHeader(const ObuParser& obu);
+
+  bool HasFailure() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return failure_status_ != kStatusOk;
+  }
+
+  // Elements in this queue cannot be moved with std::move since the
+  // |EncodedFrame.temporal_unit| stores a pointer to elements in this queue.
+  Queue<TemporalUnit> temporal_units_;
   DecoderState state_;
-  ThreadingStrategy threading_strategy_;
-  SymbolDecoderContext symbol_decoder_context_;
 
-  // TODO(vigneshv): Only support one buffer for now. Eventually this has to be
-  // a vector or an array.
   DecoderBuffer buffer_ = {};
-  // output_frame_ holds a reference to the output frame on behalf of buffer_.
+  // |output_frame_| holds a reference to the output frame on behalf of
+  // |buffer_|.
   RefCountedBufferPtr output_frame_;
 
-  BufferPool buffer_pool_;
-  std::unique_ptr<ResidualBufferPool> residual_buffer_pool_;
-  AlignedUniquePtr<uint8_t> threaded_window_buffer_;
-  size_t threaded_window_buffer_size_ = 0;
-  Array2D<TransformSize> inter_transform_sizes_;
-  DecoderScratchBufferPool decoder_scratch_buffer_pool_;
+  // Queue of output frames that are to be returned in the DequeueFrame() calls.
+  // If |settings_.output_all_layers| is false, this queue will never contain
+  // more than 1 element. This queue is used only when |is_frame_parallel_| is
+  // false.
+  Queue<RefCountedBufferPtr> output_frame_queue_;
 
-  LoopFilterMask loop_filter_mask_;
+  BufferPool buffer_pool_;
+  WedgeMaskArray wedge_masks_;
+  FrameScratchBufferPool frame_scratch_buffer_pool_;
+
+  // Used to synchronize the accesses into |temporal_units_| in order to update
+  // the "decoded" state of an temporal unit.
+  std::mutex mutex_;
+  std::condition_variable decoded_condvar_;
+  bool is_frame_parallel_;
+  std::unique_ptr<ThreadPool> frame_thread_pool_;
+
+  // In frame parallel mode, there are two primary points of failure:
+  //  1) ParseAndSchedule()
+  //  2) DecodeTiles()
+  // Both of these functions have to respond to the other one failing by
+  // aborting whatever they are doing. This variable is used to accomplish that.
+  // If |failure_status_| is not kStatusOk, then the two functions will try to
+  // abort as early as they can.
+  StatusCode failure_status_ = kStatusOk LIBGAV1_GUARDED_BY(mutex_);
+
+  ObuSequenceHeader sequence_header_ = {};
+  // If true, sequence_header is valid.
+  bool has_sequence_header_ = false;
 
   const DecoderSettings& settings_;
+  bool seen_first_frame_ = false;
 };
 
 }  // namespace libgav1
diff --git a/libgav1/src/decoder_scratch_buffer.h b/libgav1/src/decoder_scratch_buffer.h
deleted file mode 100644
index 54ee1b7..0000000
--- a/libgav1/src/decoder_scratch_buffer.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Copyright 2019 The libgav1 Authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef LIBGAV1_SRC_DECODER_SCRATCH_BUFFER_H_
-#define LIBGAV1_SRC_DECODER_SCRATCH_BUFFER_H_
-
-#include <cstdint>
-#include <mutex>  // NOLINT (unapproved c++11 header)
-
-#include "src/dsp/constants.h"
-#include "src/utils/compiler_attributes.h"
-#include "src/utils/constants.h"
-#include "src/utils/memory.h"
-#include "src/utils/stack.h"
-
-namespace libgav1 {
-
-// Buffer to facilitate decoding a superblock.
-struct DecoderScratchBuffer : public Allocable {
-  static constexpr int kBlockDecodedStride = 34;
-
- private:
-#if LIBGAV1_MAX_BITDEPTH >= 10
-  static constexpr int kPixelSize = 2;
-#else
-  static constexpr int kPixelSize = 1;
-#endif
-
- public:
-  // The following prediction modes need a prediction mask:
-  // kCompoundPredictionTypeDiffWeighted, kCompoundPredictionTypeWedge,
-  // kCompoundPredictionTypeIntra. They are mutually exclusive. This buffer is
-  // used to store the prediction mask during the inter prediction process. The
-  // mask only needs to be created for the Y plane and is used for the U & V
-  // planes.
-  alignas(kMaxAlignment) uint8_t
-      prediction_mask[kMaxSuperBlockSizeSquareInPixels];
-
-  // For each instance of the DecoderScratchBuffer, only one of the following
-  // buffers will be used at any given time, so it is ok to share them in a
-  // union.
-  union {
-    // Union usage note: This is used only by functions in the "inter"
-    // prediction path.
-    //
-    // Buffers used for inter prediction process.
-    alignas(kMaxAlignment) uint16_t
-        prediction_buffer[2][kMaxSuperBlockSizeSquareInPixels];
-
-    struct {
-      // Union usage note: This is used only by functions in the "intra"
-      // prediction path.
-      //
-      // Buffer used for storing subsampled luma samples needed for CFL
-      // prediction. This buffer is used to avoid repetition of the subsampling
-      // for the V plane when it is already done for the U plane.
-      int16_t cfl_luma_buffer[kCflLumaBufferStride][kCflLumaBufferStride];
-
-      // Union usage note: This is used only by the
-      // Tile::ReadTransformCoefficients() function (and the helper functions
-      // that it calls). This cannot be shared with |cfl_luma_buffer| since
-      // |cfl_luma_buffer| has to live across the 3 plane loop in
-      // Tile::TransformBlock.
-      //
-      // Buffer used by Tile::ReadTransformCoefficients() to store the quantized
-      // coefficients until the dequantization process is performed.
-      int32_t quantized_buffer[kQuantizedCoefficientBufferSize];
-    };
-  };
-
-  // Buffer used for convolve. The maximum size required for this buffer is:
-  //  maximum block height (with scaling) = 2 * 128 = 256.
-  //  maximum block stride (with scaling and border aligned to 16) =
-  //     (2 * 128 + 7 + 9) * pixel_size = 272 * pixel_size.
-  alignas(kMaxAlignment) uint8_t
-      convolve_block_buffer[256 * 272 * DecoderScratchBuffer::kPixelSize];
-
-  // Flag indicating whether the data in |cfl_luma_buffer| is valid.
-  bool cfl_luma_buffer_valid;
-
-  // Equivalent to BlockDecoded array in the spec. This stores the decoded
-  // state of every 4x4 block in a superblock. It has 1 row/column border on
-  // all 4 sides (hence the 34x34 dimension instead of 32x32). Note that the
-  // spec uses "-1" as an index to access the left and top borders. In the
-  // code, we treat the index (1, 1) as equivalent to the spec's (0, 0). So
-  // all accesses into this array will be offset by +1 when compared with the
-  // spec.
-  bool block_decoded[kMaxPlanes][kBlockDecodedStride][kBlockDecodedStride];
-};
-
-class DecoderScratchBufferPool {
- public:
-  std::unique_ptr<DecoderScratchBuffer> Get() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    if (buffers_.Empty()) {
-      std::unique_ptr<DecoderScratchBuffer> scratch_buffer(
-          new (std::nothrow) DecoderScratchBuffer);
-      return scratch_buffer;
-    }
-    return buffers_.Pop();
-  }
-
-  void Release(std::unique_ptr<DecoderScratchBuffer> scratch_buffer) {
-    std::lock_guard<std::mutex> lock(mutex_);
-    buffers_.Push(std::move(scratch_buffer));
-  }
-
- private:
-  std::mutex mutex_;
-  // We will never need more than kMaxThreads scratch buffers since that is the
-  // maximum amount of work that will be done at any given time.
-  Stack<std::unique_ptr<DecoderScratchBuffer>, kMaxThreads> buffers_
-      LIBGAV1_GUARDED_BY(mutex_);
-};
-
-}  // namespace libgav1
-
-#endif  // LIBGAV1_SRC_DECODER_SCRATCH_BUFFER_H_
diff --git a/libgav1/src/decoder_settings.cc b/libgav1/src/decoder_settings.cc
new file mode 100644
index 0000000..9399073
--- /dev/null
+++ b/libgav1/src/decoder_settings.cc
@@ -0,0 +1,33 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/gav1/decoder_settings.h"
+
+extern "C" {
+
+void Libgav1DecoderSettingsInitDefault(Libgav1DecoderSettings* settings) {
+  settings->threads = 1;
+  settings->frame_parallel = 0;    // false
+  settings->blocking_dequeue = 0;  // false
+  settings->on_frame_buffer_size_changed = nullptr;
+  settings->get_frame_buffer = nullptr;
+  settings->release_frame_buffer = nullptr;
+  settings->release_input_buffer = nullptr;
+  settings->callback_private_data = nullptr;
+  settings->output_all_layers = 0;  // false
+  settings->operating_point = 0;
+  settings->post_filter_mask = 0x1f;
+}
+
+}  // extern "C"
diff --git a/libgav1/src/decoder_settings.h b/libgav1/src/decoder_settings.h
deleted file mode 100644
index 6c6f21d..0000000
--- a/libgav1/src/decoder_settings.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright 2019 The libgav1 Authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef LIBGAV1_SRC_DECODER_SETTINGS_H_
-#define LIBGAV1_SRC_DECODER_SETTINGS_H_
-
-#include <cstdint>
-
-#include "src/frame_buffer.h"
-
-// All the declarations in this file are part of the public ABI.
-
-namespace libgav1 {
-
-// Applications must populate this structure before creating a decoder instance.
-struct DecoderSettings {
-  // Number of threads to use when decoding. Must be greater than 0. The
-  // library will create at most |threads|-1 new threads, the calling thread is
-  // considered part of the library's thread count. Defaults to 1 (no new
-  // threads will be created).
-  int threads = 1;
-  // Do frame parallel decoding.
-  bool frame_parallel = false;
-  // Get frame buffer callback.
-  GetFrameBufferCallback get = nullptr;
-  // Release frame buffer callback.
-  ReleaseFrameBufferCallback release = nullptr;
-  // Passed as the private_data argument to the callbacks.
-  void* callback_private_data = nullptr;
-  // Mask indicating the post processing filters that need to be applied to the
-  // reconstructed frame. From LSB:
-  //   Bit 0: Loop filter (deblocking filter).
-  //   Bit 1: Cdef.
-  //   Bit 2: Superres.
-  //   Bit 3: Loop restoration.
-  //   Bit 4: Film grain synthesis.
-  //   All the bits other than the last 5 are ignored.
-  uint8_t post_filter_mask = 0x1f;
-};
-
-}  // namespace libgav1
-#endif  // LIBGAV1_SRC_DECODER_SETTINGS_H_
diff --git a/libgav1/src/decoder_state.h b/libgav1/src/decoder_state.h
new file mode 100644
index 0000000..897c99f
--- /dev/null
+++ b/libgav1/src/decoder_state.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DECODER_STATE_H_
+#define LIBGAV1_SRC_DECODER_STATE_H_
+
+#include <array>
+#include <cstdint>
+
+#include "src/buffer_pool.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+
+struct DecoderState {
+  // Section 7.20. Updates frames in the reference_frame array with
+  // |current_frame|, based on the |refresh_frame_flags| bitmask.
+  void UpdateReferenceFrames(const RefCountedBufferPtr& current_frame,
+                             int refresh_frame_flags) {
+    for (int ref_index = 0, mask = refresh_frame_flags; mask != 0;
+         ++ref_index, mask >>= 1) {
+      if ((mask & 1) != 0) {
+        reference_valid[ref_index] = true;
+        reference_frame_id[ref_index] = current_frame_id;
+        reference_frame[ref_index] = current_frame;
+        reference_order_hint[ref_index] = order_hint;
+      }
+    }
+  }
+
+  // Clears all the reference frames.
+  void ClearReferenceFrames() {
+    reference_valid = {};
+    reference_frame_id = {};
+    reference_order_hint = {};
+    for (int ref_index = 0; ref_index < kNumReferenceFrameTypes; ++ref_index) {
+      reference_frame[ref_index] = nullptr;
+    }
+  }
+
+  // reference_valid and reference_frame_id are used only if
+  // sequence_header_.frame_id_numbers_present is true.
+  // The reference_valid array is indexed by a reference picture slot number.
+  // A value (boolean) in the array signifies whether the corresponding
+  // reference picture slot is valid for use as a reference picture.
+  std::array<bool, kNumReferenceFrameTypes> reference_valid = {};
+  std::array<uint16_t, kNumReferenceFrameTypes> reference_frame_id = {};
+  // A valid value of current_frame_id is an unsigned integer of at most 16
+  // bits. -1 indicates current_frame_id is not initialized.
+  int current_frame_id = -1;
+  // The RefOrderHint array variable in the spec.
+  std::array<uint8_t, kNumReferenceFrameTypes> reference_order_hint = {};
+  // The OrderHint variable in the spec. Its value comes from either the
+  // order_hint syntax element in the uncompressed header (if
+  // show_existing_frame is false) or RefOrderHint[ frame_to_show_map_idx ]
+  // (if show_existing_frame is true and frame_type is KEY_FRAME). See Section
+  // 5.9.2 and Section 7.4.
+  //
+  // NOTE: When show_existing_frame is false, it is often more convenient to
+  // just use the order_hint field of the frame header as OrderHint. So this
+  // field is mainly used to update the reference_order_hint array in
+  // UpdateReferenceFrames().
+  uint8_t order_hint = 0;
+  // reference_frame_sign_bias[i] (a boolean) specifies the intended direction
+  // of the motion vector in time for each reference frame.
+  // * |false| indicates that the reference frame is a forwards reference (i.e.
+  //   the reference frame is expected to be output before the current frame);
+  // * |true| indicates that the reference frame is a backwards reference.
+  // Note: reference_frame_sign_bias[0] (for kReferenceFrameIntra) is not used.
+  std::array<bool, kNumReferenceFrameTypes> reference_frame_sign_bias = {};
+  std::array<RefCountedBufferPtr, kNumReferenceFrameTypes> reference_frame;
+};
+
+}  // namespace libgav1
+
+#endif  // LIBGAV1_SRC_DECODER_STATE_H_
diff --git a/libgav1/src/dsp/arm/average_blend_neon.cc b/libgav1/src/dsp/arm/average_blend_neon.cc
index 94fad54..d946d70 100644
--- a/libgav1/src/dsp/arm/average_blend_neon.cc
+++ b/libgav1/src/dsp/arm/average_blend_neon.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "src/dsp/average_blend.h"
-#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
 
 #if LIBGAV1_ENABLE_NEON
 
@@ -24,83 +24,61 @@
 #include <cstdint>
 
 #include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
 #include "src/utils/common.h"
 
 namespace libgav1 {
 namespace dsp {
 namespace {
 
-constexpr int kBitdepth8 = 8;
-constexpr int kInterPostRoundBit = 4;
-// An offset to cancel offsets used in compound predictor generation that
-// make intermediate computations non negative.
-const int16x8_t kCompoundRoundOffset =
-    vdupq_n_s16((2 << (kBitdepth8 + 4)) + (2 << (kBitdepth8 + 3)));
+constexpr int kInterPostRoundBit =
+    kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
 
-inline void AverageBlend4Row(const uint16_t* prediction_0,
-                             const uint16_t* prediction_1, uint8_t* dest) {
-  const int16x4_t pred0 = vreinterpret_s16_u16(vld1_u16(prediction_0));
-  const int16x4_t pred1 = vreinterpret_s16_u16(vld1_u16(prediction_1));
-  int16x4_t res = vadd_s16(pred0, pred1);
-  res = vsub_s16(res, vget_low_s16(kCompoundRoundOffset));
-  StoreLo4(dest,
-           vqrshrun_n_s16(vcombine_s16(res, res), kInterPostRoundBit + 1));
+inline uint8x8_t AverageBlend8Row(const int16_t* prediction_0,
+                                  const int16_t* prediction_1) {
+  const int16x8_t pred0 = vld1q_s16(prediction_0);
+  const int16x8_t pred1 = vld1q_s16(prediction_1);
+  const int16x8_t res = vaddq_s16(pred0, pred1);
+  return vqrshrun_n_s16(res, kInterPostRoundBit + 1);
 }
 
-inline void AverageBlend8Row(const uint16_t* prediction_0,
-                             const uint16_t* prediction_1, uint8_t* dest) {
-  const int16x8_t pred0 = vreinterpretq_s16_u16(vld1q_u16(prediction_0));
-  const int16x8_t pred1 = vreinterpretq_s16_u16(vld1q_u16(prediction_1));
-  int16x8_t res = vaddq_s16(pred0, pred1);
-  res = vsubq_s16(res, kCompoundRoundOffset);
-  vst1_u8(dest, vqrshrun_n_s16(res, kInterPostRoundBit + 1));
-}
-
-inline void AverageBlendLargeRow(const uint16_t* prediction_0,
-                                 const uint16_t* prediction_1, const int width,
+inline void AverageBlendLargeRow(const int16_t* prediction_0,
+                                 const int16_t* prediction_1, const int width,
                                  uint8_t* dest) {
   int x = 0;
   do {
-    const int16x8_t pred_00 =
-        vreinterpretq_s16_u16(vld1q_u16(&prediction_0[x]));
-    const int16x8_t pred_01 =
-        vreinterpretq_s16_u16(vld1q_u16(&prediction_1[x]));
-    int16x8_t res0 = vaddq_s16(pred_00, pred_01);
-    res0 = vsubq_s16(res0, kCompoundRoundOffset);
+    const int16x8_t pred_00 = vld1q_s16(&prediction_0[x]);
+    const int16x8_t pred_01 = vld1q_s16(&prediction_1[x]);
+    const int16x8_t res0 = vaddq_s16(pred_00, pred_01);
     const uint8x8_t res_out0 = vqrshrun_n_s16(res0, kInterPostRoundBit + 1);
-    const int16x8_t pred_10 =
-        vreinterpretq_s16_u16(vld1q_u16(&prediction_0[x + 8]));
-    const int16x8_t pred_11 =
-        vreinterpretq_s16_u16(vld1q_u16(&prediction_1[x + 8]));
-    int16x8_t res1 = vaddq_s16(pred_10, pred_11);
-    res1 = vsubq_s16(res1, kCompoundRoundOffset);
+    const int16x8_t pred_10 = vld1q_s16(&prediction_0[x + 8]);
+    const int16x8_t pred_11 = vld1q_s16(&prediction_1[x + 8]);
+    const int16x8_t res1 = vaddq_s16(pred_10, pred_11);
     const uint8x8_t res_out1 = vqrshrun_n_s16(res1, kInterPostRoundBit + 1);
     vst1q_u8(dest + x, vcombine_u8(res_out0, res_out1));
     x += 16;
   } while (x < width);
 }
 
-void AverageBlend_NEON(const uint16_t* prediction_0,
-                       const ptrdiff_t prediction_stride_0,
-                       const uint16_t* prediction_1,
-                       const ptrdiff_t prediction_stride_1, const int width,
-                       const int height, void* const dest,
+void AverageBlend_NEON(const void* prediction_0, const void* prediction_1,
+                       const int width, const int height, void* const dest,
                        const ptrdiff_t dest_stride) {
   auto* dst = static_cast<uint8_t*>(dest);
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
   int y = height;
 
   if (width == 4) {
     do {
-      AverageBlend4Row(prediction_0, prediction_1, dst);
-      dst += dest_stride;
-      prediction_0 += prediction_stride_0;
-      prediction_1 += prediction_stride_1;
+      const uint8x8_t result = AverageBlend8Row(pred_0, pred_1);
+      pred_0 += 8;
+      pred_1 += 8;
 
-      AverageBlend4Row(prediction_0, prediction_1, dst);
+      StoreLo4(dst, result);
       dst += dest_stride;
-      prediction_0 += prediction_stride_0;
-      prediction_1 += prediction_stride_1;
-
+      StoreHi4(dst, result);
+      dst += dest_stride;
       y -= 2;
     } while (y != 0);
     return;
@@ -108,15 +86,15 @@
 
   if (width == 8) {
     do {
-      AverageBlend8Row(prediction_0, prediction_1, dst);
+      vst1_u8(dst, AverageBlend8Row(pred_0, pred_1));
       dst += dest_stride;
-      prediction_0 += prediction_stride_0;
-      prediction_1 += prediction_stride_1;
+      pred_0 += 8;
+      pred_1 += 8;
 
-      AverageBlend8Row(prediction_0, prediction_1, dst);
+      vst1_u8(dst, AverageBlend8Row(pred_0, pred_1));
       dst += dest_stride;
-      prediction_0 += prediction_stride_0;
-      prediction_1 += prediction_stride_1;
+      pred_0 += 8;
+      pred_1 += 8;
 
       y -= 2;
     } while (y != 0);
@@ -124,22 +102,22 @@
   }
 
   do {
-    AverageBlendLargeRow(prediction_0, prediction_1, width, dst);
+    AverageBlendLargeRow(pred_0, pred_1, width, dst);
     dst += dest_stride;
-    prediction_0 += prediction_stride_0;
-    prediction_1 += prediction_stride_1;
+    pred_0 += width;
+    pred_1 += width;
 
-    AverageBlendLargeRow(prediction_0, prediction_1, width, dst);
+    AverageBlendLargeRow(pred_0, pred_1, width, dst);
     dst += dest_stride;
-    prediction_0 += prediction_stride_0;
-    prediction_1 += prediction_stride_1;
+    pred_0 += width;
+    pred_1 += width;
 
     y -= 2;
   } while (y != 0);
 }
 
 void Init8bpp() {
-  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
   assert(dsp != nullptr);
   dsp->average_blend = AverageBlend_NEON;
 }
@@ -151,7 +129,7 @@
 }  // namespace dsp
 }  // namespace libgav1
 
-#else   // !LIBGAV1_ENABLE_NEON
+#else  // !LIBGAV1_ENABLE_NEON
 
 namespace libgav1 {
 namespace dsp {
diff --git a/libgav1/src/dsp/arm/average_blend_neon.h b/libgav1/src/dsp/arm/average_blend_neon.h
index 569da64..d13bcd6 100644
--- a/libgav1/src/dsp/arm/average_blend_neon.h
+++ b/libgav1/src/dsp/arm/average_blend_neon.h
@@ -17,8 +17,8 @@
 #ifndef LIBGAV1_SRC_DSP_ARM_AVERAGE_BLEND_NEON_H_
 #define LIBGAV1_SRC_DSP_ARM_AVERAGE_BLEND_NEON_H_
 
-#include "src/dsp/cpu.h"
 #include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
 
 namespace libgav1 {
 namespace dsp {
@@ -30,7 +30,7 @@
 }  // namespace libgav1
 
 #if LIBGAV1_ENABLE_NEON
-#define LIBGAV1_Dsp8bpp_AverageBlend LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_AverageBlend LIBGAV1_CPU_NEON
 #endif  // LIBGAV1_ENABLE_NEON
 
 #endif  // LIBGAV1_SRC_DSP_ARM_AVERAGE_BLEND_NEON_H_
diff --git a/libgav1/src/dsp/arm/cdef_neon.cc b/libgav1/src/dsp/arm/cdef_neon.cc
new file mode 100644
index 0000000..968b0ff
--- /dev/null
+++ b/libgav1/src/dsp/arm/cdef_neon.cc
@@ -0,0 +1,697 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/cdef.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+#include "src/dsp/cdef.inc"
+
+// ----------------------------------------------------------------------------
+// Refer to CdefDirection_C().
+//
+// int32_t partial[8][15] = {};
+// for (int i = 0; i < 8; ++i) {
+//   for (int j = 0; j < 8; ++j) {
+//     const int x = 1;
+//     partial[0][i + j] += x;
+//     partial[1][i + j / 2] += x;
+//     partial[2][i] += x;
+//     partial[3][3 + i - j / 2] += x;
+//     partial[4][7 + i - j] += x;
+//     partial[5][3 - i / 2 + j] += x;
+//     partial[6][j] += x;
+//     partial[7][i / 2 + j] += x;
+//   }
+// }
+//
+// Using the code above, generate the position count for partial[8][15].
+//
+// partial[0]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[1]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[2]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[3]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[4]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[5]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[6]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[7]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+//
+// The SIMD code shifts the input horizontally, then adds vertically to get the
+// correct partial value for the given position.
+// ----------------------------------------------------------------------------
+
+// ----------------------------------------------------------------------------
+// partial[0][i + j] += x;
+//
+// 00 01 02 03 04 05 06 07  00 00 00 00 00 00 00
+// 00 10 11 12 13 14 15 16  17 00 00 00 00 00 00
+// 00 00 20 21 22 23 24 25  26 27 00 00 00 00 00
+// 00 00 00 30 31 32 33 34  35 36 37 00 00 00 00
+// 00 00 00 00 40 41 42 43  44 45 46 47 00 00 00
+// 00 00 00 00 00 50 51 52  53 54 55 56 57 00 00
+// 00 00 00 00 00 00 60 61  62 63 64 65 66 67 00
+// 00 00 00 00 00 00 00 70  71 72 73 74 75 76 77
+//
+// partial[4] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D0_D4(uint8x8_t* v_src,
+                                            uint16x8_t* partial_lo,
+                                            uint16x8_t* partial_hi) {
+  const uint8x8_t v_zero = vdup_n_u8(0);
+  // 00 01 02 03 04 05 06 07
+  // 00 10 11 12 13 14 15 16
+  *partial_lo = vaddl_u8(v_src[0], vext_u8(v_zero, v_src[1], 7));
+
+  // 00 00 20 21 22 23 24 25
+  *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[2], 6));
+  // 17 00 00 00 00 00 00 00
+  // 26 27 00 00 00 00 00 00
+  *partial_hi =
+      vaddl_u8(vext_u8(v_src[1], v_zero, 7), vext_u8(v_src[2], v_zero, 6));
+
+  // 00 00 00 30 31 32 33 34
+  *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[3], 5));
+  // 35 36 37 00 00 00 00 00
+  *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[3], v_zero, 5));
+
+  // 00 00 00 00 40 41 42 43
+  *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[4], 4));
+  // 44 45 46 47 00 00 00 00
+  *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[4], v_zero, 4));
+
+  // 00 00 00 00 00 50 51 52
+  *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[5], 3));
+  // 53 54 55 56 57 00 00 00
+  *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[5], v_zero, 3));
+
+  // 00 00 00 00 00 00 60 61
+  *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[6], 2));
+  // 62 63 64 65 66 67 00 00
+  *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[6], v_zero, 2));
+
+  // 00 00 00 00 00 00 00 70
+  *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[7], 1));
+  // 71 72 73 74 75 76 77 00
+  *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[7], v_zero, 1));
+}
+
+// ----------------------------------------------------------------------------
+// partial[1][i + j / 2] += x;
+//
+// A0 = src[0] + src[1], A1 = src[2] + src[3], ...
+//
+// A0 A1 A2 A3 00 00 00 00  00 00 00 00 00 00 00
+// 00 B0 B1 B2 B3 00 00 00  00 00 00 00 00 00 00
+// 00 00 C0 C1 C2 C3 00 00  00 00 00 00 00 00 00
+// 00 00 00 D0 D1 D2 D3 00  00 00 00 00 00 00 00
+// 00 00 00 00 E0 E1 E2 E3  00 00 00 00 00 00 00
+// 00 00 00 00 00 F0 F1 F2  F3 00 00 00 00 00 00
+// 00 00 00 00 00 00 G0 G1  G2 G3 00 00 00 00 00
+// 00 00 00 00 00 00 00 H0  H1 H2 H3 00 00 00 00
+//
+// partial[3] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D1_D3(uint8x8_t* v_src,
+                                            uint16x8_t* partial_lo,
+                                            uint16x8_t* partial_hi) {
+  uint8x16_t v_d1_temp[8];
+  const uint8x8_t v_zero = vdup_n_u8(0);
+  const uint8x16_t v_zero_16 = vdupq_n_u8(0);
+
+  for (int i = 0; i < 8; ++i) {
+    v_d1_temp[i] = vcombine_u8(v_src[i], v_zero);
+  }
+
+  *partial_lo = *partial_hi = vdupq_n_u16(0);
+  // A0 A1 A2 A3 00 00 00 00
+  *partial_lo = vpadalq_u8(*partial_lo, v_d1_temp[0]);
+
+  // 00 B0 B1 B2 B3 00 00 00
+  *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[1], 14));
+
+  // 00 00 C0 C1 C2 C3 00 00
+  *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[2], 12));
+  // 00 00 00 D0 D1 D2 D3 00
+  *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[3], 10));
+  // 00 00 00 00 E0 E1 E2 E3
+  *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[4], 8));
+
+  // 00 00 00 00 00 F0 F1 F2
+  *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[5], 6));
+  // F3 00 00 00 00 00 00 00
+  *partial_hi = vpadalq_u8(*partial_hi, vextq_u8(v_d1_temp[5], v_zero_16, 6));
+
+  // 00 00 00 00 00 00 G0 G1
+  *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[6], 4));
+  // G2 G3 00 00 00 00 00 00
+  *partial_hi = vpadalq_u8(*partial_hi, vextq_u8(v_d1_temp[6], v_zero_16, 4));
+
+  // 00 00 00 00 00 00 00 H0
+  *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[7], 2));
+  // H1 H2 H3 00 00 00 00 00
+  *partial_hi = vpadalq_u8(*partial_hi, vextq_u8(v_d1_temp[7], v_zero_16, 2));
+}
+
+// ----------------------------------------------------------------------------
+// partial[7][i / 2 + j] += x;
+//
+// 00 01 02 03 04 05 06 07  00 00 00 00 00 00 00
+// 10 11 12 13 14 15 16 17  00 00 00 00 00 00 00
+// 00 20 21 22 23 24 25 26  27 00 00 00 00 00 00
+// 00 30 31 32 33 34 35 36  37 00 00 00 00 00 00
+// 00 00 40 41 42 43 44 45  46 47 00 00 00 00 00
+// 00 00 50 51 52 53 54 55  56 57 00 00 00 00 00
+// 00 00 00 60 61 62 63 64  65 66 67 00 00 00 00
+// 00 00 00 70 71 72 73 74  75 76 77 00 00 00 00
+//
+// partial[5] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D5_D7(uint8x8_t* v_src,
+                                            uint16x8_t* partial_lo,
+                                            uint16x8_t* partial_hi) {
+  const uint16x8_t v_zero = vdupq_n_u16(0);
+  uint16x8_t v_pair_add[4];
+  // Add vertical source pairs.
+  v_pair_add[0] = vaddl_u8(v_src[0], v_src[1]);
+  v_pair_add[1] = vaddl_u8(v_src[2], v_src[3]);
+  v_pair_add[2] = vaddl_u8(v_src[4], v_src[5]);
+  v_pair_add[3] = vaddl_u8(v_src[6], v_src[7]);
+
+  // 00 01 02 03 04 05 06 07
+  // 10 11 12 13 14 15 16 17
+  *partial_lo = v_pair_add[0];
+  // 00 00 00 00 00 00 00 00
+  // 00 00 00 00 00 00 00 00
+  *partial_hi = vdupq_n_u16(0);
+
+  // 00 20 21 22 23 24 25 26
+  // 00 30 31 32 33 34 35 36
+  *partial_lo = vaddq_u16(*partial_lo, vextq_u16(v_zero, v_pair_add[1], 7));
+  // 27 00 00 00 00 00 00 00
+  // 37 00 00 00 00 00 00 00
+  *partial_hi = vaddq_u16(*partial_hi, vextq_u16(v_pair_add[1], v_zero, 7));
+
+  // 00 00 40 41 42 43 44 45
+  // 00 00 50 51 52 53 54 55
+  *partial_lo = vaddq_u16(*partial_lo, vextq_u16(v_zero, v_pair_add[2], 6));
+  // 46 47 00 00 00 00 00 00
+  // 56 57 00 00 00 00 00 00
+  *partial_hi = vaddq_u16(*partial_hi, vextq_u16(v_pair_add[2], v_zero, 6));
+
+  // 00 00 00 60 61 62 63 64
+  // 00 00 00 70 71 72 73 74
+  *partial_lo = vaddq_u16(*partial_lo, vextq_u16(v_zero, v_pair_add[3], 5));
+  // 65 66 67 00 00 00 00 00
+  // 75 76 77 00 00 00 00 00
+  *partial_hi = vaddq_u16(*partial_hi, vextq_u16(v_pair_add[3], v_zero, 5));
+}
+
+LIBGAV1_ALWAYS_INLINE void AddPartial(const void* const source,
+                                      ptrdiff_t stride, uint16x8_t* partial_lo,
+                                      uint16x8_t* partial_hi) {
+  const auto* src = static_cast<const uint8_t*>(source);
+
+  // 8x8 input
+  // 00 01 02 03 04 05 06 07
+  // 10 11 12 13 14 15 16 17
+  // 20 21 22 23 24 25 26 27
+  // 30 31 32 33 34 35 36 37
+  // 40 41 42 43 44 45 46 47
+  // 50 51 52 53 54 55 56 57
+  // 60 61 62 63 64 65 66 67
+  // 70 71 72 73 74 75 76 77
+  uint8x8_t v_src[8];
+  for (int i = 0; i < 8; ++i) {
+    v_src[i] = vld1_u8(src);
+    src += stride;
+  }
+
+  // partial for direction 2
+  // --------------------------------------------------------------------------
+  // partial[2][i] += x;
+  // 00 10 20 30 40 50 60 70  00 00 00 00 00 00 00 00
+  // 01 11 21 33 41 51 61 71  00 00 00 00 00 00 00 00
+  // 02 12 22 33 42 52 62 72  00 00 00 00 00 00 00 00
+  // 03 13 23 33 43 53 63 73  00 00 00 00 00 00 00 00
+  // 04 14 24 34 44 54 64 74  00 00 00 00 00 00 00 00
+  // 05 15 25 35 45 55 65 75  00 00 00 00 00 00 00 00
+  // 06 16 26 36 46 56 66 76  00 00 00 00 00 00 00 00
+  // 07 17 27 37 47 57 67 77  00 00 00 00 00 00 00 00
+  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[0]), partial_lo[2], 0);
+  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[1]), partial_lo[2], 1);
+  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[2]), partial_lo[2], 2);
+  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[3]), partial_lo[2], 3);
+  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[4]), partial_lo[2], 4);
+  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[5]), partial_lo[2], 5);
+  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[6]), partial_lo[2], 6);
+  partial_lo[2] = vsetq_lane_u16(SumVector(v_src[7]), partial_lo[2], 7);
+
+  // partial for direction 6
+  // --------------------------------------------------------------------------
+  // partial[6][j] += x;
+  // 00 01 02 03 04 05 06 07  00 00 00 00 00 00 00 00
+  // 10 11 12 13 14 15 16 17  00 00 00 00 00 00 00 00
+  // 20 21 22 23 24 25 26 27  00 00 00 00 00 00 00 00
+  // 30 31 32 33 34 35 36 37  00 00 00 00 00 00 00 00
+  // 40 41 42 43 44 45 46 47  00 00 00 00 00 00 00 00
+  // 50 51 52 53 54 55 56 57  00 00 00 00 00 00 00 00
+  // 60 61 62 63 64 65 66 67  00 00 00 00 00 00 00 00
+  // 70 71 72 73 74 75 76 77  00 00 00 00 00 00 00 00
+  const uint8x8_t v_zero = vdup_n_u8(0);
+  partial_lo[6] = vaddl_u8(v_zero, v_src[0]);
+  for (int i = 1; i < 8; ++i) {
+    partial_lo[6] = vaddw_u8(partial_lo[6], v_src[i]);
+  }
+
+  // partial for direction 0
+  AddPartial_D0_D4(v_src, &partial_lo[0], &partial_hi[0]);
+
+  // partial for direction 1
+  AddPartial_D1_D3(v_src, &partial_lo[1], &partial_hi[1]);
+
+  // partial for direction 7
+  AddPartial_D5_D7(v_src, &partial_lo[7], &partial_hi[7]);
+
+  uint8x8_t v_src_reverse[8];
+  for (int i = 0; i < 8; ++i) {
+    v_src_reverse[i] = vrev64_u8(v_src[i]);
+  }
+
+  // partial for direction 4
+  AddPartial_D0_D4(v_src_reverse, &partial_lo[4], &partial_hi[4]);
+
+  // partial for direction 3
+  AddPartial_D1_D3(v_src_reverse, &partial_lo[3], &partial_hi[3]);
+
+  // partial for direction 5
+  AddPartial_D5_D7(v_src_reverse, &partial_lo[5], &partial_hi[5]);
+}
+
+uint32x4_t Square(uint16x4_t a) { return vmull_u16(a, a); }
+
+uint32x4_t SquareAccumulate(uint32x4_t a, uint16x4_t b) {
+  return vmlal_u16(a, b, b);
+}
+
+// |cost[0]| and |cost[4]| square the input and sum with the corresponding
+// element from the other end of the vector:
+// |kCdefDivisionTable[]| element:
+// cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) *
+//             kCdefDivisionTable[i + 1];
+// cost[0] += Square(partial[0][7]) * kCdefDivisionTable[8];
+// Because everything is being summed into a single value the distributive
+// property allows us to mirror the division table and accumulate once.
+uint32_t Cost0Or4(const uint16x8_t a, const uint16x8_t b,
+                  const uint32x4_t division_table[4]) {
+  uint32x4_t c = vmulq_u32(Square(vget_low_u16(a)), division_table[0]);
+  c = vmlaq_u32(c, Square(vget_high_u16(a)), division_table[1]);
+  c = vmlaq_u32(c, Square(vget_low_u16(b)), division_table[2]);
+  c = vmlaq_u32(c, Square(vget_high_u16(b)), division_table[3]);
+  return SumVector(c);
+}
+
+// |cost[2]| and |cost[6]| square the input and accumulate:
+// cost[2] += Square(partial[2][i])
+uint32_t SquareAccumulate(const uint16x8_t a) {
+  uint32x4_t c = Square(vget_low_u16(a));
+  c = SquareAccumulate(c, vget_high_u16(a));
+  c = vmulq_n_u32(c, kCdefDivisionTable[7]);
+  return SumVector(c);
+}
+
+uint32_t CostOdd(const uint16x8_t a, const uint16x8_t b, const uint32x4_t mask,
+                 const uint32x4_t division_table[2]) {
+  // Remove elements 0-2.
+  uint32x4_t c = vandq_u32(mask, Square(vget_low_u16(a)));
+  c = vaddq_u32(c, Square(vget_high_u16(a)));
+  c = vmulq_n_u32(c, kCdefDivisionTable[7]);
+
+  c = vmlaq_u32(c, Square(vget_low_u16(a)), division_table[0]);
+  c = vmlaq_u32(c, Square(vget_low_u16(b)), division_table[1]);
+  return SumVector(c);
+}
+
+void CdefDirection_NEON(const void* const source, ptrdiff_t stride,
+                        int* const direction, int* const variance) {
+  assert(direction != nullptr);
+  assert(variance != nullptr);
+  const auto* src = static_cast<const uint8_t*>(source);
+  uint32_t cost[8];
+  uint16x8_t partial_lo[8], partial_hi[8];
+
+  AddPartial(src, stride, partial_lo, partial_hi);
+
+  cost[2] = SquareAccumulate(partial_lo[2]);
+  cost[6] = SquareAccumulate(partial_lo[6]);
+
+  const uint32x4_t division_table[4] = {
+      vld1q_u32(kCdefDivisionTable), vld1q_u32(kCdefDivisionTable + 4),
+      vld1q_u32(kCdefDivisionTable + 8), vld1q_u32(kCdefDivisionTable + 12)};
+
+  cost[0] = Cost0Or4(partial_lo[0], partial_hi[0], division_table);
+  cost[4] = Cost0Or4(partial_lo[4], partial_hi[4], division_table);
+
+  const uint32x4_t division_table_odd[2] = {
+      vld1q_u32(kCdefDivisionTableOdd), vld1q_u32(kCdefDivisionTableOdd + 4)};
+
+  const uint32x4_t element_3_mask = {0, 0, 0, static_cast<uint32_t>(-1)};
+
+  cost[1] =
+      CostOdd(partial_lo[1], partial_hi[1], element_3_mask, division_table_odd);
+  cost[3] =
+      CostOdd(partial_lo[3], partial_hi[3], element_3_mask, division_table_odd);
+  cost[5] =
+      CostOdd(partial_lo[5], partial_hi[5], element_3_mask, division_table_odd);
+  cost[7] =
+      CostOdd(partial_lo[7], partial_hi[7], element_3_mask, division_table_odd);
+
+  uint32_t best_cost = 0;
+  *direction = 0;
+  for (int i = 0; i < 8; ++i) {
+    if (cost[i] > best_cost) {
+      best_cost = cost[i];
+      *direction = i;
+    }
+  }
+  *variance = (best_cost - cost[(*direction + 4) & 7]) >> 10;
+}
+
+// -------------------------------------------------------------------------
+// CdefFilter
+
+// Load 4 vectors based on the given |direction|.
+void LoadDirection(const uint16_t* const src, const ptrdiff_t stride,
+                   uint16x8_t* output, const int direction) {
+  // Each |direction| describes a different set of source values. Expand this
+  // set by negating each set. For |direction| == 0 this gives a diagonal line
+  // from top right to bottom left. The first value is y, the second x. Negative
+  // y values move up.
+  //    a       b         c       d
+  // {-1, 1}, {1, -1}, {-2, 2}, {2, -2}
+  //         c
+  //       a
+  //     0
+  //   b
+  // d
+  const int y_0 = kCdefDirections[direction][0][0];
+  const int x_0 = kCdefDirections[direction][0][1];
+  const int y_1 = kCdefDirections[direction][1][0];
+  const int x_1 = kCdefDirections[direction][1][1];
+  output[0] = vld1q_u16(src + y_0 * stride + x_0);
+  output[1] = vld1q_u16(src - y_0 * stride - x_0);
+  output[2] = vld1q_u16(src + y_1 * stride + x_1);
+  output[3] = vld1q_u16(src - y_1 * stride - x_1);
+}
+
+// Load 4 vectors based on the given |direction|. Use when |block_width| == 4 to
+// do 2 rows at a time.
+void LoadDirection4(const uint16_t* const src, const ptrdiff_t stride,
+                    uint16x8_t* output, const int direction) {
+  const int y_0 = kCdefDirections[direction][0][0];
+  const int x_0 = kCdefDirections[direction][0][1];
+  const int y_1 = kCdefDirections[direction][1][0];
+  const int x_1 = kCdefDirections[direction][1][1];
+  output[0] = vcombine_u16(vld1_u16(src + y_0 * stride + x_0),
+                           vld1_u16(src + y_0 * stride + stride + x_0));
+  output[1] = vcombine_u16(vld1_u16(src - y_0 * stride - x_0),
+                           vld1_u16(src - y_0 * stride + stride - x_0));
+  output[2] = vcombine_u16(vld1_u16(src + y_1 * stride + x_1),
+                           vld1_u16(src + y_1 * stride + stride + x_1));
+  output[3] = vcombine_u16(vld1_u16(src - y_1 * stride - x_1),
+                           vld1_u16(src - y_1 * stride + stride - x_1));
+}
+
+int16x8_t Constrain(const uint16x8_t pixel, const uint16x8_t reference,
+                    const uint16x8_t threshold, const int16x8_t damping) {
+  // If reference > pixel, the difference will be negative, so covert to 0 or
+  // -1.
+  const uint16x8_t sign = vcgtq_u16(reference, pixel);
+  const uint16x8_t abs_diff = vabdq_u16(pixel, reference);
+  const uint16x8_t shifted_diff = vshlq_u16(abs_diff, damping);
+  // For bitdepth == 8, the threshold range is [0, 15] and the damping range is
+  // [3, 6]. If pixel == kCdefLargeValue(0x4000), shifted_diff will always be
+  // larger than threshold. Subtract using saturation will return 0 when pixel
+  // == kCdefLargeValue.
+  static_assert(kCdefLargeValue == 0x4000, "Invalid kCdefLargeValue");
+  const uint16x8_t thresh_minus_shifted_diff =
+      vqsubq_u16(threshold, shifted_diff);
+  const uint16x8_t clamp_abs_diff =
+      vminq_u16(thresh_minus_shifted_diff, abs_diff);
+  // Restore the sign.
+  return vreinterpretq_s16_u16(
+      vsubq_u16(veorq_u16(clamp_abs_diff, sign), sign));
+}
+
+template <int width, bool enable_primary = true, bool enable_secondary = true>
+void CdefFilter_NEON(const uint16_t* src, const ptrdiff_t src_stride,
+                     const int height, const int primary_strength,
+                     const int secondary_strength, const int damping,
+                     const int direction, void* dest,
+                     const ptrdiff_t dst_stride) {
+  static_assert(width == 8 || width == 4, "");
+  static_assert(enable_primary || enable_secondary, "");
+  constexpr bool clipping_required = enable_primary && enable_secondary;
+  auto* dst = static_cast<uint8_t*>(dest);
+  const uint16x8_t cdef_large_value_mask =
+      vdupq_n_u16(static_cast<uint16_t>(~kCdefLargeValue));
+  const uint16x8_t primary_threshold = vdupq_n_u16(primary_strength);
+  const uint16x8_t secondary_threshold = vdupq_n_u16(secondary_strength);
+
+  int16x8_t primary_damping_shift, secondary_damping_shift;
+
+  // FloorLog2() requires input to be > 0.
+  // 8-bit damping range: Y: [3, 6], UV: [2, 5].
+  if (enable_primary) {
+    // primary_strength: [0, 15] -> FloorLog2: [0, 3] so a clamp is necessary
+    // for UV filtering.
+    primary_damping_shift =
+        vdupq_n_s16(-std::max(0, damping - FloorLog2(primary_strength)));
+  }
+  if (enable_secondary) {
+    // secondary_strength: [0, 4] -> FloorLog2: [0, 2] so no clamp to 0 is
+    // necessary.
+    assert(damping - FloorLog2(secondary_strength) >= 0);
+    secondary_damping_shift =
+        vdupq_n_s16(-(damping - FloorLog2(secondary_strength)));
+  }
+
+  const int primary_tap_0 = kCdefPrimaryTaps[primary_strength & 1][0];
+  const int primary_tap_1 = kCdefPrimaryTaps[primary_strength & 1][1];
+
+  int y = height;
+  do {
+    uint16x8_t pixel;
+    if (width == 8) {
+      pixel = vld1q_u16(src);
+    } else {
+      pixel = vcombine_u16(vld1_u16(src), vld1_u16(src + src_stride));
+    }
+
+    uint16x8_t min = pixel;
+    uint16x8_t max = pixel;
+    int16x8_t sum;
+
+    if (enable_primary) {
+      // Primary |direction|.
+      uint16x8_t primary_val[4];
+      if (width == 8) {
+        LoadDirection(src, src_stride, primary_val, direction);
+      } else {
+        LoadDirection4(src, src_stride, primary_val, direction);
+      }
+
+      if (clipping_required) {
+        min = vminq_u16(min, primary_val[0]);
+        min = vminq_u16(min, primary_val[1]);
+        min = vminq_u16(min, primary_val[2]);
+        min = vminq_u16(min, primary_val[3]);
+
+        // The source is 16 bits, however, we only really care about the lower
+        // 8 bits.  The upper 8 bits contain the "large" flag.  After the final
+        // primary max has been calculated, zero out the upper 8 bits.  Use this
+        // to find the "16 bit" max.
+        const uint8x16_t max_p01 =
+            vmaxq_u8(vreinterpretq_u8_u16(primary_val[0]),
+                     vreinterpretq_u8_u16(primary_val[1]));
+        const uint8x16_t max_p23 =
+            vmaxq_u8(vreinterpretq_u8_u16(primary_val[2]),
+                     vreinterpretq_u8_u16(primary_val[3]));
+        const uint16x8_t max_p =
+            vreinterpretq_u16_u8(vmaxq_u8(max_p01, max_p23));
+        max = vmaxq_u16(max, vandq_u16(max_p, cdef_large_value_mask));
+      }
+
+      sum = Constrain(primary_val[0], pixel, primary_threshold,
+                      primary_damping_shift);
+      sum = vmulq_n_s16(sum, primary_tap_0);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(primary_val[1], pixel, primary_threshold,
+                                  primary_damping_shift),
+                        primary_tap_0);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(primary_val[2], pixel, primary_threshold,
+                                  primary_damping_shift),
+                        primary_tap_1);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(primary_val[3], pixel, primary_threshold,
+                                  primary_damping_shift),
+                        primary_tap_1);
+    } else {
+      sum = vdupq_n_s16(0);
+    }
+
+    if (enable_secondary) {
+      // Secondary |direction| values (+/- 2). Clamp |direction|.
+      uint16x8_t secondary_val[8];
+      if (width == 8) {
+        LoadDirection(src, src_stride, secondary_val, direction + 2);
+        LoadDirection(src, src_stride, secondary_val + 4, direction - 2);
+      } else {
+        LoadDirection4(src, src_stride, secondary_val, direction + 2);
+        LoadDirection4(src, src_stride, secondary_val + 4, direction - 2);
+      }
+
+      if (clipping_required) {
+        min = vminq_u16(min, secondary_val[0]);
+        min = vminq_u16(min, secondary_val[1]);
+        min = vminq_u16(min, secondary_val[2]);
+        min = vminq_u16(min, secondary_val[3]);
+        min = vminq_u16(min, secondary_val[4]);
+        min = vminq_u16(min, secondary_val[5]);
+        min = vminq_u16(min, secondary_val[6]);
+        min = vminq_u16(min, secondary_val[7]);
+
+        const uint8x16_t max_s01 =
+            vmaxq_u8(vreinterpretq_u8_u16(secondary_val[0]),
+                     vreinterpretq_u8_u16(secondary_val[1]));
+        const uint8x16_t max_s23 =
+            vmaxq_u8(vreinterpretq_u8_u16(secondary_val[2]),
+                     vreinterpretq_u8_u16(secondary_val[3]));
+        const uint8x16_t max_s45 =
+            vmaxq_u8(vreinterpretq_u8_u16(secondary_val[4]),
+                     vreinterpretq_u8_u16(secondary_val[5]));
+        const uint8x16_t max_s67 =
+            vmaxq_u8(vreinterpretq_u8_u16(secondary_val[6]),
+                     vreinterpretq_u8_u16(secondary_val[7]));
+        const uint16x8_t max_s = vreinterpretq_u16_u8(
+            vmaxq_u8(vmaxq_u8(max_s01, max_s23), vmaxq_u8(max_s45, max_s67)));
+        max = vmaxq_u16(max, vandq_u16(max_s, cdef_large_value_mask));
+      }
+
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[0], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap0);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[1], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap0);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[2], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap1);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[3], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap1);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[4], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap0);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[5], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap0);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[6], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap1);
+      sum = vmlaq_n_s16(sum,
+                        Constrain(secondary_val[7], pixel, secondary_threshold,
+                                  secondary_damping_shift),
+                        kCdefSecondaryTap1);
+    }
+    // Clip3(pixel + ((8 + sum - (sum < 0)) >> 4), min, max))
+    const int16x8_t sum_lt_0 = vshrq_n_s16(sum, 15);
+    sum = vaddq_s16(sum, sum_lt_0);
+    int16x8_t result = vrsraq_n_s16(vreinterpretq_s16_u16(pixel), sum, 4);
+    if (clipping_required) {
+      result = vminq_s16(result, vreinterpretq_s16_u16(max));
+      result = vmaxq_s16(result, vreinterpretq_s16_u16(min));
+    }
+
+    const uint8x8_t dst_pixel = vqmovun_s16(result);
+    if (width == 8) {
+      src += src_stride;
+      vst1_u8(dst, dst_pixel);
+      dst += dst_stride;
+      --y;
+    } else {
+      src += src_stride << 1;
+      StoreLo4(dst, dst_pixel);
+      dst += dst_stride;
+      StoreHi4(dst, dst_pixel);
+      dst += dst_stride;
+      y -= 2;
+    }
+  } while (y != 0);
+}
+
+void Init8bpp() {
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+  assert(dsp != nullptr);
+  dsp->cdef_direction = CdefDirection_NEON;
+  dsp->cdef_filters[0][0] = CdefFilter_NEON<4>;
+  dsp->cdef_filters[0][1] =
+      CdefFilter_NEON<4, /*enable_primary=*/true, /*enable_secondary=*/false>;
+  dsp->cdef_filters[0][2] = CdefFilter_NEON<4, /*enable_primary=*/false>;
+  dsp->cdef_filters[1][0] = CdefFilter_NEON<8>;
+  dsp->cdef_filters[1][1] =
+      CdefFilter_NEON<8, /*enable_primary=*/true, /*enable_secondary=*/false>;
+  dsp->cdef_filters[1][2] = CdefFilter_NEON<8, /*enable_primary=*/false>;
+}
+
+}  // namespace
+}  // namespace low_bitdepth
+
+void CdefInit_NEON() { low_bitdepth::Init8bpp(); }
+
+}  // namespace dsp
+}  // namespace libgav1
+#else  // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void CdefInit_NEON() {}
+
+}  // namespace dsp
+}  // namespace libgav1
+#endif  // LIBGAV1_ENABLE_NEON
diff --git a/libgav1/src/dsp/arm/cdef_neon.h b/libgav1/src/dsp/arm/cdef_neon.h
new file mode 100644
index 0000000..53d5f86
--- /dev/null
+++ b/libgav1/src/dsp/arm/cdef_neon.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_CDEF_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_CDEF_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cdef_direction and Dsp::cdef_filters. This function is not
+// thread-safe.
+void CdefInit_NEON();
+
+}  // namespace dsp
+}  // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_CdefDirection LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_CdefFilters LIBGAV1_CPU_NEON
+#endif  // LIBGAV1_ENABLE_NEON
+
+#endif  // LIBGAV1_SRC_DSP_ARM_CDEF_NEON_H_
diff --git a/libgav1/src/dsp/arm/common_neon.h b/libgav1/src/dsp/arm/common_neon.h
index e0667f9..e8367ab 100644
--- a/libgav1/src/dsp/arm/common_neon.h
+++ b/libgav1/src/dsp/arm/common_neon.h
@@ -17,7 +17,7 @@
 #ifndef LIBGAV1_SRC_DSP_ARM_COMMON_NEON_H_
 #define LIBGAV1_SRC_DSP_ARM_COMMON_NEON_H_
 
-#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
 
 #if LIBGAV1_ENABLE_NEON
 
@@ -29,6 +29,8 @@
 #if 0
 #include <cstdio>
 
+#include "absl/strings/str_cat.h"
+
 constexpr bool kEnablePrintRegs = true;
 
 union DebugRegister {
@@ -82,6 +84,16 @@
   }
 }
 
+inline void PrintReg(const int32x4x2_t val, const std::string& name) {
+  DebugRegisterQ r;
+  vst1q_u32(r.u32, val.val[0]);
+  const std::string name0 = absl::StrCat(name, ".val[0]").c_str();
+  PrintVectQ(r, name0.c_str(), 32);
+  vst1q_u32(r.u32, val.val[1]);
+  const std::string name1 = absl::StrCat(name, ".val[1]").c_str();
+  PrintVectQ(r, name1.c_str(), 32);
+}
+
 inline void PrintReg(const uint32x4_t val, const char* name) {
   DebugRegisterQ r;
   vst1q_u32(r.u32, val);
@@ -180,49 +192,89 @@
 //------------------------------------------------------------------------------
 // Load functions.
 
-// Load 4 uint8_t values into the low half of a uint8x8_t register.
-inline uint8x8_t LoadLo4(const uint8_t* const buf, uint8x8_t val) {
-  uint32_t temp;
-  memcpy(&temp, buf, 4);
-  return vreinterpret_u8_u32(vld1_lane_u32(&temp, vreinterpret_u32_u8(val), 0));
+// Load 2 uint8_t values into lanes 0 and 1. Zeros the register before loading
+// the values. Use caution when using this in loops because it will re-zero the
+// register before loading on every iteration.
+inline uint8x8_t Load2(const void* const buf) {
+  const uint16x4_t zero = vdup_n_u16(0);
+  uint16_t temp;
+  memcpy(&temp, buf, 2);
+  return vreinterpret_u8_u16(vld1_lane_u16(&temp, zero, 0));
 }
 
-// Load 4 uint8_t values into the high half of a uint8x8_t register.
-inline uint8x8_t LoadHi4(const uint8_t* const buf, uint8x8_t val) {
+// Load 2 uint8_t values into |lane| * 2 and |lane| * 2 + 1.
+template <int lane>
+inline uint8x8_t Load2(const void* const buf, uint8x8_t val) {
+  uint16_t temp;
+  memcpy(&temp, buf, 2);
+  return vreinterpret_u8_u16(
+      vld1_lane_u16(&temp, vreinterpret_u16_u8(val), lane));
+}
+
+// Load 4 uint8_t values into the low half of a uint8x8_t register. Zeros the
+// register before loading the values. Use caution when using this in loops
+// because it will re-zero the register before loading on every iteration.
+inline uint8x8_t Load4(const void* const buf) {
+  const uint32x2_t zero = vdup_n_u32(0);
   uint32_t temp;
   memcpy(&temp, buf, 4);
-  return vreinterpret_u8_u32(vld1_lane_u32(&temp, vreinterpret_u32_u8(val), 1));
+  return vreinterpret_u8_u32(vld1_lane_u32(&temp, zero, 0));
+}
+
+// Load 4 uint8_t values into 4 lanes staring with |lane| * 4.
+template <int lane>
+inline uint8x8_t Load4(const void* const buf, uint8x8_t val) {
+  uint32_t temp;
+  memcpy(&temp, buf, 4);
+  return vreinterpret_u8_u32(
+      vld1_lane_u32(&temp, vreinterpret_u32_u8(val), lane));
 }
 
 //------------------------------------------------------------------------------
 // Store functions.
 
 // Propagate type information to the compiler. Without this the compiler may
-// assume the required alignment of uint32_t (4 bytes) and add alignment hints
-// to the memory access.
-inline void Uint32ToMem(uint8_t* const buf, uint32_t val) {
-  memcpy(buf, &val, 4);
+// assume the required alignment of the type (4 bytes in the case of uint32_t)
+// and add alignment hints to the memory access.
+template <typename T>
+inline void ValueToMem(void* const buf, T val) {
+  memcpy(buf, &val, sizeof(val));
 }
 
-inline void Uint32ToMem(uint16_t* const buf, uint32_t val) {
-  memcpy(buf, &val, 4);
+// Store 4 int8_t values from the low half of an int8x8_t register.
+inline void StoreLo4(void* const buf, const int8x8_t val) {
+  ValueToMem<int32_t>(buf, vget_lane_s32(vreinterpret_s32_s8(val), 0));
 }
 
 // Store 4 uint8_t values from the low half of a uint8x8_t register.
-inline void StoreLo4(uint8_t* const buf, const uint8x8_t val) {
-  Uint32ToMem(buf, vget_lane_u32(vreinterpret_u32_u8(val), 0));
+inline void StoreLo4(void* const buf, const uint8x8_t val) {
+  ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u8(val), 0));
 }
 
 // Store 4 uint8_t values from the high half of a uint8x8_t register.
-inline void StoreHi4(uint8_t* const buf, const uint8x8_t val) {
-  Uint32ToMem(buf, vget_lane_u32(vreinterpret_u32_u8(val), 1));
+inline void StoreHi4(void* const buf, const uint8x8_t val) {
+  ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u8(val), 1));
+}
+
+// Store 2 uint8_t values from |lane| * 2 and |lane| * 2 + 1 of a uint8x8_t
+// register.
+template <int lane>
+inline void Store2(void* const buf, const uint8x8_t val) {
+  ValueToMem<uint16_t>(buf, vget_lane_u16(vreinterpret_u16_u8(val), lane));
 }
 
 // Store 2 uint16_t values from |lane| * 2 and |lane| * 2 + 1 of a uint16x8_t
 // register.
 template <int lane>
-inline void Store2(uint16_t* const buf, const uint16x8_t val) {
-  Uint32ToMem(buf, vgetq_lane_u32(vreinterpretq_u32_u16(val), lane));
+inline void Store2(void* const buf, const uint16x8_t val) {
+  ValueToMem<uint32_t>(buf, vgetq_lane_u32(vreinterpretq_u32_u16(val), lane));
+}
+
+// Store 2 uint16_t values from |lane| * 2 and |lane| * 2 + 1 of a uint16x4_t
+// register.
+template <int lane>
+inline void Store2(uint16_t* const buf, const uint16x4_t val) {
+  ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u16(val), lane));
 }
 
 //------------------------------------------------------------------------------
@@ -230,6 +282,11 @@
 
 // vshXX_n_XX() requires an immediate.
 template <int shift>
+inline uint8x8_t LeftShift(const uint8x8_t vector) {
+  return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vector), shift));
+}
+
+template <int shift>
 inline uint8x8_t RightShift(const uint8x8_t vector) {
   return vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(vector), shift));
 }
@@ -249,6 +306,16 @@
 #endif
 }
 
+// Shim vqtbl1_s8 for armv7.
+inline int8x8_t VQTbl1S8(const int8x16_t a, const uint8x8_t index) {
+#if defined(__aarch64__)
+  return vqtbl1_s8(a, index);
+#else
+  const int8x8x2_t b = {vget_low_s8(a), vget_high_s8(a)};
+  return vtbl2_s8(b, vreinterpret_s8_u8(index));
+#endif
+}
+
 //------------------------------------------------------------------------------
 // Interleave.
 
@@ -307,6 +374,30 @@
 }
 
 //------------------------------------------------------------------------------
+// Sum.
+
+inline uint16_t SumVector(const uint8x8_t a) {
+#if defined(__aarch64__)
+  return vaddlv_u8(a);
+#else
+  const uint16x4_t c = vpaddl_u8(a);
+  const uint32x2_t d = vpaddl_u16(c);
+  const uint64x1_t e = vpaddl_u32(d);
+  return static_cast<uint16_t>(vget_lane_u64(e, 0));
+#endif  // defined(__aarch64__)
+}
+
+inline uint32_t SumVector(const uint32x4_t a) {
+#if defined(__aarch64__)
+  return vaddvq_u32(a);
+#else
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint64x1_t c = vadd_u64(vget_low_u64(b), vget_high_u64(b));
+  return static_cast<uint32_t>(vget_lane_u64(c, 0));
+#endif
+}
+
+//------------------------------------------------------------------------------
 // Transpose.
 
 // Transpose 32 bit elements such that:
@@ -497,76 +588,24 @@
 }
 
 // Input:
-// a0: 00 01 02 03 04 05 06 07
-// a1: 10 11 12 13 14 15 16 17
-// a2: 20 21 22 23 24 25 26 27
-// a3: 30 31 32 33 34 35 36 37
-// a4: 40 41 42 43 44 45 46 47
-// a5: 50 51 52 53 54 55 56 57
-// a6: 60 61 62 63 64 65 66 67
-// a7: 70 71 72 73 74 75 76 77
+// a[0]: 00 01 02 03 04 05 06 07
+// a[1]: 10 11 12 13 14 15 16 17
+// a[2]: 20 21 22 23 24 25 26 27
+// a[3]: 30 31 32 33 34 35 36 37
+// a[4]: 40 41 42 43 44 45 46 47
+// a[5]: 50 51 52 53 54 55 56 57
+// a[6]: 60 61 62 63 64 65 66 67
+// a[7]: 70 71 72 73 74 75 76 77
 
 // Output:
-// a0: 00 10 20 30 40 50 60 70
-// a1: 01 11 21 31 41 51 61 71
-// a2: 02 12 22 32 42 52 62 72
-// a3: 03 13 23 33 43 53 63 73
-// a4: 04 14 24 34 44 54 64 74
-// a5: 05 15 25 35 45 55 65 75
-// a6: 06 16 26 36 46 56 66 76
-// a7: 07 17 27 37 47 57 67 77
-inline void Transpose8x8(int16x8_t* a0, int16x8_t* a1, int16x8_t* a2,
-                         int16x8_t* a3, int16x8_t* a4, int16x8_t* a5,
-                         int16x8_t* a6, int16x8_t* a7) {
-  const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
-  const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
-  const int16x8x2_t b2 = vtrnq_s16(*a4, *a5);
-  const int16x8x2_t b3 = vtrnq_s16(*a6, *a7);
-
-  const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
-                                   vreinterpretq_s32_s16(b1.val[0]));
-  const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
-                                   vreinterpretq_s32_s16(b1.val[1]));
-  const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
-                                   vreinterpretq_s32_s16(b3.val[0]));
-  const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
-                                   vreinterpretq_s32_s16(b3.val[1]));
-
-  const int16x8x2_t d0 = VtrnqS64(c0.val[0], c2.val[0]);
-  const int16x8x2_t d1 = VtrnqS64(c1.val[0], c3.val[0]);
-  const int16x8x2_t d2 = VtrnqS64(c0.val[1], c2.val[1]);
-  const int16x8x2_t d3 = VtrnqS64(c1.val[1], c3.val[1]);
-
-  *a0 = d0.val[0];
-  *a1 = d1.val[0];
-  *a2 = d2.val[0];
-  *a3 = d3.val[0];
-  *a4 = d0.val[1];
-  *a5 = d1.val[1];
-  *a6 = d2.val[1];
-  *a7 = d3.val[1];
-}
-
-// Input:
-// a0: 00 01 02 03 04 05 06 07
-// a1: 10 11 12 13 14 15 16 17
-// a2: 20 21 22 23 24 25 26 27
-// a3: 30 31 32 33 34 35 36 37
-// a4: 40 41 42 43 44 45 46 47
-// a5: 50 51 52 53 54 55 56 57
-// a6: 60 61 62 63 64 65 66 67
-// a7: 70 71 72 73 74 75 76 77
-
-// Output:
-// a0: 00 10 20 30 40 50 60 70
-// a1: 01 11 21 31 41 51 61 71
-// a2: 02 12 22 32 42 52 62 72
-// a3: 03 13 23 33 43 53 63 73
-// a4: 04 14 24 34 44 54 64 74
-// a5: 05 15 25 35 45 55 65 75
-// a6: 06 16 26 36 46 56 66 76
-// a7: 07 17 27 37 47 57 67 77
-// TODO(johannkoenig): Switch users of the above transpose to this one.
+// a[0]: 00 10 20 30 40 50 60 70
+// a[1]: 01 11 21 31 41 51 61 71
+// a[2]: 02 12 22 32 42 52 62 72
+// a[3]: 03 13 23 33 43 53 63 73
+// a[4]: 04 14 24 34 44 54 64 74
+// a[5]: 05 15 25 35 45 55 65 75
+// a[6]: 06 16 26 36 46 56 66 76
+// a[7]: 07 17 27 37 47 57 67 77
 inline void Transpose8x8(int16x8_t a[8]) {
   const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]);
   const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]);
@@ -628,125 +667,8 @@
   a[7] = d3.val[1];
 }
 
-// Input:
-// i0: 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
-// i1: 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f
-// i2: 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f
-// i3: 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f
-// i4: 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f
-// i5: 50 51 52 53 54 55 56 57 58 59 5a 5b 5c 5d 5e 5f
-// i6: 60 61 62 63 64 65 66 67 68 69 6a 6b 6c 6d 6e 6f
-// i7: 70 71 72 73 74 75 76 77 78 79 7a 7b 7c 7d 7e 7f
-
-// Output:
-// o00: 00 10 20 30 40 50 60 70
-// o01: 01 11 21 31 41 51 61 71
-// o02: 02 12 22 32 42 52 62 72
-// o03: 03 13 23 33 43 53 63 73
-// o04: 04 14 24 34 44 54 64 74
-// o05: 05 15 25 35 45 55 65 75
-// o06: 06 16 26 36 46 56 66 76
-// o07: 07 17 27 37 47 57 67 77
-// o08: 08 18 28 38 48 58 68 78
-// o09: 09 19 29 39 49 59 69 79
-// o0a: 0a 1a 2a 3a 4a 5a 6a 7a
-// o0b: 0b 1b 2b 3b 4b 5b 6b 7b
-// o0c: 0c 1c 2c 3c 4c 5c 6c 7c
-// o0d: 0d 1d 2d 3d 4d 5d 6d 7d
-// o0e: 0e 1e 2e 3e 4e 5e 6e 7e
-// o0f: 0f 1f 2f 3f 4f 5f 6f 7f
-inline void Transpose16x8(const uint8x16_t i0, const uint8x16_t i1,
-                          const uint8x16_t i2, const uint8x16_t i3,
-                          const uint8x16_t i4, const uint8x16_t i5,
-                          const uint8x16_t i6, const uint8x16_t i7,
-                          uint8x8_t* o00, uint8x8_t* o01, uint8x8_t* o02,
-                          uint8x8_t* o03, uint8x8_t* o04, uint8x8_t* o05,
-                          uint8x8_t* o06, uint8x8_t* o07, uint8x8_t* o08,
-                          uint8x8_t* o09, uint8x8_t* o10, uint8x8_t* o11,
-                          uint8x8_t* o12, uint8x8_t* o13, uint8x8_t* o14,
-                          uint8x8_t* o15) {
-  const uint8x16x2_t b0 = vtrnq_u8(i0, i1);
-  const uint8x16x2_t b1 = vtrnq_u8(i2, i3);
-  const uint8x16x2_t b2 = vtrnq_u8(i4, i5);
-  const uint8x16x2_t b3 = vtrnq_u8(i6, i7);
-
-  const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
-                                    vreinterpretq_u16_u8(b1.val[0]));
-  const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
-                                    vreinterpretq_u16_u8(b1.val[1]));
-  const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
-                                    vreinterpretq_u16_u8(b3.val[0]));
-  const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
-                                    vreinterpretq_u16_u8(b3.val[1]));
-
-  const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
-                                    vreinterpretq_u32_u16(c2.val[0]));
-  const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
-                                    vreinterpretq_u32_u16(c2.val[1]));
-  const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
-                                    vreinterpretq_u32_u16(c3.val[0]));
-  const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
-                                    vreinterpretq_u32_u16(c3.val[1]));
-
-  *o00 = vget_low_u8(vreinterpretq_u8_u32(d0.val[0]));
-  *o01 = vget_low_u8(vreinterpretq_u8_u32(d2.val[0]));
-  *o02 = vget_low_u8(vreinterpretq_u8_u32(d1.val[0]));
-  *o03 = vget_low_u8(vreinterpretq_u8_u32(d3.val[0]));
-  *o04 = vget_low_u8(vreinterpretq_u8_u32(d0.val[1]));
-  *o05 = vget_low_u8(vreinterpretq_u8_u32(d2.val[1]));
-  *o06 = vget_low_u8(vreinterpretq_u8_u32(d1.val[1]));
-  *o07 = vget_low_u8(vreinterpretq_u8_u32(d3.val[1]));
-  *o08 = vget_high_u8(vreinterpretq_u8_u32(d0.val[0]));
-  *o09 = vget_high_u8(vreinterpretq_u8_u32(d2.val[0]));
-  *o10 = vget_high_u8(vreinterpretq_u8_u32(d1.val[0]));
-  *o11 = vget_high_u8(vreinterpretq_u8_u32(d3.val[0]));
-  *o12 = vget_high_u8(vreinterpretq_u8_u32(d0.val[1]));
-  *o13 = vget_high_u8(vreinterpretq_u8_u32(d2.val[1]));
-  *o14 = vget_high_u8(vreinterpretq_u8_u32(d1.val[1]));
-  *o15 = vget_high_u8(vreinterpretq_u8_u32(d3.val[1]));
-}
-
-// TODO(johannkoenig): Replace usage of the above transpose with this one.
-inline void Transpose16x8(const uint8x16_t input[8], uint8x8_t output[16]) {
-  const uint8x16x2_t b0 = vtrnq_u8(input[0], input[1]);
-  const uint8x16x2_t b1 = vtrnq_u8(input[2], input[3]);
-  const uint8x16x2_t b2 = vtrnq_u8(input[4], input[5]);
-  const uint8x16x2_t b3 = vtrnq_u8(input[6], input[7]);
-
-  const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
-                                    vreinterpretq_u16_u8(b1.val[0]));
-  const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
-                                    vreinterpretq_u16_u8(b1.val[1]));
-  const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
-                                    vreinterpretq_u16_u8(b3.val[0]));
-  const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
-                                    vreinterpretq_u16_u8(b3.val[1]));
-
-  const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
-                                    vreinterpretq_u32_u16(c2.val[0]));
-  const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
-                                    vreinterpretq_u32_u16(c2.val[1]));
-  const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
-                                    vreinterpretq_u32_u16(c3.val[0]));
-  const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
-                                    vreinterpretq_u32_u16(c3.val[1]));
-
-  output[0] = vget_low_u8(vreinterpretq_u8_u32(d0.val[0]));
-  output[1] = vget_low_u8(vreinterpretq_u8_u32(d2.val[0]));
-  output[2] = vget_low_u8(vreinterpretq_u8_u32(d1.val[0]));
-  output[3] = vget_low_u8(vreinterpretq_u8_u32(d3.val[0]));
-  output[4] = vget_low_u8(vreinterpretq_u8_u32(d0.val[1]));
-  output[5] = vget_low_u8(vreinterpretq_u8_u32(d2.val[1]));
-  output[6] = vget_low_u8(vreinterpretq_u8_u32(d1.val[1]));
-  output[7] = vget_low_u8(vreinterpretq_u8_u32(d3.val[1]));
-  output[8] = vget_high_u8(vreinterpretq_u8_u32(d0.val[0]));
-  output[9] = vget_high_u8(vreinterpretq_u8_u32(d2.val[0]));
-  output[10] = vget_high_u8(vreinterpretq_u8_u32(d1.val[0]));
-  output[11] = vget_high_u8(vreinterpretq_u8_u32(d3.val[0]));
-  output[12] = vget_high_u8(vreinterpretq_u8_u32(d0.val[1]));
-  output[13] = vget_high_u8(vreinterpretq_u8_u32(d2.val[1]));
-  output[14] = vget_high_u8(vreinterpretq_u8_u32(d1.val[1]));
-  output[15] = vget_high_u8(vreinterpretq_u8_u32(d3.val[1]));
+inline int16x8_t ZeroExtend(const uint8x8_t in) {
+  return vreinterpretq_s16_u16(vmovl_u8(in));
 }
 
 }  // namespace dsp
diff --git a/libgav1/src/dsp/arm/convolve_neon.cc b/libgav1/src/dsp/arm/convolve_neon.cc
index 5f7eef7..2c2557f 100644
--- a/libgav1/src/dsp/arm/convolve_neon.cc
+++ b/libgav1/src/dsp/arm/convolve_neon.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "src/dsp/convolve.h"
-#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
 
 #if LIBGAV1_ENABLE_NEON
 
@@ -25,325 +25,231 @@
 #include <cstdint>
 
 #include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
 #include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
 
 namespace libgav1 {
 namespace dsp {
 namespace low_bitdepth {
 namespace {
 
-constexpr int kBitdepth8 = 8;
 constexpr int kIntermediateStride = kMaxSuperBlockSizeInPixels;
-constexpr int kSubPixelMask = (1 << kSubPixelBits) - 1;
 constexpr int kHorizontalOffset = 3;
-constexpr int kVerticalOffset = 3;
-constexpr int kInterRoundBitsVertical = 11;
+constexpr int kFilterIndexShift = 6;
 
-int GetFilterIndex(const int filter_index, const int length) {
-  if (length <= 4) {
-    if (filter_index == kInterpolationFilterEightTap ||
-        filter_index == kInterpolationFilterEightTapSharp) {
-      return 4;
-    }
-    if (filter_index == kInterpolationFilterEightTapSmooth) {
-      return 5;
-    }
+// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
+// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
+// sum from outranging int16_t.
+template <int filter_index, bool negative_outside_taps = false>
+int16x8_t SumOnePassTaps(const uint8x8_t* const src,
+                         const uint8x8_t* const taps) {
+  uint16x8_t sum;
+  if (filter_index == 0) {
+    // 6 taps. + - + + - +
+    sum = vmull_u8(src[0], taps[0]);
+    // Unsigned overflow will result in a valid int16_t value.
+    sum = vmlsl_u8(sum, src[1], taps[1]);
+    sum = vmlal_u8(sum, src[2], taps[2]);
+    sum = vmlal_u8(sum, src[3], taps[3]);
+    sum = vmlsl_u8(sum, src[4], taps[4]);
+    sum = vmlal_u8(sum, src[5], taps[5]);
+  } else if (filter_index == 1 && negative_outside_taps) {
+    // 6 taps. - + + + + -
+    // Set a base we can subtract from.
+    sum = vmull_u8(src[1], taps[1]);
+    sum = vmlsl_u8(sum, src[0], taps[0]);
+    sum = vmlal_u8(sum, src[2], taps[2]);
+    sum = vmlal_u8(sum, src[3], taps[3]);
+    sum = vmlal_u8(sum, src[4], taps[4]);
+    sum = vmlsl_u8(sum, src[5], taps[5]);
+  } else if (filter_index == 1) {
+    // 6 taps. All are positive.
+    sum = vmull_u8(src[0], taps[0]);
+    sum = vmlal_u8(sum, src[1], taps[1]);
+    sum = vmlal_u8(sum, src[2], taps[2]);
+    sum = vmlal_u8(sum, src[3], taps[3]);
+    sum = vmlal_u8(sum, src[4], taps[4]);
+    sum = vmlal_u8(sum, src[5], taps[5]);
+  } else if (filter_index == 2) {
+    // 8 taps. - + - + + - + -
+    sum = vmull_u8(src[1], taps[1]);
+    sum = vmlsl_u8(sum, src[0], taps[0]);
+    sum = vmlsl_u8(sum, src[2], taps[2]);
+    sum = vmlal_u8(sum, src[3], taps[3]);
+    sum = vmlal_u8(sum, src[4], taps[4]);
+    sum = vmlsl_u8(sum, src[5], taps[5]);
+    sum = vmlal_u8(sum, src[6], taps[6]);
+    sum = vmlsl_u8(sum, src[7], taps[7]);
+  } else if (filter_index == 3) {
+    // 2 taps. All are positive.
+    sum = vmull_u8(src[0], taps[0]);
+    sum = vmlal_u8(sum, src[1], taps[1]);
+  } else if (filter_index == 4) {
+    // 4 taps. - + + -
+    sum = vmull_u8(src[1], taps[1]);
+    sum = vmlsl_u8(sum, src[0], taps[0]);
+    sum = vmlal_u8(sum, src[2], taps[2]);
+    sum = vmlsl_u8(sum, src[3], taps[3]);
+  } else if (filter_index == 5) {
+    // 4 taps. All are positive.
+    sum = vmull_u8(src[0], taps[0]);
+    sum = vmlal_u8(sum, src[1], taps[1]);
+    sum = vmlal_u8(sum, src[2], taps[2]);
+    sum = vmlal_u8(sum, src[3], taps[3]);
   }
-  return filter_index;
+  return vreinterpretq_s16_u16(sum);
 }
 
-inline int16x8_t ZeroExtend(const uint8x8_t in) {
-  return vreinterpretq_s16_u16(vmovl_u8(in));
-}
-
-inline void Load8x8(const uint8_t* s, const ptrdiff_t p, int16x8_t* dst) {
-  dst[0] = ZeroExtend(vld1_u8(s));
-  s += p;
-  dst[1] = ZeroExtend(vld1_u8(s));
-  s += p;
-  dst[2] = ZeroExtend(vld1_u8(s));
-  s += p;
-  dst[3] = ZeroExtend(vld1_u8(s));
-  s += p;
-  dst[4] = ZeroExtend(vld1_u8(s));
-  s += p;
-  dst[5] = ZeroExtend(vld1_u8(s));
-  s += p;
-  dst[6] = ZeroExtend(vld1_u8(s));
-  s += p;
-  dst[7] = ZeroExtend(vld1_u8(s));
-}
-
-// Multiply every entry in |src[]| by the corresponding lane in |taps| and sum.
-// The sum of the entries in |taps| is always 128. In some situations negative
-// values are used. This creates a situation where the positive taps sum to more
-// than 128. An example is:
-// {-4, 10, -24, 100, 60, -20, 8, -2}
-// The negative taps never sum to < -128
-// The center taps are always positive. The remaining positive taps never sum
-// to > 128.
-// Summing these naively can overflow int16_t. This can be avoided by adding the
-// center taps last and saturating the result.
-// We do not need to expand to int32_t because later in the function the value
-// is shifted by |kFilterBits| (7) and saturated to uint8_t. This means any
-// value over 255 << 7 (32576 because of rounding) is clamped.
-template <int num_taps>
-int16x8_t SumTaps(const int16x8_t* const src, const int16x8_t taps) {
+template <int filter_index, bool negative_outside_taps>
+int16x8_t SumHorizontalTaps(const uint8_t* const src,
+                            const uint8x8_t* const v_tap) {
+  uint8x8_t v_src[8];
+  const uint8x16_t src_long = vld1q_u8(src);
   int16x8_t sum;
-  if (num_taps == 8) {
-    const int16x4_t taps_lo = vget_low_s16(taps);
-    const int16x4_t taps_hi = vget_high_s16(taps);
-    sum = vmulq_lane_s16(src[0], taps_lo, 0);
-    sum = vmlaq_lane_s16(sum, src[1], taps_lo, 1);
-    sum = vmlaq_lane_s16(sum, src[2], taps_lo, 2);
-    sum = vmlaq_lane_s16(sum, src[5], taps_hi, 1);
-    sum = vmlaq_lane_s16(sum, src[6], taps_hi, 2);
-    sum = vmlaq_lane_s16(sum, src[7], taps_hi, 3);
 
-    // Center taps.
-    sum = vqaddq_s16(sum, vmulq_lane_s16(src[3], taps_lo, 3));
-    sum = vqaddq_s16(sum, vmulq_lane_s16(src[4], taps_hi, 0));
-  } else if (num_taps == 6) {
-    const int16x4_t taps_lo = vget_low_s16(taps);
-    const int16x4_t taps_hi = vget_high_s16(taps);
-    sum = vmulq_lane_s16(src[0], taps_lo, 1);
-    sum = vmlaq_lane_s16(sum, src[1], taps_lo, 2);
-    sum = vmlaq_lane_s16(sum, src[4], taps_hi, 1);
-    sum = vmlaq_lane_s16(sum, src[5], taps_hi, 2);
-
-    // Center taps.
-    sum = vqaddq_s16(sum, vmulq_lane_s16(src[2], taps_lo, 3));
-    sum = vqaddq_s16(sum, vmulq_lane_s16(src[3], taps_hi, 0));
-  } else if (num_taps == 4) {
-    const int16x4_t taps_lo = vget_low_s16(taps);
-    sum = vmulq_lane_s16(src[0], taps_lo, 0);
-    sum = vmlaq_lane_s16(sum, src[3], taps_lo, 3);
-
-    // Center taps.
-    sum = vqaddq_s16(sum, vmulq_lane_s16(src[1], taps_lo, 1));
-    sum = vqaddq_s16(sum, vmulq_lane_s16(src[2], taps_lo, 2));
-  } else {
-    assert(num_taps == 2);
-    // All the taps are positive so there is no concern regarding saturation.
-    const int16x4_t taps_lo = vget_low_s16(taps);
-    sum = vmulq_lane_s16(src[0], taps_lo, 1);
-    sum = vmlaq_lane_s16(sum, src[1], taps_lo, 2);
+  if (filter_index < 2) {
+    v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+    v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+    v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+    v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+    v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+    v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 6));
+    sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 1);
+  } else if (filter_index == 2) {
+    v_src[0] = vget_low_u8(src_long);
+    v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+    v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+    v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+    v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+    v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+    v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6));
+    v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7));
+    sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap);
+  } else if (filter_index == 3) {
+    v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+    v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+    sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 3);
+  } else if (filter_index > 3) {
+    v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+    v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+    v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+    v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+    sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 2);
   }
-
   return sum;
 }
 
-// Add an offset to ensure the sum is positive and it fits within uint16_t.
-template <int num_taps>
-uint16x8_t SumTaps8To16(const int16x8_t* const src, const int16x8_t taps) {
-  // The worst case sum of negative taps is -56. The worst case sum of positive
-  // taps is 184. With the single pass versions of the Convolve we could safely
-  // saturate to int16_t because it outranged the final shift and narrow to
-  // uint8_t. For the 2D Convolve the intermediate values are 16 bits so we
-  // don't have that option.
-  // 184 * 255 = 46920 which is greater than int16_t can hold, but not uint16_t.
-  // The minimum value we need to handle is -56 * 255 = -14280.
-  // By offsetting the sum with 1 << 14 = 16384 we ensure that the sum is never
-  // negative and that 46920 + 16384 = 63304 fits comfortably in uint16_t. This
-  // allows us to use 16 bit registers instead of 32 bit registers.
-  // When considering the bit operations it is safe to ignore signedness. Due to
-  // the magic of 2's complement and well defined rollover rules the bit
-  // representations are equivalent.
-  const int16x4_t taps_lo = vget_low_s16(taps);
-  const int16x4_t taps_hi = vget_high_s16(taps);
-  // |offset| == 1 << (bitdepth + kFilterBits - 1);
-  int16x8_t sum = vdupq_n_s16(1 << 14);
-  if (num_taps == 8) {
-    sum = vmlaq_lane_s16(sum, src[0], taps_lo, 0);
-    sum = vmlaq_lane_s16(sum, src[1], taps_lo, 1);
-    sum = vmlaq_lane_s16(sum, src[2], taps_lo, 2);
-    sum = vmlaq_lane_s16(sum, src[3], taps_lo, 3);
-    sum = vmlaq_lane_s16(sum, src[4], taps_hi, 0);
-    sum = vmlaq_lane_s16(sum, src[5], taps_hi, 1);
-    sum = vmlaq_lane_s16(sum, src[6], taps_hi, 2);
-    sum = vmlaq_lane_s16(sum, src[7], taps_hi, 3);
-  } else if (num_taps == 6) {
-    sum = vmlaq_lane_s16(sum, src[0], taps_lo, 1);
-    sum = vmlaq_lane_s16(sum, src[1], taps_lo, 2);
-    sum = vmlaq_lane_s16(sum, src[2], taps_lo, 3);
-    sum = vmlaq_lane_s16(sum, src[3], taps_hi, 0);
-    sum = vmlaq_lane_s16(sum, src[4], taps_hi, 1);
-    sum = vmlaq_lane_s16(sum, src[5], taps_hi, 2);
-  } else if (num_taps == 4) {
-    sum = vmlaq_lane_s16(sum, src[0], taps_lo, 2);
-    sum = vmlaq_lane_s16(sum, src[1], taps_lo, 3);
-    sum = vmlaq_lane_s16(sum, src[2], taps_hi, 0);
-    sum = vmlaq_lane_s16(sum, src[3], taps_hi, 1);
-  } else if (num_taps == 2) {
-    sum = vmlaq_lane_s16(sum, src[0], taps_lo, 3);
-    sum = vmlaq_lane_s16(sum, src[1], taps_hi, 0);
-  }
+template <int filter_index, bool negative_outside_taps>
+uint8x8_t SimpleHorizontalTaps(const uint8_t* const src,
+                               const uint8x8_t* const v_tap) {
+  int16x8_t sum =
+      SumHorizontalTaps<filter_index, negative_outside_taps>(src, v_tap);
 
-  // This is guaranteed to be positive. Convert it for the final shift.
-  return vreinterpretq_u16_s16(sum);
+  // Normally the Horizontal pass does the downshift in two passes:
+  // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+  // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
+  // requires adding the rounding offset from the skipped shift.
+  constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
+
+  sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
+  return vqrshrun_n_s16(sum, kFilterBits - 1);
 }
 
-template <int num_taps, int filter_index, bool negative_outside_taps = true>
-uint16x8_t SumCompoundHorizontalTaps(const uint8_t* const src,
-                                     const uint8x8_t* const v_tap) {
-  // Start with an offset to guarantee the sum is non negative.
-  uint16x8_t v_sum = vdupq_n_u16(1 << 14);
-  uint8x16_t v_src[8];
-  v_src[0] = vld1q_u8(&src[0]);
-  if (num_taps == 8) {
-    v_src[1] = vextq_u8(v_src[0], v_src[0], 1);
-    v_src[2] = vextq_u8(v_src[0], v_src[0], 2);
-    v_src[3] = vextq_u8(v_src[0], v_src[0], 3);
-    v_src[4] = vextq_u8(v_src[0], v_src[0], 4);
-    v_src[5] = vextq_u8(v_src[0], v_src[0], 5);
-    v_src[6] = vextq_u8(v_src[0], v_src[0], 6);
-    v_src[7] = vextq_u8(v_src[0], v_src[0], 7);
+template <int filter_index, bool negative_outside_taps>
+uint16x8_t HorizontalTaps8To16(const uint8_t* const src,
+                               const uint8x8_t* const v_tap) {
+  const int16x8_t sum =
+      SumHorizontalTaps<filter_index, negative_outside_taps>(src, v_tap);
 
-    // tap signs : - + - + + - + -
-    v_sum = vmlsl_u8(v_sum, vget_low_u8(v_src[0]), v_tap[0]);
-    v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[1]), v_tap[1]);
-    v_sum = vmlsl_u8(v_sum, vget_low_u8(v_src[2]), v_tap[2]);
-    v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[3]), v_tap[3]);
-    v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[4]), v_tap[4]);
-    v_sum = vmlsl_u8(v_sum, vget_low_u8(v_src[5]), v_tap[5]);
-    v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[6]), v_tap[6]);
-    v_sum = vmlsl_u8(v_sum, vget_low_u8(v_src[7]), v_tap[7]);
-  } else if (num_taps == 6) {
-    v_src[1] = vextq_u8(v_src[0], v_src[0], 1);
-    v_src[2] = vextq_u8(v_src[0], v_src[0], 2);
-    v_src[3] = vextq_u8(v_src[0], v_src[0], 3);
-    v_src[4] = vextq_u8(v_src[0], v_src[0], 4);
-    v_src[5] = vextq_u8(v_src[0], v_src[0], 5);
-    v_src[6] = vextq_u8(v_src[0], v_src[0], 6);
-    if (filter_index == 0) {
-      // tap signs : + - + + - +
-      v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[1]), v_tap[1]);
-      v_sum = vmlsl_u8(v_sum, vget_low_u8(v_src[2]), v_tap[2]);
-      v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[3]), v_tap[3]);
-      v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[4]), v_tap[4]);
-      v_sum = vmlsl_u8(v_sum, vget_low_u8(v_src[5]), v_tap[5]);
-      v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[6]), v_tap[6]);
-    } else {
-      if (negative_outside_taps) {
-        // tap signs : - + + + + -
-        v_sum = vmlsl_u8(v_sum, vget_low_u8(v_src[1]), v_tap[1]);
-        v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[2]), v_tap[2]);
-        v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[3]), v_tap[3]);
-        v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[4]), v_tap[4]);
-        v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[5]), v_tap[5]);
-        v_sum = vmlsl_u8(v_sum, vget_low_u8(v_src[6]), v_tap[6]);
-      } else {
-        // tap signs : + + + + + +
-        v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[1]), v_tap[1]);
-        v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[2]), v_tap[2]);
-        v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[3]), v_tap[3]);
-        v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[4]), v_tap[4]);
-        v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[5]), v_tap[5]);
-        v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[6]), v_tap[6]);
-      }
-    }
-  } else if (num_taps == 4) {
-    v_src[2] = vextq_u8(v_src[0], v_src[0], 2);
-    v_src[3] = vextq_u8(v_src[0], v_src[0], 3);
-    v_src[4] = vextq_u8(v_src[0], v_src[0], 4);
-    v_src[5] = vextq_u8(v_src[0], v_src[0], 5);
-    if (filter_index == 4) {
-      // tap signs : - + + -
-      v_sum = vmlsl_u8(v_sum, vget_low_u8(v_src[2]), v_tap[2]);
-      v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[3]), v_tap[3]);
-      v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[4]), v_tap[4]);
-      v_sum = vmlsl_u8(v_sum, vget_low_u8(v_src[5]), v_tap[5]);
-    } else {
-      // tap signs : + + + +
-      v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[2]), v_tap[2]);
-      v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[3]), v_tap[3]);
-      v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[4]), v_tap[4]);
-      v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[5]), v_tap[5]);
-    }
-  } else {
-    assert(num_taps == 2);
-    v_src[3] = vextq_u8(v_src[0], v_src[0], 3);
-    v_src[4] = vextq_u8(v_src[0], v_src[0], 4);
-    // tap signs : + +
-    v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[3]), v_tap[3]);
-    v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[4]), v_tap[4]);
-  }
-
-  return v_sum;
+  return vreinterpretq_u16_s16(
+      vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
 }
 
-template <int num_taps, int filter_index>
-uint16x8_t SumHorizontalTaps2xH(const uint8_t* src, const ptrdiff_t src_stride,
-                                const uint8x8_t* const v_tap) {
-  constexpr int positive_offset_bits = kBitdepth8 + kFilterBits - 1;
-  uint16x8_t sum = vdupq_n_u16(1 << positive_offset_bits);
-  uint8x8_t input0 = vld1_u8(src);
+template <int filter_index>
+int16x8_t SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
+                               const uint8x8_t* const v_tap) {
+  uint16x8_t sum;
+  const uint8x8_t input0 = vld1_u8(src);
   src += src_stride;
-  uint8x8_t input1 = vld1_u8(src);
+  const uint8x8_t input1 = vld1_u8(src);
   uint8x8x2_t input = vzip_u8(input0, input1);
 
-  if (num_taps == 2) {
+  if (filter_index == 3) {
     // tap signs : + +
-    sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
+    sum = vmull_u8(vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
     sum = vmlal_u8(sum, input.val[1], v_tap[4]);
   } else if (filter_index == 4) {
     // tap signs : - + + -
+    sum = vmull_u8(vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
     sum = vmlsl_u8(sum, RightShift<4 * 8>(input.val[0]), v_tap[2]);
-    sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
     sum = vmlal_u8(sum, input.val[1], v_tap[4]);
     sum = vmlsl_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]);
   } else {
     // tap signs : + + + +
-    sum = vmlal_u8(sum, RightShift<4 * 8>(input.val[0]), v_tap[2]);
+    sum = vmull_u8(RightShift<4 * 8>(input.val[0]), v_tap[2]);
     sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
     sum = vmlal_u8(sum, input.val[1], v_tap[4]);
     sum = vmlal_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]);
   }
 
-  return vrshrq_n_u16(sum, kInterRoundBitsHorizontal);
+  return vreinterpretq_s16_u16(sum);
 }
 
-// TODO(johannkoenig): Rename this function. It works for more than just
-// compound convolutions.
+template <int filter_index>
+uint8x8_t SimpleHorizontalTaps2x2(const uint8_t* src,
+                                  const ptrdiff_t src_stride,
+                                  const uint8x8_t* const v_tap) {
+  int16x8_t sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+  // Normally the Horizontal pass does the downshift in two passes:
+  // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+  // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
+  // requires adding the rounding offset from the skipped shift.
+  constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
+
+  sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
+  return vqrshrun_n_s16(sum, kFilterBits - 1);
+}
+
+template <int filter_index>
+uint16x8_t HorizontalTaps8To16_2x2(const uint8_t* src,
+                                   const ptrdiff_t src_stride,
+                                   const uint8x8_t* const v_tap) {
+  const int16x8_t sum =
+      SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+  return vreinterpretq_u16_s16(
+      vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
+}
+
 template <int num_taps, int step, int filter_index,
           bool negative_outside_taps = true, bool is_2d = false,
-          bool is_8bit = false>
-void ConvolveCompoundHorizontalBlock(const uint8_t* src,
-                                     const ptrdiff_t src_stride,
-                                     void* const dest,
-                                     const ptrdiff_t pred_stride,
-                                     const int width, const int height,
-                                     const uint8x8_t* const v_tap) {
-  const uint16x8_t v_compound_round_offset = vdupq_n_u16(1 << (kBitdepth8 + 4));
-  const int16x8_t v_inter_round_bits_0 =
-      vdupq_n_s16(-kInterRoundBitsHorizontal);
-
+          bool is_compound = false>
+void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
+                      void* const dest, const ptrdiff_t pred_stride,
+                      const int width, const int height,
+                      const uint8x8_t* const v_tap) {
   auto* dest8 = static_cast<uint8_t*>(dest);
   auto* dest16 = static_cast<uint16_t*>(dest);
 
-  if (width > 4) {
+  // 4 tap filters are never used when width > 4.
+  if (num_taps != 4 && width > 4) {
     int y = 0;
     do {
       int x = 0;
       do {
-        uint16x8_t v_sum =
-            SumCompoundHorizontalTaps<num_taps, filter_index,
-                                      negative_outside_taps>(&src[x], v_tap);
-        if (is_8bit) {
-          // Split shifts the way they are in C. They can be combined but that
-          // makes removing the 1 << 14 offset much more difficult.
-          v_sum = vrshrq_n_u16(v_sum, kInterRoundBitsHorizontal);
-          int16x8_t v_sum_signed = vreinterpretq_s16_u16(vsubq_u16(
-              v_sum, vdupq_n_u16(1 << (14 - kInterRoundBitsHorizontal))));
-          uint8x8_t result = vqrshrun_n_s16(
-              v_sum_signed, kFilterBits - kInterRoundBitsHorizontal);
-          vst1_u8(&dest8[x], result);
-        } else {
-          v_sum = vrshlq_u16(v_sum, v_inter_round_bits_0);
-          if (!is_2d) {
-            v_sum = vaddq_u16(v_sum, v_compound_round_offset);
-          }
+        if (is_2d || is_compound) {
+          const uint16x8_t v_sum =
+              HorizontalTaps8To16<filter_index, negative_outside_taps>(&src[x],
+                                                                       v_tap);
           vst1q_u16(&dest16[x], v_sum);
+        } else {
+          const uint8x8_t result =
+              SimpleHorizontalTaps<filter_index, negative_outside_taps>(&src[x],
+                                                                        v_tap);
+          vst1_u8(&dest8[x], result);
         }
         x += step;
       } while (x < width);
@@ -352,135 +258,142 @@
       dest16 += pred_stride;
     } while (++y < height);
     return;
-  } else if (width == 4) {
-    int y = 0;
-    do {
-      uint16x8_t v_sum =
-          SumCompoundHorizontalTaps<num_taps, filter_index,
-                                    negative_outside_taps>(&src[0], v_tap);
-      if (is_8bit) {
-        v_sum = vrshrq_n_u16(v_sum, kInterRoundBitsHorizontal);
-        int16x8_t v_sum_signed = vreinterpretq_s16_u16(vsubq_u16(
-            v_sum, vdupq_n_u16(1 << (14 - kInterRoundBitsHorizontal))));
-        uint8x8_t result = vqrshrun_n_s16(
-            v_sum_signed, kFilterBits - kInterRoundBitsHorizontal);
-        StoreLo4(&dest8[0], result);
-      } else {
-        v_sum = vrshlq_u16(v_sum, v_inter_round_bits_0);
-        if (!is_2d) {
-          v_sum = vaddq_u16(v_sum, v_compound_round_offset);
-        }
-        vst1_u16(&dest16[0], vget_low_u16(v_sum));
-      }
-      src += src_stride;
-      dest8 += pred_stride;
-      dest16 += pred_stride;
-    } while (++y < height);
-    return;
   }
 
   // Horizontal passes only needs to account for |num_taps| 2 and 4 when
-  // |width| == 2.
-  assert(width == 2);
+  // |width| <= 4.
+  assert(width <= 4);
   assert(num_taps <= 4);
-
-  constexpr int positive_offset_bits = kBitdepth8 + kFilterBits - 1;
-  // Leave off + 1 << (kBitdepth8 + 3).
-  constexpr int compound_round_offset = 1 << (kBitdepth8 + 4);
-
   if (num_taps <= 4) {
-    int y = 0;
-    do {
-      // TODO(johannkoenig): Re-order the values for storing.
-      uint16x8_t sum =
-          SumHorizontalTaps2xH<num_taps, filter_index>(src, src_stride, v_tap);
+    if (width == 4) {
+      int y = 0;
+      do {
+        if (is_2d || is_compound) {
+          const uint16x8_t v_sum =
+              HorizontalTaps8To16<filter_index, negative_outside_taps>(src,
+                                                                       v_tap);
+          vst1_u16(dest16, vget_low_u16(v_sum));
+        } else {
+          const uint8x8_t result =
+              SimpleHorizontalTaps<filter_index, negative_outside_taps>(src,
+                                                                        v_tap);
+          StoreLo4(&dest8[0], result);
+        }
+        src += src_stride;
+        dest8 += pred_stride;
+        dest16 += pred_stride;
+      } while (++y < height);
+      return;
+    }
 
+    if (!is_compound) {
+      int y = 0;
+      do {
+        if (is_2d) {
+          const uint16x8_t sum =
+              HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap);
+          dest16[0] = vgetq_lane_u16(sum, 0);
+          dest16[1] = vgetq_lane_u16(sum, 2);
+          dest16 += pred_stride;
+          dest16[0] = vgetq_lane_u16(sum, 1);
+          dest16[1] = vgetq_lane_u16(sum, 3);
+          dest16 += pred_stride;
+        } else {
+          const uint8x8_t sum =
+              SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+          dest8[0] = vget_lane_u8(sum, 0);
+          dest8[1] = vget_lane_u8(sum, 2);
+          dest8 += pred_stride;
+
+          dest8[0] = vget_lane_u8(sum, 1);
+          dest8[1] = vget_lane_u8(sum, 3);
+          dest8 += pred_stride;
+        }
+
+        src += src_stride << 1;
+        y += 2;
+      } while (y < height - 1);
+
+      // The 2d filters have an odd |height| because the horizontal pass
+      // generates context for the vertical pass.
       if (is_2d) {
-        dest16[0] = vgetq_lane_u16(sum, 0);
-        dest16[1] = vgetq_lane_u16(sum, 2);
-        dest16 += pred_stride;
-        dest16[0] = vgetq_lane_u16(sum, 1);
-        dest16[1] = vgetq_lane_u16(sum, 3);
-        dest16 += pred_stride;
-      } else if (!is_8bit) {
-        // None of the test vectors hit this path but the unit tests do.
-        sum = vaddq_u16(sum, vdupq_n_u16(compound_round_offset));
-
-        dest16[0] = vgetq_lane_u16(sum, 0);
-        dest16[1] = vgetq_lane_u16(sum, 2);
-        dest16 += pred_stride;
-        dest16[0] = vgetq_lane_u16(sum, 1);
-        dest16[1] = vgetq_lane_u16(sum, 3);
-        dest16 += pred_stride;
-      } else {
-        // Split shifts the way they are in C. They can be combined but that
-        // makes removing the 1 << 14 offset much more difficult.
-        int16x8_t sum_signed = vreinterpretq_s16_u16(vsubq_u16(
-            sum, vdupq_n_u16(
-                     1 << (positive_offset_bits - kInterRoundBitsHorizontal))));
-        uint8x8_t result =
-            vqrshrun_n_s16(sum_signed, kFilterBits - kInterRoundBitsHorizontal);
-
-        // Could de-interleave and vst1_lane_u16().
-        dest8[0] = vget_lane_u8(result, 0);
-        dest8[1] = vget_lane_u8(result, 2);
-        dest8 += pred_stride;
-
-        dest8[0] = vget_lane_u8(result, 1);
-        dest8[1] = vget_lane_u8(result, 3);
-        dest8 += pred_stride;
+        assert(height % 2 == 1);
+        uint16x8_t sum;
+        const uint8x8_t input = vld1_u8(src);
+        if (filter_index == 3) {  // |num_taps| == 2
+          sum = vmull_u8(RightShift<3 * 8>(input), v_tap[3]);
+          sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
+        } else if (filter_index == 4) {
+          sum = vmull_u8(RightShift<3 * 8>(input), v_tap[3]);
+          sum = vmlsl_u8(sum, RightShift<2 * 8>(input), v_tap[2]);
+          sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
+          sum = vmlsl_u8(sum, RightShift<5 * 8>(input), v_tap[5]);
+        } else {
+          assert(filter_index == 5);
+          sum = vmull_u8(RightShift<2 * 8>(input), v_tap[2]);
+          sum = vmlal_u8(sum, RightShift<3 * 8>(input), v_tap[3]);
+          sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
+          sum = vmlal_u8(sum, RightShift<5 * 8>(input), v_tap[5]);
+        }
+        // |sum| contains an int16_t value.
+        sum = vreinterpretq_u16_s16(vrshrq_n_s16(
+            vreinterpretq_s16_u16(sum), kInterRoundBitsHorizontal - 1));
+        Store2<0>(dest16, sum);
       }
-
-      src += src_stride << 1;
-      y += 2;
-    } while (y < height - 1);
-
-    // The 2d filters have an odd |height| because the horizontal pass generates
-    // context for the vertical pass.
-    if (is_2d) {
-      assert(height % 2 == 1);
-      uint16x8_t sum = vdupq_n_u16(1 << positive_offset_bits);
-      uint8x8_t input = vld1_u8(src);
-      if (filter_index == 3) {  // |num_taps| == 2
-        sum = vmlal_u8(sum, RightShift<3 * 8>(input), v_tap[3]);
-        sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
-      } else if (filter_index == 4) {
-        sum = vmlsl_u8(sum, RightShift<2 * 8>(input), v_tap[2]);
-        sum = vmlal_u8(sum, RightShift<3 * 8>(input), v_tap[3]);
-        sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
-        sum = vmlsl_u8(sum, RightShift<5 * 8>(input), v_tap[5]);
-      } else {
-        assert(filter_index == 5);
-        sum = vmlal_u8(sum, RightShift<2 * 8>(input), v_tap[2]);
-        sum = vmlal_u8(sum, RightShift<3 * 8>(input), v_tap[3]);
-        sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
-        sum = vmlal_u8(sum, RightShift<5 * 8>(input), v_tap[5]);
-        sum = vrshrq_n_u16(sum, kInterRoundBitsHorizontal);
-      }
-      Store2<0>(dest16, sum);
     }
   }
 }
 
 // Process 16 bit inputs and output 32 bits.
-template <int num_taps>
-uint32x4x2_t Sum2DVerticalTaps(const int16x8_t* const src,
-                               const int16x8_t taps) {
-  // In order to get the rollover correct with the lengthening instruction we
-  // need to treat these as signed so that they sign extend properly.
+template <int num_taps, bool is_compound>
+inline int16x4_t Sum2DVerticalTaps4(const int16x4_t* const src,
+                                    const int16x8_t taps) {
   const int16x4_t taps_lo = vget_low_s16(taps);
   const int16x4_t taps_hi = vget_high_s16(taps);
-  // An offset to guarantee the sum is non negative. Captures 56 * -4590 =
-  // 257040 (worst case negative value from horizontal pass). It should be
-  // possible to use 1 << 18 (262144) instead of 1 << 19 but there probably
-  // isn't any benefit.
-  // |offset_bits| = bitdepth + 2 * kFilterBits - kInterRoundBitsHorizontal
-  // == 19.
-  int32x4_t sum_lo = vdupq_n_s32(1 << 19);
-  int32x4_t sum_hi = sum_lo;
+  int32x4_t sum;
   if (num_taps == 8) {
-    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[0]), taps_lo, 0);
-    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[0]), taps_lo, 0);
+    sum = vmull_lane_s16(src[0], taps_lo, 0);
+    sum = vmlal_lane_s16(sum, src[1], taps_lo, 1);
+    sum = vmlal_lane_s16(sum, src[2], taps_lo, 2);
+    sum = vmlal_lane_s16(sum, src[3], taps_lo, 3);
+    sum = vmlal_lane_s16(sum, src[4], taps_hi, 0);
+    sum = vmlal_lane_s16(sum, src[5], taps_hi, 1);
+    sum = vmlal_lane_s16(sum, src[6], taps_hi, 2);
+    sum = vmlal_lane_s16(sum, src[7], taps_hi, 3);
+  } else if (num_taps == 6) {
+    sum = vmull_lane_s16(src[0], taps_lo, 1);
+    sum = vmlal_lane_s16(sum, src[1], taps_lo, 2);
+    sum = vmlal_lane_s16(sum, src[2], taps_lo, 3);
+    sum = vmlal_lane_s16(sum, src[3], taps_hi, 0);
+    sum = vmlal_lane_s16(sum, src[4], taps_hi, 1);
+    sum = vmlal_lane_s16(sum, src[5], taps_hi, 2);
+  } else if (num_taps == 4) {
+    sum = vmull_lane_s16(src[0], taps_lo, 2);
+    sum = vmlal_lane_s16(sum, src[1], taps_lo, 3);
+    sum = vmlal_lane_s16(sum, src[2], taps_hi, 0);
+    sum = vmlal_lane_s16(sum, src[3], taps_hi, 1);
+  } else if (num_taps == 2) {
+    sum = vmull_lane_s16(src[0], taps_lo, 3);
+    sum = vmlal_lane_s16(sum, src[1], taps_hi, 0);
+  }
+
+  if (is_compound) {
+    return vqrshrn_n_s32(sum, kInterRoundBitsCompoundVertical - 1);
+  }
+
+  return vqrshrn_n_s32(sum, kInterRoundBitsVertical - 1);
+}
+
+template <int num_taps, bool is_compound>
+int16x8_t SimpleSum2DVerticalTaps(const int16x8_t* const src,
+                                  const int16x8_t taps) {
+  const int16x4_t taps_lo = vget_low_s16(taps);
+  const int16x4_t taps_hi = vget_high_s16(taps);
+  int32x4_t sum_lo, sum_hi;
+  if (num_taps == 8) {
+    sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 0);
+    sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 0);
     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 1);
     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 1);
     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 2);
@@ -497,8 +410,8 @@
     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[7]), taps_hi, 3);
     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[7]), taps_hi, 3);
   } else if (num_taps == 6) {
-    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[0]), taps_lo, 1);
-    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[0]), taps_lo, 1);
+    sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 1);
+    sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 1);
     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 2);
     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 2);
     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 3);
@@ -511,8 +424,8 @@
     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[5]), taps_hi, 2);
     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[5]), taps_hi, 2);
   } else if (num_taps == 4) {
-    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[0]), taps_lo, 2);
-    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[0]), taps_lo, 2);
+    sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 2);
+    sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 2);
     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 3);
     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 3);
 
@@ -521,384 +434,273 @@
     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_hi, 1);
     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_hi, 1);
   } else if (num_taps == 2) {
-    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[0]), taps_lo, 3);
-    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[0]), taps_lo, 3);
+    sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 3);
+    sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 3);
 
     sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_hi, 0);
     sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_hi, 0);
   }
 
-  // This is guaranteed to be positive. Convert it for the final shift.
-  const uint32x4x2_t return_val = {vreinterpretq_u32_s32(sum_lo),
-                                   vreinterpretq_u32_s32(sum_hi)};
-  return return_val;
-}
-
-// Process 16 bit inputs and output 32 bits.
-template <int num_taps>
-uint32x4_t Sum2DVerticalTaps(const int16x4_t* const src, const int16x8_t taps) {
-  // In order to get the rollover correct with the lengthening instruction we
-  // need to treat these as signed so that they sign extend properly.
-  const int16x4_t taps_lo = vget_low_s16(taps);
-  const int16x4_t taps_hi = vget_high_s16(taps);
-  // An offset to guarantee the sum is non negative. Captures 56 * -4590 =
-  // 257040 (worst case negative value from horizontal pass). It should be
-  // possible to use 1 << 18 (262144) instead of 1 << 19 but there probably
-  // isn't any benefit.
-  // |offset_bits| = bitdepth + 2 * kFilterBits - kInterRoundBitsHorizontal
-  // == 19.
-  int32x4_t sum = vdupq_n_s32(1 << 19);
-  if (num_taps == 8) {
-    sum = vmlal_lane_s16(sum, src[0], taps_lo, 0);
-    sum = vmlal_lane_s16(sum, src[1], taps_lo, 1);
-    sum = vmlal_lane_s16(sum, src[2], taps_lo, 2);
-    sum = vmlal_lane_s16(sum, src[3], taps_lo, 3);
-
-    sum = vmlal_lane_s16(sum, src[4], taps_hi, 0);
-    sum = vmlal_lane_s16(sum, src[5], taps_hi, 1);
-    sum = vmlal_lane_s16(sum, src[6], taps_hi, 2);
-    sum = vmlal_lane_s16(sum, src[7], taps_hi, 3);
-  } else if (num_taps == 6) {
-    sum = vmlal_lane_s16(sum, src[0], taps_lo, 1);
-    sum = vmlal_lane_s16(sum, src[1], taps_lo, 2);
-    sum = vmlal_lane_s16(sum, src[2], taps_lo, 3);
-
-    sum = vmlal_lane_s16(sum, src[3], taps_hi, 0);
-    sum = vmlal_lane_s16(sum, src[4], taps_hi, 1);
-    sum = vmlal_lane_s16(sum, src[5], taps_hi, 2);
-  } else if (num_taps == 4) {
-    sum = vmlal_lane_s16(sum, src[0], taps_lo, 2);
-    sum = vmlal_lane_s16(sum, src[1], taps_lo, 3);
-
-    sum = vmlal_lane_s16(sum, src[2], taps_hi, 0);
-    sum = vmlal_lane_s16(sum, src[3], taps_hi, 1);
-  } else if (num_taps == 2) {
-    sum = vmlal_lane_s16(sum, src[0], taps_lo, 3);
-
-    sum = vmlal_lane_s16(sum, src[1], taps_hi, 0);
+  if (is_compound) {
+    return vcombine_s16(
+        vqrshrn_n_s32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+        vqrshrn_n_s32(sum_hi, kInterRoundBitsCompoundVertical - 1));
   }
 
-  // This is guaranteed to be positive. Convert it for the final shift.
-  return vreinterpretq_u32_s32(sum);
+  return vcombine_s16(vqrshrn_n_s32(sum_lo, kInterRoundBitsVertical - 1),
+                      vqrshrn_n_s32(sum_hi, kInterRoundBitsVertical - 1));
 }
 
 template <int num_taps, bool is_compound = false>
-void Filter2DVertical(const uint16_t* src, const ptrdiff_t src_stride,
-                      void* const dst, const ptrdiff_t dst_stride,
-                      const int width, const int height, const int16x8_t taps,
-                      const int inter_round_bits_vertical) {
+void Filter2DVertical(const uint16_t* src, void* const dst,
+                      const ptrdiff_t dst_stride, const int width,
+                      const int height, const int16x8_t taps) {
+  assert(width >= 8);
   constexpr int next_row = num_taps - 1;
-  const int32x4_t v_inter_round_bits_vertical =
-      vdupq_n_s32(-inter_round_bits_vertical);
+  // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
+  const ptrdiff_t src_stride = width;
 
   auto* dst8 = static_cast<uint8_t*>(dst);
   auto* dst16 = static_cast<uint16_t*>(dst);
 
-  if (width > 4) {
-    int x = 0;
+  int x = 0;
+  do {
+    int16x8_t srcs[8];
+    const uint16_t* src_x = src + x;
+    srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src_x));
+    src_x += src_stride;
+    if (num_taps >= 4) {
+      srcs[1] = vreinterpretq_s16_u16(vld1q_u16(src_x));
+      src_x += src_stride;
+      srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src_x));
+      src_x += src_stride;
+      if (num_taps >= 6) {
+        srcs[3] = vreinterpretq_s16_u16(vld1q_u16(src_x));
+        src_x += src_stride;
+        srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src_x));
+        src_x += src_stride;
+        if (num_taps == 8) {
+          srcs[5] = vreinterpretq_s16_u16(vld1q_u16(src_x));
+          src_x += src_stride;
+          srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src_x));
+          src_x += src_stride;
+        }
+      }
+    }
+
+    int y = 0;
     do {
-      int16x8_t srcs[8];
-      srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src + x));
+      srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src_x));
+      src_x += src_stride;
+
+      const int16x8_t sum =
+          SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+      if (is_compound) {
+        vst1q_u16(dst16 + x + y * dst_stride, vreinterpretq_u16_s16(sum));
+      } else {
+        vst1_u8(dst8 + x + y * dst_stride, vqmovun_s16(sum));
+      }
+
+      srcs[0] = srcs[1];
       if (num_taps >= 4) {
-        srcs[1] = vreinterpretq_s16_u16(vld1q_u16(src + x + src_stride));
-        srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src + x + 2 * src_stride));
+        srcs[1] = srcs[2];
+        srcs[2] = srcs[3];
         if (num_taps >= 6) {
-          srcs[3] = vreinterpretq_s16_u16(vld1q_u16(src + x + 3 * src_stride));
-          srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src + x + 4 * src_stride));
+          srcs[3] = srcs[4];
+          srcs[4] = srcs[5];
           if (num_taps == 8) {
-            srcs[5] =
-                vreinterpretq_s16_u16(vld1q_u16(src + x + 5 * src_stride));
-            srcs[6] =
-                vreinterpretq_s16_u16(vld1q_u16(src + x + 6 * src_stride));
+            srcs[5] = srcs[6];
+            srcs[6] = srcs[7];
           }
         }
       }
+    } while (++y < height);
+    x += 8;
+  } while (x < width);
+}
 
-      int y = 0;
-      do {
-        srcs[next_row] = vreinterpretq_s16_u16(
-            vld1q_u16(src + x + (y + next_row) * src_stride));
+// Take advantage of |src_stride| == |width| to process two rows at a time.
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical4xH(const uint16_t* src, void* const dst,
+                         const ptrdiff_t dst_stride, const int height,
+                         const int16x8_t taps) {
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
 
-        const uint32x4x2_t sums = Sum2DVerticalTaps<num_taps>(srcs, taps);
-        if (is_compound) {
-          const uint16x8_t results = vcombine_u16(
-              vmovn_u32(vqrshlq_u32(sums.val[0], v_inter_round_bits_vertical)),
-              vmovn_u32(vqrshlq_u32(sums.val[1], v_inter_round_bits_vertical)));
-          vst1q_u16(dst16 + x + y * dst_stride, results);
-        } else {
-          const uint16x8_t first_shift =
-              vcombine_u16(vqrshrn_n_u32(sums.val[0], kInterRoundBitsVertical),
-                           vqrshrn_n_u32(sums.val[1], kInterRoundBitsVertical));
-          // |single_round_offset| == (1 << bitdepth) + (1 << (bitdepth - 1)) ==
-          // 384
-          const uint8x8_t results =
-              vqmovn_u16(vqsubq_u16(first_shift, vdupq_n_u16(384)));
-
-          vst1_u8(dst8 + x + y * dst_stride, results);
-        }
-
-        srcs[0] = srcs[1];
-        if (num_taps >= 4) {
-          srcs[1] = srcs[2];
-          srcs[2] = srcs[3];
-          if (num_taps >= 6) {
-            srcs[3] = srcs[4];
-            srcs[4] = srcs[5];
-            if (num_taps == 8) {
-              srcs[5] = srcs[6];
-              srcs[6] = srcs[7];
-            }
-          }
-        }
-      } while (++y < height);
-      x += 8;
-    } while (x < width);
-    return;
-  }
-
-  assert(width == 4);
-  int16x4_t srcs[8];
-  srcs[0] = vreinterpret_s16_u16(vld1_u16(src));
-  src += src_stride;
+  int16x8_t srcs[9];
+  srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src));
+  src += 8;
   if (num_taps >= 4) {
-    srcs[1] = vreinterpret_s16_u16(vld1_u16(src));
-    src += src_stride;
-    srcs[2] = vreinterpret_s16_u16(vld1_u16(src));
-    src += src_stride;
+    srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src));
+    src += 8;
+    srcs[1] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[2]));
     if (num_taps >= 6) {
-      srcs[3] = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
-      srcs[4] = vreinterpret_s16_u16(vld1_u16(src));
-      src += src_stride;
+      srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src));
+      src += 8;
+      srcs[3] = vcombine_s16(vget_high_s16(srcs[2]), vget_low_s16(srcs[4]));
       if (num_taps == 8) {
-        srcs[5] = vreinterpret_s16_u16(vld1_u16(src));
-        src += src_stride;
-        srcs[6] = vreinterpret_s16_u16(vld1_u16(src));
-        src += src_stride;
+        srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src));
+        src += 8;
+        srcs[5] = vcombine_s16(vget_high_s16(srcs[4]), vget_low_s16(srcs[6]));
       }
     }
   }
 
   int y = 0;
   do {
-    srcs[next_row] = vreinterpret_s16_u16(vld1_u16(src));
-    src += src_stride;
+    srcs[num_taps] = vreinterpretq_s16_u16(vld1q_u16(src));
+    src += 8;
+    srcs[num_taps - 1] = vcombine_s16(vget_high_s16(srcs[num_taps - 2]),
+                                      vget_low_s16(srcs[num_taps]));
 
-    const uint32x4_t sums = Sum2DVerticalTaps<num_taps>(srcs, taps);
+    const int16x8_t sum =
+        SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
     if (is_compound) {
-      const uint16x4_t results =
-          vmovn_u32(vqrshlq_u32(sums, v_inter_round_bits_vertical));
-      vst1_u16(dst16, results);
-      dst16 += dst_stride;
+      const uint16x8_t results = vreinterpretq_u16_s16(sum);
+      vst1q_u16(dst16, results);
+      dst16 += 4 << 1;
     } else {
-      const uint16x4_t first_shift =
-          vqrshrn_n_u32(sums, kInterRoundBitsVertical);
-      // |single_round_offset| == (1 << bitdepth) + (1 << (bitdepth - 1)) ==
-      // 384
-      const uint8x8_t results = vqmovn_u16(
-          vcombine_u16(vqsub_u16(first_shift, vdup_n_u16(384)), vdup_n_u16(0)));
+      const uint8x8_t results = vqmovun_s16(sum);
 
       StoreLo4(dst8, results);
       dst8 += dst_stride;
+      StoreHi4(dst8, results);
+      dst8 += dst_stride;
     }
 
-    srcs[0] = srcs[1];
+    srcs[0] = srcs[2];
     if (num_taps >= 4) {
-      srcs[1] = srcs[2];
-      srcs[2] = srcs[3];
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
       if (num_taps >= 6) {
-        srcs[3] = srcs[4];
-        srcs[4] = srcs[5];
+        srcs[3] = srcs[5];
+        srcs[4] = srcs[6];
         if (num_taps == 8) {
-          srcs[5] = srcs[6];
-          srcs[6] = srcs[7];
+          srcs[5] = srcs[7];
+          srcs[6] = srcs[8];
         }
       }
     }
-  } while (++y < height);
+    y += 2;
+  } while (y < height);
 }
 
-template <bool is_2d = false, bool is_8bit = false>
-void HorizontalPass(const uint8_t* const src, const ptrdiff_t src_stride,
-                    void* const dst, const ptrdiff_t dst_stride,
-                    const int width, const int height, const int subpixel,
-                    const int filter_index) {
+// Take advantage of |src_stride| == |width| to process four rows at a time.
+template <int num_taps>
+void Filter2DVertical2xH(const uint16_t* src, void* const dst,
+                         const ptrdiff_t dst_stride, const int height,
+                         const int16x8_t taps) {
+  constexpr int next_row = (num_taps < 6) ? 4 : 8;
+
+  auto* dst8 = static_cast<uint8_t*>(dst);
+
+  int16x8_t srcs[9];
+  srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src));
+  src += 8;
+  if (num_taps >= 6) {
+    srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src));
+    src += 8;
+    srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
+    if (num_taps == 8) {
+      srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
+      srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
+    }
+  }
+
+  int y = 0;
+  do {
+    srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src));
+    src += 8;
+    if (num_taps == 2) {
+      srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
+    } else if (num_taps == 4) {
+      srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
+      srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
+      srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
+    } else if (num_taps == 6) {
+      srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
+      srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
+      srcs[5] = vextq_s16(srcs[4], srcs[8], 2);
+    } else if (num_taps == 8) {
+      srcs[5] = vextq_s16(srcs[4], srcs[8], 2);
+      srcs[6] = vcombine_s16(vget_high_s16(srcs[4]), vget_low_s16(srcs[8]));
+      srcs[7] = vextq_s16(srcs[4], srcs[8], 6);
+    }
+
+    const int16x8_t sum =
+        SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps);
+    const uint8x8_t results = vqmovun_s16(sum);
+
+    Store2<0>(dst8, results);
+    dst8 += dst_stride;
+    Store2<1>(dst8, results);
+    // When |height| <= 4 the taps are restricted to 2 and 4 tap variants.
+    // Therefore we don't need to check this condition when |height| > 4.
+    if (num_taps <= 4 && height == 2) return;
+    dst8 += dst_stride;
+    Store2<2>(dst8, results);
+    dst8 += dst_stride;
+    Store2<3>(dst8, results);
+    dst8 += dst_stride;
+
+    srcs[0] = srcs[4];
+    if (num_taps == 6) {
+      srcs[1] = srcs[5];
+      srcs[4] = srcs[8];
+    } else if (num_taps == 8) {
+      srcs[1] = srcs[5];
+      srcs[2] = srcs[6];
+      srcs[3] = srcs[7];
+      srcs[4] = srcs[8];
+    }
+
+    y += 4;
+  } while (y < height);
+}
+
+template <bool is_2d = false, bool is_compound = false>
+LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
+    const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
+    const ptrdiff_t dst_stride, const int width, const int height,
+    const int subpixel, const int filter_index) {
   // Duplicate the absolute value for each tap.  Negative taps are corrected
   // by using the vmlsl_u8 instruction.  Positive taps use vmlal_u8.
   uint8x8_t v_tap[kSubPixelTaps];
   const int filter_id = (subpixel >> 6) & kSubPixelMask;
+  assert(filter_id != 0);
+
   for (int k = 0; k < kSubPixelTaps; ++k) {
-    v_tap[k] = vreinterpret_u8_s8(
-        vabs_s8(vdup_n_s8(kSubPixelFilters[filter_index][filter_id][k])));
+    v_tap[k] = vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][filter_id][k]);
   }
 
   if (filter_index == 2) {  // 8 tap.
-    ConvolveCompoundHorizontalBlock<8, 8, 2, true, is_2d, is_8bit>(
+    FilterHorizontal<8, 8, 2, true, is_2d, is_compound>(
         src, src_stride, dst, dst_stride, width, height, v_tap);
   } else if (filter_index == 1) {  // 6 tap.
     // Check if outside taps are positive.
     if ((filter_id == 1) | (filter_id == 15)) {
-      ConvolveCompoundHorizontalBlock<6, 8, 1, false, is_2d, is_8bit>(
+      FilterHorizontal<6, 8, 1, false, is_2d, is_compound>(
           src, src_stride, dst, dst_stride, width, height, v_tap);
     } else {
-      ConvolveCompoundHorizontalBlock<6, 8, 1, true, is_2d, is_8bit>(
+      FilterHorizontal<6, 8, 1, true, is_2d, is_compound>(
           src, src_stride, dst, dst_stride, width, height, v_tap);
     }
   } else if (filter_index == 0) {  // 6 tap.
-    ConvolveCompoundHorizontalBlock<6, 8, 0, true, is_2d, is_8bit>(
+    FilterHorizontal<6, 8, 0, true, is_2d, is_compound>(
         src, src_stride, dst, dst_stride, width, height, v_tap);
   } else if (filter_index == 4) {  // 4 tap.
-    ConvolveCompoundHorizontalBlock<4, 8, 4, true, is_2d, is_8bit>(
+    FilterHorizontal<4, 8, 4, true, is_2d, is_compound>(
         src, src_stride, dst, dst_stride, width, height, v_tap);
   } else if (filter_index == 5) {  // 4 tap.
-    ConvolveCompoundHorizontalBlock<4, 8, 5, true, is_2d, is_8bit>(
+    FilterHorizontal<4, 8, 5, true, is_2d, is_compound>(
         src, src_stride, dst, dst_stride, width, height, v_tap);
   } else {  // 2 tap.
-    ConvolveCompoundHorizontalBlock<2, 8, 3, true, is_2d, is_8bit>(
+    FilterHorizontal<2, 8, 3, true, is_2d, is_compound>(
         src, src_stride, dst, dst_stride, width, height, v_tap);
   }
 }
 
-// There are three forms of this function:
-// 2D: input 8bit, output 16bit. |is_compound| has no effect.
-// 1D Horizontal: input 8bit, output 8bit.
-// 1D Compound Horizontal: input 8bit, output 16bit. Different rounding from 2D.
-// |width| is guaranteed to be 2 because all other cases are handled in neon.
-template <bool is_2d = true, bool is_compound = false>
-void HorizontalPass2xH(const uint8_t* src, const ptrdiff_t src_stride,
-                       void* const dst, const ptrdiff_t dst_stride,
-                       const int height, const int filter_index, const int taps,
-                       const int subpixel) {
-  // Even though |is_compound| has no effect when |is_2d| is true we block this
-  // combination in case the compiler gets confused.
-  static_assert(!is_2d || !is_compound, "|is_compound| is ignored.");
-  // Since this only handles |width| == 2, we only need to be concerned with
-  // 2 or 4 tap filters.
-  assert(taps == 2 || taps == 4);
-  auto* dst8 = static_cast<uint8_t*>(dst);
-  auto* dst16 = static_cast<uint16_t*>(dst);
-
-  const int compound_round_offset =
-      (1 << (kBitdepth8 + 4)) + (1 << (kBitdepth8 + 3));
-
-  const int filter_id = (subpixel >> 6) & kSubPixelMask;
-  const int taps_start = (kSubPixelTaps - taps) / 2;
-  int y = 0;
-  do {
-    int x = 0;
-    do {
-      int sum;
-      if (is_2d) {
-        // An offset to guarantee the sum is non negative.
-        sum = 1 << (kBitdepth8 + kFilterBits - 1);
-      } else if (is_compound) {
-        sum = 0;
-      } else {
-        // 1D non-Compound. The C uses a two stage shift with rounding. Here the
-        // shifts are combined and the rounding bit from the first stage is
-        // added in.
-        // (sum + 4 >> 3) + 8) >> 4 == (sum + 64 + 4) >> 7
-        sum = 4;
-      }
-      for (int k = 0; k < taps; ++k) {
-        const int tap = k + taps_start;
-        sum += kSubPixelFilters[filter_index][filter_id][tap] * src[x + k];
-      }
-      if (is_2d) {
-        dst16[x] = static_cast<int16_t>(
-            RightShiftWithRounding(sum, kInterRoundBitsHorizontal));
-      } else if (is_compound) {
-        sum = RightShiftWithRounding(sum, kInterRoundBitsHorizontal);
-        dst16[x] = sum + compound_round_offset;
-      } else {
-        // 1D non-Compound.
-        dst8[x] = static_cast<uint8_t>(
-            Clip3(RightShiftWithRounding(sum, kFilterBits), 0, 255));
-      }
-    } while (++x < 2);
-
-    src += src_stride;
-    dst8 += dst_stride;
-    dst16 += dst_stride;
-  } while (++y < height);
-}
-
-// This will always need to handle all |filter_index| values. Even with |width|
-// restricted to 2 the value of |height| can go up to at least 16.
-template <bool is_2d = true, bool is_compound = false>
-void VerticalPass2xH(const void* const src, const ptrdiff_t src_stride,
-                     void* const dst, const ptrdiff_t dst_stride,
-                     const int height, const int inter_round_bits_vertical,
-                     const int filter_index, const int taps,
-                     const int subpixel) {
-  const auto* src8 = static_cast<const uint8_t*>(src);
-  const auto* src16 = static_cast<const uint16_t*>(src);
-  auto* dst8 = static_cast<uint8_t*>(dst);
-  auto* dst16 = static_cast<uint16_t*>(dst);
-  const int filter_id = (subpixel >> 6) & kSubPixelMask;
-  const int taps_start = (kSubPixelTaps - taps) / 2;
-  constexpr int max_pixel_value = (1 << kBitdepth8) - 1;
-
-  int y = 0;
-  do {
-    int x = 0;
-    do {
-      int sum;
-      if (is_2d) {
-        sum = 1 << (kBitdepth8 + 2 * kFilterBits - kInterRoundBitsHorizontal);
-      } else if (is_compound) {
-        // TODO(johannkoenig): Keeping the sum positive is valuable for neon but
-        // may not actually help the C implementation. Investigate removing
-        // this.
-        // Use this offset to cancel out 1 << (kBitdepth8 + 3) >> 3 from
-        // |compound_round_offset|.
-        sum = (1 << (kBitdepth8 + 3)) << 3;
-      } else {
-        sum = 0;
-      }
-
-      for (int k = 0; k < taps; ++k) {
-        const int tap = k + taps_start;
-        if (is_2d) {
-          sum += kSubPixelFilters[filter_index][filter_id][tap] *
-                 src16[x + k * src_stride];
-        } else {
-          sum += kSubPixelFilters[filter_index][filter_id][tap] *
-                 src8[x + k * src_stride];
-        }
-      }
-
-      if (is_2d) {
-        if (is_compound) {
-          dst16[x] = static_cast<uint16_t>(
-              RightShiftWithRounding(sum, inter_round_bits_vertical));
-        } else {
-          constexpr int single_round_offset =
-              (1 << kBitdepth8) + (1 << (kBitdepth8 - 1));
-          dst8[x] = static_cast<uint8_t>(
-              Clip3(RightShiftWithRounding(sum, kInterRoundBitsVertical) -
-                        single_round_offset,
-                    0, max_pixel_value));
-        }
-      } else if (is_compound) {
-        // Leave off + 1 << (kBitdepth8 + 3).
-        constexpr int compound_round_offset = 1 << (kBitdepth8 + 4);
-        dst16[x] = RightShiftWithRounding(sum, 3) + compound_round_offset;
-      } else {
-        // 1D non-compound.
-        dst8[x] = static_cast<uint8_t>(Clip3(
-            RightShiftWithRounding(sum, kFilterBits), 0, max_pixel_value));
-      }
-    } while (++x < 2);
-
-    src8 += src_stride;
-    src16 += src_stride;
-    dst8 += dst_stride;
-    dst16 += dst_stride;
-  } while (++y < height);
-}
-
-int NumTapsInFilter(const int filter_index) {
+int GetNumTapsInFilter(const int filter_index) {
   if (filter_index < 2) {
     // Despite the names these only use 6 taps.
     // kInterpolationFilterEightTap
@@ -930,255 +732,135 @@
 void Convolve2D_NEON(const void* const reference,
                      const ptrdiff_t reference_stride,
                      const int horizontal_filter_index,
-                     const int vertical_filter_index,
-                     const int /*inter_round_bits_vertical*/,
-                     const int subpixel_x, const int subpixel_y,
-                     const int /*step_x*/, const int /*step_y*/,
-                     const int width, const int height, void* prediction,
-                     const ptrdiff_t pred_stride) {
+                     const int vertical_filter_index, const int subpixel_x,
+                     const int subpixel_y, const int width, const int height,
+                     void* prediction, const ptrdiff_t pred_stride) {
   const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
   const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
-  const int horizontal_taps = NumTapsInFilter(horiz_filter_index);
-  const int vertical_taps = NumTapsInFilter(vert_filter_index);
+  const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
 
   // The output of the horizontal filter is guaranteed to fit in 16 bits.
   uint16_t
       intermediate_result[kMaxSuperBlockSizeInPixels *
                           (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
-  const int intermediate_stride = width;
   const int intermediate_height = height + vertical_taps - 1;
 
-  if (width >= 4) {
-    const ptrdiff_t src_stride = reference_stride;
-    const auto* src = static_cast<const uint8_t*>(reference) -
-                      (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
+  const ptrdiff_t src_stride = reference_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
 
-    HorizontalPass<true>(src, src_stride, intermediate_result,
-                         intermediate_stride, width, intermediate_height,
-                         subpixel_x, horiz_filter_index);
+  DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result, width,
+                                   width, intermediate_height, subpixel_x,
+                                   horiz_filter_index);
 
-    // Vertical filter.
-    auto* dest = static_cast<uint8_t*>(prediction);
-    const ptrdiff_t dest_stride = pred_stride;
-    const int filter_id = ((subpixel_y & 1023) >> 6) & kSubPixelMask;
-    const int16x8_t taps =
-        vld1q_s16(kSubPixelFilters[vert_filter_index][filter_id]);
+  // Vertical filter.
+  auto* dest = static_cast<uint8_t*>(prediction);
+  const ptrdiff_t dest_stride = pred_stride;
+  const int filter_id = ((subpixel_y & 1023) >> 6) & kSubPixelMask;
+  assert(filter_id != 0);
 
-    if (vertical_taps == 8) {
-      Filter2DVertical<8>(intermediate_result, intermediate_stride, dest,
-                          dest_stride, width, height, taps, 0);
-    } else if (vertical_taps == 6) {
-      Filter2DVertical<6>(intermediate_result, intermediate_stride, dest,
-                          dest_stride, width, height, taps, 0);
-    } else if (vertical_taps == 4) {
-      Filter2DVertical<4>(intermediate_result, intermediate_stride, dest,
-                          dest_stride, width, height, taps, 0);
-    } else {  // |vertical_taps| == 2
-      Filter2DVertical<2>(intermediate_result, intermediate_stride, dest,
-                          dest_stride, width, height, taps, 0);
+  const int16x8_t taps =
+      vmovl_s8(vld1_s8(kHalfSubPixelFilters[vert_filter_index][filter_id]));
+
+  if (vertical_taps == 8) {
+    if (width == 2) {
+      Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else if (width == 4) {
+      Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else {
+      Filter2DVertical<8>(intermediate_result, dest, dest_stride, width, height,
+                          taps);
     }
-  } else {
-    assert(width == 2);
-    // Horizontal filter.
-    const auto* const src = static_cast<const uint8_t*>(reference) -
-                            ((vertical_taps / 2) - 1) * reference_stride -
-                            ((horizontal_taps / 2) - 1);
-
-    HorizontalPass2xH(src, reference_stride, intermediate_result,
-                      intermediate_stride, intermediate_height,
-                      horiz_filter_index, horizontal_taps, subpixel_x);
-
-    // Vertical filter.
-    auto* dest = static_cast<uint8_t*>(prediction);
-    const ptrdiff_t dest_stride = pred_stride;
-
-    VerticalPass2xH(intermediate_result, intermediate_stride, dest, dest_stride,
-                    height, 0, vert_filter_index, vertical_taps, subpixel_y);
+  } else if (vertical_taps == 6) {
+    if (width == 2) {
+      Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else if (width == 4) {
+      Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else {
+      Filter2DVertical<6>(intermediate_result, dest, dest_stride, width, height,
+                          taps);
+    }
+  } else if (vertical_taps == 4) {
+    if (width == 2) {
+      Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else if (width == 4) {
+      Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else {
+      Filter2DVertical<4>(intermediate_result, dest, dest_stride, width, height,
+                          taps);
+    }
+  } else {  // |vertical_taps| == 2
+    if (width == 2) {
+      Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else if (width == 4) {
+      Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height,
+                             taps);
+    } else {
+      Filter2DVertical<2>(intermediate_result, dest, dest_stride, width, height,
+                          taps);
+    }
   }
 }
 
-template <int tap_lane0, int tap_lane1>
-inline int16x8_t CombineFilterTapsLong(const int16x8_t sum,
-                                       const int16x8_t src0, int16x8_t src1,
-                                       int16x4_t taps0, int16x4_t taps1) {
-  int32x4_t sum_lo = vmovl_s16(vget_low_s16(sum));
-  int32x4_t sum_hi = vmovl_s16(vget_high_s16(sum));
-  const int16x8_t product0 = vmulq_lane_s16(src0, taps0, tap_lane0);
-  const int16x8_t product1 = vmulq_lane_s16(src1, taps1, tap_lane1);
-  const int32x4_t center_vals_lo =
-      vaddl_s16(vget_low_s16(product0), vget_low_s16(product1));
-  const int32x4_t center_vals_hi =
-      vaddl_s16(vget_high_s16(product0), vget_high_s16(product1));
-
-  sum_lo = vaddq_s32(sum_lo, center_vals_lo);
-  sum_hi = vaddq_s32(sum_hi, center_vals_hi);
-  return vcombine_s16(vrshrn_n_s32(sum_lo, 3), vrshrn_n_s32(sum_hi, 3));
-}
-
-// TODO(b/133525024): Replace usage of this function with version that uses
-// unsigned trick, once cl/263050071 is submitted.
-template <int num_taps>
-inline int16x8_t SumTapsCompound(const int16x8_t* const src,
-                                 const int16x8_t taps) {
-  int16x8_t sum = vdupq_n_s16(1 << (kBitdepth8 + kFilterBits - 1));
-  if (num_taps == 8) {
-    const int16x4_t taps_lo = vget_low_s16(taps);
-    const int16x4_t taps_hi = vget_high_s16(taps);
-    sum = vmlaq_lane_s16(sum, src[0], taps_lo, 0);
-    sum = vmlaq_lane_s16(sum, src[1], taps_lo, 1);
-    sum = vmlaq_lane_s16(sum, src[2], taps_lo, 2);
-    sum = vmlaq_lane_s16(sum, src[5], taps_hi, 1);
-    sum = vmlaq_lane_s16(sum, src[6], taps_hi, 2);
-    sum = vmlaq_lane_s16(sum, src[7], taps_hi, 3);
-
-    // Center taps may sum to as much as 160, which pollutes the sign bit in
-    // int16 types.
-    sum = CombineFilterTapsLong<3, 0>(sum, src[3], src[4], taps_lo, taps_hi);
-  } else if (num_taps == 6) {
-    const int16x4_t taps_lo = vget_low_s16(taps);
-    const int16x4_t taps_hi = vget_high_s16(taps);
-    sum = vmlaq_lane_s16(sum, src[0], taps_lo, 0);
-    sum = vmlaq_lane_s16(sum, src[1], taps_lo, 1);
-    sum = vmlaq_lane_s16(sum, src[4], taps_hi, 0);
-    sum = vmlaq_lane_s16(sum, src[5], taps_hi, 1);
-
-    // Center taps in filter 0 may sum to as much as 148, which pollutes the
-    // sign bit in int16 types. This is not true of filter 1.
-    sum = CombineFilterTapsLong<2, 3>(sum, src[2], src[3], taps_lo, taps_lo);
-  } else if (num_taps == 4) {
-    const int16x4_t taps_lo = vget_low_s16(taps);
-    sum = vmlaq_lane_s16(sum, src[0], taps_lo, 0);
-    sum = vmlaq_lane_s16(sum, src[3], taps_lo, 3);
-
-    // Center taps.
-    sum = vqaddq_s16(sum, vmulq_lane_s16(src[1], taps_lo, 1));
-    sum = vrshrq_n_s16(vqaddq_s16(sum, vmulq_lane_s16(src[2], taps_lo, 2)),
-                       kInterRoundBitsHorizontal);
-  } else {
-    assert(num_taps == 2);
-    // All the taps are positive so there is no concern regarding saturation.
-    const int16x4_t taps_lo = vget_low_s16(taps);
-    sum = vmlaq_lane_s16(sum, src[0], taps_lo, 0);
-    sum = vrshrq_n_s16(vmlaq_lane_s16(sum, src[1], taps_lo, 1),
-                       kInterRoundBitsHorizontal);
+// There are many opportunities for overreading in scaled convolve, because the
+// range of starting points for filter windows is anywhere from 0 to 16 for 8
+// destination pixels, and the window sizes range from 2 to 8. To accommodate
+// this range concisely, we use |grade_x| to mean the most steps in src that can
+// be traversed in a single |step_x| increment, i.e. 1 or 2. When grade_x is 2,
+// we are guaranteed to exceed 8 whole steps in src for every 8 |step_x|
+// increments. The first load covers the initial elements of src_x, while the
+// final load covers the taps.
+template <int grade_x>
+inline uint8x8x3_t LoadSrcVals(const uint8_t* src_x) {
+  uint8x8x3_t ret;
+  const uint8x16_t src_val = vld1q_u8(src_x);
+  ret.val[0] = vget_low_u8(src_val);
+  ret.val[1] = vget_high_u8(src_val);
+  if (grade_x > 1) {
+    ret.val[2] = vld1_u8(src_x + 16);
   }
-  return sum;
+  return ret;
 }
 
-// |grade_x| determines an upper limit on how many whole-pixel steps will be
-// realized with 8 |step_x| increments.
-template <int filter_index, int num_taps, int grade_x>
-inline void ConvolveHorizontalScaled_NEON(const uint8_t* src,
-                                          const ptrdiff_t src_stride,
-                                          const int width, const int subpixel_x,
-                                          const int step_x,
-                                          const int intermediate_height,
-                                          int16_t* dst) {
-  const int dst_stride = kMaxSuperBlockSizeInPixels;
-  const int kernel_offset = (8 - num_taps) / 2;
-  const int ref_x = subpixel_x >> kScaleSubPixelBits;
-  int y = intermediate_height;
-  do {  // y > 0
-    int p = subpixel_x;
-    int prev_p = p;
-    int x = 0;
-    int16x8_t s[(grade_x + 1) * 8];
-    const uint8_t* src_x =
-        &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
-    // TODO(petersonab,b/139707209): Fix source buffer overreads.
-    // For example, when |height| == 2 and |num_taps| == 8 then
-    // |intermediate_height| == 9. On the second pass this will load and
-    // transpose 7 rows past where |src| may end.
-    Load8x8(src_x, src_stride, s);
-    Transpose8x8(s);
-    if (grade_x > 1) {
-      Load8x8(src_x + 8, src_stride, &s[8]);
-      Transpose8x8(&s[8]);
-    }
-
-    do {  // x < width
-      int16x8_t result[8];
-      src_x = &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
-      // process 8 src_x steps
-      Load8x8(src_x + 8, src_stride, &s[8]);
-      Transpose8x8(&s[8]);
-      if (grade_x > 1) {
-        Load8x8(src_x + 16, src_stride, &s[16]);
-        Transpose8x8(&s[16]);
-      }
-      // Remainder after whole index increments.
-      int pixel_offset = p & ((1 << kScaleSubPixelBits) - 1);
-      for (int z = 0; z < 8; ++z) {
-        const int16x8_t filter = vld1q_s16(
-            &kSubPixelFilters[filter_index][(p >> 6) & 0xF][kernel_offset]);
-        result[z] = SumTapsCompound<num_taps>(
-            &s[pixel_offset >> kScaleSubPixelBits], filter);
-        pixel_offset += step_x;
-        p += step_x;
-      }
-
-      // Transpose the 8x8 filtered values back to dst.
-      Transpose8x8(result);
-
-      vst1q_s16(&dst[x + 0 * dst_stride], result[0]);
-      vst1q_s16(&dst[x + 1 * dst_stride], result[1]);
-      vst1q_s16(&dst[x + 2 * dst_stride], result[2]);
-      vst1q_s16(&dst[x + 3 * dst_stride], result[3]);
-      vst1q_s16(&dst[x + 4 * dst_stride], result[4]);
-      vst1q_s16(&dst[x + 5 * dst_stride], result[5]);
-      vst1q_s16(&dst[x + 6 * dst_stride], result[6]);
-      vst1q_s16(&dst[x + 7 * dst_stride], result[7]);
-
-      for (int i = 0; i < 8; ++i) {
-        s[i] =
-            s[(p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits) + i];
-        if (grade_x > 1) {
-          s[i + 8] = s[(p >> kScaleSubPixelBits) -
-                       (prev_p >> kScaleSubPixelBits) + i + 8];
-        }
-      }
-
-      prev_p = p;
-      x += 8;
-    } while (x < width);
-
-    src += src_stride * 8;
-    dst += dst_stride * 8;
-    y -= 8;
-  } while (y > 0);
-}
-
+// Pre-transpose the 2 tap filters in |kAbsHalfSubPixelFilters|[3]
 inline uint8x16_t GetPositive2TapFilter(const int tap_index) {
   assert(tap_index < 2);
-  constexpr uint8_t kSubPixel2TapFilterColumns[2][16] = {
-      {128, 120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8},
-      {0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120}};
+  alignas(
+      16) static constexpr uint8_t kAbsHalfSubPixel2TapFilterColumns[2][16] = {
+      {64, 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4},
+      {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60}};
 
-  return vld1q_u8(kSubPixel2TapFilterColumns[tap_index]);
+  return vld1q_u8(kAbsHalfSubPixel2TapFilterColumns[tap_index]);
 }
 
+template <int grade_x>
 inline void ConvolveKernelHorizontal2Tap(const uint8_t* src,
                                          const ptrdiff_t src_stride,
                                          const int width, const int subpixel_x,
                                          const int step_x,
                                          const int intermediate_height,
                                          int16_t* intermediate) {
-  const int kIntermediateStride = kMaxSuperBlockSizeInPixels;
   // Account for the 0-taps that precede the 2 nonzero taps.
   const int kernel_offset = 3;
   const int ref_x = subpixel_x >> kScaleSubPixelBits;
   const int step_x8 = step_x << 3;
   const uint8x16_t filter_taps0 = GetPositive2TapFilter(0);
   const uint8x16_t filter_taps1 = GetPositive2TapFilter(1);
-  const uint16x8_t sum = vdupq_n_u16(1 << (kBitdepth8 + kFilterBits - 1));
-  uint16x8_t index_steps = vmulq_n_u16(vmovl_u8(vcreate_u8(0x0706050403020100)),
-                                       static_cast<uint16_t>(step_x));
-
+  const uint16x8_t index_steps = vmulq_n_u16(
+      vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
   const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
-  for (int x = 0, p = subpixel_x; x < width; x += 8, p += step_x8) {
+
+  int p = subpixel_x;
+  if (width <= 4) {
     const uint8_t* src_x =
         &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
-    int16_t* intermediate_x = intermediate + x;
     // Only add steps to the 10-bit truncated p to avoid overflow.
     const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
     const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
@@ -1189,45 +871,86 @@
     // For each x, a lane of tapsK has
     // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
     // on x.
-    const uint8x8_t taps0 = VQTbl1U8(filter_taps0, filter_indices);
-    const uint8x8_t taps1 = VQTbl1U8(filter_taps1, filter_indices);
-    for (int y = 0; y < intermediate_height; ++y) {
+    const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
+                               VQTbl1U8(filter_taps1, filter_indices)};
+    int y = 0;
+    do {
       // Load a pool of samples to select from using stepped indices.
-      uint8x16_t src_vals = vld1q_u8(src_x);
+      const uint8x16_t src_vals = vld1q_u8(src_x);
       const uint8x8_t src_indices =
           vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
 
       // For each x, a lane of srcK contains src_x[k].
-      const uint8x8_t src0 = VQTbl1U8(src_vals, src_indices);
-      const uint8x8_t src1 =
-          VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)));
+      const uint8x8_t src[2] = {
+          VQTbl1U8(src_vals, src_indices),
+          VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)))};
 
-      const uint16x8_t product0 = vmlal_u8(sum, taps0, src0);
-      // product0 + product1
-      const uint16x8_t result = vmlal_u8(product0, taps1, src1);
+      vst1q_s16(intermediate,
+                vrshrq_n_s16(SumOnePassTaps</*filter_index=*/3>(src, taps),
+                             kInterRoundBitsHorizontal - 1));
+      src_x += src_stride;
+      intermediate += kIntermediateStride;
+    } while (++y < intermediate_height);
+    return;
+  }
 
-      vst1q_s16(intermediate_x, vreinterpretq_s16_u16(vrshrq_n_u16(result, 3)));
+  // |width| >= 8
+  int x = 0;
+  do {
+    const uint8_t* src_x =
+        &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+    int16_t* intermediate_x = intermediate + x;
+    // Only add steps to the 10-bit truncated p to avoid overflow.
+    const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+    const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+    const uint8x8_t filter_indices =
+        vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+                filter_index_mask);
+    // This is a special case. The 2-tap filter has no negative taps, so we
+    // can use unsigned values.
+    // For each x, a lane of tapsK has
+    // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
+    // on x.
+    const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
+                               VQTbl1U8(filter_taps1, filter_indices)};
+    int y = 0;
+    do {
+      // Load a pool of samples to select from using stepped indices.
+      const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
+      const uint8x8_t src_indices =
+          vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+
+      // For each x, a lane of srcK contains src_x[k].
+      const uint8x8_t src[2] = {
+          vtbl3_u8(src_vals, src_indices),
+          vtbl3_u8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)))};
+
+      vst1q_s16(intermediate_x,
+                vrshrq_n_s16(SumOnePassTaps</*filter_index=*/3>(src, taps),
+                             kInterRoundBitsHorizontal - 1));
       src_x += src_stride;
       intermediate_x += kIntermediateStride;
-    }
-  }
+    } while (++y < intermediate_height);
+    x += 8;
+    p += step_x8;
+  } while (x < width);
 }
 
+// Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[5].
 inline uint8x16_t GetPositive4TapFilter(const int tap_index) {
   assert(tap_index < 4);
-  constexpr uint8_t kSubPixel4TapPositiveFilterColumns[4][16] = {
-      {0, 30, 26, 22, 20, 18, 16, 14, 12, 12, 10, 8, 6, 4, 4, 2},
-      {128, 62, 62, 62, 60, 58, 56, 54, 52, 48, 46, 44, 42, 40, 36, 34},
-      {0, 34, 36, 40, 42, 44, 46, 48, 52, 54, 56, 58, 60, 62, 62, 62},
-      {0, 2, 4, 4, 6, 8, 10, 12, 12, 14, 16, 18, 20, 22, 26, 30}};
+  alignas(
+      16) static constexpr uint8_t kSubPixel4TapPositiveFilterColumns[4][16] = {
+      {0, 15, 13, 11, 10, 9, 8, 7, 6, 6, 5, 4, 3, 2, 2, 1},
+      {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+      {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+      {0, 1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11, 13, 15}};
 
-  uint8x16_t filter_taps =
-      vld1q_u8(kSubPixel4TapPositiveFilterColumns[tap_index]);
-  return filter_taps;
+  return vld1q_u8(kSubPixel4TapPositiveFilterColumns[tap_index]);
 }
 
 // This filter is only possible when width <= 4.
-inline void ConvolveKernelHorizontalPositive4Tap(
+void ConvolveKernelHorizontalPositive4Tap(
     const uint8_t* src, const ptrdiff_t src_stride, const int subpixel_x,
     const int step_x, const int intermediate_height, int16_t* intermediate) {
   const int kernel_offset = 2;
@@ -1237,69 +960,60 @@
   const uint8x16_t filter_taps1 = GetPositive4TapFilter(1);
   const uint8x16_t filter_taps2 = GetPositive4TapFilter(2);
   const uint8x16_t filter_taps3 = GetPositive4TapFilter(3);
-  uint16x8_t index_steps = vmulq_n_u16(vmovl_u8(vcreate_u8(0x0706050403020100)),
-                                       static_cast<uint16_t>(step_x));
-  int p = subpixel_x;
-  const uint16x8_t base = vdupq_n_u16(1 << (kBitdepth8 + kFilterBits - 1));
+  const uint16x8_t index_steps = vmulq_n_u16(
+      vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+  const int p = subpixel_x;
   // First filter is special, just a 128 tap on the center.
   const uint8_t* src_x =
       &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
   // Only add steps to the 10-bit truncated p to avoid overflow.
   const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
   const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
-  const uint8x8_t filter_indices =
-      vand_u8(vshrn_n_u16(subpel_index_offsets, 6), filter_index_mask);
+  const uint8x8_t filter_indices = vand_u8(
+      vshrn_n_u16(subpel_index_offsets, kFilterIndexShift), filter_index_mask);
   // Note that filter_id depends on x.
   // For each x, tapsK has kSubPixelFilters[filter_index][filter_id][k].
-  const uint8x8_t taps0 = VQTbl1U8(filter_taps0, filter_indices);
-  const uint8x8_t taps1 = VQTbl1U8(filter_taps1, filter_indices);
-  const uint8x8_t taps2 = VQTbl1U8(filter_taps2, filter_indices);
-  const uint8x8_t taps3 = VQTbl1U8(filter_taps3, filter_indices);
+  const uint8x8_t taps[4] = {VQTbl1U8(filter_taps0, filter_indices),
+                             VQTbl1U8(filter_taps1, filter_indices),
+                             VQTbl1U8(filter_taps2, filter_indices),
+                             VQTbl1U8(filter_taps3, filter_indices)};
 
   const uint8x8_t src_indices =
       vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
-  for (int y = 0; y < intermediate_height; ++y) {
+  int y = 0;
+  do {
     // Load a pool of samples to select from using stepped index vectors.
-    uint8x16_t src_vals = vld1q_u8(src_x);
+    const uint8x16_t src_vals = vld1q_u8(src_x);
 
     // For each x, srcK contains src_x[k] where k=1.
     // Whereas taps come from different arrays, src pixels are drawn from the
     // same contiguous line.
-    const uint8x8_t src0 = VQTbl1U8(src_vals, src_indices);
-    const uint8x8_t src1 =
-        VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)));
-    const uint8x8_t src2 =
-        VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(2)));
-    const uint8x8_t src3 =
-        VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(3)));
+    const uint8x8_t src[4] = {
+        VQTbl1U8(src_vals, src_indices),
+        VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(1))),
+        VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(2))),
+        VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(3)))};
 
-    uint16x8_t sum = vmlal_u8(base, taps0, src0);
-    sum = vmlal_u8(sum, taps1, src1);
-    sum = vmlal_u8(sum, taps2, src2);
-    sum = vmlal_u8(sum, taps3, src3);
-
-    vst1_s16(intermediate,
-             vreinterpret_s16_u16(vrshr_n_u16(vget_low_u16(sum), 3)));
+    vst1q_s16(intermediate,
+              vrshrq_n_s16(SumOnePassTaps</*filter_index=*/5>(src, taps),
+                           kInterRoundBitsHorizontal - 1));
 
     src_x += src_stride;
     intermediate += kIntermediateStride;
-  }
+  } while (++y < intermediate_height);
 }
 
+// Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[4].
 inline uint8x16_t GetSigned4TapFilter(const int tap_index) {
   assert(tap_index < 4);
-  // The first and fourth taps of each filter are negative. However
-  // 128 does not fit in an 8-bit signed integer. Thus we use subtraction to
-  // keep everything unsigned.
-  constexpr uint8_t kSubPixel4TapSignedFilterColumns[4][16] = {
-      {0, 4, 8, 10, 12, 12, 14, 12, 12, 10, 10, 10, 8, 6, 4, 2},
-      {128, 126, 122, 116, 110, 102, 94, 84, 76, 66, 58, 48, 38, 28, 18, 8},
-      {0, 8, 18, 28, 38, 48, 58, 66, 76, 84, 94, 102, 110, 116, 122, 126},
-      {0, 2, 4, 6, 8, 10, 10, 10, 12, 12, 14, 12, 12, 10, 8, 4}};
+  alignas(16) static constexpr uint8_t
+      kAbsHalfSubPixel4TapSignedFilterColumns[4][16] = {
+          {0, 2, 4, 5, 6, 6, 7, 6, 6, 5, 5, 5, 4, 3, 2, 1},
+          {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+          {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+          {0, 1, 2, 3, 4, 5, 5, 5, 6, 6, 7, 6, 6, 5, 4, 2}};
 
-  uint8x16_t filter_taps =
-      vld1q_u8(kSubPixel4TapSignedFilterColumns[tap_index]);
-  return filter_taps;
+  return vld1q_u8(kAbsHalfSubPixel4TapSignedFilterColumns[tap_index]);
 }
 
 // This filter is only possible when width <= 4.
@@ -1313,66 +1027,480 @@
   const uint8x16_t filter_taps1 = GetSigned4TapFilter(1);
   const uint8x16_t filter_taps2 = GetSigned4TapFilter(2);
   const uint8x16_t filter_taps3 = GetSigned4TapFilter(3);
-  const uint16x8_t index_steps = vmulq_n_u16(vmovl_u8(vcreate_u8(0x03020100)),
-                                             static_cast<uint16_t>(step_x));
+  const uint16x4_t index_steps = vmul_n_u16(vcreate_u16(0x0003000200010000),
+                                            static_cast<uint16_t>(step_x));
 
-  const uint16x8_t base = vdupq_n_u16(1 << (kBitdepth8 + kFilterBits - 1));
-  int p = subpixel_x;
+  const int p = subpixel_x;
   const uint8_t* src_x =
       &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
   // Only add steps to the 10-bit truncated p to avoid overflow.
-  const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
-  const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+  const uint16x4_t p_fraction = vdup_n_u16(p & 1023);
+  const uint16x4_t subpel_index_offsets = vadd_u16(index_steps, p_fraction);
+  const uint8x8_t filter_index_offsets = vshrn_n_u16(
+      vcombine_u16(subpel_index_offsets, vdup_n_u16(0)), kFilterIndexShift);
   const uint8x8_t filter_indices =
-      vand_u8(vshrn_n_u16(subpel_index_offsets, 6), filter_index_mask);
+      vand_u8(filter_index_offsets, filter_index_mask);
   // Note that filter_id depends on x.
   // For each x, tapsK has kSubPixelFilters[filter_index][filter_id][k].
-  const uint8x8_t taps0 = VQTbl1U8(filter_taps0, filter_indices);
-  const uint8x8_t taps1 = VQTbl1U8(filter_taps1, filter_indices);
-  const uint8x8_t taps2 = VQTbl1U8(filter_taps2, filter_indices);
-  const uint8x8_t taps3 = VQTbl1U8(filter_taps3, filter_indices);
-  for (int y = 0; y < intermediate_height; ++y) {
+  const uint8x8_t taps[4] = {VQTbl1U8(filter_taps0, filter_indices),
+                             VQTbl1U8(filter_taps1, filter_indices),
+                             VQTbl1U8(filter_taps2, filter_indices),
+                             VQTbl1U8(filter_taps3, filter_indices)};
+
+  const uint8x8_t src_indices_base =
+      vshr_n_u8(filter_index_offsets, kScaleSubPixelBits - kFilterIndexShift);
+
+  const uint8x8_t src_indices[4] = {src_indices_base,
+                                    vadd_u8(src_indices_base, vdup_n_u8(1)),
+                                    vadd_u8(src_indices_base, vdup_n_u8(2)),
+                                    vadd_u8(src_indices_base, vdup_n_u8(3))};
+
+  int y = 0;
+  do {
     // Load a pool of samples to select from using stepped indices.
-    uint8x16_t src_vals = vld1q_u8(src_x);
-    const uint8x8_t src_indices =
-        vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+    const uint8x16_t src_vals = vld1q_u8(src_x);
 
     // For each x, srcK contains src_x[k] where k=1.
     // Whereas taps come from different arrays, src pixels are drawn from the
     // same contiguous line.
-    const uint8x8_t src0 = VQTbl1U8(src_vals, src_indices);
-    const uint8x8_t src1 =
-        VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)));
-    const uint8x8_t src2 =
-        VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(2)));
-    const uint8x8_t src3 =
-        VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(3)));
+    const uint8x8_t src[4] = {
+        VQTbl1U8(src_vals, src_indices[0]), VQTbl1U8(src_vals, src_indices[1]),
+        VQTbl1U8(src_vals, src_indices[2]), VQTbl1U8(src_vals, src_indices[3])};
 
-    // Offsetting by base permits a guaranteed positive.
-    uint16x8_t sum = vmlsl_u8(base, taps0, src0);
-    sum = vmlal_u8(sum, taps1, src1);
-    sum = vmlal_u8(sum, taps2, src2);
-    sum = vmlsl_u8(sum, taps3, src3);
-
-    vst1_s16(intermediate,
-             vreinterpret_s16_u16(vrshr_n_u16(vget_low_u16(sum), 3)));
+    vst1q_s16(intermediate,
+              vrshrq_n_s16(SumOnePassTaps</*filter_index=*/4>(src, taps),
+                           kInterRoundBitsHorizontal - 1));
     src_x += src_stride;
     intermediate += kIntermediateStride;
-  }
+  } while (++y < intermediate_height);
 }
 
-void ConvolveCompoundScale2D_NEON(
-    const void* const reference, const ptrdiff_t reference_stride,
-    const int horizontal_filter_index, const int vertical_filter_index,
-    const int inter_round_bits_vertical, const int subpixel_x,
-    const int subpixel_y, const int step_x, const int step_y, const int width,
-    const int height, void* prediction, const ptrdiff_t pred_stride) {
+// Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[0].
+inline uint8x16_t GetSigned6TapFilter(const int tap_index) {
+  assert(tap_index < 6);
+  alignas(16) static constexpr uint8_t
+      kAbsHalfSubPixel6TapSignedFilterColumns[6][16] = {
+          {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0},
+          {0, 3, 5, 6, 7, 7, 8, 7, 7, 6, 6, 6, 5, 4, 2, 1},
+          {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+          {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+          {0, 1, 2, 4, 5, 6, 6, 6, 7, 7, 8, 7, 7, 6, 5, 3},
+          {0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
+
+  return vld1q_u8(kAbsHalfSubPixel6TapSignedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width >= 8.
+template <int grade_x>
+inline void ConvolveKernelHorizontalSigned6Tap(
+    const uint8_t* src, const ptrdiff_t src_stride, const int width,
+    const int subpixel_x, const int step_x, const int intermediate_height,
+    int16_t* intermediate) {
+  const int kernel_offset = 1;
+  const uint8x8_t one = vdup_n_u8(1);
+  const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const int step_x8 = step_x << 3;
+  uint8x16_t filter_taps[6];
+  for (int i = 0; i < 6; ++i) {
+    filter_taps[i] = GetSigned6TapFilter(i);
+  }
+  const uint16x8_t index_steps = vmulq_n_u16(
+      vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+
+  int x = 0;
+  int p = subpixel_x;
+  do {
+    // Avoid overloading outside the reference boundaries. This means
+    // |trailing_width| can be up to 24.
+    const uint8_t* src_x =
+        &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+    int16_t* intermediate_x = intermediate + x;
+    // Only add steps to the 10-bit truncated p to avoid overflow.
+    const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+    const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+    const uint8x8_t src_indices =
+        vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+    uint8x8_t src_lookup[6];
+    src_lookup[0] = src_indices;
+    for (int i = 1; i < 6; ++i) {
+      src_lookup[i] = vadd_u8(src_lookup[i - 1], one);
+    }
+
+    const uint8x8_t filter_indices =
+        vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+                filter_index_mask);
+    // For each x, a lane of taps[k] has
+    // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
+    // on x.
+    uint8x8_t taps[6];
+    for (int i = 0; i < 6; ++i) {
+      taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
+    }
+    int y = 0;
+    do {
+      // Load a pool of samples to select from using stepped indices.
+      const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
+
+      const uint8x8_t src[6] = {
+          vtbl3_u8(src_vals, src_lookup[0]), vtbl3_u8(src_vals, src_lookup[1]),
+          vtbl3_u8(src_vals, src_lookup[2]), vtbl3_u8(src_vals, src_lookup[3]),
+          vtbl3_u8(src_vals, src_lookup[4]), vtbl3_u8(src_vals, src_lookup[5])};
+
+      vst1q_s16(intermediate_x,
+                vrshrq_n_s16(SumOnePassTaps</*filter_index=*/0>(src, taps),
+                             kInterRoundBitsHorizontal - 1));
+      src_x += src_stride;
+      intermediate_x += kIntermediateStride;
+    } while (++y < intermediate_height);
+    x += 8;
+    p += step_x8;
+  } while (x < width);
+}
+
+// Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[1]. This filter
+// has mixed positive and negative outer taps which are handled in
+// GetMixed6TapFilter().
+inline uint8x16_t GetPositive6TapFilter(const int tap_index) {
+  assert(tap_index < 6);
+  alignas(16) static constexpr uint8_t
+      kAbsHalfSubPixel6TapPositiveFilterColumns[4][16] = {
+          {0, 14, 13, 11, 10, 9, 8, 8, 7, 6, 5, 4, 3, 2, 2, 1},
+          {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+          {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+          {0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 13, 14}};
+
+  return vld1q_u8(kAbsHalfSubPixel6TapPositiveFilterColumns[tap_index]);
+}
+
+inline int8x16_t GetMixed6TapFilter(const int tap_index) {
+  assert(tap_index < 2);
+  alignas(
+      16) static constexpr int8_t kHalfSubPixel6TapMixedFilterColumns[2][16] = {
+      {0, 1, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0},
+      {0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 1}};
+
+  return vld1q_s8(kHalfSubPixel6TapMixedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width >= 8.
+template <int grade_x>
+inline void ConvolveKernelHorizontalMixed6Tap(
+    const uint8_t* src, const ptrdiff_t src_stride, const int width,
+    const int subpixel_x, const int step_x, const int intermediate_height,
+    int16_t* intermediate) {
+  const int kernel_offset = 1;
+  const uint8x8_t one = vdup_n_u8(1);
+  const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const int step_x8 = step_x << 3;
+  uint8x8_t taps[4];
+  int16x8_t mixed_taps[2];
+  uint8x16_t positive_filter_taps[4];
+  for (int i = 0; i < 4; ++i) {
+    positive_filter_taps[i] = GetPositive6TapFilter(i);
+  }
+  int8x16_t mixed_filter_taps[2];
+  mixed_filter_taps[0] = GetMixed6TapFilter(0);
+  mixed_filter_taps[1] = GetMixed6TapFilter(1);
+  const uint16x8_t index_steps = vmulq_n_u16(
+      vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+
+  int x = 0;
+  int p = subpixel_x;
+  do {
+    const uint8_t* src_x =
+        &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+    int16_t* intermediate_x = intermediate + x;
+    // Only add steps to the 10-bit truncated p to avoid overflow.
+    const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+    const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+    const uint8x8_t src_indices =
+        vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+    uint8x8_t src_lookup[6];
+    src_lookup[0] = src_indices;
+    for (int i = 1; i < 6; ++i) {
+      src_lookup[i] = vadd_u8(src_lookup[i - 1], one);
+    }
+
+    const uint8x8_t filter_indices =
+        vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+                filter_index_mask);
+    // For each x, a lane of taps[k] has
+    // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
+    // on x.
+    for (int i = 0; i < 4; ++i) {
+      taps[i] = VQTbl1U8(positive_filter_taps[i], filter_indices);
+    }
+    mixed_taps[0] = vmovl_s8(VQTbl1S8(mixed_filter_taps[0], filter_indices));
+    mixed_taps[1] = vmovl_s8(VQTbl1S8(mixed_filter_taps[1], filter_indices));
+
+    int y = 0;
+    do {
+      // Load a pool of samples to select from using stepped indices.
+      const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
+
+      int16x8_t sum_mixed = vmulq_s16(
+          mixed_taps[0], ZeroExtend(vtbl3_u8(src_vals, src_lookup[0])));
+      sum_mixed = vmlaq_s16(sum_mixed, mixed_taps[1],
+                            ZeroExtend(vtbl3_u8(src_vals, src_lookup[5])));
+      uint16x8_t sum = vreinterpretq_u16_s16(sum_mixed);
+      sum = vmlal_u8(sum, taps[0], vtbl3_u8(src_vals, src_lookup[1]));
+      sum = vmlal_u8(sum, taps[1], vtbl3_u8(src_vals, src_lookup[2]));
+      sum = vmlal_u8(sum, taps[2], vtbl3_u8(src_vals, src_lookup[3]));
+      sum = vmlal_u8(sum, taps[3], vtbl3_u8(src_vals, src_lookup[4]));
+
+      vst1q_s16(intermediate_x, vrshrq_n_s16(vreinterpretq_s16_u16(sum),
+                                             kInterRoundBitsHorizontal - 1));
+      src_x += src_stride;
+      intermediate_x += kIntermediateStride;
+    } while (++y < intermediate_height);
+    x += 8;
+    p += step_x8;
+  } while (x < width);
+}
+
+// Pre-transpose the 8 tap filters in |kAbsHalfSubPixelFilters|[2].
+inline uint8x16_t GetSigned8TapFilter(const int tap_index) {
+  assert(tap_index < 8);
+  alignas(16) static constexpr uint8_t
+      kAbsHalfSubPixel8TapSignedFilterColumns[8][16] = {
+          {0, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0},
+          {0, 1, 3, 4, 5, 5, 5, 5, 6, 5, 4, 4, 3, 3, 2, 1},
+          {0, 3, 6, 9, 11, 11, 12, 12, 12, 11, 10, 9, 7, 5, 3, 1},
+          {64, 63, 62, 60, 58, 54, 50, 45, 40, 35, 30, 24, 19, 13, 8, 4},
+          {0, 4, 8, 13, 19, 24, 30, 35, 40, 45, 50, 54, 58, 60, 62, 63},
+          {0, 1, 3, 5, 7, 9, 10, 11, 12, 12, 12, 11, 11, 9, 6, 3},
+          {0, 1, 2, 3, 3, 4, 4, 5, 6, 5, 5, 5, 5, 4, 3, 1},
+          {0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1}};
+
+  return vld1q_u8(kAbsHalfSubPixel8TapSignedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width >= 8.
+template <int grade_x>
+inline void ConvolveKernelHorizontalSigned8Tap(
+    const uint8_t* src, const ptrdiff_t src_stride, const int width,
+    const int subpixel_x, const int step_x, const int intermediate_height,
+    int16_t* intermediate) {
+  const uint8x8_t one = vdup_n_u8(1);
+  const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+  const int ref_x = subpixel_x >> kScaleSubPixelBits;
+  const int step_x8 = step_x << 3;
+  uint8x8_t taps[8];
+  uint8x16_t filter_taps[8];
+  for (int i = 0; i < 8; ++i) {
+    filter_taps[i] = GetSigned8TapFilter(i);
+  }
+  const uint16x8_t index_steps = vmulq_n_u16(
+      vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+  int x = 0;
+  int p = subpixel_x;
+  do {
+    const uint8_t* src_x = &src[(p >> kScaleSubPixelBits) - ref_x];
+    int16_t* intermediate_x = intermediate + x;
+    // Only add steps to the 10-bit truncated p to avoid overflow.
+    const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+    const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+    const uint8x8_t src_indices =
+        vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+    uint8x8_t src_lookup[8];
+    src_lookup[0] = src_indices;
+    for (int i = 1; i < 8; ++i) {
+      src_lookup[i] = vadd_u8(src_lookup[i - 1], one);
+    }
+
+    const uint8x8_t filter_indices =
+        vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+                filter_index_mask);
+    // For each x, a lane of taps[k] has
+    // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
+    // on x.
+    for (int i = 0; i < 8; ++i) {
+      taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
+    }
+
+    int y = 0;
+    do {
+      // Load a pool of samples to select from using stepped indices.
+      const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
+
+      const uint8x8_t src[8] = {
+          vtbl3_u8(src_vals, src_lookup[0]), vtbl3_u8(src_vals, src_lookup[1]),
+          vtbl3_u8(src_vals, src_lookup[2]), vtbl3_u8(src_vals, src_lookup[3]),
+          vtbl3_u8(src_vals, src_lookup[4]), vtbl3_u8(src_vals, src_lookup[5]),
+          vtbl3_u8(src_vals, src_lookup[6]), vtbl3_u8(src_vals, src_lookup[7])};
+
+      vst1q_s16(intermediate_x,
+                vrshrq_n_s16(SumOnePassTaps</*filter_index=*/2>(src, taps),
+                             kInterRoundBitsHorizontal - 1));
+      src_x += src_stride;
+      intermediate_x += kIntermediateStride;
+    } while (++y < intermediate_height);
+    x += 8;
+    p += step_x8;
+  } while (x < width);
+}
+
+// This function handles blocks of width 2 or 4.
+template <int num_taps, int grade_y, int width, bool is_compound>
+void ConvolveVerticalScale4xH(const int16_t* src, const int subpixel_y,
+                              const int filter_index, const int step_y,
+                              const int height, void* dest,
+                              const ptrdiff_t dest_stride) {
+  constexpr ptrdiff_t src_stride = kIntermediateStride;
+  const int16_t* src_y = src;
+  // |dest| is 16-bit in compound mode, Pixel otherwise.
+  uint16_t* dest16_y = static_cast<uint16_t*>(dest);
+  uint8_t* dest_y = static_cast<uint8_t*>(dest);
+  int16x4_t s[num_taps + grade_y];
+
+  int p = subpixel_y & 1023;
+  int prev_p = p;
+  int y = 0;
+  do {  // y < height
+    for (int i = 0; i < num_taps; ++i) {
+      s[i] = vld1_s16(src_y + i * src_stride);
+    }
+    int filter_id = (p >> 6) & kSubPixelMask;
+    int16x8_t filter =
+        vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+    int16x4_t sums = Sum2DVerticalTaps4<num_taps, is_compound>(s, filter);
+    if (is_compound) {
+      assert(width != 2);
+      const uint16x4_t result = vreinterpret_u16_s16(sums);
+      vst1_u16(dest16_y, result);
+    } else {
+      const uint8x8_t result = vqmovun_s16(vcombine_s16(sums, sums));
+      if (width == 2) {
+        Store2<0>(dest_y, result);
+      } else {
+        StoreLo4(dest_y, result);
+      }
+    }
+    p += step_y;
+    const int p_diff =
+        (p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits);
+    prev_p = p;
+    // Here we load extra source in case it is needed. If |p_diff| == 0, these
+    // values will be unused, but it's faster to load than to branch.
+    s[num_taps] = vld1_s16(src_y + num_taps * src_stride);
+    if (grade_y > 1) {
+      s[num_taps + 1] = vld1_s16(src_y + (num_taps + 1) * src_stride);
+    }
+    dest16_y += dest_stride;
+    dest_y += dest_stride;
+
+    filter_id = (p >> 6) & kSubPixelMask;
+    filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+    sums = Sum2DVerticalTaps4<num_taps, is_compound>(&s[p_diff], filter);
+    if (is_compound) {
+      assert(width != 2);
+      const uint16x4_t result = vreinterpret_u16_s16(sums);
+      vst1_u16(dest16_y, result);
+    } else {
+      const uint8x8_t result = vqmovun_s16(vcombine_s16(sums, sums));
+      if (width == 2) {
+        Store2<0>(dest_y, result);
+      } else {
+        StoreLo4(dest_y, result);
+      }
+    }
+    p += step_y;
+    src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+    prev_p = p;
+    dest16_y += dest_stride;
+    dest_y += dest_stride;
+
+    y += 2;
+  } while (y < height);
+}
+
+template <int num_taps, int grade_y, bool is_compound>
+inline void ConvolveVerticalScale(const int16_t* src, const int width,
+                                  const int subpixel_y, const int filter_index,
+                                  const int step_y, const int height,
+                                  void* dest, const ptrdiff_t dest_stride) {
+  constexpr ptrdiff_t src_stride = kIntermediateStride;
+  // A possible improvement is to use arithmetic to decide how many times to
+  // apply filters to same source before checking whether to load new srcs.
+  // However, this will only improve performance with very small step sizes.
+  int16x8_t s[num_taps + grade_y];
+  // |dest| is 16-bit in compound mode, Pixel otherwise.
+  uint16_t* dest16_y;
+  uint8_t* dest_y;
+
+  int x = 0;
+  do {  // x < width
+    const int16_t* src_x = src + x;
+    const int16_t* src_y = src_x;
+    dest16_y = static_cast<uint16_t*>(dest) + x;
+    dest_y = static_cast<uint8_t*>(dest) + x;
+    int p = subpixel_y & 1023;
+    int prev_p = p;
+    int y = 0;
+    do {  // y < height
+      for (int i = 0; i < num_taps; ++i) {
+        s[i] = vld1q_s16(src_y + i * src_stride);
+      }
+      int filter_id = (p >> 6) & kSubPixelMask;
+      int16x8_t filter =
+          vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+      int16x8_t sum = SimpleSum2DVerticalTaps<num_taps, is_compound>(s, filter);
+      if (is_compound) {
+        vst1q_u16(dest16_y, vreinterpretq_u16_s16(sum));
+      } else {
+        vst1_u8(dest_y, vqmovun_s16(sum));
+      }
+      p += step_y;
+      const int p_diff =
+          (p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits);
+      // |grade_y| > 1 always means p_diff > 0, so load vectors that may be
+      // needed. Otherwise, we only need to load one vector because |p_diff|
+      // can't exceed 1.
+      s[num_taps] = vld1q_s16(src_y + num_taps * src_stride);
+      if (grade_y > 1) {
+        s[num_taps + 1] = vld1q_s16(src_y + (num_taps + 1) * src_stride);
+      }
+      dest16_y += dest_stride;
+      dest_y += dest_stride;
+
+      filter_id = (p >> 6) & kSubPixelMask;
+      filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+      sum = SimpleSum2DVerticalTaps<num_taps, is_compound>(&s[p_diff], filter);
+      if (is_compound) {
+        vst1q_u16(dest16_y, vreinterpretq_u16_s16(sum));
+      } else {
+        vst1_u8(dest_y, vqmovun_s16(sum));
+      }
+      p += step_y;
+      src_y = src_x + (p >> kScaleSubPixelBits) * src_stride;
+      prev_p = p;
+      dest16_y += dest_stride;
+      dest_y += dest_stride;
+
+      y += 2;
+    } while (y < height);
+    x += 8;
+  } while (x < width);
+}
+
+template <bool is_compound>
+void ConvolveScale2D_NEON(const void* const reference,
+                          const ptrdiff_t reference_stride,
+                          const int horizontal_filter_index,
+                          const int vertical_filter_index, const int subpixel_x,
+                          const int subpixel_y, const int step_x,
+                          const int step_y, const int width, const int height,
+                          void* prediction, const ptrdiff_t pred_stride) {
+  const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+  const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+  assert(step_x <= 2048);
+  const int num_vert_taps = GetNumTapsInFilter(vert_filter_index);
   const int intermediate_height =
       (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
        kScaleSubPixelBits) +
-      kSubPixelTaps;
-  // TODO(b/133525024): Decide whether it's worth branching to a special case
-  // when step_x or step_y is 1024.
+      num_vert_taps;
   assert(step_x <= 2048);
   // The output of the horizontal filter, i.e. the intermediate_result, is
   // guaranteed to fit in int16_t.
@@ -1384,49 +1512,71 @@
   // When width > 4, the valid filter index range is always [0, 3].
   // When width <= 4, the valid filter index range is always [3, 5].
   // Similarly for height.
-  const int kIntermediateStride = kMaxSuperBlockSizeInPixels;
   int filter_index = GetFilterIndex(horizontal_filter_index, width);
   int16_t* intermediate = intermediate_result;
-  const auto* src = static_cast<const uint8_t*>(reference);
   const ptrdiff_t src_stride = reference_stride;
-  auto* dest = static_cast<uint16_t*>(prediction);
+  const auto* src = static_cast<const uint8_t*>(reference);
+  const int vert_kernel_offset = (8 - num_vert_taps) / 2;
+  src += vert_kernel_offset * src_stride;
+
+  // Derive the maximum value of |step_x| at which all source values fit in one
+  // 16-byte load. Final index is src_x + |num_taps| - 1 < 16
+  // step_x*7 is the final base subpel index for the shuffle mask for filter
+  // inputs in each iteration on large blocks. When step_x is large, we need a
+  // larger structure and use a larger table lookup in order to gather all
+  // filter inputs.
+  // |num_taps| - 1 is the shuffle index of the final filter input.
+  const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index);
+  const int kernel_start_ceiling = 16 - num_horiz_taps;
+  // This truncated quotient |grade_x_threshold| selects |step_x| such that:
+  // (step_x * 7) >> kScaleSubPixelBits < single load limit
+  const int grade_x_threshold =
+      (kernel_start_ceiling << kScaleSubPixelBits) / 7;
   switch (filter_index) {
     case 0:
-      if (step_x < 1024) {
-        ConvolveHorizontalScaled_NEON<0, 6, 1>(
+      if (step_x > grade_x_threshold) {
+        ConvolveKernelHorizontalSigned6Tap<2>(
             src, src_stride, width, subpixel_x, step_x, intermediate_height,
             intermediate);
       } else {
-        ConvolveHorizontalScaled_NEON<0, 6, 2>(
+        ConvolveKernelHorizontalSigned6Tap<1>(
             src, src_stride, width, subpixel_x, step_x, intermediate_height,
             intermediate);
       }
       break;
     case 1:
-      if (step_x < 1024) {
-        ConvolveHorizontalScaled_NEON<1, 6, 1>(
-            src, src_stride, width, subpixel_x, step_x, intermediate_height,
-            intermediate);
+      if (step_x > grade_x_threshold) {
+        ConvolveKernelHorizontalMixed6Tap<2>(src, src_stride, width, subpixel_x,
+                                             step_x, intermediate_height,
+                                             intermediate);
+
       } else {
-        ConvolveHorizontalScaled_NEON<1, 6, 2>(
-            src, src_stride, width, subpixel_x, step_x, intermediate_height,
-            intermediate);
+        ConvolveKernelHorizontalMixed6Tap<1>(src, src_stride, width, subpixel_x,
+                                             step_x, intermediate_height,
+                                             intermediate);
       }
       break;
     case 2:
-      if (step_x <= 1024) {
-        ConvolveHorizontalScaled_NEON<2, 8, 1>(
+      if (step_x > grade_x_threshold) {
+        ConvolveKernelHorizontalSigned8Tap<2>(
             src, src_stride, width, subpixel_x, step_x, intermediate_height,
             intermediate);
       } else {
-        ConvolveHorizontalScaled_NEON<2, 8, 2>(
+        ConvolveKernelHorizontalSigned8Tap<1>(
             src, src_stride, width, subpixel_x, step_x, intermediate_height,
             intermediate);
       }
       break;
     case 3:
-      ConvolveKernelHorizontal2Tap(src, src_stride, width, subpixel_x, step_x,
-                                   intermediate_height, intermediate);
+      if (step_x > grade_x_threshold) {
+        ConvolveKernelHorizontal2Tap<2>(src, src_stride, width, subpixel_x,
+                                        step_x, intermediate_height,
+                                        intermediate);
+      } else {
+        ConvolveKernelHorizontal2Tap<1>(src, src_stride, width, subpixel_x,
+                                        step_x, intermediate_height,
+                                        intermediate);
+      }
       break;
     case 4:
       assert(width <= 4);
@@ -1441,23 +1591,135 @@
   // Vertical filter.
   filter_index = GetFilterIndex(vertical_filter_index, height);
   intermediate = intermediate_result;
-  const int offset_bits = kBitdepth8 + 2 * kFilterBits - 3;
-  for (int y = 0, p = subpixel_y & 1023; y < height; ++y, p += step_y) {
-    const int filter_id = (p >> 6) & kSubPixelMask;
-    for (int x = 0; x < width; ++x) {
-      // An offset to guarantee the sum is non negative.
-      int sum = 1 << offset_bits;
-      for (int k = 0; k < kSubPixelTaps; ++k) {
-        sum +=
-            kSubPixelFilters[filter_index][filter_id][k] *
-            intermediate[((p >> kScaleSubPixelBits) + k) * kIntermediateStride +
-                         x];
+
+  switch (filter_index) {
+    case 0:
+    case 1:
+      if (step_y <= 1024) {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale4xH<6, 1, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale4xH<6, 1, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<6, 1, is_compound>(
+              intermediate, width, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        }
+      } else {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale4xH<6, 2, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale4xH<6, 2, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<6, 2, is_compound>(
+              intermediate, width, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        }
       }
-      assert(sum >= 0 && sum < (1 << (offset_bits + 2)));
-      dest[x] = static_cast<uint16_t>(
-          RightShiftWithRounding(sum, inter_round_bits_vertical));
-    }
-    dest += pred_stride;
+      break;
+    case 2:
+      if (step_y <= 1024) {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale4xH<8, 1, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale4xH<8, 1, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<8, 1, is_compound>(
+              intermediate, width, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        }
+      } else {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale4xH<8, 2, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale4xH<8, 2, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<8, 2, is_compound>(
+              intermediate, width, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        }
+      }
+      break;
+    case 3:
+      if (step_y <= 1024) {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale4xH<2, 1, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale4xH<2, 1, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<2, 1, is_compound>(
+              intermediate, width, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        }
+      } else {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale4xH<2, 2, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale4xH<2, 2, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<2, 2, is_compound>(
+              intermediate, width, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        }
+      }
+      break;
+    case 4:
+    default:
+      assert(filter_index == 4 || filter_index == 5);
+      assert(height <= 4);
+      if (step_y <= 1024) {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale4xH<4, 1, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale4xH<4, 1, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<4, 1, is_compound>(
+              intermediate, width, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        }
+      } else {
+        if (!is_compound && width == 2) {
+          ConvolveVerticalScale4xH<4, 2, 2, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else if (width == 4) {
+          ConvolveVerticalScale4xH<4, 2, 4, is_compound>(
+              intermediate, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        } else {
+          ConvolveVerticalScale<4, 2, is_compound>(
+              intermediate, width, subpixel_y, filter_index, step_y, height,
+              prediction, pred_stride);
+        }
+      }
   }
 }
 
@@ -1465,65 +1727,75 @@
                              const ptrdiff_t reference_stride,
                              const int horizontal_filter_index,
                              const int /*vertical_filter_index*/,
-                             const int /*inter_round_bits_vertical*/,
                              const int subpixel_x, const int /*subpixel_y*/,
-                             const int /*step_x*/, const int /*step_y*/,
                              const int width, const int height,
                              void* prediction, const ptrdiff_t pred_stride) {
-  // For 8 (and 10) bit calculations |inter_round_bits_horizontal| is 3.
   const int filter_index = GetFilterIndex(horizontal_filter_index, width);
   // Set |src| to the outermost tap.
   const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
   auto* dest = static_cast<uint8_t*>(prediction);
 
-  HorizontalPass<false, true>(src, reference_stride, dest, pred_stride, width,
-                              height, subpixel_x, filter_index);
+  DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
+                   subpixel_x, filter_index);
 }
 
-template <int min_width, int num_taps>
+// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
+// Vertical calculations.
+uint16x8_t Compound1DShift(const int16x8_t sum) {
+  return vreinterpretq_u16_s16(
+      vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
+}
+
+template <int filter_index, bool is_compound = false,
+          bool negative_outside_taps = false>
 void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
-                    uint8_t* dst, const ptrdiff_t dst_stride, const int width,
-                    const int height, const int16x8_t taps) {
-  constexpr int next_row = num_taps - 1;
-  // |src| points to the outermost tap of the first value. When doing fewer than
-  // 8 taps it needs to be adjusted.
-  if (num_taps == 6) {
-    src += src_stride;
-  } else if (num_taps == 4) {
-    src += 2 * src_stride;
-  } else if (num_taps == 2) {
-    src += 3 * src_stride;
-  }
+                    void* const dst, const ptrdiff_t dst_stride,
+                    const int width, const int height,
+                    const uint8x8_t* const taps) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  const int next_row = num_taps - 1;
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+  assert(width >= 8);
 
   int x = 0;
   do {
-    int16x8_t srcs[8];
-    srcs[0] = ZeroExtend(vld1_u8(src + x));
+    const uint8_t* src_x = src + x;
+    uint8x8_t srcs[8];
+    srcs[0] = vld1_u8(src_x);
+    src_x += src_stride;
     if (num_taps >= 4) {
-      srcs[1] = ZeroExtend(vld1_u8(src + x + src_stride));
-      srcs[2] = ZeroExtend(vld1_u8(src + x + 2 * src_stride));
+      srcs[1] = vld1_u8(src_x);
+      src_x += src_stride;
+      srcs[2] = vld1_u8(src_x);
+      src_x += src_stride;
       if (num_taps >= 6) {
-        srcs[3] = ZeroExtend(vld1_u8(src + x + 3 * src_stride));
-        srcs[4] = ZeroExtend(vld1_u8(src + x + 4 * src_stride));
+        srcs[3] = vld1_u8(src_x);
+        src_x += src_stride;
+        srcs[4] = vld1_u8(src_x);
+        src_x += src_stride;
         if (num_taps == 8) {
-          srcs[5] = ZeroExtend(vld1_u8(src + x + 5 * src_stride));
-          srcs[6] = ZeroExtend(vld1_u8(src + x + 6 * src_stride));
+          srcs[5] = vld1_u8(src_x);
+          src_x += src_stride;
+          srcs[6] = vld1_u8(src_x);
+          src_x += src_stride;
         }
       }
     }
 
     int y = 0;
     do {
-      srcs[next_row] =
-          ZeroExtend(vld1_u8(src + x + (y + next_row) * src_stride));
+      srcs[next_row] = vld1_u8(src_x);
+      src_x += src_stride;
 
-      const int16x8_t sums = SumTaps<num_taps>(srcs, taps);
-      const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits);
-
-      if (min_width == 4) {
-        StoreLo4(dst + x + y * dst_stride, results);
+      const int16x8_t sums =
+          SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+      if (is_compound) {
+        const uint16x8_t results = Compound1DShift(sums);
+        vst1q_u16(dst16 + x + y * dst_stride, results);
       } else {
-        vst1_u8(dst + x + y * dst_stride, results);
+        const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+        vst1_u8(dst8 + x + y * dst_stride, results);
       }
 
       srcs[0] = srcs[1];
@@ -1544,6 +1816,394 @@
   } while (x < width);
 }
 
+template <int filter_index, bool is_compound = false,
+          bool negative_outside_taps = false>
+void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
+                       void* const dst, const ptrdiff_t dst_stride,
+                       const int height, const uint8x8_t* const taps) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  auto* dst8 = static_cast<uint8_t*>(dst);
+  auto* dst16 = static_cast<uint16_t*>(dst);
+
+  uint8x8_t srcs[9];
+
+  if (num_taps == 2) {
+    srcs[2] = vdup_n_u8(0);
+
+    srcs[0] = Load4(src);
+    src += src_stride;
+
+    int y = 0;
+    do {
+      srcs[0] = Load4<1>(src, srcs[0]);
+      src += src_stride;
+      srcs[2] = Load4<0>(src, srcs[2]);
+      src += src_stride;
+      srcs[1] = vext_u8(srcs[0], srcs[2], 4);
+
+      const int16x8_t sums =
+          SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+      if (is_compound) {
+        const uint16x8_t results = Compound1DShift(sums);
+
+        vst1q_u16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+        StoreLo4(dst8, results);
+        dst8 += dst_stride;
+        StoreHi4(dst8, results);
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      y += 2;
+    } while (y < height);
+  } else if (num_taps == 4) {
+    srcs[4] = vdup_n_u8(0);
+
+    srcs[0] = Load4(src);
+    src += src_stride;
+    srcs[0] = Load4<1>(src, srcs[0]);
+    src += src_stride;
+    srcs[2] = Load4(src);
+    src += src_stride;
+    srcs[1] = vext_u8(srcs[0], srcs[2], 4);
+
+    int y = 0;
+    do {
+      srcs[2] = Load4<1>(src, srcs[2]);
+      src += src_stride;
+      srcs[4] = Load4<0>(src, srcs[4]);
+      src += src_stride;
+      srcs[3] = vext_u8(srcs[2], srcs[4], 4);
+
+      const int16x8_t sums =
+          SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+      if (is_compound) {
+        const uint16x8_t results = Compound1DShift(sums);
+
+        vst1q_u16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+        StoreLo4(dst8, results);
+        dst8 += dst_stride;
+        StoreHi4(dst8, results);
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      y += 2;
+    } while (y < height);
+  } else if (num_taps == 6) {
+    srcs[6] = vdup_n_u8(0);
+
+    srcs[0] = Load4(src);
+    src += src_stride;
+    srcs[0] = Load4<1>(src, srcs[0]);
+    src += src_stride;
+    srcs[2] = Load4(src);
+    src += src_stride;
+    srcs[1] = vext_u8(srcs[0], srcs[2], 4);
+    srcs[2] = Load4<1>(src, srcs[2]);
+    src += src_stride;
+    srcs[4] = Load4(src);
+    src += src_stride;
+    srcs[3] = vext_u8(srcs[2], srcs[4], 4);
+
+    int y = 0;
+    do {
+      srcs[4] = Load4<1>(src, srcs[4]);
+      src += src_stride;
+      srcs[6] = Load4<0>(src, srcs[6]);
+      src += src_stride;
+      srcs[5] = vext_u8(srcs[4], srcs[6], 4);
+
+      const int16x8_t sums =
+          SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+      if (is_compound) {
+        const uint16x8_t results = Compound1DShift(sums);
+
+        vst1q_u16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+        StoreLo4(dst8, results);
+        dst8 += dst_stride;
+        StoreHi4(dst8, results);
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      srcs[3] = srcs[5];
+      srcs[4] = srcs[6];
+      y += 2;
+    } while (y < height);
+  } else if (num_taps == 8) {
+    srcs[8] = vdup_n_u8(0);
+
+    srcs[0] = Load4(src);
+    src += src_stride;
+    srcs[0] = Load4<1>(src, srcs[0]);
+    src += src_stride;
+    srcs[2] = Load4(src);
+    src += src_stride;
+    srcs[1] = vext_u8(srcs[0], srcs[2], 4);
+    srcs[2] = Load4<1>(src, srcs[2]);
+    src += src_stride;
+    srcs[4] = Load4(src);
+    src += src_stride;
+    srcs[3] = vext_u8(srcs[2], srcs[4], 4);
+    srcs[4] = Load4<1>(src, srcs[4]);
+    src += src_stride;
+    srcs[6] = Load4(src);
+    src += src_stride;
+    srcs[5] = vext_u8(srcs[4], srcs[6], 4);
+
+    int y = 0;
+    do {
+      srcs[6] = Load4<1>(src, srcs[6]);
+      src += src_stride;
+      srcs[8] = Load4<0>(src, srcs[8]);
+      src += src_stride;
+      srcs[7] = vext_u8(srcs[6], srcs[8], 4);
+
+      const int16x8_t sums =
+          SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+      if (is_compound) {
+        const uint16x8_t results = Compound1DShift(sums);
+
+        vst1q_u16(dst16, results);
+        dst16 += 4 << 1;
+      } else {
+        const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+        StoreLo4(dst8, results);
+        dst8 += dst_stride;
+        StoreHi4(dst8, results);
+        dst8 += dst_stride;
+      }
+
+      srcs[0] = srcs[2];
+      srcs[1] = srcs[3];
+      srcs[2] = srcs[4];
+      srcs[3] = srcs[5];
+      srcs[4] = srcs[6];
+      srcs[5] = srcs[7];
+      srcs[6] = srcs[8];
+      y += 2;
+    } while (y < height);
+  }
+}
+
+template <int filter_index, bool negative_outside_taps = false>
+void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
+                       void* const dst, const ptrdiff_t dst_stride,
+                       const int height, const uint8x8_t* const taps) {
+  const int num_taps = GetNumTapsInFilter(filter_index);
+  auto* dst8 = static_cast<uint8_t*>(dst);
+
+  uint8x8_t srcs[9];
+
+  if (num_taps == 2) {
+    srcs[2] = vdup_n_u8(0);
+
+    srcs[0] = Load2(src);
+    src += src_stride;
+
+    int y = 0;
+    do {
+      srcs[0] = Load2<1>(src, srcs[0]);
+      src += src_stride;
+      srcs[0] = Load2<2>(src, srcs[0]);
+      src += src_stride;
+      srcs[0] = Load2<3>(src, srcs[0]);
+      src += src_stride;
+      srcs[2] = Load2<0>(src, srcs[2]);
+      src += src_stride;
+      srcs[1] = vext_u8(srcs[0], srcs[2], 2);
+
+      // This uses srcs[0]..srcs[1].
+      const int16x8_t sums =
+          SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+      const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+      Store2<0>(dst8, results);
+      dst8 += dst_stride;
+      Store2<1>(dst8, results);
+      if (height == 2) return;
+      dst8 += dst_stride;
+      Store2<2>(dst8, results);
+      dst8 += dst_stride;
+      Store2<3>(dst8, results);
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[2];
+      y += 4;
+    } while (y < height);
+  } else if (num_taps == 4) {
+    srcs[4] = vdup_n_u8(0);
+
+    srcs[0] = Load2(src);
+    src += src_stride;
+    srcs[0] = Load2<1>(src, srcs[0]);
+    src += src_stride;
+    srcs[0] = Load2<2>(src, srcs[0]);
+    src += src_stride;
+
+    int y = 0;
+    do {
+      srcs[0] = Load2<3>(src, srcs[0]);
+      src += src_stride;
+      srcs[4] = Load2<0>(src, srcs[4]);
+      src += src_stride;
+      srcs[1] = vext_u8(srcs[0], srcs[4], 2);
+      srcs[4] = Load2<1>(src, srcs[4]);
+      src += src_stride;
+      srcs[2] = vext_u8(srcs[0], srcs[4], 4);
+      srcs[4] = Load2<2>(src, srcs[4]);
+      src += src_stride;
+      srcs[3] = vext_u8(srcs[0], srcs[4], 6);
+
+      // This uses srcs[0]..srcs[3].
+      const int16x8_t sums =
+          SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+      const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+      Store2<0>(dst8, results);
+      dst8 += dst_stride;
+      Store2<1>(dst8, results);
+      if (height == 2) return;
+      dst8 += dst_stride;
+      Store2<2>(dst8, results);
+      dst8 += dst_stride;
+      Store2<3>(dst8, results);
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[4];
+      y += 4;
+    } while (y < height);
+  } else if (num_taps == 6) {
+    // During the vertical pass the number of taps is restricted when
+    // |height| <= 4.
+    assert(height > 4);
+    srcs[8] = vdup_n_u8(0);
+
+    srcs[0] = Load2(src);
+    src += src_stride;
+    srcs[0] = Load2<1>(src, srcs[0]);
+    src += src_stride;
+    srcs[0] = Load2<2>(src, srcs[0]);
+    src += src_stride;
+    srcs[0] = Load2<3>(src, srcs[0]);
+    src += src_stride;
+    srcs[4] = Load2(src);
+    src += src_stride;
+    srcs[1] = vext_u8(srcs[0], srcs[4], 2);
+
+    int y = 0;
+    do {
+      srcs[4] = Load2<1>(src, srcs[4]);
+      src += src_stride;
+      srcs[2] = vext_u8(srcs[0], srcs[4], 4);
+      srcs[4] = Load2<2>(src, srcs[4]);
+      src += src_stride;
+      srcs[3] = vext_u8(srcs[0], srcs[4], 6);
+      srcs[4] = Load2<3>(src, srcs[4]);
+      src += src_stride;
+      srcs[8] = Load2<0>(src, srcs[8]);
+      src += src_stride;
+      srcs[5] = vext_u8(srcs[4], srcs[8], 2);
+
+      // This uses srcs[0]..srcs[5].
+      const int16x8_t sums =
+          SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+      const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+      Store2<0>(dst8, results);
+      dst8 += dst_stride;
+      Store2<1>(dst8, results);
+      dst8 += dst_stride;
+      Store2<2>(dst8, results);
+      dst8 += dst_stride;
+      Store2<3>(dst8, results);
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[4];
+      srcs[1] = srcs[5];
+      srcs[4] = srcs[8];
+      y += 4;
+    } while (y < height);
+  } else if (num_taps == 8) {
+    // During the vertical pass the number of taps is restricted when
+    // |height| <= 4.
+    assert(height > 4);
+    srcs[8] = vdup_n_u8(0);
+
+    srcs[0] = Load2(src);
+    src += src_stride;
+    srcs[0] = Load2<1>(src, srcs[0]);
+    src += src_stride;
+    srcs[0] = Load2<2>(src, srcs[0]);
+    src += src_stride;
+    srcs[0] = Load2<3>(src, srcs[0]);
+    src += src_stride;
+    srcs[4] = Load2(src);
+    src += src_stride;
+    srcs[1] = vext_u8(srcs[0], srcs[4], 2);
+    srcs[4] = Load2<1>(src, srcs[4]);
+    src += src_stride;
+    srcs[2] = vext_u8(srcs[0], srcs[4], 4);
+    srcs[4] = Load2<2>(src, srcs[4]);
+    src += src_stride;
+    srcs[3] = vext_u8(srcs[0], srcs[4], 6);
+
+    int y = 0;
+    do {
+      srcs[4] = Load2<3>(src, srcs[4]);
+      src += src_stride;
+      srcs[8] = Load2<0>(src, srcs[8]);
+      src += src_stride;
+      srcs[5] = vext_u8(srcs[4], srcs[8], 2);
+      srcs[8] = Load2<1>(src, srcs[8]);
+      src += src_stride;
+      srcs[6] = vext_u8(srcs[4], srcs[8], 4);
+      srcs[8] = Load2<2>(src, srcs[8]);
+      src += src_stride;
+      srcs[7] = vext_u8(srcs[4], srcs[8], 6);
+
+      // This uses srcs[0]..srcs[7].
+      const int16x8_t sums =
+          SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+      const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+      Store2<0>(dst8, results);
+      dst8 += dst_stride;
+      Store2<1>(dst8, results);
+      dst8 += dst_stride;
+      Store2<2>(dst8, results);
+      dst8 += dst_stride;
+      Store2<3>(dst8, results);
+      dst8 += dst_stride;
+
+      srcs[0] = srcs[4];
+      srcs[1] = srcs[5];
+      srcs[2] = srcs[6];
+      srcs[3] = srcs[7];
+      srcs[4] = srcs[8];
+      y += 4;
+    } while (y < height);
+  }
+}
+
 // This function is a simplified version of Convolve2D_C.
 // It is called when it is single prediction mode, where only vertical
 // filtering is required.
@@ -1553,107 +2213,129 @@
                            const ptrdiff_t reference_stride,
                            const int /*horizontal_filter_index*/,
                            const int vertical_filter_index,
-                           const int /*inter_round_bits_vertical*/,
                            const int /*subpixel_x*/, const int subpixel_y,
-                           const int /*step_x*/, const int /*step_y*/,
                            const int width, const int height, void* prediction,
                            const ptrdiff_t pred_stride) {
   const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(filter_index);
   const ptrdiff_t src_stride = reference_stride;
-  const auto* src =
-      static_cast<const uint8_t*>(reference) - kVerticalOffset * src_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride;
   auto* dest = static_cast<uint8_t*>(prediction);
   const ptrdiff_t dest_stride = pred_stride;
   const int filter_id = (subpixel_y >> 6) & kSubPixelMask;
-  // First filter is always a copy.
-  if (filter_id == 0) {
-    // Move |src| down the actual values and not the start of the context.
-    src = static_cast<const uint8_t*>(reference);
-    int y = 0;
-    do {
-      memcpy(dest, src, width * sizeof(src[0]));
-      src += src_stride;
-      dest += dest_stride;
-    } while (++y < height);
-    return;
+  assert(filter_id != 0);
+
+  uint8x8_t taps[8];
+  for (int k = 0; k < kSubPixelTaps; ++k) {
+    taps[k] = vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][filter_id][k]);
   }
 
-  // Break up by # of taps
-  // |filter_index| taps  enum InterpolationFilter
-  //        0       6     kInterpolationFilterEightTap
-  //        1       6     kInterpolationFilterEightTapSmooth
-  //        2       8     kInterpolationFilterEightTapSharp
-  //        3       2     kInterpolationFilterBilinear
-  //        4       4     kInterpolationFilterSwitchable
-  //        5       4     !!! SECRET FILTER !!! only for Wx4.
-  if (width >= 4) {
-    if (filter_index == 2) {  // 8 tap.
-      const int16x8_t taps =
-          vld1q_s16(kSubPixelFilters[filter_index][filter_id]);
-      if (width == 4) {
-        FilterVertical<4, 8>(src, src_stride, dest, dest_stride, width, height,
-                             taps);
-      } else {
-        FilterVertical<8, 8>(src, src_stride, dest, dest_stride, width, height,
-                             taps);
-      }
-    } else if (filter_index < 2) {  // 6 tap.
-      const int16x8_t taps =
-          vld1q_s16(kSubPixelFilters[filter_index][filter_id]);
-      if (width == 4) {
-        FilterVertical<4, 6>(src, src_stride, dest, dest_stride, width, height,
-                             taps);
-      } else {
-        FilterVertical<8, 6>(src, src_stride, dest, dest_stride, width, height,
-                             taps);
-      }
-    } else if (filter_index > 3) {  // 4 tap.
-      // Store taps in vget_low_s16(taps).
-      const int16x8_t taps =
-          vld1q_s16(kSubPixelFilters[filter_index][filter_id] + 2);
-      if (width == 4) {
-        FilterVertical<4, 4>(src, src_stride, dest, dest_stride, width, height,
-                             taps);
-      } else {
-        FilterVertical<8, 4>(src, src_stride, dest, dest_stride, width, height,
-                             taps);
-      }
-    } else {  // 2 tap.
-      // Store taps in vget_low_s16(taps).
-      const int16x8_t taps =
-          vld1q_s16(kSubPixelFilters[filter_index][filter_id] + 2);
-      if (width == 4) {
-        FilterVertical<4, 2>(src, src_stride, dest, dest_stride, width, height,
-                             taps);
-      } else {
-        FilterVertical<8, 2>(src, src_stride, dest, dest_stride, width, height,
-                             taps);
-      }
+  if (filter_index == 0) {  // 6 tap.
+    if (width == 2) {
+      FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height,
+                           taps + 1);
+    } else if (width == 4) {
+      FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height,
+                           taps + 1);
+    } else {
+      FilterVertical<0>(src, src_stride, dest, dest_stride, width, height,
+                        taps + 1);
+    }
+  } else if ((filter_index == 1) &
+             ((filter_id == 1) | (filter_id == 15))) {  // 5 tap.
+    if (width == 2) {
+      FilterVertical2xH<1>(src, src_stride, dest, dest_stride, height,
+                           taps + 1);
+    } else if (width == 4) {
+      FilterVertical4xH<1>(src, src_stride, dest, dest_stride, height,
+                           taps + 1);
+    } else {
+      FilterVertical<1>(src, src_stride, dest, dest_stride, width, height,
+                        taps + 1);
+    }
+  } else if ((filter_index == 1) &
+             ((filter_id == 7) | (filter_id == 8) |
+              (filter_id == 9))) {  // 6 tap with weird negative taps.
+    if (width == 2) {
+      FilterVertical2xH<1,
+                        /*negative_outside_taps=*/true>(
+          src, src_stride, dest, dest_stride, height, taps + 1);
+    } else if (width == 4) {
+      FilterVertical4xH<1, /*is_compound=*/false,
+                        /*negative_outside_taps=*/true>(
+          src, src_stride, dest, dest_stride, height, taps + 1);
+    } else {
+      FilterVertical<1, /*is_compound=*/false, /*negative_outside_taps=*/true>(
+          src, src_stride, dest, dest_stride, width, height, taps + 1);
+    }
+  } else if (filter_index == 2) {  // 8 tap.
+    if (width == 2) {
+      FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
+    } else if (width == 4) {
+      FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
+    } else {
+      FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
+                        taps);
+    }
+  } else if (filter_index == 3) {  // 2 tap.
+    if (width == 2) {
+      FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height,
+                           taps + 3);
+    } else if (width == 4) {
+      FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height,
+                           taps + 3);
+    } else {
+      FilterVertical<3>(src, src_stride, dest, dest_stride, width, height,
+                        taps + 3);
+    }
+  } else if (filter_index == 4) {  // 4 tap.
+    // Outside taps are negative.
+    if (width == 2) {
+      FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height,
+                           taps + 2);
+    } else if (width == 4) {
+      FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height,
+                           taps + 2);
+    } else {
+      FilterVertical<4>(src, src_stride, dest, dest_stride, width, height,
+                        taps + 2);
     }
   } else {
-    assert(width == 2);
-    const int taps = NumTapsInFilter(filter_index);
-    src =
-        static_cast<const uint8_t*>(reference) - ((taps / 2) - 1) * src_stride;
-    VerticalPass2xH</*is_2d=*/false>(src, src_stride, dest, pred_stride, height,
-                                     0, filter_index, taps, subpixel_y);
+    // 4 tap. When |filter_index| == 1 the |filter_id| values listed below map
+    // to 4 tap filters.
+    assert(filter_index == 5 ||
+           (filter_index == 1 &&
+            (filter_id == 2 || filter_id == 3 || filter_id == 4 ||
+             filter_id == 5 || filter_id == 6 || filter_id == 10 ||
+             filter_id == 11 || filter_id == 12 || filter_id == 13 ||
+             filter_id == 14)));
+    // According to GetNumTapsInFilter() this has 6 taps but here we are
+    // treating it as though it has 4.
+    if (filter_index == 1) src += src_stride;
+    if (width == 2) {
+      FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height,
+                           taps + 2);
+    } else if (width == 4) {
+      FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height,
+                           taps + 2);
+    } else {
+      FilterVertical<5>(src, src_stride, dest, dest_stride, width, height,
+                        taps + 2);
+    }
   }
 }
 
 void ConvolveCompoundCopy_NEON(
     const void* const reference, const ptrdiff_t reference_stride,
     const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
-    const int /*inter_round_bits_vertical*/, const int /*subpixel_x*/,
-    const int /*subpixel_y*/, const int /*step_x*/, const int /*step_y*/,
-    const int width, const int height, void* prediction,
-    const ptrdiff_t pred_stride) {
+    const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
+    const int height, void* prediction, const ptrdiff_t /*pred_stride*/) {
   const auto* src = static_cast<const uint8_t*>(reference);
   const ptrdiff_t src_stride = reference_stride;
   auto* dest = static_cast<uint16_t*>(prediction);
-  const int bitdepth = 8;
-  const int compound_round_offset =
-      (1 << (bitdepth + 4)) + (1 << (bitdepth + 3));
-  const uint16x8_t v_compound_round_offset = vdupq_n_u16(compound_round_offset);
+  constexpr int final_shift =
+      kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
 
   if (width >= 16) {
     int y = 0;
@@ -1661,226 +2343,161 @@
       int x = 0;
       do {
         const uint8x16_t v_src = vld1q_u8(&src[x]);
-        const uint16x8_t v_src_x16_lo = vshll_n_u8(vget_low_u8(v_src), 4);
-        const uint16x8_t v_src_x16_hi = vshll_n_u8(vget_high_u8(v_src), 4);
         const uint16x8_t v_dest_lo =
-            vaddq_u16(v_src_x16_lo, v_compound_round_offset);
+            vshll_n_u8(vget_low_u8(v_src), final_shift);
         const uint16x8_t v_dest_hi =
-            vaddq_u16(v_src_x16_hi, v_compound_round_offset);
+            vshll_n_u8(vget_high_u8(v_src), final_shift);
         vst1q_u16(&dest[x], v_dest_lo);
         x += 8;
         vst1q_u16(&dest[x], v_dest_hi);
         x += 8;
       } while (x < width);
       src += src_stride;
-      dest += pred_stride;
+      dest += width;
     } while (++y < height);
   } else if (width == 8) {
     int y = 0;
     do {
       const uint8x8_t v_src = vld1_u8(&src[0]);
-      const uint16x8_t v_src_x16 = vshll_n_u8(v_src, 4);
-      vst1q_u16(&dest[0], vaddq_u16(v_src_x16, v_compound_round_offset));
+      const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
+      vst1q_u16(&dest[0], v_dest);
       src += src_stride;
-      dest += pred_stride;
+      dest += width;
     } while (++y < height);
-  } else if (width == 4) {
-    const uint8x8_t zero = vdup_n_u8(0);
+  } else { /* width == 4 */
+    uint8x8_t v_src = vdup_n_u8(0);
+
     int y = 0;
     do {
-      const uint8x8_t v_src = LoadLo4(&src[0], zero);
-      const uint16x8_t v_src_x16 = vshll_n_u8(v_src, 4);
-      const uint16x8_t v_dest = vaddq_u16(v_src_x16, v_compound_round_offset);
-      vst1_u16(&dest[0], vget_low_u16(v_dest));
+      v_src = Load4<0>(&src[0], v_src);
       src += src_stride;
-      dest += pred_stride;
-    } while (++y < height);
-  } else {  // width == 2
-    assert(width == 2);
-    int y = 0;
-    do {
-      dest[0] = (src[0] << 4) + compound_round_offset;
-      dest[1] = (src[1] << 4) + compound_round_offset;
+      v_src = Load4<1>(&src[0], v_src);
       src += src_stride;
-      dest += pred_stride;
-    } while (++y < height);
+      const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
+      vst1q_u16(&dest[0], v_dest);
+      dest += 4 << 1;
+      y += 2;
+    } while (y < height);
   }
 }
 
-// Input 8 bits and output 16 bits.
-template <int min_width, int num_taps>
-void FilterCompoundVertical(const uint8_t* src, const ptrdiff_t src_stride,
-                            uint16_t* dst, const ptrdiff_t dst_stride,
-                            const int width, const int height,
-                            const int16x8_t taps) {
-  constexpr int next_row = num_taps - 1;
-  // |src| points to the outermost tap of the first value. When doing fewer than
-  // 8 taps it needs to be adjusted.
-  if (num_taps == 6) {
-    src += src_stride;
-  } else if (num_taps == 4) {
-    src += 2 * src_stride;
-  } else if (num_taps == 2) {
-    src += 3 * src_stride;
-  }
-
-  const uint16x8_t compound_round_offset = vdupq_n_u16(1 << 12);
-
-  int x = 0;
-  do {
-    int16x8_t srcs[8];
-    srcs[0] = ZeroExtend(vld1_u8(src + x));
-    if (num_taps >= 4) {
-      srcs[1] = ZeroExtend(vld1_u8(src + x + src_stride));
-      srcs[2] = ZeroExtend(vld1_u8(src + x + 2 * src_stride));
-      if (num_taps >= 6) {
-        srcs[3] = ZeroExtend(vld1_u8(src + x + 3 * src_stride));
-        srcs[4] = ZeroExtend(vld1_u8(src + x + 4 * src_stride));
-        if (num_taps == 8) {
-          srcs[5] = ZeroExtend(vld1_u8(src + x + 5 * src_stride));
-          srcs[6] = ZeroExtend(vld1_u8(src + x + 6 * src_stride));
-        }
-      }
-    }
-
-    int y = 0;
-    do {
-      srcs[next_row] =
-          ZeroExtend(vld1_u8(src + x + (y + next_row) * src_stride));
-
-      const uint16x8_t sums = SumTaps8To16<num_taps>(srcs, taps);
-      const uint16x8_t shifted = vrshrq_n_u16(sums, 3);
-      // In order to keep the sum in 16 bits we add an offset to the sum
-      // (1 << (bitdepth + kFilterBits - 1) == 1 << 14). This ensures that the
-      // results will never be negative.
-      // Normally ConvolveCompoundVertical would add |compound_round_offset| at
-      // the end. Instead we use that to compensate for the initial offset.
-      // (1 << (bitdepth + 4)) + (1 << (bitdepth + 3)) == (1 << 12) + (1 << 11)
-      // After taking into account the shift above:
-      // RightShiftWithRounding(LeftShift(sum, bits_shift),
-      //                        inter_round_bits_vertical)
-      // where bits_shift == kFilterBits - kInterRoundBitsHorizontal == 4
-      // and inter_round_bits_vertical == 7
-      // and simplifying it to RightShiftWithRounding(sum, 3)
-      // we see that the initial offset of 1 << 14 >> 3 == 1 << 11 and
-      // |compound_round_offset| can be simplified to 1 << 12.
-      const uint16x8_t offset = vaddq_u16(shifted, compound_round_offset);
-
-      if (min_width == 4) {
-        vst1_u16(dst + x + y * dst_stride, vget_low_u16(offset));
-      } else {
-        vst1q_u16(dst + x + y * dst_stride, offset);
-      }
-
-      srcs[0] = srcs[1];
-      if (num_taps >= 4) {
-        srcs[1] = srcs[2];
-        srcs[2] = srcs[3];
-        if (num_taps >= 6) {
-          srcs[3] = srcs[4];
-          srcs[4] = srcs[5];
-          if (num_taps == 8) {
-            srcs[5] = srcs[6];
-            srcs[6] = srcs[7];
-          }
-        }
-      }
-    } while (++y < height);
-    x += 8;
-  } while (x < width);
-}
-
 void ConvolveCompoundVertical_NEON(
     const void* const reference, const ptrdiff_t reference_stride,
     const int /*horizontal_filter_index*/, const int vertical_filter_index,
-    const int /*inter_round_bits_vertical*/, const int /*subpixel_x*/,
-    const int subpixel_y, const int /*step_x*/, const int /*step_y*/,
-    const int width, const int height, void* prediction,
-    const ptrdiff_t pred_stride) {
+    const int /*subpixel_x*/, const int subpixel_y, const int width,
+    const int height, void* prediction, const ptrdiff_t /*pred_stride*/) {
   const int filter_index = GetFilterIndex(vertical_filter_index, height);
+  const int vertical_taps = GetNumTapsInFilter(filter_index);
   const ptrdiff_t src_stride = reference_stride;
-  const auto* src =
-      static_cast<const uint8_t*>(reference) - kVerticalOffset * src_stride;
+  const auto* src = static_cast<const uint8_t*>(reference) -
+                    (vertical_taps / 2 - 1) * src_stride;
   auto* dest = static_cast<uint16_t*>(prediction);
   const int filter_id = (subpixel_y >> 6) & kSubPixelMask;
+  assert(filter_id != 0);
 
-  if (width >= 4) {
-    const int16x8_t taps = vld1q_s16(kSubPixelFilters[filter_index][filter_id]);
+  uint8x8_t taps[8];
+  for (int k = 0; k < kSubPixelTaps; ++k) {
+    taps[k] = vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][filter_id][k]);
+  }
 
-    if (filter_index == 2) {  // 8 tap.
-      if (width == 4) {
-        FilterCompoundVertical<4, 8>(src, src_stride, dest, pred_stride, width,
-                                     height, taps);
-      } else {
-        FilterCompoundVertical<8, 8>(src, src_stride, dest, pred_stride, width,
-                                     height, taps);
-      }
-    } else if (filter_index < 2) {  // 6 tap.
-      if (width == 4) {
-        FilterCompoundVertical<4, 6>(src, src_stride, dest, pred_stride, width,
-                                     height, taps);
-      } else {
-        FilterCompoundVertical<8, 6>(src, src_stride, dest, pred_stride, width,
-                                     height, taps);
-      }
-    } else if (filter_index == 3) {  // 2 tap.
-      if (width == 4) {
-        FilterCompoundVertical<4, 2>(src, src_stride, dest, pred_stride, width,
-                                     height, taps);
-      } else {
-        FilterCompoundVertical<8, 2>(src, src_stride, dest, pred_stride, width,
-                                     height, taps);
-      }
-    } else if (filter_index > 3) {  // 4 tap.
-      if (width == 4) {
-        FilterCompoundVertical<4, 4>(src, src_stride, dest, pred_stride, width,
-                                     height, taps);
-      } else {
-        FilterCompoundVertical<8, 4>(src, src_stride, dest, pred_stride, width,
-                                     height, taps);
-      }
+  if (filter_index == 0) {  // 6 tap.
+    if (width == 4) {
+      FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps + 1);
+    } else {
+      FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps + 1);
+    }
+  } else if ((filter_index == 1) &
+             ((filter_id == 1) | (filter_id == 15))) {  // 5 tap.
+    if (width == 4) {
+      FilterVertical4xH<1, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps + 1);
+    } else {
+      FilterVertical<1, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps + 1);
+    }
+  } else if ((filter_index == 1) &
+             ((filter_id == 7) | (filter_id == 8) |
+              (filter_id == 9))) {  // 6 tap with weird negative taps.
+    if (width == 4) {
+      FilterVertical4xH<1, /*is_compound=*/true,
+                        /*negative_outside_taps=*/true>(src, src_stride, dest,
+                                                        4, height, taps + 1);
+    } else {
+      FilterVertical<1, /*is_compound=*/true, /*negative_outside_taps=*/true>(
+          src, src_stride, dest, width, width, height, taps + 1);
+    }
+  } else if (filter_index == 2) {  // 8 tap.
+    if (width == 4) {
+      FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps);
+    } else {
+      FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps);
+    }
+  } else if (filter_index == 3) {  // 2 tap.
+    if (width == 4) {
+      FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps + 3);
+    } else {
+      FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps + 3);
+    }
+  } else if (filter_index == 4) {  // 4 tap.
+    if (width == 4) {
+      FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps + 2);
+    } else {
+      FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps + 2);
     }
   } else {
-    assert(width == 2);
-    const int taps = NumTapsInFilter(filter_index);
-    src =
-        static_cast<const uint8_t*>(reference) - ((taps / 2) - 1) * src_stride;
-    VerticalPass2xH</*is_2d=*/false, /*is_compound=*/true>(
-        src, src_stride, dest, pred_stride, height, 0, filter_index, taps,
-        subpixel_y);
+    // 4 tap. When |filter_index| == 1 the |filter_id| values listed below map
+    // to 4 tap filters.
+    assert(filter_index == 5 ||
+           (filter_index == 1 &&
+            (filter_id == 2 || filter_id == 3 || filter_id == 4 ||
+             filter_id == 5 || filter_id == 6 || filter_id == 10 ||
+             filter_id == 11 || filter_id == 12 || filter_id == 13 ||
+             filter_id == 14)));
+    // According to GetNumTapsInFilter() this has 6 taps but here we are
+    // treating it as though it has 4.
+    if (filter_index == 1) src += src_stride;
+    if (width == 4) {
+      FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4,
+                                                 height, taps + 2);
+    } else {
+      FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width,
+                                              width, height, taps + 2);
+    }
   }
 }
 
 void ConvolveCompoundHorizontal_NEON(
     const void* const reference, const ptrdiff_t reference_stride,
     const int horizontal_filter_index, const int /*vertical_filter_index*/,
-    const int /*inter_round_bits_vertical*/, const int subpixel_x,
-    const int /*subpixel_y*/, const int /*step_x*/, const int /*step_y*/,
-    const int width, const int height, void* prediction,
-    const ptrdiff_t pred_stride) {
+    const int subpixel_x, const int /*subpixel_y*/, const int width,
+    const int height, void* prediction, const ptrdiff_t /*pred_stride*/) {
   const int filter_index = GetFilterIndex(horizontal_filter_index, width);
   const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
   auto* dest = static_cast<uint16_t*>(prediction);
 
-  HorizontalPass(src, reference_stride, dest, pred_stride, width, height,
-                 subpixel_x, filter_index);
+  DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>(
+      src, reference_stride, dest, width, width, height, subpixel_x,
+      filter_index);
 }
 
-void ConvolveCompound2D_NEON(const void* const reference,
-                             const ptrdiff_t reference_stride,
-                             const int horizontal_filter_index,
-                             const int vertical_filter_index,
-                             const int inter_round_bits_vertical,
-                             const int subpixel_x, const int subpixel_y,
-                             const int /*step_x*/, const int /*step_y*/,
-                             const int width, const int height,
-                             void* prediction, const ptrdiff_t pred_stride) {
+void ConvolveCompound2D_NEON(
+    const void* const reference, const ptrdiff_t reference_stride,
+    const int horizontal_filter_index, const int vertical_filter_index,
+    const int subpixel_x, const int subpixel_y, const int width,
+    const int height, void* prediction, const ptrdiff_t /*pred_stride*/) {
   // The output of the horizontal filter, i.e. the intermediate_result, is
   // guaranteed to fit in int16_t.
   uint16_t
       intermediate_result[kMaxSuperBlockSizeInPixels *
                           (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
-  const int intermediate_stride = kMaxSuperBlockSizeInPixels;
 
   // Horizontal filter.
   // Filter types used for width <= 4 are different from those for width > 4.
@@ -1889,66 +2506,586 @@
   // Similarly for height.
   const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
   const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
-  const int horizontal_taps = NumTapsInFilter(horiz_filter_index);
-  const int vertical_taps = NumTapsInFilter(vert_filter_index);
-  uint16_t* intermediate = intermediate_result;
+  const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
   const int intermediate_height = height + vertical_taps - 1;
   const ptrdiff_t src_stride = reference_stride;
-  const auto* src = static_cast<const uint8_t*>(reference) -
-                    kVerticalOffset * src_stride - kHorizontalOffset;
+  const auto* const src = static_cast<const uint8_t*>(reference) -
+                          (vertical_taps / 2 - 1) * src_stride -
+                          kHorizontalOffset;
+
+  DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
+      src, src_stride, intermediate_result, width, width, intermediate_height,
+      subpixel_x, horiz_filter_index);
+
+  // Vertical filter.
   auto* dest = static_cast<uint16_t*>(prediction);
-  int filter_id = (subpixel_x >> 6) & kSubPixelMask;
+  const int filter_id = ((subpixel_y & 1023) >> 6) & kSubPixelMask;
+  assert(filter_id != 0);
 
-  if (width >= 4) {
-    // TODO(johannkoenig): Use |width| for |intermediate_stride|.
-    src = static_cast<const uint8_t*>(reference) -
-          (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
-    HorizontalPass<true>(src, src_stride, intermediate_result,
-                         intermediate_stride, width, intermediate_height,
-                         subpixel_x, horiz_filter_index);
+  const ptrdiff_t dest_stride = width;
+  const int16x8_t taps =
+      vmovl_s8(vld1_s8(kHalfSubPixelFilters[vert_filter_index][filter_id]));
 
-    // Vertical filter.
-    intermediate = intermediate_result;
-    filter_id = ((subpixel_y & 1023) >> 6) & kSubPixelMask;
-
-    const ptrdiff_t dest_stride = pred_stride;
-    const int16x8_t taps =
-        vld1q_s16(kSubPixelFilters[vert_filter_index][filter_id]);
-
-    if (vertical_taps == 8) {
+  if (vertical_taps == 8) {
+    if (width == 4) {
+      Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest,
+                                                   dest_stride, height, taps);
+    } else {
       Filter2DVertical<8, /*is_compound=*/true>(
-          intermediate, intermediate_stride, dest, dest_stride, width, height,
-          taps, inter_round_bits_vertical);
-    } else if (vertical_taps == 6) {
-      Filter2DVertical<6, /*is_compound=*/true>(
-          intermediate, intermediate_stride, dest, dest_stride, width, height,
-          taps, inter_round_bits_vertical);
-    } else if (vertical_taps == 4) {
-      Filter2DVertical<4, /*is_compound=*/true>(
-          intermediate, intermediate_stride, dest, dest_stride, width, height,
-          taps, inter_round_bits_vertical);
-    } else {  // |vertical_taps| == 2
-      Filter2DVertical<2, /*is_compound=*/true>(
-          intermediate, intermediate_stride, dest, dest_stride, width, height,
-          taps, inter_round_bits_vertical);
+          intermediate_result, dest, dest_stride, width, height, taps);
     }
+  } else if (vertical_taps == 6) {
+    if (width == 4) {
+      Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest,
+                                                   dest_stride, height, taps);
+    } else {
+      Filter2DVertical<6, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps);
+    }
+  } else if (vertical_taps == 4) {
+    if (width == 4) {
+      Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest,
+                                                   dest_stride, height, taps);
+    } else {
+      Filter2DVertical<4, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps);
+    }
+  } else {  // |vertical_taps| == 2
+    if (width == 4) {
+      Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest,
+                                                   dest_stride, height, taps);
+    } else {
+      Filter2DVertical<2, /*is_compound=*/true>(
+          intermediate_result, dest, dest_stride, width, height, taps);
+    }
+  }
+}
+
+inline void HalfAddHorizontal(const uint8_t* src, uint8_t* dst) {
+  const uint8x16_t left = vld1q_u8(src);
+  const uint8x16_t right = vld1q_u8(src + 1);
+  vst1q_u8(dst, vrhaddq_u8(left, right));
+}
+
+template <int width>
+inline void IntraBlockCopyHorizontal(const uint8_t* src,
+                                     const ptrdiff_t src_stride,
+                                     const int height, uint8_t* dst,
+                                     const ptrdiff_t dst_stride) {
+  const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
+  const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
+
+  int y = 0;
+  do {
+    HalfAddHorizontal(src, dst);
+    if (width >= 32) {
+      src += 16;
+      dst += 16;
+      HalfAddHorizontal(src, dst);
+      if (width >= 64) {
+        src += 16;
+        dst += 16;
+        HalfAddHorizontal(src, dst);
+        src += 16;
+        dst += 16;
+        HalfAddHorizontal(src, dst);
+        if (width == 128) {
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal(src, dst);
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal(src, dst);
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal(src, dst);
+          src += 16;
+          dst += 16;
+          HalfAddHorizontal(src, dst);
+        }
+      }
+    }
+    src += src_remainder_stride;
+    dst += dst_remainder_stride;
+  } while (++y < height);
+}
+
+void ConvolveIntraBlockCopyHorizontal_NEON(
+    const void* const reference, const ptrdiff_t reference_stride,
+    const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
+    const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
+    const int height, void* const prediction, const ptrdiff_t pred_stride) {
+  const auto* src = static_cast<const uint8_t*>(reference);
+  auto* dest = static_cast<uint8_t*>(prediction);
+
+  if (width == 128) {
+    IntraBlockCopyHorizontal<128>(src, reference_stride, height, dest,
+                                  pred_stride);
+  } else if (width == 64) {
+    IntraBlockCopyHorizontal<64>(src, reference_stride, height, dest,
+                                 pred_stride);
+  } else if (width == 32) {
+    IntraBlockCopyHorizontal<32>(src, reference_stride, height, dest,
+                                 pred_stride);
+  } else if (width == 16) {
+    IntraBlockCopyHorizontal<16>(src, reference_stride, height, dest,
+                                 pred_stride);
+  } else if (width == 8) {
+    int y = 0;
+    do {
+      const uint8x8_t left = vld1_u8(src);
+      const uint8x8_t right = vld1_u8(src + 1);
+      vst1_u8(dest, vrhadd_u8(left, right));
+
+      src += reference_stride;
+      dest += pred_stride;
+    } while (++y < height);
+  } else if (width == 4) {
+    uint8x8_t left = vdup_n_u8(0);
+    uint8x8_t right = vdup_n_u8(0);
+    int y = 0;
+    do {
+      left = Load4<0>(src, left);
+      right = Load4<0>(src + 1, right);
+      src += reference_stride;
+      left = Load4<1>(src, left);
+      right = Load4<1>(src + 1, right);
+      src += reference_stride;
+
+      const uint8x8_t result = vrhadd_u8(left, right);
+
+      StoreLo4(dest, result);
+      dest += pred_stride;
+      StoreHi4(dest, result);
+      dest += pred_stride;
+      y += 2;
+    } while (y < height);
   } else {
-    src = static_cast<const uint8_t*>(reference) -
-          ((vertical_taps / 2) - 1) * src_stride - ((horizontal_taps / 2) - 1);
+    assert(width == 2);
+    uint8x8_t left = vdup_n_u8(0);
+    uint8x8_t right = vdup_n_u8(0);
+    int y = 0;
+    do {
+      left = Load2<0>(src, left);
+      right = Load2<0>(src + 1, right);
+      src += reference_stride;
+      left = Load2<1>(src, left);
+      right = Load2<1>(src + 1, right);
+      src += reference_stride;
 
-    HorizontalPass2xH(src, src_stride, intermediate_result, intermediate_stride,
-                      intermediate_height, horiz_filter_index, horizontal_taps,
-                      subpixel_x);
+      const uint8x8_t result = vrhadd_u8(left, right);
 
-    VerticalPass2xH</*is_2d=*/true, /*is_compound=*/true>(
-        intermediate_result, intermediate_stride, dest, pred_stride, height,
-        inter_round_bits_vertical, vert_filter_index, vertical_taps,
-        subpixel_y);
+      Store2<0>(dest, result);
+      dest += pred_stride;
+      Store2<1>(dest, result);
+      dest += pred_stride;
+      y += 2;
+    } while (y < height);
+  }
+}
+
+template <int width>
+inline void IntraBlockCopyVertical(const uint8_t* src,
+                                   const ptrdiff_t src_stride, const int height,
+                                   uint8_t* dst, const ptrdiff_t dst_stride) {
+  const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
+  const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
+  uint8x16_t row[8], below[8];
+
+  row[0] = vld1q_u8(src);
+  if (width >= 32) {
+    src += 16;
+    row[1] = vld1q_u8(src);
+    if (width >= 64) {
+      src += 16;
+      row[2] = vld1q_u8(src);
+      src += 16;
+      row[3] = vld1q_u8(src);
+      if (width == 128) {
+        src += 16;
+        row[4] = vld1q_u8(src);
+        src += 16;
+        row[5] = vld1q_u8(src);
+        src += 16;
+        row[6] = vld1q_u8(src);
+        src += 16;
+        row[7] = vld1q_u8(src);
+      }
+    }
+  }
+  src += src_remainder_stride;
+
+  int y = 0;
+  do {
+    below[0] = vld1q_u8(src);
+    if (width >= 32) {
+      src += 16;
+      below[1] = vld1q_u8(src);
+      if (width >= 64) {
+        src += 16;
+        below[2] = vld1q_u8(src);
+        src += 16;
+        below[3] = vld1q_u8(src);
+        if (width == 128) {
+          src += 16;
+          below[4] = vld1q_u8(src);
+          src += 16;
+          below[5] = vld1q_u8(src);
+          src += 16;
+          below[6] = vld1q_u8(src);
+          src += 16;
+          below[7] = vld1q_u8(src);
+        }
+      }
+    }
+    src += src_remainder_stride;
+
+    vst1q_u8(dst, vrhaddq_u8(row[0], below[0]));
+    row[0] = below[0];
+    if (width >= 32) {
+      dst += 16;
+      vst1q_u8(dst, vrhaddq_u8(row[1], below[1]));
+      row[1] = below[1];
+      if (width >= 64) {
+        dst += 16;
+        vst1q_u8(dst, vrhaddq_u8(row[2], below[2]));
+        row[2] = below[2];
+        dst += 16;
+        vst1q_u8(dst, vrhaddq_u8(row[3], below[3]));
+        row[3] = below[3];
+        if (width >= 128) {
+          dst += 16;
+          vst1q_u8(dst, vrhaddq_u8(row[4], below[4]));
+          row[4] = below[4];
+          dst += 16;
+          vst1q_u8(dst, vrhaddq_u8(row[5], below[5]));
+          row[5] = below[5];
+          dst += 16;
+          vst1q_u8(dst, vrhaddq_u8(row[6], below[6]));
+          row[6] = below[6];
+          dst += 16;
+          vst1q_u8(dst, vrhaddq_u8(row[7], below[7]));
+          row[7] = below[7];
+        }
+      }
+    }
+    dst += dst_remainder_stride;
+  } while (++y < height);
+}
+
+void ConvolveIntraBlockCopyVertical_NEON(
+    const void* const reference, const ptrdiff_t reference_stride,
+    const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
+    const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
+    const int height, void* const prediction, const ptrdiff_t pred_stride) {
+  const auto* src = static_cast<const uint8_t*>(reference);
+  auto* dest = static_cast<uint8_t*>(prediction);
+
+  if (width == 128) {
+    IntraBlockCopyVertical<128>(src, reference_stride, height, dest,
+                                pred_stride);
+  } else if (width == 64) {
+    IntraBlockCopyVertical<64>(src, reference_stride, height, dest,
+                               pred_stride);
+  } else if (width == 32) {
+    IntraBlockCopyVertical<32>(src, reference_stride, height, dest,
+                               pred_stride);
+  } else if (width == 16) {
+    IntraBlockCopyVertical<16>(src, reference_stride, height, dest,
+                               pred_stride);
+  } else if (width == 8) {
+    uint8x8_t row, below;
+    row = vld1_u8(src);
+    src += reference_stride;
+
+    int y = 0;
+    do {
+      below = vld1_u8(src);
+      src += reference_stride;
+
+      vst1_u8(dest, vrhadd_u8(row, below));
+      dest += pred_stride;
+
+      row = below;
+    } while (++y < height);
+  } else if (width == 4) {
+    uint8x8_t row = Load4(src);
+    uint8x8_t below = vdup_n_u8(0);
+    src += reference_stride;
+
+    int y = 0;
+    do {
+      below = Load4<0>(src, below);
+      src += reference_stride;
+
+      StoreLo4(dest, vrhadd_u8(row, below));
+      dest += pred_stride;
+
+      row = below;
+    } while (++y < height);
+  } else {
+    assert(width == 2);
+    uint8x8_t row = Load2(src);
+    uint8x8_t below = vdup_n_u8(0);
+    src += reference_stride;
+
+    int y = 0;
+    do {
+      below = Load2<0>(src, below);
+      src += reference_stride;
+
+      Store2<0>(dest, vrhadd_u8(row, below));
+      dest += pred_stride;
+
+      row = below;
+    } while (++y < height);
+  }
+}
+
+template <int width>
+inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride,
+                             const int height, uint8_t* dst,
+                             const ptrdiff_t dst_stride) {
+  const ptrdiff_t src_remainder_stride = src_stride - (width - 8);
+  const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8);
+  uint16x8_t row[16];
+  row[0] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+  if (width >= 16) {
+    src += 8;
+    row[1] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+    if (width >= 32) {
+      src += 8;
+      row[2] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+      src += 8;
+      row[3] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+      if (width >= 64) {
+        src += 8;
+        row[4] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+        src += 8;
+        row[5] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+        src += 8;
+        row[6] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+        src += 8;
+        row[7] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+        if (width == 128) {
+          src += 8;
+          row[8] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          src += 8;
+          row[9] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          src += 8;
+          row[10] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          src += 8;
+          row[11] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          src += 8;
+          row[12] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          src += 8;
+          row[13] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          src += 8;
+          row[14] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          src += 8;
+          row[15] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+        }
+      }
+    }
+  }
+  src += src_remainder_stride;
+
+  int y = 0;
+  do {
+    const uint16x8_t below_0 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+    vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[0], below_0), 2));
+    row[0] = below_0;
+    if (width >= 16) {
+      src += 8;
+      dst += 8;
+
+      const uint16x8_t below_1 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+      vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[1], below_1), 2));
+      row[1] = below_1;
+      if (width >= 32) {
+        src += 8;
+        dst += 8;
+
+        const uint16x8_t below_2 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+        vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[2], below_2), 2));
+        row[2] = below_2;
+        src += 8;
+        dst += 8;
+
+        const uint16x8_t below_3 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+        vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[3], below_3), 2));
+        row[3] = below_3;
+        if (width >= 64) {
+          src += 8;
+          dst += 8;
+
+          const uint16x8_t below_4 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[4], below_4), 2));
+          row[4] = below_4;
+          src += 8;
+          dst += 8;
+
+          const uint16x8_t below_5 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[5], below_5), 2));
+          row[5] = below_5;
+          src += 8;
+          dst += 8;
+
+          const uint16x8_t below_6 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[6], below_6), 2));
+          row[6] = below_6;
+          src += 8;
+          dst += 8;
+
+          const uint16x8_t below_7 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+          vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[7], below_7), 2));
+          row[7] = below_7;
+          if (width == 128) {
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_8 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+            vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[8], below_8), 2));
+            row[8] = below_8;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_9 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+            vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[9], below_9), 2));
+            row[9] = below_9;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_10 =
+                vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+            vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[10], below_10), 2));
+            row[10] = below_10;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_11 =
+                vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+            vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[11], below_11), 2));
+            row[11] = below_11;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_12 =
+                vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+            vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[12], below_12), 2));
+            row[12] = below_12;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_13 =
+                vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+            vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[13], below_13), 2));
+            row[13] = below_13;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_14 =
+                vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+            vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[14], below_14), 2));
+            row[14] = below_14;
+            src += 8;
+            dst += 8;
+
+            const uint16x8_t below_15 =
+                vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+            vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[15], below_15), 2));
+            row[15] = below_15;
+          }
+        }
+      }
+    }
+    src += src_remainder_stride;
+    dst += dst_remainder_stride;
+  } while (++y < height);
+}
+
+void ConvolveIntraBlockCopy2D_NEON(
+    const void* const reference, const ptrdiff_t reference_stride,
+    const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
+    const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
+    const int height, void* const prediction, const ptrdiff_t pred_stride) {
+  const auto* src = static_cast<const uint8_t*>(reference);
+  auto* dest = static_cast<uint8_t*>(prediction);
+  // Note: allow vertical access to height + 1. Because this function is only
+  // for u/v plane of intra block copy, such access is guaranteed to be within
+  // the prediction block.
+
+  if (width == 128) {
+    IntraBlockCopy2D<128>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 64) {
+    IntraBlockCopy2D<64>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 32) {
+    IntraBlockCopy2D<32>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 16) {
+    IntraBlockCopy2D<16>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 8) {
+    IntraBlockCopy2D<8>(src, reference_stride, height, dest, pred_stride);
+  } else if (width == 4) {
+    uint8x8_t left = Load4(src);
+    uint8x8_t right = Load4(src + 1);
+    src += reference_stride;
+
+    uint16x4_t row = vget_low_u16(vaddl_u8(left, right));
+
+    int y = 0;
+    do {
+      left = Load4<0>(src, left);
+      right = Load4<0>(src + 1, right);
+      src += reference_stride;
+      left = Load4<1>(src, left);
+      right = Load4<1>(src + 1, right);
+      src += reference_stride;
+
+      const uint16x8_t below = vaddl_u8(left, right);
+
+      const uint8x8_t result = vrshrn_n_u16(
+          vaddq_u16(vcombine_u16(row, vget_low_u16(below)), below), 2);
+      StoreLo4(dest, result);
+      dest += pred_stride;
+      StoreHi4(dest, result);
+      dest += pred_stride;
+
+      row = vget_high_u16(below);
+      y += 2;
+    } while (y < height);
+  } else {
+    uint8x8_t left = Load2(src);
+    uint8x8_t right = Load2(src + 1);
+    src += reference_stride;
+
+    uint16x4_t row = vget_low_u16(vaddl_u8(left, right));
+
+    int y = 0;
+    do {
+      left = Load2<0>(src, left);
+      right = Load2<0>(src + 1, right);
+      src += reference_stride;
+      left = Load2<2>(src, left);
+      right = Load2<2>(src + 1, right);
+      src += reference_stride;
+
+      const uint16x8_t below = vaddl_u8(left, right);
+
+      const uint8x8_t result = vrshrn_n_u16(
+          vaddq_u16(vcombine_u16(row, vget_low_u16(below)), below), 2);
+      Store2<0>(dest, result);
+      dest += pred_stride;
+      Store2<2>(dest, result);
+      dest += pred_stride;
+
+      row = vget_high_u16(below);
+      y += 2;
+    } while (y < height);
   }
 }
 
 void Init8bpp() {
-  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
   assert(dsp != nullptr);
   dsp->convolve[0][0][0][1] = ConvolveHorizontal_NEON;
   dsp->convolve[0][0][1][0] = ConvolveVertical_NEON;
@@ -1959,9 +3096,12 @@
   dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_NEON;
   dsp->convolve[0][1][1][1] = ConvolveCompound2D_NEON;
 
-  // TODO(petersonab,b/139707209): Fix source buffer overreads.
-  // dsp->convolve_scale[1] = ConvolveCompoundScale2D_NEON;
-  static_cast<void>(ConvolveCompoundScale2D_NEON);
+  dsp->convolve[1][0][0][1] = ConvolveIntraBlockCopyHorizontal_NEON;
+  dsp->convolve[1][0][1][0] = ConvolveIntraBlockCopyVertical_NEON;
+  dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_NEON;
+
+  dsp->convolve_scale[0] = ConvolveScale2D_NEON<false>;
+  dsp->convolve_scale[1] = ConvolveScale2D_NEON<true>;
 }
 
 }  // namespace
@@ -1972,7 +3112,7 @@
 }  // namespace dsp
 }  // namespace libgav1
 
-#else   // !LIBGAV1_ENABLE_NEON
+#else  // !LIBGAV1_ENABLE_NEON
 
 namespace libgav1 {
 namespace dsp {
diff --git a/libgav1/src/dsp/arm/convolve_neon.h b/libgav1/src/dsp/arm/convolve_neon.h
index a537650..948ef4d 100644
--- a/libgav1/src/dsp/arm/convolve_neon.h
+++ b/libgav1/src/dsp/arm/convolve_neon.h
@@ -17,8 +17,8 @@
 #ifndef LIBGAV1_SRC_DSP_ARM_CONVOLVE_NEON_H_
 #define LIBGAV1_SRC_DSP_ARM_CONVOLVE_NEON_H_
 
-#include "src/dsp/cpu.h"
 #include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
 
 namespace libgav1 {
 namespace dsp {
@@ -30,17 +30,21 @@
 }  // namespace libgav1
 
 #if LIBGAV1_ENABLE_NEON
-#define LIBGAV1_Dsp8bpp_ConvolveHorizontal LIBGAV1_DSP_NEON
-#define LIBGAV1_Dsp8bpp_ConvolveVertical LIBGAV1_DSP_NEON
-#define LIBGAV1_Dsp8bpp_Convolve2D LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveHorizontal LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_Convolve2D LIBGAV1_CPU_NEON
 
-#define LIBGAV1_Dsp8bpp_ConvolveCompoundCopy LIBGAV1_DSP_NEON
-#define LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal LIBGAV1_DSP_NEON
-#define LIBGAV1_Dsp8bpp_ConvolveCompoundVertical LIBGAV1_DSP_NEON
-#define LIBGAV1_Dsp8bpp_ConvolveCompound2D LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundCopy LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveCompound2D LIBGAV1_CPU_NEON
 
-// TODO(petersonab,b/139707209): Fix source buffer overreads.
-// #define LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopyHorizontal LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopyVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopy2D LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_ConvolveScale2D LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D LIBGAV1_CPU_NEON
 #endif  // LIBGAV1_ENABLE_NEON
 
 #endif  // LIBGAV1_SRC_DSP_ARM_CONVOLVE_NEON_H_
diff --git a/libgav1/src/dsp/arm/distance_weighted_blend_neon.cc b/libgav1/src/dsp/arm/distance_weighted_blend_neon.cc
index 39b34a9..04952ab 100644
--- a/libgav1/src/dsp/arm/distance_weighted_blend_neon.cc
+++ b/libgav1/src/dsp/arm/distance_weighted_blend_neon.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "src/dsp/distance_weighted_blend.h"
-#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
 
 #if LIBGAV1_ENABLE_NEON
 
@@ -24,138 +24,93 @@
 #include <cstdint>
 
 #include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
 #include "src/utils/common.h"
 
 namespace libgav1 {
 namespace dsp {
 namespace {
 
-constexpr int kBitdepth8 = 8;
 constexpr int kInterPostRoundBit = 4;
 
-const int16x8_t kCompoundRoundOffset =
-    vdupq_n_s16((1 << kBitdepth8) + (1 << (kBitdepth8 - 1)));
+inline int16x8_t ComputeWeightedAverage8(const int16x8_t pred0,
+                                         const int16x8_t pred1,
+                                         const int16x4_t weights[2]) {
+  // TODO(https://issuetracker.google.com/issues/150325685): Investigate range.
+  const int32x4_t wpred0_lo = vmull_s16(weights[0], vget_low_s16(pred0));
+  const int32x4_t wpred0_hi = vmull_s16(weights[0], vget_high_s16(pred0));
+  const int32x4_t blended_lo =
+      vmlal_s16(wpred0_lo, weights[1], vget_low_s16(pred1));
+  const int32x4_t blended_hi =
+      vmlal_s16(wpred0_hi, weights[1], vget_high_s16(pred1));
 
-inline int16x8_t ComputeWeightedAverage8(const uint16x8_t pred0,
-                                         const uint16x8_t pred1,
-                                         const uint16x4_t weights[2]) {
-  const uint32x4_t wpred0_lo = vmull_u16(weights[0], vget_low_u16(pred0));
-  const uint32x4_t wpred0_hi = vmull_u16(weights[0], vget_high_u16(pred0));
-  const uint32x4_t blended_lo =
-      vmlal_u16(wpred0_lo, weights[1], vget_low_u16(pred1));
-  const uint32x4_t blended_hi =
-      vmlal_u16(wpred0_hi, weights[1], vget_high_u16(pred1));
-
-  const uint16x4_t result_lo =
-      vqrshrn_n_u32(blended_lo, kInterPostRoundBit + 4);
-  const uint16x4_t result_hi =
-      vqrshrn_n_u32(blended_hi, kInterPostRoundBit + 4);
-  return vsubq_s16(vreinterpretq_s16_u16(vcombine_u16(result_lo, result_hi)),
-                   kCompoundRoundOffset);
+  return vcombine_s16(vqrshrn_n_s32(blended_lo, kInterPostRoundBit + 4),
+                      vqrshrn_n_s32(blended_hi, kInterPostRoundBit + 4));
 }
 
-template <int height>
-inline void DistanceWeightedBlend4xH_NEON(const uint16_t* prediction_0,
-                                          const ptrdiff_t prediction_stride_0,
-                                          const uint16_t* prediction_1,
-                                          const ptrdiff_t prediction_stride_1,
-                                          const uint16x4_t weights[2],
-                                          void* const dest,
-                                          const ptrdiff_t dest_stride) {
+template <int width, int height>
+inline void DistanceWeightedBlendSmall_NEON(const int16_t* prediction_0,
+                                            const int16_t* prediction_1,
+                                            const int16x4_t weights[2],
+                                            void* const dest,
+                                            const ptrdiff_t dest_stride) {
   auto* dst = static_cast<uint8_t*>(dest);
-  const uint16_t* pred_0 = prediction_0;
-  const uint16_t* pred_1 = prediction_1;
+  constexpr int step = 16 / width;
 
-  for (int y = 0; y < height; y += 4) {
-    const uint16x4_t src_00 = vld1_u16(pred_0);
-    const uint16x4_t src_10 = vld1_u16(pred_1);
-    pred_0 += prediction_stride_0;
-    pred_1 += prediction_stride_1;
-    const uint16x4_t src_01 = vld1_u16(pred_0);
-    const uint16x4_t src_11 = vld1_u16(pred_1);
-    pred_0 += prediction_stride_0;
-    pred_1 += prediction_stride_1;
-    const int16x8_t res01 = ComputeWeightedAverage8(
-        vcombine_u16(src_00, src_01), vcombine_u16(src_10, src_11), weights);
-
-    const uint16x4_t src_02 = vld1_u16(pred_0);
-    const uint16x4_t src_12 = vld1_u16(pred_1);
-    pred_0 += prediction_stride_0;
-    pred_1 += prediction_stride_1;
-    const uint16x4_t src_03 = vld1_u16(pred_0);
-    const uint16x4_t src_13 = vld1_u16(pred_1);
-    pred_0 += prediction_stride_0;
-    pred_1 += prediction_stride_1;
-    const int16x8_t res23 = ComputeWeightedAverage8(
-        vcombine_u16(src_02, src_03), vcombine_u16(src_12, src_13), weights);
-
-    const uint8x8_t result_01 = vqmovun_s16(res01);
-    const uint8x8_t result_23 = vqmovun_s16(res23);
-    StoreLo4(dst, result_01);
-    dst += dest_stride;
-    StoreHi4(dst, result_01);
-    dst += dest_stride;
-    StoreLo4(dst, result_23);
-    dst += dest_stride;
-    StoreHi4(dst, result_23);
-    dst += dest_stride;
-  }
-}
-
-template <int height>
-inline void DistanceWeightedBlend8xH_NEON(const uint16_t* prediction_0,
-                                          const ptrdiff_t prediction_stride_0,
-                                          const uint16_t* prediction_1,
-                                          const ptrdiff_t prediction_stride_1,
-                                          const uint16x4_t weights[2],
-                                          void* const dest,
-                                          const ptrdiff_t dest_stride) {
-  auto* dst = static_cast<uint8_t*>(dest);
-  const uint16_t* pred_0 = prediction_0;
-  const uint16_t* pred_1 = prediction_1;
-
-  for (int y = 0; y < height; y += 2) {
-    const uint16x8_t src_00 = vld1q_u16(pred_0);
-    const uint16x8_t src_10 = vld1q_u16(pred_1);
-    pred_0 += prediction_stride_0;
-    pred_1 += prediction_stride_1;
+  for (int y = 0; y < height; y += step) {
+    const int16x8_t src_00 = vld1q_s16(prediction_0);
+    const int16x8_t src_10 = vld1q_s16(prediction_1);
+    prediction_0 += 8;
+    prediction_1 += 8;
     const int16x8_t res0 = ComputeWeightedAverage8(src_00, src_10, weights);
 
-    const uint16x8_t src_01 = vld1q_u16(pred_0);
-    const uint16x8_t src_11 = vld1q_u16(pred_1);
-    pred_0 += prediction_stride_0;
-    pred_1 += prediction_stride_1;
+    const int16x8_t src_01 = vld1q_s16(prediction_0);
+    const int16x8_t src_11 = vld1q_s16(prediction_1);
+    prediction_0 += 8;
+    prediction_1 += 8;
     const int16x8_t res1 = ComputeWeightedAverage8(src_01, src_11, weights);
 
     const uint8x8_t result0 = vqmovun_s16(res0);
     const uint8x8_t result1 = vqmovun_s16(res1);
-    vst1_u8(dst, result0);
-    dst += dest_stride;
-    vst1_u8(dst, result1);
-    dst += dest_stride;
+    if (width == 4) {
+      StoreLo4(dst, result0);
+      dst += dest_stride;
+      StoreHi4(dst, result0);
+      dst += dest_stride;
+      StoreLo4(dst, result1);
+      dst += dest_stride;
+      StoreHi4(dst, result1);
+      dst += dest_stride;
+    } else {
+      assert(width == 8);
+      vst1_u8(dst, result0);
+      dst += dest_stride;
+      vst1_u8(dst, result1);
+      dst += dest_stride;
+    }
   }
 }
 
-inline void DistanceWeightedBlendLarge_NEON(
-    const uint16_t* prediction_0, const ptrdiff_t prediction_stride_0,
-    const uint16_t* prediction_1, const ptrdiff_t prediction_stride_1,
-    const uint16x4_t weights[2], const int width, const int height,
-    void* const dest, const ptrdiff_t dest_stride) {
+inline void DistanceWeightedBlendLarge_NEON(const int16_t* prediction_0,
+                                            const int16_t* prediction_1,
+                                            const int16x4_t weights[2],
+                                            const int width, const int height,
+                                            void* const dest,
+                                            const ptrdiff_t dest_stride) {
   auto* dst = static_cast<uint8_t*>(dest);
-  const uint16_t* pred_0 = prediction_0;
-  const uint16_t* pred_1 = prediction_1;
 
   int y = height;
   do {
     int x = 0;
     do {
-      const uint16x8_t src0_lo = vld1q_u16(pred_0 + x);
-      const uint16x8_t src1_lo = vld1q_u16(pred_1 + x);
+      const int16x8_t src0_lo = vld1q_s16(prediction_0 + x);
+      const int16x8_t src1_lo = vld1q_s16(prediction_1 + x);
       const int16x8_t res_lo =
           ComputeWeightedAverage8(src0_lo, src1_lo, weights);
 
-      const uint16x8_t src0_hi = vld1q_u16(pred_0 + x + 8);
-      const uint16x8_t src1_hi = vld1q_u16(pred_1 + x + 8);
+      const int16x8_t src0_hi = vld1q_s16(prediction_0 + x + 8);
+      const int16x8_t src1_hi = vld1q_s16(prediction_1 + x + 8);
       const int16x8_t res_hi =
           ComputeWeightedAverage8(src0_hi, src1_hi, weights);
 
@@ -165,31 +120,33 @@
       x += 16;
     } while (x < width);
     dst += dest_stride;
-    pred_0 += prediction_stride_0;
-    pred_1 += prediction_stride_1;
+    prediction_0 += width;
+    prediction_1 += width;
   } while (--y != 0);
 }
 
-inline void DistanceWeightedBlend_NEON(
-    const uint16_t* prediction_0, const ptrdiff_t prediction_stride_0,
-    const uint16_t* prediction_1, const ptrdiff_t prediction_stride_1,
-    const uint8_t weight_0, const uint8_t weight_1, const int width,
-    const int height, void* const dest, const ptrdiff_t dest_stride) {
-  uint16x4_t weights[2] = {vdup_n_u16(weight_0), vdup_n_u16(weight_1)};
+inline void DistanceWeightedBlend_NEON(const void* prediction_0,
+                                       const void* prediction_1,
+                                       const uint8_t weight_0,
+                                       const uint8_t weight_1, const int width,
+                                       const int height, void* const dest,
+                                       const ptrdiff_t dest_stride) {
+  const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+  const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+  int16x4_t weights[2] = {vdup_n_s16(weight_0), vdup_n_s16(weight_1)};
+  // TODO(johannkoenig): Investigate the branching. May be fine to call with a
+  // variable height.
   if (width == 4) {
     if (height == 4) {
-      DistanceWeightedBlend4xH_NEON<4>(prediction_0, prediction_stride_0,
-                                       prediction_1, prediction_stride_1,
-                                       weights, dest, dest_stride);
+      DistanceWeightedBlendSmall_NEON<4, 4>(pred_0, pred_1, weights, dest,
+                                            dest_stride);
     } else if (height == 8) {
-      DistanceWeightedBlend4xH_NEON<8>(prediction_0, prediction_stride_0,
-                                       prediction_1, prediction_stride_1,
-                                       weights, dest, dest_stride);
+      DistanceWeightedBlendSmall_NEON<4, 8>(pred_0, pred_1, weights, dest,
+                                            dest_stride);
     } else {
       assert(height == 16);
-      DistanceWeightedBlend4xH_NEON<16>(prediction_0, prediction_stride_0,
-                                        prediction_1, prediction_stride_1,
-                                        weights, dest, dest_stride);
+      DistanceWeightedBlendSmall_NEON<4, 16>(pred_0, pred_1, weights, dest,
+                                             dest_stride);
     }
     return;
   }
@@ -197,37 +154,32 @@
   if (width == 8) {
     switch (height) {
       case 4:
-        DistanceWeightedBlend8xH_NEON<4>(prediction_0, prediction_stride_0,
-                                         prediction_1, prediction_stride_1,
-                                         weights, dest, dest_stride);
+        DistanceWeightedBlendSmall_NEON<8, 4>(pred_0, pred_1, weights, dest,
+                                              dest_stride);
         return;
       case 8:
-        DistanceWeightedBlend8xH_NEON<8>(prediction_0, prediction_stride_0,
-                                         prediction_1, prediction_stride_1,
-                                         weights, dest, dest_stride);
+        DistanceWeightedBlendSmall_NEON<8, 8>(pred_0, pred_1, weights, dest,
+                                              dest_stride);
         return;
       case 16:
-        DistanceWeightedBlend8xH_NEON<16>(prediction_0, prediction_stride_0,
-                                          prediction_1, prediction_stride_1,
-                                          weights, dest, dest_stride);
+        DistanceWeightedBlendSmall_NEON<8, 16>(pred_0, pred_1, weights, dest,
+                                               dest_stride);
         return;
       default:
         assert(height == 32);
-        DistanceWeightedBlend8xH_NEON<32>(prediction_0, prediction_stride_0,
-                                          prediction_1, prediction_stride_1,
-                                          weights, dest, dest_stride);
+        DistanceWeightedBlendSmall_NEON<8, 32>(pred_0, pred_1, weights, dest,
+                                               dest_stride);
 
         return;
     }
   }
 
-  DistanceWeightedBlendLarge_NEON(prediction_0, prediction_stride_0,
-                                  prediction_1, prediction_stride_1, weights,
-                                  width, height, dest, dest_stride);
+  DistanceWeightedBlendLarge_NEON(pred_0, pred_1, weights, width, height, dest,
+                                  dest_stride);
 }
 
 void Init8bpp() {
-  Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+  Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
   assert(dsp != nullptr);
   dsp->distance_weighted_blend = DistanceWeightedBlend_NEON;
 }
@@ -239,7 +191,7 @@
 }  // namespace dsp
 }  // namespace libgav1
 
-#else   // !LIBGAV1_ENABLE_NEON
+#else  // !LIBGAV1_ENABLE_NEON
 
 namespace libgav1 {
 namespace dsp {
diff --git a/libgav1/src/dsp/arm/distance_weighted_blend_neon.h b/libgav1/src/dsp/arm/distance_weighted_blend_neon.h
index 6d35956..4d8824c 100644
--- a/libgav1/src/dsp/arm/distance_weighted_blend_neon.h
+++ b/libgav1/src/dsp/arm/distance_weighted_blend_neon.h
@@ -17,8 +17,8 @@
 #ifndef LIBGAV1_SRC_DSP_ARM_DISTANCE_WEIGHTED_BLEND_NEON_H_
 #define LIBGAV1_SRC_DSP_ARM_DISTANCE_WEIGHTED_BLEND_NEON_H_
 
-#include "src/dsp/cpu.h"
 #include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
 
 namespace libgav1 {
 namespace dsp {
@@ -32,7 +32,7 @@
 // If NEON is enabled signal the NEON implementation should be used instead of
 // normal C.
 #if LIBGAV1_ENABLE_NEON
-#define LIBGAV1_Dsp8bpp_DistanceWeightedBlend LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_DistanceWeightedBlend LIBGAV1_CPU_NEON
 
 #endif  // LIBGAV1_ENABLE_NEON
 
diff --git a/libgav1/src/dsp/arm/film_grain_neon.cc b/libgav1/src/dsp/arm/film_grain_neon.cc
new file mode 100644
index 0000000..2612466
--- /dev/null
+++ b/libgav1/src/dsp/arm/film_grain_neon.cc
@@ -0,0 +1,1188 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/film_grain.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <new>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/arm/film_grain_neon.h"
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/film_grain_common.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace film_grain {
+namespace {
+
+// These functions are overloaded for both possible sizes in order to simplify
+// loading and storing to and from intermediate value types from within a
+// template function.
+inline int16x8_t GetSignedSource8(const int8_t* src) {
+  return vmovl_s8(vld1_s8(src));
+}
+
+inline int16x8_t GetSignedSource8(const uint8_t* src) {
+  return ZeroExtend(vld1_u8(src));
+}
+
+inline void StoreUnsigned8(uint8_t* dest, const uint16x8_t data) {
+  vst1_u8(dest, vmovn_u16(data));
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+inline int16x8_t GetSignedSource8(const int16_t* src) { return vld1q_s16(src); }
+
+inline int16x8_t GetSignedSource8(const uint16_t* src) {
+  return vreinterpretq_s16_u16(vld1q_u16(src));
+}
+
+inline void StoreUnsigned8(uint16_t* dest, const uint16x8_t data) {
+  vst1q_u16(dest, data);
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+// Each element in |sum| represents one destination value's running
+// autoregression formula. The fixed source values in |grain_lo| and |grain_hi|
+// allow for a sliding window in successive calls to this function.
+template <int position_offset>
+inline int32x4x2_t AccumulateWeightedGrain(const int16x8_t grain_lo,
+                                           const int16x8_t grain_hi,
+                                           int16_t coeff, int32x4x2_t sum) {
+  const int16x8_t grain = vextq_s16(grain_lo, grain_hi, position_offset);
+  sum.val[0] = vmlal_n_s16(sum.val[0], vget_low_s16(grain), coeff);
+  sum.val[1] = vmlal_n_s16(sum.val[1], vget_high_s16(grain), coeff);
+  return sum;
+}
+
+// Because the autoregressive filter requires the output of each pixel to
+// compute pixels that come after in the row, we have to finish the calculations
+// one at a time.
+template <int bitdepth, int auto_regression_coeff_lag, int lane>
+inline void WriteFinalAutoRegression(int8_t* grain_cursor, int32x4x2_t sum,
+                                     const int8_t* coeffs, int pos, int shift) {
+  int32_t result = vgetq_lane_s32(sum.val[lane >> 2], lane & 3);
+
+  for (int delta_col = -auto_regression_coeff_lag; delta_col < 0; ++delta_col) {
+    result += grain_cursor[lane + delta_col] * coeffs[pos];
+    ++pos;
+  }
+  grain_cursor[lane] =
+      Clip3(grain_cursor[lane] + RightShiftWithRounding(result, shift),
+            GetGrainMin<bitdepth>(), GetGrainMax<bitdepth>());
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template <int bitdepth, int auto_regression_coeff_lag, int lane>
+inline void WriteFinalAutoRegression(int16_t* grain_cursor, int32x4x2_t sum,
+                                     const int8_t* coeffs, int pos, int shift) {
+  int32_t result = vgetq_lane_s32(sum.val[lane >> 2], lane & 3);
+
+  for (int delta_col = -auto_regression_coeff_lag; delta_col < 0; ++delta_col) {
+    result += grain_cursor[lane + delta_col] * coeffs[pos];
+    ++pos;
+  }
+  grain_cursor[lane] =
+      Clip3(grain_cursor[lane] + RightShiftWithRounding(result, shift),
+            GetGrainMin<bitdepth>(), GetGrainMax<bitdepth>());
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+// Because the autoregressive filter requires the output of each pixel to
+// compute pixels that come after in the row, we have to finish the calculations
+// one at a time.
+template <int bitdepth, int auto_regression_coeff_lag, int lane>
+inline void WriteFinalAutoRegressionChroma(int8_t* u_grain_cursor,
+                                           int8_t* v_grain_cursor,
+                                           int32x4x2_t sum_u, int32x4x2_t sum_v,
+                                           const int8_t* coeffs_u,
+                                           const int8_t* coeffs_v, int pos,
+                                           int shift) {
+  WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
+      u_grain_cursor, sum_u, coeffs_u, pos, shift);
+  WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
+      v_grain_cursor, sum_v, coeffs_v, pos, shift);
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template <int bitdepth, int auto_regression_coeff_lag, int lane>
+inline void WriteFinalAutoRegressionChroma(int16_t* u_grain_cursor,
+                                           int16_t* v_grain_cursor,
+                                           int32x4x2_t sum_u, int32x4x2_t sum_v,
+                                           const int8_t* coeffs_u,
+                                           const int8_t* coeffs_v, int pos,
+                                           int shift) {
+  WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
+      u_grain_cursor, sum_u, coeffs_u, pos, shift);
+  WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
+      v_grain_cursor, sum_v, coeffs_v, pos, shift);
+}
+#endif  // LIBGAV1_MAX_BITDEPTH >= 10
+
+inline void SetZero(int32x4x2_t* v) {
+  v->val[0] = vdupq_n_s32(0);
+  v->val[1] = vdupq_n_s32(0);
+}
+
+// Computes subsampled luma for use with chroma, by averaging in the x direction
+// or y direction when applicable.
+int16x8_t GetSubsampledLuma(const int8_t* const luma, int subsampling_x,
+                            int subsampling_y, ptrdiff_t stride) {
+  if (subsampling_y != 0) {
+    assert(subsampling_x != 0);
+    const int8x16_t src0 = vld1q_s8(luma);
+    const int8x16_t src1 = vld1q_s8(luma + stride);
+    const int16x8_t ret0 = vcombine_s16(vpaddl_s8(vget_low_s8(src0)),
+                                        vpaddl_s8(vget_high_s8(src0)));
+    const int16x8_t ret1 = vcombine_s16(vpaddl_s8(vget_low_s8(src1)),
+                                        vpaddl_s8(vget_high_s8(src1)));
+    return vrshrq_n_s16(vaddq_s16(ret0, ret1), 2);
+  }
+  if (subsampling_x != 0) {
+    const int8x16_t src = vld1q_s8(luma);
+    return vrshrq_n_s16(
+        vcombine_s16(vpaddl_s8(vget_low_s8(src)), vpaddl_s8(vget_high_s8(src))),
+        1);
+  }
+  return vmovl_s8(vld1_s8(luma));
+}
+
+// For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
+inline uint16x8_t GetAverageLuma(const uint8_t* const luma, int subsampling_x) {
+  if (subsampling_x != 0) {
+    const uint8x16_t src = vld1q_u8(luma);
+    return vrshrq_n_u16(vpaddlq_u8(src), 1);
+  }
+  return vmovl_u8(vld1_u8(luma));
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+// Computes subsampled luma for use with chroma, by averaging in the x direction
+// or y direction when applicable.
+int16x8_t GetSubsampledLuma(const int16_t* const luma, int subsampling_x,
+                            int subsampling_y, ptrdiff_t stride) {
+  if (subsampling_y != 0) {
+    assert(subsampling_x != 0);
+    int16x8_t src0_lo = vld1q_s16(luma);
+    int16x8_t src0_hi = vld1q_s16(luma + 8);
+    const int16x8_t src1_lo = vld1q_s16(luma + stride);
+    const int16x8_t src1_hi = vld1q_s16(luma + stride + 8);
+    const int16x8_t src0 =
+        vcombine_s16(vpadd_s16(vget_low_s16(src0_lo), vget_high_s16(src0_lo)),
+                     vpadd_s16(vget_low_s16(src0_hi), vget_high_s16(src0_hi)));
+    const int16x8_t src1 =
+        vcombine_s16(vpadd_s16(vget_low_s16(src1_lo), vget_high_s16(src1_lo)),
+                     vpadd_s16(vget_low_s16(src1_hi), vget_high_s16(src1_hi)));
+    return vrshrq_n_s16(vaddq_s16(src0, src1), 2);
+  }
+  if (subsampling_x != 0) {
+    const int16x8_t src_lo = vld1q_s16(luma);
+    const int16x8_t src_hi = vld1q_s16(luma + 8);
+    const int16x8_t ret =
+        vcombine_s16(vpadd_s16(vget_low_s16(src_lo), vget_high_s16(src_lo)),
+                     vpadd_s16(vget_low_s16(src_hi), vget_high_s16(src_hi)));
+    return vrshrq_n_s16(ret, 1);
+  }
+  return vld1q_s16(luma);
+}
+