Merge rvc-qpr-dev-plus-aosp-without-vendor@6881855
Bug: 172690556
Merged-In: Ib7af1dc437feb7676e2c9ebe81075bc9e93f0eb9
Change-Id: I1325e88711b3e467279ebcc168ddb20525b0388a
diff --git a/Android.bp b/Android.bp
index 3d5b91a..e6a47be 100644
--- a/Android.bp
+++ b/Android.bp
@@ -18,6 +18,7 @@
export_include_dirs: [
".",
+ "libgav1/src",
],
cflags: [
@@ -40,10 +41,12 @@
"libgav1/src/buffer_pool.cc",
"libgav1/src/decoder.cc",
"libgav1/src/decoder_impl.cc",
- "libgav1/src/decoder_scratch_buffer.cc",
+ "libgav1/src/decoder_settings.cc",
"libgav1/src/dsp/arm/average_blend_neon.cc",
+ "libgav1/src/dsp/arm/cdef_neon.cc",
"libgav1/src/dsp/arm/convolve_neon.cc",
"libgav1/src/dsp/arm/distance_weighted_blend_neon.cc",
+ "libgav1/src/dsp/arm/film_grain_neon.cc",
"libgav1/src/dsp/arm/intra_edge_neon.cc",
"libgav1/src/dsp/arm/intrapred_cfl_neon.cc",
"libgav1/src/dsp/arm/intrapred_directional_neon.cc",
@@ -54,13 +57,16 @@
"libgav1/src/dsp/arm/loop_filter_neon.cc",
"libgav1/src/dsp/arm/loop_restoration_neon.cc",
"libgav1/src/dsp/arm/mask_blend_neon.cc",
+ "libgav1/src/dsp/arm/motion_field_projection_neon.cc",
+ "libgav1/src/dsp/arm/motion_vector_search_neon.cc",
"libgav1/src/dsp/arm/obmc_neon.cc",
+ "libgav1/src/dsp/arm/super_res_neon.cc",
"libgav1/src/dsp/arm/warp_neon.cc",
+ "libgav1/src/dsp/arm/weight_mask_neon.cc",
"libgav1/src/dsp/average_blend.cc",
"libgav1/src/dsp/cdef.cc",
"libgav1/src/dsp/constants.cc",
"libgav1/src/dsp/convolve.cc",
- "libgav1/src/dsp/cpu.cc",
"libgav1/src/dsp/distance_weighted_blend.cc",
"libgav1/src/dsp/dsp.cc",
"libgav1/src/dsp/film_grain.cc",
@@ -70,9 +76,14 @@
"libgav1/src/dsp/loop_filter.cc",
"libgav1/src/dsp/loop_restoration.cc",
"libgav1/src/dsp/mask_blend.cc",
+ "libgav1/src/dsp/motion_field_projection.cc",
+ "libgav1/src/dsp/motion_vector_search.cc",
"libgav1/src/dsp/obmc.cc",
+ "libgav1/src/dsp/super_res.cc",
"libgav1/src/dsp/warp.cc",
+ "libgav1/src/dsp/weight_mask.cc",
"libgav1/src/dsp/x86/average_blend_sse4.cc",
+ "libgav1/src/dsp/x86/cdef_sse4.cc",
"libgav1/src/dsp/x86/convolve_sse4.cc",
"libgav1/src/dsp/x86/distance_weighted_blend_sse4.cc",
"libgav1/src/dsp/x86/intra_edge_sse4.cc",
@@ -82,17 +93,29 @@
"libgav1/src/dsp/x86/inverse_transform_sse4.cc",
"libgav1/src/dsp/x86/loop_filter_sse4.cc",
"libgav1/src/dsp/x86/loop_restoration_sse4.cc",
+ "libgav1/src/dsp/x86/mask_blend_sse4.cc",
+ "libgav1/src/dsp/x86/motion_field_projection_sse4.cc",
+ "libgav1/src/dsp/x86/motion_vector_search_sse4.cc",
"libgav1/src/dsp/x86/obmc_sse4.cc",
+ "libgav1/src/dsp/x86/super_res_sse4.cc",
+ "libgav1/src/dsp/x86/warp_sse4.cc",
+ "libgav1/src/dsp/x86/weight_mask_sse4.cc",
+ "libgav1/src/film_grain.cc",
+ "libgav1/src/frame_buffer.cc",
"libgav1/src/internal_frame_buffer_list.cc",
- "libgav1/src/loop_filter_mask.cc",
"libgav1/src/loop_restoration_info.cc",
"libgav1/src/motion_vector.cc",
"libgav1/src/obu_parser.cc",
- "libgav1/src/post_filter.cc",
+ "libgav1/src/post_filter/cdef.cc",
+ "libgav1/src/post_filter/deblock.cc",
+ "libgav1/src/post_filter/loop_restoration.cc",
+ "libgav1/src/post_filter/post_filter.cc",
+ "libgav1/src/post_filter/super_res.cc",
"libgav1/src/prediction_mask.cc",
"libgav1/src/quantizer.cc",
"libgav1/src/reconstruction.cc",
"libgav1/src/residual_buffer_pool.cc",
+ "libgav1/src/status_code.cc",
"libgav1/src/symbol_decoder_context.cc",
"libgav1/src/threading_strategy.cc",
"libgav1/src/tile/bitstream/mode_info.cc",
@@ -100,10 +123,12 @@
"libgav1/src/tile/bitstream/partition.cc",
"libgav1/src/tile/bitstream/transform_size.cc",
"libgav1/src/tile/prediction.cc",
+ "libgav1/src/tile_scratch_buffer.cc",
"libgav1/src/tile/tile.cc",
"libgav1/src/utils/bit_reader.cc",
"libgav1/src/utils/block_parameters_holder.cc",
"libgav1/src/utils/constants.cc",
+ "libgav1/src/utils/cpu.cc",
"libgav1/src/utils/entropy_decoder.cc",
"libgav1/src/utils/executor.cc",
"libgav1/src/utils/logging.cc",
@@ -112,6 +137,7 @@
"libgav1/src/utils/segmentation.cc",
"libgav1/src/utils/segmentation_map.cc",
"libgav1/src/utils/threadpool.cc",
+ "libgav1/src/version.cc",
"libgav1/src/warp_prediction.cc",
"libgav1/src/yuv_buffer.cc",
],
diff --git a/README.version b/README.version
index 5d15f7f..b65b65a 100644
--- a/README.version
+++ b/README.version
@@ -1,11 +1,5 @@
URL: https://chromium.googlesource.com/codecs/libgav1
-Version: cl/267700628
+Version: v0.16.0
BugComponent: 324837
Local Modifications:
-- ab3390a external/libgav1,cosmetics: add license headers
-- backport cl/281117442: Fully use the frame border for reference block.
-- backport cl/289984918: convolve: Use the correct subsampling for ref frames
-- backport cl/289966078: Move initial_display_delay out of OperatingParamet
-- backport cl/290784565: Handle a change of sequence header parameters.
-- backport cl/291222461: Disallow change of sequence header during a frame.
-- backport cl/289910031: obu: Check for size validity in SetTileDataOffset
+None
diff --git a/libgav1/.gitignore b/libgav1/.gitignore
new file mode 100644
index 0000000..87ccf24
--- /dev/null
+++ b/libgav1/.gitignore
@@ -0,0 +1,2 @@
+/build
+/third_party
diff --git a/libgav1/AUTHORS b/libgav1/AUTHORS
new file mode 100644
index 0000000..d92ea0a
--- /dev/null
+++ b/libgav1/AUTHORS
@@ -0,0 +1,6 @@
+# This is the list of libgav1 authors for copyright purposes.
+#
+# This does not necessarily list everyone who has contributed code, since in
+# some cases, their employer may be the copyright holder. To see the full list
+# of contributors, see the revision history in source control.
+Google LLC
diff --git a/libgav1/CMakeLists.txt b/libgav1/CMakeLists.txt
new file mode 100644
index 0000000..f033bae
--- /dev/null
+++ b/libgav1/CMakeLists.txt
@@ -0,0 +1,124 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# libgav1 requires modern CMake.
+cmake_minimum_required(VERSION 3.7.1 FATAL_ERROR)
+
+# libgav1 requires C++11.
+set(CMAKE_CXX_STANDARD 11)
+set(ABSL_CXX_STANDARD 11)
+
+project(libgav1 CXX)
+
+set(libgav1_root "${CMAKE_CURRENT_SOURCE_DIR}")
+set(libgav1_build "${CMAKE_BINARY_DIR}")
+
+if("${libgav1_root}" STREQUAL "${libgav1_build}")
+ message(
+ FATAL_ERROR
+ "Building from within the libgav1 source tree is not supported.\n"
+ "Hint: Run these commands\n" "$ rm -rf CMakeCache.txt CMakeFiles\n"
+ "$ mkdir -p ../libgav1_build\n" "$ cd ../libgav1_build\n"
+ "And re-run CMake from the libgav1_build directory.")
+endif()
+
+set(libgav1_examples "${libgav1_root}/examples")
+set(libgav1_source "${libgav1_root}/src")
+
+include(FindThreads)
+
+include("${libgav1_examples}/libgav1_examples.cmake")
+include("${libgav1_root}/cmake/libgav1_build_definitions.cmake")
+include("${libgav1_root}/cmake/libgav1_cpu_detection.cmake")
+include("${libgav1_root}/cmake/libgav1_flags.cmake")
+include("${libgav1_root}/cmake/libgav1_helpers.cmake")
+include("${libgav1_root}/cmake/libgav1_install.cmake")
+include("${libgav1_root}/cmake/libgav1_intrinsics.cmake")
+include("${libgav1_root}/cmake/libgav1_options.cmake")
+include("${libgav1_root}/cmake/libgav1_sanitizer.cmake")
+include("${libgav1_root}/cmake/libgav1_targets.cmake")
+include("${libgav1_root}/cmake/libgav1_variables.cmake")
+include("${libgav1_source}/dsp/libgav1_dsp.cmake")
+include("${libgav1_source}/libgav1_decoder.cmake")
+include("${libgav1_source}/utils/libgav1_utils.cmake")
+
+libgav1_option(NAME LIBGAV1_ENABLE_OPTIMIZATIONS HELPSTRING
+ "Enables optimized code." VALUE ON)
+libgav1_option(NAME LIBGAV1_ENABLE_NEON HELPSTRING "Enables neon optimizations."
+ VALUE ON)
+libgav1_option(NAME LIBGAV1_ENABLE_SSE4_1 HELPSTRING
+ "Enables sse4.1 optimizations." VALUE ON)
+libgav1_option(
+ NAME LIBGAV1_VERBOSE HELPSTRING
+ "Enables verbose build system output. Higher numbers are more verbose." VALUE
+ OFF)
+
+if(NOT CMAKE_BUILD_TYPE)
+ set(CMAKE_BUILD_TYPE Release)
+endif()
+
+libgav1_optimization_detect()
+libgav1_set_build_definitions()
+libgav1_set_cxx_flags()
+libgav1_configure_sanitizer()
+
+# Supported bit depth.
+libgav1_track_configuration_variable(LIBGAV1_MAX_BITDEPTH)
+
+# C++ and linker flags.
+libgav1_track_configuration_variable(LIBGAV1_CXX_FLAGS)
+libgav1_track_configuration_variable(LIBGAV1_EXE_LINKER_FLAGS)
+
+# Sanitizer integration.
+libgav1_track_configuration_variable(LIBGAV1_SANITIZE)
+
+# Generated source file directory.
+libgav1_track_configuration_variable(LIBGAV1_GENERATED_SOURCES_DIRECTORY)
+
+# Controls use of std::mutex and absl::Mutex in ThreadPool.
+libgav1_track_configuration_variable(LIBGAV1_THREADPOOL_USE_STD_MUTEX)
+
+if(LIBGAV1_VERBOSE)
+ libgav1_dump_cmake_flag_variables()
+ libgav1_dump_tracked_configuration_variables()
+ libgav1_dump_options()
+endif()
+
+set(libgav1_abseil_build "${libgav1_build}/abseil")
+set(libgav1_gtest_build "${libgav1_build}/gtest")
+
+# Compiler/linker flags must be lists, but come in from the environment as
+# strings. Break them up:
+if(NOT "${LIBGAV1_CXX_FLAGS}" STREQUAL "")
+ separate_arguments(LIBGAV1_CXX_FLAGS)
+endif()
+if(NOT "${LIBGAV1_EXE_LINKER_FLAGS}" STREQUAL "")
+ separate_arguments(LIBGAV1_EXE_LINKER_FLAGS)
+endif()
+
+add_subdirectory("${libgav1_root}/third_party/abseil-cpp"
+ "${libgav1_abseil_build}" EXCLUDE_FROM_ALL)
+
+libgav1_reset_target_lists()
+libgav1_add_dsp_targets()
+libgav1_add_decoder_targets()
+libgav1_add_examples_targets()
+libgav1_add_utils_targets()
+libgav1_setup_install_target()
+
+if(LIBGAV1_VERBOSE)
+ libgav1_dump_cmake_flag_variables()
+ libgav1_dump_tracked_configuration_variables()
+ libgav1_dump_options()
+endif()
diff --git a/libgav1/CONTRIBUTING.md b/libgav1/CONTRIBUTING.md
new file mode 100644
index 0000000..69140ff
--- /dev/null
+++ b/libgav1/CONTRIBUTING.md
@@ -0,0 +1,27 @@
+# How to Contribute
+
+We'd love to accept your patches and contributions to this project. There are
+just a few small guidelines you need to follow.
+
+## Contributor License Agreement
+
+Contributions to this project must be accompanied by a Contributor License
+Agreement. You (or your employer) retain the copyright to your contribution;
+this simply gives us permission to use and redistribute your contributions as
+part of the project. Head over to <https://cla.developers.google.com/> to see
+your current agreements on file or to sign a new one.
+
+You generally only need to submit a CLA once, so if you've already submitted one
+(even if it was for a different project), you probably don't need to do it
+again.
+
+## Code reviews
+
+All submissions, including submissions by project members, require review. We
+use a [Gerrit](https://www.gerritcodereview.com) instance hosted at
+https://chromium-review.googlesource.com for this purpose.
+
+## Community Guidelines
+
+This project follows
+[Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
diff --git a/libgav1/README.md b/libgav1/README.md
new file mode 100644
index 0000000..b935679
--- /dev/null
+++ b/libgav1/README.md
@@ -0,0 +1,165 @@
+# libgav1 -- an AV1 decoder
+
+libgav1 is a Main profile (0) & High profile (1) compliant AV1 decoder. More
+information on the AV1 video format can be found at
+[aomedia.org](https://aomedia.org).
+
+[TOC]
+
+## Building
+
+### Prerequisites
+
+1. A C++11 compiler. gcc 6+, clang 7+ or Microsoft Visual Studio 2017+ are
+ recommended.
+
+2. [CMake >= 3.7.1](https://cmake.org/download/)
+
+3. [Abseil](https://abseil.io)
+
+ From within the libgav1 directory:
+
+ ```shell
+ $ git clone https://github.com/abseil/abseil-cpp.git third_party/abseil-cpp
+ ```
+
+### Compile
+
+```shell
+ $ mkdir build && cd build
+ $ cmake -G "Unix Makefiles" ..
+ $ make
+```
+
+Configuration options:
+
+* `LIBGAV1_MAX_BITDEPTH`: defines the maximum supported bitdepth (8, 10;
+ default: 10).
+* `LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS`: define to a non-zero value to disable
+ [symbol reduction](#symbol-reduction) in an optimized build to keep all
+ versions of dsp functions available. Automatically defined in
+ `src/dsp/dsp.h` if unset.
+* `LIBGAV1_ENABLE_NEON`: define to a non-zero value to enable NEON
+ optimizations. Automatically defined in `src/dsp/dsp.h` if unset.
+* `LIBGAV1_ENABLE_SSE4_1`: define to a non-zero value to enable sse4.1
+ optimizations. Automatically defined in `src/dsp/dsp.h` if unset.
+* `LIBGAV1_ENABLE_LOGGING`: define to 0/1 to control debug logging.
+ Automatically defined in `src/utils/logging.h` if unset.
+* `LIBGAV1_EXAMPLES_ENABLE_LOGGING`: define to 0/1 to control error logging in
+ the examples. Automatically defined in `examples/logging.h` if unset.
+* `LIBGAV1_ENABLE_TRANSFORM_RANGE_CHECK`: define to 1 to enable transform
+ coefficient range checks.
+* `LIBGAV1_LOG_LEVEL`: controls the maximum allowed log level, see `enum
+ LogSeverity` in `src/utils/logging.h`. Automatically defined in
+ `src/utils/logging.cc` if unset.
+* `LIBGAV1_THREADPOOL_USE_STD_MUTEX`: controls use of std::mutex and
+ absl::Mutex in ThreadPool. Defining this to 1 will remove any Abseil
+ dependency from the core library. Automatically defined in
+ `src/utils/threadpool.h` if unset.
+* `LIBGAV1_MAX_THREADS`: sets the number of threads that the library is
+ allowed to create. Has to be an integer > 0. Otherwise this is ignored.
+ The default value is 128.
+* `LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER`: the threshold multiplier that
+ is used to determine when to use frame parallel decoding. Frame parallel
+ decoding will be used if |threads| > |tile_count| * this multiplier. Has to
+ be an integer > 0. The default value is 4. This is an advanced setting
+ intended for testing purposes.
+
+For additional options see:
+
+```shell
+ $ cmake .. -LH
+```
+
+## Testing
+
+* `gav1_decode` can be used to decode IVF files, see `gav1_decode --help` for
+ options. Note: tools like [FFmpeg](https://ffmpeg.org) can be used to
+ convert other container formats to IVF.
+
+## Development
+
+### Contributing
+
+See [CONTRIBUTING.md](CONTRIBUTING.md) for details on how to submit patches.
+
+### Style
+
+libgav1 follows the
+[Google C++ style guide](https://google.github.io/styleguide/cppguide.html) with
+formatting enforced by `clang-format`.
+
+### Comments
+
+Comments of the form '`// X.Y(.Z).`', '`Section X.Y(.Z).`' or '`... in the
+spec`' reference the relevant section(s) in the
+[AV1 specification](http://aomediacodec.github.io/av1-spec/av1-spec.pdf).
+
+### DSP structure
+
+* `src/dsp/dsp.cc` defines the main entry point: `libgav1::dsp::DspInit()`.
+ This handles cpu-detection and initializing each logical unit which populate
+ `libgav1::dsp::Dsp` function tables.
+* `src/dsp/dsp.h` contains function and type definitions for all logical units
+ (e.g., intra-predictors)
+* `src/utils/cpu.h` contains definitions for cpu-detection
+* base implementations are located in `src/dsp/*.{h,cc}` with platform
+ specific optimizations in sub-folders
+* unit tests define `DISABLED_Speed` test(s) to allow timing of individual
+ functions
+
+#### Symbol reduction
+
+Based on the build configuration unneeded lesser optimizations are removed using
+a hierarchical include and define system. Each logical unit in `src/dsp` should
+include all platform specific headers in descending order to allow higher level
+optimizations to disable lower level ones. See `src/dsp/loop_filter.h` for an
+example.
+
+Each function receives a new define which can be checked in platform specific
+headers. The format is: `LIBGAV1_<Dsp-table>_FunctionName` or
+`LIBGAV1_<Dsp-table>_[sub-table-index1][...-indexN]`, e.g.,
+`LIBGAV1_Dsp8bpp_AverageBlend`,
+`LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc`. The Dsp-table name is of
+the form `Dsp<bitdepth>bpp` e.g. `Dsp10bpp` for bitdepth == 10 (bpp stands for
+bits per pixel). The indices correspond to enum values used as lookups with
+leading 'k' removed. Platform specific headers then should first check if the
+symbol is defined and if not set the value to the corresponding
+`LIBGAV1_CPU_<arch>` value from `src/utils/cpu.h`.
+
+```
+ #ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc
+ #define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_SSE4_1
+ #endif
+```
+
+Within each module the code should check if the symbol is defined to its
+specific architecture or forced via `LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS` before
+defining the function. The `DSP_ENABLED_(8|10)BPP_*` macros are available to
+simplify this check for optimized code.
+
+```
+ #if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDc)
+ ...
+
+ // In unoptimized code use the following structure; there's no equivalent
+ // define for LIBGAV1_CPU_C as it would require duplicating the function
+ // defines used in optimized code for only a small benefit to this
+ // boilerplate.
+ #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ ...
+ #else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ #ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcFill
+ ...
+```
+
+## Bugs
+
+Please report all bugs to the issue tracker:
+https://issuetracker.google.com/issues/new?component=750480&template=1355007
+
+## Discussion
+
+Email: gav1-devel@googlegroups.com
+
+Web: https://groups.google.com/forum/#!forum/gav1-devel
diff --git a/libgav1/cmake/libgav1-config.cmake.template b/libgav1/cmake/libgav1-config.cmake.template
new file mode 100644
index 0000000..dc253d3
--- /dev/null
+++ b/libgav1/cmake/libgav1-config.cmake.template
@@ -0,0 +1,2 @@
+set(LIBGAV1_INCLUDE_DIRS "@LIBGAV1_INCLUDE_DIRS@")
+set(LIBGAV1_LIBRARIES "gav1")
diff --git a/libgav1/cmake/libgav1.pc.template b/libgav1/cmake/libgav1.pc.template
new file mode 100644
index 0000000..c571a43
--- /dev/null
+++ b/libgav1/cmake/libgav1.pc.template
@@ -0,0 +1,11 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: @PROJECT_NAME@
+Description: AV1 decoder library (@LIBGAV1_MAX_BITDEPTH@-bit).
+Version: @LIBGAV1_VERSION@
+Cflags: -I${includedir}
+Libs: -L${libdir} -lgav1
+Libs.private: @CMAKE_THREAD_LIBS_INIT@
diff --git a/libgav1/cmake/libgav1_build_definitions.cmake b/libgav1/cmake/libgav1_build_definitions.cmake
new file mode 100644
index 0000000..930d8f5
--- /dev/null
+++ b/libgav1/cmake/libgav1_build_definitions.cmake
@@ -0,0 +1,149 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_BUILD_DEFINITIONS_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_BUILD_DEFINITIONS_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_BUILD_DEFINITIONS_CMAKE_ 1)
+
+macro(libgav1_set_build_definitions)
+ string(TOLOWER "${CMAKE_BUILD_TYPE}" build_type_lowercase)
+
+ libgav1_load_version_info()
+ set(LIBGAV1_SOVERSION 0)
+
+ list(APPEND libgav1_include_paths "${libgav1_root}" "${libgav1_root}/src"
+ "${libgav1_build}" "${libgav1_root}/third_party/abseil-cpp")
+ list(APPEND libgav1_gtest_include_paths
+ "third_party/googletest/googlemock/include"
+ "third_party/googletest/googletest/include"
+ "third_party/googletest/googletest")
+ list(APPEND libgav1_test_include_paths ${libgav1_include_paths}
+ ${libgav1_gtest_include_paths})
+ list(APPEND libgav1_defines "LIBGAV1_CMAKE=1"
+ "LIBGAV1_FLAGS_SRCDIR=\"${libgav1_root}\""
+ "LIBGAV1_FLAGS_TMPDIR=\"/tmp\"")
+
+ if(MSVC OR WIN32)
+ list(APPEND libgav1_defines "_CRT_SECURE_NO_DEPRECATE=1" "NOMINMAX=1")
+ endif()
+
+ if(ANDROID)
+ if(CMAKE_ANDROID_ARCH_ABI STREQUAL "armeabi-v7a")
+ set(CMAKE_ANDROID_ARM_MODE ON)
+ endif()
+
+ if(build_type_lowercase MATCHES "rel")
+ list(APPEND libgav1_base_cxx_flags "-fno-stack-protector")
+ endif()
+ endif()
+
+ list(APPEND libgav1_base_cxx_flags "-Wall" "-Wextra" "-Wmissing-declarations"
+ "-Wno-sign-compare" "-fvisibility=hidden"
+ "-fvisibility-inlines-hidden")
+
+ if(BUILD_SHARED_LIBS)
+ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+ set(libgav1_dependency libgav1_shared)
+ else()
+ set(libgav1_dependency libgav1_static)
+ endif()
+
+ list(APPEND libgav1_clang_cxx_flags "-Wextra-semi" "-Wmissing-prototypes"
+ "-Wshorten-64-to-32")
+
+ if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+ if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "6")
+ # Quiet warnings in copy-list-initialization where {} elision has always
+ # been allowed.
+ list(APPEND libgav1_clang_cxx_flags "-Wno-missing-braces")
+ endif()
+ if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8)
+ list(APPEND libgav1_clang_cxx_flags "-Wextra-semi-stmt")
+ endif()
+ endif()
+
+ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+ if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "7")
+ # Quiet warnings due to potential snprintf() truncation in threadpool.cc.
+ list(APPEND libgav1_base_cxx_flags "-Wno-format-truncation")
+
+ if(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7")
+ # Quiet gcc 6 vs 7 abi warnings:
+ # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77728
+ list(APPEND libgav1_base_cxx_flags "-Wno-psabi")
+ list(APPEND ABSL_GCC_FLAGS "-Wno-psabi")
+ endif()
+ endif()
+ endif()
+
+ if(build_type_lowercase MATCHES "rel")
+ # TODO(tomfinegan): this value is only a concern for the core library and
+ # can be made smaller if the test targets are avoided.
+ list(APPEND libgav1_base_cxx_flags "-Wstack-usage=196608")
+ endif()
+
+ list(APPEND libgav1_msvc_cxx_flags
+ # Warning level 3.
+ "/W3"
+ # Disable warning C4018:
+ # '<comparison operator>' signed/unsigned mismatch
+ "/wd4018"
+ # Disable warning C4244:
+ # 'argument': conversion from '<double/int>' to
+ # '<float/smaller int type>', possible loss of data
+ "/wd4244"
+ # Disable warning C4267:
+ # '=': conversion from '<double/int>' to
+ # '<float/smaller int type>', possible loss of data
+ "/wd4267"
+ # Disable warning C4309:
+ # 'argument': truncation of constant value
+ "/wd4309"
+ # Disable warning C4551:
+ # function call missing argument list
+ "/wd4551")
+
+ if(BUILD_SHARED_LIBS)
+ list(APPEND libgav1_msvc_cxx_flags
+ # Disable warning C4251:
+ # 'libgav1::DecoderImpl class member' needs to have
+ # dll-interface to be used by clients of class
+ # 'libgav1::Decoder'.
+ "/wd4251")
+ endif()
+
+ if(NOT LIBGAV1_MAX_BITDEPTH)
+ set(LIBGAV1_MAX_BITDEPTH 10)
+ elseif(NOT LIBGAV1_MAX_BITDEPTH EQUAL 8 AND NOT LIBGAV1_MAX_BITDEPTH EQUAL 10)
+ libgav1_die("LIBGAV1_MAX_BITDEPTH must be 8 or 10.")
+ endif()
+
+ list(APPEND libgav1_defines "LIBGAV1_MAX_BITDEPTH=${LIBGAV1_MAX_BITDEPTH}")
+
+ if(DEFINED LIBGAV1_THREADPOOL_USE_STD_MUTEX)
+ if(NOT LIBGAV1_THREADPOOL_USE_STD_MUTEX EQUAL 0
+ AND NOT LIBGAV1_THREADPOOL_USE_STD_MUTEX EQUAL 1)
+ libgav1_die("LIBGAV1_THREADPOOL_USE_STD_MUTEX must be 0 or 1.")
+ endif()
+
+ list(APPEND libgav1_defines
+ "LIBGAV1_THREADPOOL_USE_STD_MUTEX=${LIBGAV1_THREADPOOL_USE_STD_MUTEX}")
+ endif()
+
+ # Source file names ending in these suffixes will have the appropriate
+ # compiler flags added to their compile commands to enable intrinsics.
+ set(libgav1_neon_source_file_suffix "neon.cc")
+ set(libgav1_sse4_source_file_suffix "sse4.cc")
+endmacro()
diff --git a/libgav1/cmake/libgav1_cpu_detection.cmake b/libgav1/cmake/libgav1_cpu_detection.cmake
new file mode 100644
index 0000000..6972d34
--- /dev/null
+++ b/libgav1/cmake/libgav1_cpu_detection.cmake
@@ -0,0 +1,42 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_CPU_DETECTION_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_CPU_DETECTION_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_CPU_DETECTION_CMAKE_ 1)
+
+# Detect optimizations available for the current target CPU.
+macro(libgav1_optimization_detect)
+ if(LIBGAV1_ENABLE_OPTIMIZATIONS)
+ string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" cpu_lowercase)
+ if(cpu_lowercase MATCHES "^arm|^aarch64")
+ set(libgav1_have_neon ON)
+ elseif(cpu_lowercase MATCHES "^x86|amd64")
+ set(libgav1_have_sse4 ON)
+ endif()
+ endif()
+
+ if(libgav1_have_neon AND LIBGAV1_ENABLE_NEON)
+ list(APPEND libgav1_defines "LIBGAV1_ENABLE_NEON=1")
+ else()
+ list(APPEND libgav1_defines "LIBGAV1_ENABLE_NEON=0")
+ endif()
+
+ if(libgav1_have_sse4 AND LIBGAV1_ENABLE_SSE4_1)
+ list(APPEND libgav1_defines "LIBGAV1_ENABLE_SSE4_1=1")
+ else()
+ list(APPEND libgav1_defines "LIBGAV1_ENABLE_SSE4_1=0")
+ endif()
+endmacro()
diff --git a/libgav1/cmake/libgav1_flags.cmake b/libgav1/cmake/libgav1_flags.cmake
new file mode 100644
index 0000000..0b8df60
--- /dev/null
+++ b/libgav1/cmake/libgav1_flags.cmake
@@ -0,0 +1,245 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_FLAGS_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_FLAGS_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_FLAGS_CMAKE_ 1)
+
+include(CheckCXXCompilerFlag)
+include(CheckCXXSourceCompiles)
+
+# Adds compiler flags specified by FLAGS to the sources specified by SOURCES:
+#
+# libgav1_set_compiler_flags_for_sources(SOURCES <sources> FLAGS <flags>)
+macro(libgav1_set_compiler_flags_for_sources)
+ unset(compiler_SOURCES)
+ unset(compiler_FLAGS)
+ unset(optional_args)
+ unset(single_value_args)
+ set(multi_value_args SOURCES FLAGS)
+ cmake_parse_arguments(compiler "${optional_args}" "${single_value_args}"
+ "${multi_value_args}" ${ARGN})
+
+ if(NOT (compiler_SOURCES AND compiler_FLAGS))
+ libgav1_die("libgav1_set_compiler_flags_for_sources: SOURCES and "
+ "FLAGS required.")
+ endif()
+
+ set_source_files_properties(${compiler_SOURCES} PROPERTIES COMPILE_FLAGS
+ ${compiler_FLAGS})
+
+ if(LIBGAV1_VERBOSE GREATER 1)
+ foreach(source ${compiler_SOURCES})
+ foreach(flag ${compiler_FLAGS})
+ message("libgav1_set_compiler_flags_for_sources: source:${source} "
+ "flag:${flag}")
+ endforeach()
+ endforeach()
+ endif()
+endmacro()
+
+# Tests compiler flags stored in list(s) specified by FLAG_LIST_VAR_NAMES, adds
+# flags to $LIBGAV1_CXX_FLAGS when tests pass. Terminates configuration if
+# FLAG_REQUIRED is specified and any flag check fails.
+#
+# ~~~
+# libgav1_test_cxx_flag(<FLAG_LIST_VAR_NAMES <flag list variable(s)>>
+# [FLAG_REQUIRED])
+# ~~~
+macro(libgav1_test_cxx_flag)
+ unset(cxx_test_FLAG_LIST_VAR_NAMES)
+ unset(cxx_test_FLAG_REQUIRED)
+ unset(single_value_args)
+ set(optional_args FLAG_REQUIRED)
+ set(multi_value_args FLAG_LIST_VAR_NAMES)
+ cmake_parse_arguments(cxx_test "${optional_args}" "${single_value_args}"
+ "${multi_value_args}" ${ARGN})
+
+ if(NOT cxx_test_FLAG_LIST_VAR_NAMES)
+ libgav1_die("libgav1_test_cxx_flag: FLAG_LIST_VAR_NAMES required")
+ endif()
+
+ unset(cxx_flags)
+ foreach(list_var ${cxx_test_FLAG_LIST_VAR_NAMES})
+ if(LIBGAV1_VERBOSE)
+ message("libgav1_test_cxx_flag: adding ${list_var} to cxx_flags")
+ endif()
+ list(APPEND cxx_flags ${${list_var}})
+ endforeach()
+
+ if(LIBGAV1_VERBOSE)
+ message("CXX test: all flags: ${cxx_flags}")
+ endif()
+
+ unset(all_cxx_flags)
+ list(APPEND all_cxx_flags ${LIBGAV1_CXX_FLAGS} ${cxx_flags})
+
+ # Turn off output from check_cxx_source_compiles. Print status directly
+ # instead since the logging messages from check_cxx_source_compiles can be
+ # quite confusing.
+ set(CMAKE_REQUIRED_QUIET TRUE)
+
+ # Run the actual compile test.
+ unset(libgav1_all_cxx_flags_pass CACHE)
+ message("--- Running combined CXX flags test, flags: ${all_cxx_flags}")
+ check_cxx_compiler_flag("${all_cxx_flags}" libgav1_all_cxx_flags_pass)
+
+ if(cxx_test_FLAG_REQUIRED AND NOT libgav1_all_cxx_flags_pass)
+ libgav1_die("Flag test failed for required flag(s): "
+ "${all_cxx_flags} and FLAG_REQUIRED specified.")
+ endif()
+
+ if(libgav1_all_cxx_flags_pass)
+ # Test passed: update the global flag list used by the libgav1 target
+ # creation wrappers.
+ set(LIBGAV1_CXX_FLAGS ${cxx_flags})
+ list(REMOVE_DUPLICATES LIBGAV1_CXX_FLAGS)
+
+ if(LIBGAV1_VERBOSE)
+ message("LIBGAV1_CXX_FLAGS=${LIBGAV1_CXX_FLAGS}")
+ endif()
+
+ message("--- Passed combined CXX flags test")
+ else()
+ message("--- Failed combined CXX flags test, testing flags individually.")
+
+ if(cxx_flags)
+ message("--- Testing flags from $cxx_flags: " "${cxx_flags}")
+ foreach(cxx_flag ${cxx_flags})
+ unset(cxx_flag_test_passed CACHE)
+ message("--- Testing flag: ${cxx_flag}")
+ check_cxx_compiler_flag("${cxx_flag}" cxx_flag_test_passed)
+
+ if(cxx_flag_test_passed)
+ message("--- Passed test for ${cxx_flag}")
+ else()
+ list(REMOVE_ITEM cxx_flags ${cxx_flag})
+ message("--- Failed test for ${cxx_flag}, flag removed.")
+ endif()
+ endforeach()
+
+ set(LIBGAV1_CXX_FLAGS ${cxx_flags})
+ endif()
+ endif()
+
+ if(LIBGAV1_CXX_FLAGS)
+ list(REMOVE_DUPLICATES LIBGAV1_CXX_FLAGS)
+ endif()
+endmacro()
+
+# Tests executable linker flags stored in list specified by FLAG_LIST_VAR_NAME,
+# adds flags to $LIBGAV1_EXE_LINKER_FLAGS when test passes. Terminates
+# configuration when flag check fails. libgav1_set_cxx_flags() must be called
+# before calling this macro because it assumes $LIBGAV1_CXX_FLAGS contains only
+# valid CXX flags.
+#
+# libgav1_test_exe_linker_flag(<FLAG_LIST_VAR_NAME <flag list variable)>)
+macro(libgav1_test_exe_linker_flag)
+ unset(link_FLAG_LIST_VAR_NAME)
+ unset(optional_args)
+ unset(multi_value_args)
+ set(single_value_args FLAG_LIST_VAR_NAME)
+ cmake_parse_arguments(link "${optional_args}" "${single_value_args}"
+ "${multi_value_args}" ${ARGN})
+
+ if(NOT link_FLAG_LIST_VAR_NAME)
+ libgav1_die("libgav1_test_link_flag: FLAG_LIST_VAR_NAME required")
+ endif()
+
+ libgav1_set_and_stringify(DEST linker_flags SOURCE_VARS
+ ${link_FLAG_LIST_VAR_NAME})
+
+ if(LIBGAV1_VERBOSE)
+ message("EXE LINKER test: all flags: ${linker_flags}")
+ endif()
+
+ # Tests of $LIBGAV1_CXX_FLAGS have already passed. Include them with the
+ # linker test.
+ libgav1_set_and_stringify(DEST CMAKE_REQUIRED_FLAGS SOURCE_VARS
+ LIBGAV1_CXX_FLAGS)
+
+ # Cache the global exe linker flags.
+ if(CMAKE_EXE_LINKER_FLAGS)
+ set(cached_CMAKE_EXE_LINKER_FLAGS ${CMAKE_EXE_LINKER_FLAGS})
+ libgav1_set_and_stringify(DEST CMAKE_EXE_LINKER_FLAGS SOURCE
+ ${linker_flags})
+ endif()
+
+ libgav1_set_and_stringify(DEST CMAKE_EXE_LINKER_FLAGS SOURCE ${linker_flags}
+ ${CMAKE_EXE_LINKER_FLAGS})
+
+ # Turn off output from check_cxx_source_compiles. Print status directly
+ # instead since the logging messages from check_cxx_source_compiles can be
+ # quite confusing.
+ set(CMAKE_REQUIRED_QUIET TRUE)
+
+ message("--- Running EXE LINKER test for flags: ${linker_flags}")
+
+ unset(linker_flag_test_passed CACHE)
+ set(libgav1_cxx_main "\nint main() { return 0; }")
+ check_cxx_source_compiles("${libgav1_cxx_main}" linker_flag_test_passed)
+
+ if(NOT linker_flag_test_passed)
+ libgav1_die("EXE LINKER test failed.")
+ endif()
+
+ message("--- Passed EXE LINKER flag test.")
+
+ # Restore cached global exe linker flags.
+ if(cached_CMAKE_EXE_LINKER_FLAGS)
+ set(CMAKE_EXE_LINKER_FLAGS cached_CMAKE_EXE_LINKER_FLAGS)
+ else()
+ unset(CMAKE_EXE_LINKER_FLAGS)
+ endif()
+endmacro()
+
+# Runs the libgav1 compiler tests. This macro builds up the list of list var(s)
+# that is passed to libgav1_test_cxx_flag().
+#
+# Note: libgav1_set_build_definitions() must be called before this macro.
+macro(libgav1_set_cxx_flags)
+ unset(cxx_flag_lists)
+
+ if(CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
+ list(APPEND cxx_flag_lists libgav1_base_cxx_flags)
+ endif()
+
+ # Append clang flags after the base set to allow -Wno* overrides to take
+ # effect. Some of the base flags may enable a large set of warnings, e.g.,
+ # -Wall.
+ if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+ list(APPEND cxx_flag_lists libgav1_clang_cxx_flags)
+ endif()
+
+ if(MSVC)
+ list(APPEND cxx_flag_lists libgav1_msvc_cxx_flags)
+ endif()
+
+ if(LIBGAV1_VERBOSE)
+ if(cxx_flag_lists)
+ libgav1_set_and_stringify(DEST cxx_flags SOURCE_VARS ${cxx_flag_lists})
+ message("libgav1_set_cxx_flags: internal CXX flags: ${cxx_flags}")
+ endif()
+ endif()
+
+ if(LIBGAV1_CXX_FLAGS)
+ list(APPEND cxx_flag_lists LIBGAV1_CXX_FLAGS)
+ if(LIBGAV1_VERBOSE)
+ message("libgav1_set_cxx_flags: user CXX flags: ${LIBGAV1_CXX_FLAGS}")
+ endif()
+ endif()
+
+ libgav1_test_cxx_flag(FLAG_LIST_VAR_NAMES ${cxx_flag_lists})
+endmacro()
diff --git a/libgav1/cmake/libgav1_helpers.cmake b/libgav1/cmake/libgav1_helpers.cmake
new file mode 100644
index 0000000..76d8d67
--- /dev/null
+++ b/libgav1/cmake/libgav1_helpers.cmake
@@ -0,0 +1,134 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_HELPERS_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_HELPERS_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_HELPERS_CMAKE_ 1)
+
+# Kills build generation using message(FATAL_ERROR) and outputs all data passed
+# to the console via use of $ARGN.
+macro(libgav1_die)
+ message(FATAL_ERROR ${ARGN})
+endmacro()
+
+# Converts semi-colon delimited list variable(s) to string. Output is written to
+# variable supplied via the DEST parameter. Input is from an expanded variable
+# referenced by SOURCE and/or variable(s) referenced by SOURCE_VARS.
+macro(libgav1_set_and_stringify)
+ set(optional_args)
+ set(single_value_args DEST SOURCE_VAR)
+ set(multi_value_args SOURCE SOURCE_VARS)
+ cmake_parse_arguments(sas "${optional_args}" "${single_value_args}"
+ "${multi_value_args}" ${ARGN})
+
+ if(NOT sas_DEST OR NOT (sas_SOURCE OR sas_SOURCE_VARS))
+ libgav1_die("libgav1_set_and_stringify: DEST and at least one of SOURCE "
+ "SOURCE_VARS required.")
+ endif()
+
+ unset(${sas_DEST})
+
+ if(sas_SOURCE)
+ # $sas_SOURCE is one or more expanded variables, just copy the values to
+ # $sas_DEST.
+ set(${sas_DEST} "${sas_SOURCE}")
+ endif()
+
+ if(sas_SOURCE_VARS)
+ # $sas_SOURCE_VARS is one or more variable names. Each iteration expands a
+ # variable and appends it to $sas_DEST.
+ foreach(source_var ${sas_SOURCE_VARS})
+ set(${sas_DEST} "${${sas_DEST}} ${${source_var}}")
+ endforeach()
+
+ # Because $sas_DEST can be empty when entering this scope leading whitespace
+ # can be introduced to $sas_DEST on the first iteration of the above loop.
+ # Remove it:
+ string(STRIP "${${sas_DEST}}" ${sas_DEST})
+ endif()
+
+ # Lists in CMake are simply semicolon delimited strings, so stringification is
+ # just a find and replace of the semicolon.
+ string(REPLACE ";" " " ${sas_DEST} "${${sas_DEST}}")
+
+ if(LIBGAV1_VERBOSE GREATER 1)
+ message("libgav1_set_and_stringify: ${sas_DEST}=${${sas_DEST}}")
+ endif()
+endmacro()
+
+# Creates a dummy source file in $LIBGAV1_GENERATED_SOURCES_DIRECTORY and adds
+# it to the specified target. Optionally adds its path to a list variable.
+#
+# libgav1_create_dummy_source_file(<TARGET <target> BASENAME <basename of file>>
+# [LISTVAR <list variable>])
+macro(libgav1_create_dummy_source_file)
+ set(optional_args)
+ set(single_value_args TARGET BASENAME LISTVAR)
+ set(multi_value_args)
+ cmake_parse_arguments(cdsf "${optional_args}" "${single_value_args}"
+ "${multi_value_args}" ${ARGN})
+
+ if(NOT cdsf_TARGET OR NOT cdsf_BASENAME)
+ libgav1_die(
+ "libgav1_create_dummy_source_file: TARGET and BASENAME required.")
+ endif()
+
+ if(NOT LIBGAV1_GENERATED_SOURCES_DIRECTORY)
+ set(LIBGAV1_GENERATED_SOURCES_DIRECTORY "${libgav1_build}/gen_src")
+ endif()
+
+ set(dummy_source_dir "${LIBGAV1_GENERATED_SOURCES_DIRECTORY}")
+ set(dummy_source_file
+ "${dummy_source_dir}/libgav1_${cdsf_TARGET}_${cdsf_BASENAME}.cc")
+ set(dummy_source_code
+ "// Generated file. DO NOT EDIT!\n"
+ "// C++ source file created for target ${cdsf_TARGET}. \n"
+ "void libgav1_${cdsf_TARGET}_${cdsf_BASENAME}_dummy_function(void);\n"
+ "void libgav1_${cdsf_TARGET}_${cdsf_BASENAME}_dummy_function(void) {}\n")
+ file(WRITE "${dummy_source_file}" "${dummy_source_code}")
+
+ target_sources(${cdsf_TARGET} PRIVATE ${dummy_source_file})
+
+ if(cdsf_LISTVAR)
+ list(APPEND ${cdsf_LISTVAR} "${dummy_source_file}")
+ endif()
+endmacro()
+
+# Loads the version components from $libgav1_source/gav1/version.h and sets the
+# corresponding CMake variables:
+# - LIBGAV1_MAJOR_VERSION
+# - LIBGAV1_MINOR_VERSION
+# - LIBGAV1_PATCH_VERSION
+# - LIBGAV1_VERSION, which is:
+# - $LIBGAV1_MAJOR_VERSION.$LIBGAV1_MINOR_VERSION.$LIBGAV1_PATCH_VERSION
+macro(libgav1_load_version_info)
+ file(STRINGS "${libgav1_source}/gav1/version.h" version_file_strings)
+ foreach(str ${version_file_strings})
+ if(str MATCHES "#define LIBGAV1_")
+ if(str MATCHES "#define LIBGAV1_MAJOR_VERSION ")
+ string(REPLACE "#define LIBGAV1_MAJOR_VERSION " "" LIBGAV1_MAJOR_VERSION
+ "${str}")
+ elseif(str MATCHES "#define LIBGAV1_MINOR_VERSION ")
+ string(REPLACE "#define LIBGAV1_MINOR_VERSION " "" LIBGAV1_MINOR_VERSION
+ "${str}")
+ elseif(str MATCHES "#define LIBGAV1_PATCH_VERSION ")
+ string(REPLACE "#define LIBGAV1_PATCH_VERSION " "" LIBGAV1_PATCH_VERSION
+ "${str}")
+ endif()
+ endif()
+ endforeach()
+ set(LIBGAV1_VERSION "${LIBGAV1_MAJOR_VERSION}.${LIBGAV1_MINOR_VERSION}")
+ set(LIBGAV1_VERSION "${LIBGAV1_VERSION}.${LIBGAV1_PATCH_VERSION}")
+endmacro()
diff --git a/libgav1/cmake/libgav1_install.cmake b/libgav1/cmake/libgav1_install.cmake
new file mode 100644
index 0000000..b7f6006
--- /dev/null
+++ b/libgav1/cmake/libgav1_install.cmake
@@ -0,0 +1,60 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_INSTALL_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_INSTALL_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_INSTALL_CMAKE_ 1)
+
+# Sets up the Libgav1 install targets. Must be called after the static library
+# target is created.
+macro(libgav1_setup_install_target)
+ if(NOT (MSVC OR XCODE))
+ include(GNUInstallDirs)
+
+ # pkg-config: libgav1.pc
+ set(prefix "${CMAKE_INSTALL_PREFIX}")
+ set(exec_prefix "\${prefix}")
+ set(libdir "\${prefix}/${CMAKE_INSTALL_LIBDIR}")
+ set(includedir "\${prefix}/${CMAKE_INSTALL_INCLUDEDIR}")
+ set(libgav1_lib_name "libgav1")
+
+ configure_file("${libgav1_root}/cmake/libgav1.pc.template"
+ "${libgav1_build}/libgav1.pc" @ONLY NEWLINE_STYLE UNIX)
+ install(FILES "${libgav1_build}/libgav1.pc"
+ DESTINATION "${prefix}/${CMAKE_INSTALL_LIBDIR}/pkgconfig")
+
+ # CMake config: libgav1-config.cmake
+ set(LIBGAV1_INCLUDE_DIRS "${prefix}/${CMAKE_INSTALL_INCLUDEDIR}")
+ configure_file("${libgav1_root}/cmake/libgav1-config.cmake.template"
+ "${libgav1_build}/libgav1-config.cmake" @ONLY
+ NEWLINE_STYLE UNIX)
+ install(
+ FILES "${libgav1_build}/libgav1-config.cmake"
+ DESTINATION "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_DATAROOTDIR}/cmake")
+
+ install(
+ FILES ${libgav1_api_includes}
+ DESTINATION "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/gav1")
+
+ install(TARGETS gav1_decode DESTINATION
+ "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}")
+ install(TARGETS libgav1_static DESTINATION
+ "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
+ if(BUILD_SHARED_LIBS)
+ install(TARGETS libgav1_shared DESTINATION
+ "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
+ endif()
+ endif()
+endmacro()
diff --git a/libgav1/cmake/libgav1_intrinsics.cmake b/libgav1/cmake/libgav1_intrinsics.cmake
new file mode 100644
index 0000000..039ef35
--- /dev/null
+++ b/libgav1/cmake/libgav1_intrinsics.cmake
@@ -0,0 +1,110 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_INTRINSICS_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_INTRINSICS_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_INTRINSICS_CMAKE_ 1)
+
+# Returns the compiler flag for the SIMD intrinsics suffix specified by the
+# SUFFIX argument via the variable specified by the VARIABLE argument:
+# libgav1_get_intrinsics_flag_for_suffix(SUFFIX <suffix> VARIABLE <var name>)
+macro(libgav1_get_intrinsics_flag_for_suffix)
+ unset(intrinsics_SUFFIX)
+ unset(intrinsics_VARIABLE)
+ unset(optional_args)
+ unset(multi_value_args)
+ set(single_value_args SUFFIX VARIABLE)
+ cmake_parse_arguments(intrinsics "${optional_args}" "${single_value_args}"
+ "${multi_value_args}" ${ARGN})
+
+ if(NOT (intrinsics_SUFFIX AND intrinsics_VARIABLE))
+ message(FATAL_ERROR "libgav1_get_intrinsics_flag_for_suffix: SUFFIX and "
+ "VARIABLE required.")
+ endif()
+
+ if(intrinsics_SUFFIX MATCHES "neon")
+ if(NOT MSVC)
+ set(${intrinsics_VARIABLE} "${LIBGAV1_NEON_INTRINSICS_FLAG}")
+ endif()
+ elseif(intrinsics_SUFFIX MATCHES "sse4")
+ if(NOT MSVC)
+ set(${intrinsics_VARIABLE} "-msse4.1")
+ endif()
+ else()
+ message(FATAL_ERROR "libgav1_get_intrinsics_flag_for_suffix: Unknown "
+ "instrinics suffix: ${intrinsics_SUFFIX}")
+ endif()
+
+ if(LIBGAV1_VERBOSE GREATER 1)
+ message("libgav1_get_intrinsics_flag_for_suffix: "
+ "suffix:${intrinsics_SUFFIX} flag:${${intrinsics_VARIABLE}}")
+ endif()
+endmacro()
+
+# Processes source files specified by SOURCES and adds intrinsics flags as
+# necessary: libgav1_process_intrinsics_sources(SOURCES <sources>)
+#
+# Detects requirement for intrinsics flags using source file name suffix.
+# Currently supports only SSE4.1.
+macro(libgav1_process_intrinsics_sources)
+ unset(arg_TARGET)
+ unset(arg_SOURCES)
+ unset(optional_args)
+ set(single_value_args TARGET)
+ set(multi_value_args SOURCES)
+ cmake_parse_arguments(arg "${optional_args}" "${single_value_args}"
+ "${multi_value_args}" ${ARGN})
+ if(NOT (arg_TARGET AND arg_SOURCES))
+ message(FATAL_ERROR "libgav1_process_intrinsics_sources: TARGET and "
+ "SOURCES required.")
+ endif()
+
+ if(LIBGAV1_ENABLE_SSE4_1 AND libgav1_have_sse4)
+ unset(sse4_sources)
+ list(APPEND sse4_sources ${arg_SOURCES})
+
+ list(FILTER sse4_sources INCLUDE REGEX
+ "${libgav1_sse4_source_file_suffix}$")
+
+ if(sse4_sources)
+ unset(sse4_flags)
+ libgav1_get_intrinsics_flag_for_suffix(SUFFIX
+ ${libgav1_sse4_source_file_suffix}
+ VARIABLE sse4_flags)
+ if(sse4_flags)
+ libgav1_set_compiler_flags_for_sources(SOURCES ${sse4_sources} FLAGS
+ ${sse4_flags})
+ endif()
+ endif()
+ endif()
+
+ if(LIBGAV1_ENABLE_NEON AND libgav1_have_neon)
+ unset(neon_sources)
+ list(APPEND neon_sources ${arg_SOURCES})
+ list(FILTER neon_sources INCLUDE REGEX
+ "${libgav1_neon_source_file_suffix}$")
+
+ if(neon_sources AND LIBGAV1_NEON_INTRINSICS_FLAG)
+ unset(neon_flags)
+ libgav1_get_intrinsics_flag_for_suffix(SUFFIX
+ ${libgav1_neon_source_file_suffix}
+ VARIABLE neon_flags)
+ if(neon_flags)
+ libgav1_set_compiler_flags_for_sources(SOURCES ${neon_sources} FLAGS
+ ${neon_flags})
+ endif()
+ endif()
+ endif()
+endmacro()
diff --git a/libgav1/cmake/libgav1_options.cmake b/libgav1/cmake/libgav1_options.cmake
new file mode 100644
index 0000000..6327bee
--- /dev/null
+++ b/libgav1/cmake/libgav1_options.cmake
@@ -0,0 +1,55 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_OPTIONS_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_OPTIONS_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_OPTIONS_CMAKE_)
+
+# Simple wrapper for CMake's builtin option command that tracks libgav1's build
+# options in the list variable $libgav1_options.
+macro(libgav1_option)
+ unset(option_NAME)
+ unset(option_HELPSTRING)
+ unset(option_VALUE)
+ unset(optional_args)
+ unset(multi_value_args)
+ set(single_value_args NAME HELPSTRING VALUE)
+ cmake_parse_arguments(option "${optional_args}" "${single_value_args}"
+ "${multi_value_args}" ${ARGN})
+
+ if(NOT (option_NAME AND option_HELPSTRING AND DEFINED option_VALUE))
+ message(FATAL_ERROR "libgav1_option: NAME HELPSTRING and VALUE required.")
+ endif()
+
+ option(${option_NAME} ${option_HELPSTRING} ${option_VALUE})
+
+ if(LIBGAV1_VERBOSE GREATER 2)
+ message("--------- libgav1_option ---------\n"
+ "option_NAME=${option_NAME}\n"
+ "option_HELPSTRING=${option_HELPSTRING}\n"
+ "option_VALUE=${option_VALUE}\n"
+ "------------------------------------------\n")
+ endif()
+
+ list(APPEND libgav1_options ${option_NAME})
+ list(REMOVE_DUPLICATES libgav1_options)
+endmacro()
+
+# Dumps the $libgav1_options list via CMake message command.
+macro(libgav1_dump_options)
+ foreach(option_name ${libgav1_options})
+ message("${option_name}: ${${option_name}}")
+ endforeach()
+endmacro()
diff --git a/libgav1/cmake/libgav1_sanitizer.cmake b/libgav1/cmake/libgav1_sanitizer.cmake
new file mode 100644
index 0000000..4bb2263
--- /dev/null
+++ b/libgav1/cmake/libgav1_sanitizer.cmake
@@ -0,0 +1,45 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_SANITIZER_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_SANITIZER_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_SANITIZER_CMAKE_ 1)
+
+macro(libgav1_configure_sanitizer)
+ if(LIBGAV1_SANITIZE AND NOT MSVC)
+ if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+ if(LIBGAV1_SANITIZE MATCHES "cfi")
+ list(APPEND LIBGAV1_CXX_FLAGS "-flto" "-fno-sanitize-trap=cfi")
+ list(APPEND LIBGAV1_EXE_LINKER_FLAGS "-flto" "-fno-sanitize-trap=cfi"
+ "-fuse-ld=gold")
+ endif()
+
+ if(${CMAKE_SIZEOF_VOID_P} EQUAL 4
+ AND LIBGAV1_SANITIZE MATCHES "integer|undefined")
+ list(APPEND LIBGAV1_EXE_LINKER_FLAGS "--rtlib=compiler-rt" "-lgcc_s")
+ endif()
+ endif()
+
+ list(APPEND LIBGAV1_CXX_FLAGS "-fsanitize=${LIBGAV1_SANITIZE}")
+ list(APPEND LIBGAV1_EXE_LINKER_FLAGS "-fsanitize=${LIBGAV1_SANITIZE}")
+
+ # Make sanitizer callstacks accurate.
+ list(APPEND LIBGAV1_CXX_FLAGS "-fno-omit-frame-pointer"
+ "-fno-optimize-sibling-calls")
+
+ libgav1_test_cxx_flag(FLAG_LIST_VAR_NAMES LIBGAV1_CXX_FLAGS FLAG_REQUIRED)
+ libgav1_test_exe_linker_flag(FLAG_LIST_VAR_NAME LIBGAV1_EXE_LINKER_FLAGS)
+ endif()
+endmacro()
diff --git a/libgav1/cmake/libgav1_targets.cmake b/libgav1/cmake/libgav1_targets.cmake
new file mode 100644
index 0000000..78b4865
--- /dev/null
+++ b/libgav1/cmake/libgav1_targets.cmake
@@ -0,0 +1,347 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_GAV1_TARGETS_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_GAV1_TARGETS_CMAKE_
+set(LIBGAV1_CMAKE_GAV1_TARGETS_CMAKE_ 1)
+
+# Resets list variables used to track libgav1 targets.
+macro(libgav1_reset_target_lists)
+ unset(libgav1_targets)
+ unset(libgav1_exe_targets)
+ unset(libgav1_lib_targets)
+ unset(libgav1_objlib_targets)
+ unset(libgav1_sources)
+ unset(libgav1_test_targets)
+endmacro()
+
+# Creates an executable target. The target name is passed as a parameter to the
+# NAME argument, and the sources passed as a parameter to the SOURCES argument:
+# libgav1_add_test(NAME <name> SOURCES <sources> [optional args])
+#
+# Optional args:
+# cmake-format: off
+# - OUTPUT_NAME: Override output file basename. Target basename defaults to
+# NAME.
+# - TEST: Flag. Presence means treat executable as a test.
+# - DEFINES: List of preprocessor macro definitions.
+# - INCLUDES: list of include directories for the target.
+# - COMPILE_FLAGS: list of compiler flags for the target.
+# - LINK_FLAGS: List of linker flags for the target.
+# - OBJLIB_DEPS: List of CMake object library target dependencies.
+# - LIB_DEPS: List of CMake library dependencies.
+# cmake-format: on
+#
+# Sources passed to this macro are added to $libgav1_test_sources when TEST is
+# specified. Otherwise sources are added to $libgav1_sources.
+#
+# Targets passed to this macro are always added $libgav1_targets. When TEST is
+# specified targets are also added to list $libgav1_test_targets. Otherwise
+# targets are added to $libgav1_exe_targets.
+macro(libgav1_add_executable)
+ unset(exe_TEST)
+ unset(exe_TEST_DEFINES_MAIN)
+ unset(exe_NAME)
+ unset(exe_OUTPUT_NAME)
+ unset(exe_SOURCES)
+ unset(exe_DEFINES)
+ unset(exe_INCLUDES)
+ unset(exe_COMPILE_FLAGS)
+ unset(exe_LINK_FLAGS)
+ unset(exe_OBJLIB_DEPS)
+ unset(exe_LIB_DEPS)
+ set(optional_args TEST)
+ set(single_value_args NAME OUTPUT_NAME)
+ set(multi_value_args SOURCES DEFINES INCLUDES COMPILE_FLAGS LINK_FLAGS
+ OBJLIB_DEPS LIB_DEPS)
+
+ cmake_parse_arguments(exe "${optional_args}" "${single_value_args}"
+ "${multi_value_args}" ${ARGN})
+
+ if(LIBGAV1_VERBOSE GREATER 1)
+ message("--------- libgav1_add_executable ---------\n"
+ "exe_TEST=${exe_TEST}\n"
+ "exe_TEST_DEFINES_MAIN=${exe_TEST_DEFINES_MAIN}\n"
+ "exe_NAME=${exe_NAME}\n"
+ "exe_OUTPUT_NAME=${exe_OUTPUT_NAME}\n"
+ "exe_SOURCES=${exe_SOURCES}\n"
+ "exe_DEFINES=${exe_DEFINES}\n"
+ "exe_INCLUDES=${exe_INCLUDES}\n"
+ "exe_COMPILE_FLAGS=${exe_COMPILE_FLAGS}\n"
+ "exe_LINK_FLAGS=${exe_LINK_FLAGS}\n"
+ "exe_OBJLIB_DEPS=${exe_OBJLIB_DEPS}\n"
+ "exe_LIB_DEPS=${exe_LIB_DEPS}\n"
+ "------------------------------------------\n")
+ endif()
+
+ if(NOT (exe_NAME AND exe_SOURCES))
+ message(FATAL_ERROR "libgav1_add_executable: NAME and SOURCES required.")
+ endif()
+
+ list(APPEND libgav1_targets ${exe_NAME})
+ if(exe_TEST)
+ list(APPEND libgav1_test_targets ${exe_NAME})
+ list(APPEND libgav1_test_sources ${exe_SOURCES})
+ else()
+ list(APPEND libgav1_exe_targets ${exe_NAME})
+ list(APPEND libgav1_sources ${exe_SOURCES})
+ endif()
+
+ add_executable(${exe_NAME} ${exe_SOURCES})
+
+ if(exe_OUTPUT_NAME)
+ set_target_properties(${exe_NAME} PROPERTIES OUTPUT_NAME ${exe_OUTPUT_NAME})
+ endif()
+
+ libgav1_process_intrinsics_sources(TARGET ${exe_NAME} SOURCES ${exe_SOURCES})
+
+ if(exe_DEFINES)
+ target_compile_definitions(${exe_NAME} PRIVATE ${exe_DEFINES})
+ endif()
+
+ if(exe_INCLUDES)
+ target_include_directories(${exe_NAME} PRIVATE ${exe_INCLUDES})
+ endif()
+
+ if(exe_COMPILE_FLAGS OR LIBGAV1_CXX_FLAGS)
+ target_compile_options(${exe_NAME}
+ PRIVATE ${exe_COMPILE_FLAGS} ${LIBGAV1_CXX_FLAGS})
+ endif()
+
+ if(exe_LINK_FLAGS OR LIBGAV1_EXE_LINKER_FLAGS)
+ set_target_properties(${exe_NAME}
+ PROPERTIES LINK_FLAGS ${exe_LINK_FLAGS}
+ ${LIBGAV1_EXE_LINKER_FLAGS})
+ endif()
+
+ if(exe_OBJLIB_DEPS)
+ foreach(objlib_dep ${exe_OBJLIB_DEPS})
+ target_sources(${exe_NAME} PRIVATE $<TARGET_OBJECTS:${objlib_dep}>)
+ endforeach()
+ endif()
+
+ if(CMAKE_THREAD_LIBS_INIT)
+ list(APPEND exe_LIB_DEPS ${CMAKE_THREAD_LIBS_INIT})
+ endif()
+
+ if(BUILD_SHARED_LIBS AND (MSVC OR WIN32))
+ target_compile_definitions(${lib_NAME} PRIVATE "LIBGAV1_BUILDING_DLL=0")
+ endif()
+
+ if(exe_LIB_DEPS)
+ unset(exe_static)
+ if("${CMAKE_EXE_LINKER_FLAGS} ${LIBGAV1_EXE_LINKER_FLAGS}" MATCHES "static")
+ set(exe_static ON)
+ endif()
+
+ if(exe_static AND CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
+ # Third party dependencies can introduce dependencies on system and test
+ # libraries. Since the target created here is an executable, and CMake
+ # does not provide a method of controlling order of link dependencies,
+ # wrap all of the dependencies of this target in start/end group flags to
+ # ensure that dependencies of third party targets can be resolved when
+ # those dependencies happen to be resolved by dependencies of the current
+ # target.
+ list(INSERT exe_LIB_DEPS 0 -Wl,--start-group)
+ list(APPEND exe_LIB_DEPS -Wl,--end-group)
+ endif()
+ target_link_libraries(${exe_NAME} PRIVATE ${exe_LIB_DEPS})
+ endif()
+endmacro()
+
+# Creates a library target of the specified type. The target name is passed as a
+# parameter to the NAME argument, the type as a parameter to the TYPE argument,
+# and the sources passed as a parameter to the SOURCES argument:
+# libgav1_add_library(NAME <name> TYPE <type> SOURCES <sources> [optional args])
+#
+# Optional args:
+# cmake-format: off
+# - OUTPUT_NAME: Override output file basename. Target basename defaults to
+# NAME. OUTPUT_NAME is ignored when BUILD_SHARED_LIBS is enabled and CMake
+# is generating a build for which MSVC or WIN32 are true. This is to avoid
+# output basename collisions with DLL import libraries.
+# - TEST: Flag. Presence means treat library as a test.
+# - DEFINES: List of preprocessor macro definitions.
+# - INCLUDES: list of include directories for the target.
+# - COMPILE_FLAGS: list of compiler flags for the target.
+# - LINK_FLAGS: List of linker flags for the target.
+# - OBJLIB_DEPS: List of CMake object library target dependencies.
+# - LIB_DEPS: List of CMake library dependencies.
+# - PUBLIC_INCLUDES: List of include paths to export to dependents.
+# cmake-format: on
+#
+# Sources passed to the macro are added to the lists tracking libgav1 sources:
+# cmake-format: off
+# - When TEST is specified sources are added to $libgav1_test_sources.
+# - Otherwise sources are added to $libgav1_sources.
+# cmake-format: on
+#
+# Targets passed to this macro are added to the lists tracking libgav1 targets:
+# cmake-format: off
+# - Targets are always added to $libgav1_targets.
+# - When the TEST flag is specified, targets are added to
+# $libgav1_test_targets.
+# - When TEST is not specified:
+# - Libraries of type SHARED are added to $libgav1_dylib_targets.
+# - Libraries of type OBJECT are added to $libgav1_objlib_targets.
+# - Libraries of type STATIC are added to $libgav1_lib_targets.
+# cmake-format: on
+macro(libgav1_add_library)
+ unset(lib_TEST)
+ unset(lib_NAME)
+ unset(lib_OUTPUT_NAME)
+ unset(lib_TYPE)
+ unset(lib_SOURCES)
+ unset(lib_DEFINES)
+ unset(lib_INCLUDES)
+ unset(lib_COMPILE_FLAGS)
+ unset(lib_LINK_FLAGS)
+ unset(lib_OBJLIB_DEPS)
+ unset(lib_LIB_DEPS)
+ unset(lib_PUBLIC_INCLUDES)
+ set(optional_args TEST)
+ set(single_value_args NAME OUTPUT_NAME TYPE)
+ set(multi_value_args SOURCES DEFINES INCLUDES COMPILE_FLAGS LINK_FLAGS
+ OBJLIB_DEPS LIB_DEPS PUBLIC_INCLUDES)
+
+ cmake_parse_arguments(lib "${optional_args}" "${single_value_args}"
+ "${multi_value_args}" ${ARGN})
+
+ if(LIBGAV1_VERBOSE GREATER 1)
+ message("--------- libgav1_add_library ---------\n"
+ "lib_TEST=${lib_TEST}\n"
+ "lib_NAME=${lib_NAME}\n"
+ "lib_OUTPUT_NAME=${lib_OUTPUT_NAME}\n"
+ "lib_TYPE=${lib_TYPE}\n"
+ "lib_SOURCES=${lib_SOURCES}\n"
+ "lib_DEFINES=${lib_DEFINES}\n"
+ "lib_INCLUDES=${lib_INCLUDES}\n"
+ "lib_COMPILE_FLAGS=${lib_COMPILE_FLAGS}\n"
+ "lib_LINK_FLAGS=${lib_LINK_FLAGS}\n"
+ "lib_OBJLIB_DEPS=${lib_OBJLIB_DEPS}\n"
+ "lib_LIB_DEPS=${lib_LIB_DEPS}\n"
+ "lib_PUBLIC_INCLUDES=${lib_PUBLIC_INCLUDES}\n"
+ "---------------------------------------\n")
+ endif()
+
+ if(NOT (lib_NAME AND lib_TYPE AND lib_SOURCES))
+ message(FATAL_ERROR "libgav1_add_library: NAME, TYPE and SOURCES required.")
+ endif()
+
+ list(APPEND libgav1_targets ${lib_NAME})
+ if(lib_TEST)
+ list(APPEND libgav1_test_targets ${lib_NAME})
+ list(APPEND libgav1_test_sources ${lib_SOURCES})
+ else()
+ list(APPEND libgav1_sources ${lib_SOURCES})
+ if(lib_TYPE STREQUAL OBJECT)
+ list(APPEND libgav1_objlib_targets ${lib_NAME})
+ elseif(lib_TYPE STREQUAL SHARED)
+ list(APPEND libgav1_dylib_targets ${lib_NAME})
+ elseif(lib_TYPE STREQUAL STATIC)
+ list(APPEND libgav1_lib_targets ${lib_NAME})
+ else()
+ message(WARNING "libgav1_add_library: Unhandled type: ${lib_TYPE}")
+ endif()
+ endif()
+
+ add_library(${lib_NAME} ${lib_TYPE} ${lib_SOURCES})
+ libgav1_process_intrinsics_sources(TARGET ${lib_NAME} SOURCES ${lib_SOURCES})
+
+ if(lib_OUTPUT_NAME)
+ if(NOT (BUILD_SHARED_LIBS AND (MSVC OR WIN32)))
+ set_target_properties(${lib_NAME}
+ PROPERTIES OUTPUT_NAME ${lib_OUTPUT_NAME})
+ endif()
+ endif()
+
+ if(lib_DEFINES)
+ target_compile_definitions(${lib_NAME} PRIVATE ${lib_DEFINES})
+ endif()
+
+ if(lib_INCLUDES)
+ target_include_directories(${lib_NAME} PRIVATE ${lib_INCLUDES})
+ endif()
+
+ if(lib_PUBLIC_INCLUDES)
+ target_include_directories(${lib_NAME} PUBLIC ${lib_PUBLIC_INCLUDES})
+ endif()
+
+ if(lib_COMPILE_FLAGS OR LIBGAV1_CXX_FLAGS)
+ target_compile_options(${lib_NAME}
+ PRIVATE ${lib_COMPILE_FLAGS} ${LIBGAV1_CXX_FLAGS})
+ endif()
+
+ if(lib_LINK_FLAGS)
+ set_target_properties(${lib_NAME} PROPERTIES LINK_FLAGS ${lib_LINK_FLAGS})
+ endif()
+
+ if(lib_OBJLIB_DEPS)
+ foreach(objlib_dep ${lib_OBJLIB_DEPS})
+ target_sources(${lib_NAME} PRIVATE $<TARGET_OBJECTS:${objlib_dep}>)
+ endforeach()
+ endif()
+
+ if(lib_LIB_DEPS)
+ if(lib_TYPE STREQUAL STATIC)
+ set(link_type PUBLIC)
+ else()
+ set(link_type PRIVATE)
+ if(lib_TYPE STREQUAL SHARED AND CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
+ # The libgav1 shared object uses the static libgav1 as input to turn it
+ # into a shared object. Include everything from the static library in
+ # the shared object.
+ if(APPLE)
+ list(INSERT lib_LIB_DEPS 0 -Wl,-force_load)
+ else()
+ list(INSERT lib_LIB_DEPS 0 -Wl,--whole-archive)
+ list(APPEND lib_LIB_DEPS -Wl,--no-whole-archive)
+ endif()
+ endif()
+ endif()
+ target_link_libraries(${lib_NAME} ${link_type} ${lib_LIB_DEPS})
+ endif()
+
+ if(NOT MSVC AND lib_NAME MATCHES "^lib")
+ # Non-MSVC generators prepend lib to static lib target file names. Libgav1
+ # already includes lib in its name. Avoid naming output files liblib*.
+ set_target_properties(${lib_NAME} PROPERTIES PREFIX "")
+ endif()
+
+ if(lib_TYPE STREQUAL SHARED AND NOT MSVC)
+ set_target_properties(${lib_NAME} PROPERTIES SOVERSION ${LIBGAV1_SOVERSION})
+ endif()
+
+ if(BUILD_SHARED_LIBS AND (MSVC OR WIN32))
+ if(lib_TYPE STREQUAL SHARED)
+ target_compile_definitions(${lib_NAME} PRIVATE "LIBGAV1_BUILDING_DLL=1")
+ else()
+ target_compile_definitions(${lib_NAME} PRIVATE "LIBGAV1_BUILDING_DLL=0")
+ endif()
+ endif()
+
+ # Determine if $lib_NAME is a header only target.
+ set(sources_list ${lib_SOURCES})
+ list(FILTER sources_list INCLUDE REGEX cc$)
+ if(NOT sources_list)
+ if(NOT XCODE)
+ # This is a header only target. Tell CMake the link language.
+ set_target_properties(${lib_NAME} PROPERTIES LINKER_LANGUAGE CXX)
+ else()
+ # The Xcode generator ignores LINKER_LANGUAGE. Add a dummy cc file.
+ libgav1_create_dummy_source_file(TARGET ${lib_NAME} BASENAME ${lib_NAME})
+ endif()
+ endif()
+endmacro()
diff --git a/libgav1/cmake/libgav1_variables.cmake b/libgav1/cmake/libgav1_variables.cmake
new file mode 100644
index 0000000..0dd0f37
--- /dev/null
+++ b/libgav1/cmake/libgav1_variables.cmake
@@ -0,0 +1,78 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_LIBGAV1_VARIABLES_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_LIBGAV1_VARIABLES_CMAKE_
+set(LIBGAV1_CMAKE_LIBGAV1_VARIABLES_CMAKE_ 1)
+
+# Halts generation when $variable_name does not refer to a directory that
+# exists.
+macro(libgav1_variable_must_be_directory variable_name)
+ if("${variable_name}" STREQUAL "")
+ message(
+ FATAL_ERROR
+ "Empty variable_name passed to libgav1_variable_must_be_directory.")
+ endif()
+
+ if("${${variable_name}}" STREQUAL "")
+ message(
+ FATAL_ERROR
+ "Empty variable ${variable_name} is required to build libgav1.")
+ endif()
+
+ if(NOT IS_DIRECTORY "${${variable_name}}")
+ message(
+ FATAL_ERROR
+ "${variable_name}, which is ${${variable_name}}, does not refer to a\n"
+ "directory.")
+ endif()
+endmacro()
+
+# Adds $var_name to the tracked variables list.
+macro(libgav1_track_configuration_variable var_name)
+ if(LIBGAV1_VERBOSE GREATER 2)
+ message("---- libgav1_track_configuration_variable ----\n"
+ "var_name=${var_name}\n"
+ "----------------------------------------------\n")
+ endif()
+
+ list(APPEND libgav1_configuration_variables ${var_name})
+ list(REMOVE_DUPLICATES libgav1_configuration_variables)
+endmacro()
+
+# Logs current C++ and executable linker flags via CMake's message command.
+macro(libgav1_dump_cmake_flag_variables)
+ unset(flag_variables)
+ list(APPEND flag_variables "CMAKE_CXX_FLAGS_INIT" "CMAKE_CXX_FLAGS"
+ "CMAKE_EXE_LINKER_FLAGS_INIT" "CMAKE_EXE_LINKER_FLAGS")
+ if(CMAKE_BUILD_TYPE)
+ list(APPEND flag_variables "CMAKE_BUILD_TYPE"
+ "CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}_INIT"
+ "CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE}"
+ "CMAKE_EXE_LINKER_FLAGS_${CMAKE_BUILD_TYPE}_INIT"
+ "CMAKE_EXE_LINKER_FLAGS_${CMAKE_BUILD_TYPE}")
+ endif()
+ foreach(flag_variable ${flag_variables})
+ message("${flag_variable}:${${flag_variable}}")
+ endforeach()
+endmacro()
+
+# Dumps the variables tracked in $libgav1_configuration_variables via CMake's
+# message command.
+macro(libgav1_dump_tracked_configuration_variables)
+ foreach(config_variable ${libgav1_configuration_variables})
+ message("${config_variable}:${${config_variable}}")
+ endforeach()
+endmacro()
diff --git a/libgav1/cmake/toolchains/aarch64-linux-gnu.cmake b/libgav1/cmake/toolchains/aarch64-linux-gnu.cmake
new file mode 100644
index 0000000..7ffe397
--- /dev/null
+++ b/libgav1/cmake/toolchains/aarch64-linux-gnu.cmake
@@ -0,0 +1,28 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_TOOLCHAINS_AARCH64_LINUX_GNU_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_TOOLCHAINS_AARCH64_LINUX_GNU_CMAKE_
+set(LIBGAV1_CMAKE_TOOLCHAINS_AARCH64_LINUX_GNU_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+
+if("${CROSS}" STREQUAL "")
+ set(CROSS aarch64-linux-gnu-)
+endif()
+
+set(CMAKE_CXX_COMPILER ${CROSS}g++)
+set(CMAKE_CXX_FLAGS_INIT "-march=armv8-a")
+set(CMAKE_SYSTEM_PROCESSOR "aarch64")
diff --git a/libgav1/cmake/toolchains/android.cmake b/libgav1/cmake/toolchains/android.cmake
new file mode 100644
index 0000000..492957b
--- /dev/null
+++ b/libgav1/cmake/toolchains/android.cmake
@@ -0,0 +1,53 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_TOOLCHAINS_ANDROID_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_TOOLCHAINS_ANDROID_CMAKE_
+
+# Additional ANDROID_* settings are available, see:
+# https://developer.android.com/ndk/guides/cmake#variables
+
+if(NOT ANDROID_PLATFORM)
+ set(ANDROID_PLATFORM android-21)
+endif()
+
+# Choose target architecture with:
+#
+# -DANDROID_ABI={armeabi-v7a,armeabi-v7a with NEON,arm64-v8a,x86,x86_64}
+if(NOT ANDROID_ABI)
+ set(ANDROID_ABI arm64-v8a)
+endif()
+
+# Force arm mode for 32-bit targets (instead of the default thumb) to improve
+# performance.
+if(NOT ANDROID_ARM_MODE)
+ set(ANDROID_ARM_MODE arm)
+endif()
+
+# Toolchain files don't have access to cached variables:
+# https://gitlab.kitware.com/cmake/cmake/issues/16170. Set an intermediate
+# environment variable when loaded the first time.
+if(LIBGAV1_ANDROID_NDK_PATH)
+ set(ENV{LIBGAV1_ANDROID_NDK_PATH} "${LIBGAV1_ANDROID_NDK_PATH}")
+else()
+ set(LIBGAV1_ANDROID_NDK_PATH "$ENV{LIBGAV1_ANDROID_NDK_PATH}")
+endif()
+
+if(NOT LIBGAV1_ANDROID_NDK_PATH)
+ message(FATAL_ERROR "LIBGAV1_ANDROID_NDK_PATH not set.")
+ return()
+endif()
+
+include("${LIBGAV1_ANDROID_NDK_PATH}/build/cmake/android.toolchain.cmake")
diff --git a/libgav1/cmake/toolchains/arm-linux-gnueabihf.cmake b/libgav1/cmake/toolchains/arm-linux-gnueabihf.cmake
new file mode 100644
index 0000000..8051f0d
--- /dev/null
+++ b/libgav1/cmake/toolchains/arm-linux-gnueabihf.cmake
@@ -0,0 +1,29 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_CMAKE_TOOLCHAINS_ARM_LINUX_GNUEABIHF_CMAKE_)
+ return()
+endif() # LIBGAV1_CMAKE_TOOLCHAINS_ARM_LINUX_GNUEABIHF_CMAKE_
+set(LIBGAV1_CMAKE_TOOLCHAINS_ARM_LINUX_GNUEABIHF_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+
+if("${CROSS}" STREQUAL "")
+ set(CROSS arm-linux-gnueabihf-)
+endif()
+
+set(CMAKE_CXX_COMPILER ${CROSS}g++)
+set(CMAKE_CXX_FLAGS_INIT "-march=armv7-a -marm")
+set(CMAKE_SYSTEM_PROCESSOR "armv7")
+set(LIBGAV1_NEON_INTRINSICS_FLAG "-mfpu=neon")
diff --git a/libgav1/codereview.settings b/libgav1/codereview.settings
new file mode 100644
index 0000000..ccba2ee
--- /dev/null
+++ b/libgav1/codereview.settings
@@ -0,0 +1,4 @@
+# This file is used by git cl to get repository specific information.
+GERRIT_HOST: True
+CODE_REVIEW_SERVER: chromium-review.googlesource.com
+GERRIT_SQUASH_UPLOADS: False
diff --git a/libgav1/examples/file_reader.cc b/libgav1/examples/file_reader.cc
new file mode 100644
index 0000000..b096722
--- /dev/null
+++ b/libgav1/examples/file_reader.cc
@@ -0,0 +1,186 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_reader.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <new>
+#include <string>
+#include <vector>
+
+#if defined(_WIN32)
+#include <fcntl.h>
+#include <io.h>
+#endif
+
+#include "examples/file_reader_constants.h"
+#include "examples/file_reader_factory.h"
+#include "examples/file_reader_interface.h"
+#include "examples/ivf_parser.h"
+#include "examples/logging.h"
+
+namespace libgav1 {
+namespace {
+
+FILE* SetBinaryMode(FILE* stream) {
+#if defined(_WIN32)
+ _setmode(_fileno(stream), _O_BINARY);
+#endif
+ return stream;
+}
+
+} // namespace
+
+bool FileReader::registered_in_factory_ =
+ FileReaderFactory::RegisterReader(FileReader::Open);
+
+FileReader::~FileReader() {
+ if (owns_file_) fclose(file_);
+}
+
+std::unique_ptr<FileReaderInterface> FileReader::Open(
+ const std::string& file_name, const bool error_tolerant) {
+ if (file_name.empty()) return nullptr;
+
+ FILE* raw_file_ptr;
+
+ bool owns_file = true;
+ if (file_name == "-") {
+ raw_file_ptr = SetBinaryMode(stdin);
+ owns_file = false; // stdin is owned by the Standard C Library.
+ } else {
+ raw_file_ptr = fopen(file_name.c_str(), "rb");
+ }
+
+ if (raw_file_ptr == nullptr) {
+ return nullptr;
+ }
+
+ std::unique_ptr<FileReader> file(
+ new (std::nothrow) FileReader(raw_file_ptr, owns_file, error_tolerant));
+ if (file == nullptr) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Out of memory");
+ if (owns_file) fclose(raw_file_ptr);
+ return nullptr;
+ }
+
+ if (!file->ReadIvfFileHeader()) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Unsupported file type");
+ return nullptr;
+ }
+
+ return file;
+}
+
+// IVF Frame Header format, from https://wiki.multimedia.cx/index.php/IVF
+// bytes 0-3 size of frame in bytes (not including the 12-byte header)
+// bytes 4-11 64-bit presentation timestamp
+// bytes 12.. frame data
+bool FileReader::ReadTemporalUnit(std::vector<uint8_t>* const tu_data,
+ int64_t* const timestamp) {
+ if (tu_data == nullptr) return false;
+ tu_data->clear();
+
+ uint8_t header_buffer[kIvfFrameHeaderSize];
+ const size_t num_read = fread(header_buffer, 1, kIvfFrameHeaderSize, file_);
+
+ if (IsEndOfFile()) {
+ if (num_read != 0) {
+ LIBGAV1_EXAMPLES_LOG_ERROR(
+ "Cannot read IVF frame header: Not enough data available");
+ return false;
+ }
+
+ return true;
+ }
+
+ IvfFrameHeader ivf_frame_header;
+ if (!ParseIvfFrameHeader(header_buffer, &ivf_frame_header)) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Could not parse IVF frame header");
+ if (error_tolerant_) {
+ ivf_frame_header.frame_size =
+ std::min(ivf_frame_header.frame_size, size_t{kMaxTemporalUnitSize});
+ } else {
+ return false;
+ }
+ }
+
+ if (timestamp != nullptr) *timestamp = ivf_frame_header.timestamp;
+
+ tu_data->resize(ivf_frame_header.frame_size);
+ const size_t size_read =
+ fread(tu_data->data(), 1, ivf_frame_header.frame_size, file_);
+ if (size_read != ivf_frame_header.frame_size) {
+ LIBGAV1_EXAMPLES_LOG_ERROR(
+ "Unexpected EOF or I/O error reading frame data");
+ if (error_tolerant_) {
+ tu_data->resize(size_read);
+ } else {
+ return false;
+ }
+ }
+ return true;
+}
+
+// Attempt to read an IVF file header. Returns true for success, and false for
+// failure.
+//
+// IVF File Header format, from https://wiki.multimedia.cx/index.php/IVF
+// bytes 0-3 signature: 'DKIF'
+// bytes 4-5 version (should be 0)
+// bytes 6-7 length of header in bytes
+// bytes 8-11 codec FourCC (e.g., 'VP80')
+// bytes 12-13 width in pixels
+// bytes 14-15 height in pixels
+// bytes 16-19 frame rate
+// bytes 20-23 time scale
+// bytes 24-27 number of frames in file
+// bytes 28-31 unused
+//
+// Note: The rate and scale fields correspond to the numerator and denominator
+// of frame rate (fps) or time base (the reciprocal of frame rate) as follows:
+//
+// bytes 16-19 frame rate timebase.den framerate.numerator
+// bytes 20-23 time scale timebase.num framerate.denominator
+bool FileReader::ReadIvfFileHeader() {
+ uint8_t header_buffer[kIvfFileHeaderSize];
+ const size_t num_read = fread(header_buffer, 1, kIvfFileHeaderSize, file_);
+ if (num_read != kIvfFileHeaderSize) {
+ LIBGAV1_EXAMPLES_LOG_ERROR(
+ "Cannot read IVF header: Not enough data available");
+ return false;
+ }
+
+ IvfFileHeader ivf_file_header;
+ if (!ParseIvfFileHeader(header_buffer, &ivf_file_header)) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Could not parse IVF file header");
+ if (error_tolerant_) {
+ ivf_file_header = {};
+ } else {
+ return false;
+ }
+ }
+
+ width_ = ivf_file_header.width;
+ height_ = ivf_file_header.height;
+ frame_rate_ = ivf_file_header.frame_rate_numerator;
+ time_scale_ = ivf_file_header.frame_rate_denominator;
+ type_ = kFileTypeIvf;
+
+ return true;
+}
+
+} // namespace libgav1
diff --git a/libgav1/examples/file_reader.h b/libgav1/examples/file_reader.h
new file mode 100644
index 0000000..c342a20
--- /dev/null
+++ b/libgav1/examples/file_reader.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_READER_H_
+#define LIBGAV1_EXAMPLES_FILE_READER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "examples/file_reader_interface.h"
+
+namespace libgav1 {
+
+// Temporal Unit based file reader class. Currently supports only IVF files.
+class FileReader : public FileReaderInterface {
+ public:
+ enum FileType {
+ kFileTypeUnknown,
+ kFileTypeIvf,
+ };
+
+ // Creates and returns a FileReader that reads from |file_name|.
+ // If |error_tolerant| is true format and read errors are ignored,
+ // ReadTemporalUnit() may return truncated data.
+ // Returns nullptr when the file does not exist, cannot be read, or is not an
+ // IVF file.
+ static std::unique_ptr<FileReaderInterface> Open(const std::string& file_name,
+ bool error_tolerant = false);
+
+ FileReader() = delete;
+ FileReader(const FileReader&) = delete;
+ FileReader& operator=(const FileReader&) = delete;
+
+ // Closes |file_|.
+ ~FileReader() override;
+
+ // Reads a temporal unit from |file_| and writes the data to |tu_data|.
+ // Returns true when:
+ // - A temporal unit is read successfully, or
+ // - At end of file.
+ // When ReadTemporalUnit() is called at the end of the file, it will return
+ // true without writing any data to |tu_data|.
+ //
+ // The |timestamp| pointer is optional: callers not interested in timestamps
+ // can pass nullptr. When |timestamp| is not a nullptr, this function returns
+ // the presentation timestamp from the IVF frame header.
+ /*LIBGAV1_MUST_USE_RESULT*/ bool ReadTemporalUnit(
+ std::vector<uint8_t>* tu_data, int64_t* timestamp) override;
+
+ /*LIBGAV1_MUST_USE_RESULT*/ bool IsEndOfFile() const override {
+ return feof(file_) != 0;
+ }
+
+ // The values returned by these accessors are strictly informative. No
+ // validation is performed when they are read from the IVF file header.
+ size_t width() const override { return width_; }
+ size_t height() const override { return height_; }
+ size_t frame_rate() const override { return frame_rate_; }
+ size_t time_scale() const override { return time_scale_; }
+
+ private:
+ FileReader(FILE* file, bool owns_file, bool error_tolerant)
+ : file_(file), owns_file_(owns_file), error_tolerant_(error_tolerant) {}
+
+ bool ReadIvfFileHeader();
+
+ FILE* file_ = nullptr;
+ size_t width_ = 0;
+ size_t height_ = 0;
+ size_t frame_rate_ = 0;
+ size_t time_scale_ = 0;
+ FileType type_ = kFileTypeUnknown;
+ // True if this object owns file_ and is responsible for closing it when
+ // done.
+ const bool owns_file_;
+ const bool error_tolerant_;
+
+ static bool registered_in_factory_;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_EXAMPLES_FILE_READER_H_
diff --git a/libgav1/src/decoder_scratch_buffer.cc b/libgav1/examples/file_reader_constants.cc
similarity index 75%
copy from libgav1/src/decoder_scratch_buffer.cc
copy to libgav1/examples/file_reader_constants.cc
index bb9b5f2..8439071 100644
--- a/libgav1/src/decoder_scratch_buffer.cc
+++ b/libgav1/examples/file_reader_constants.cc
@@ -12,12 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "src/decoder_scratch_buffer.h"
+#include "examples/file_reader_constants.h"
namespace libgav1 {
-// static
-constexpr int DecoderScratchBuffer::kBlockDecodedStride;
-constexpr int DecoderScratchBuffer::kPixelSize;
+const char kIvfSignature[4] = {'D', 'K', 'I', 'F'};
+const char kAv1FourCcUpper[4] = {'A', 'V', '0', '1'};
+const char kAv1FourCcLower[4] = {'a', 'v', '0', '1'};
} // namespace libgav1
diff --git a/libgav1/examples/file_reader_constants.h b/libgav1/examples/file_reader_constants.h
new file mode 100644
index 0000000..00922b4
--- /dev/null
+++ b/libgav1/examples/file_reader_constants.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_READER_CONSTANTS_H_
+#define LIBGAV1_EXAMPLES_FILE_READER_CONSTANTS_H_
+
+namespace libgav1 {
+
+enum {
+ kIvfHeaderVersion = 0,
+ kIvfFrameHeaderSize = 12,
+ kIvfFileHeaderSize = 32,
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+ kMaxTemporalUnitSize = 512 * 1024,
+#else
+ kMaxTemporalUnitSize = 256 * 1024 * 1024,
+#endif
+};
+
+extern const char kIvfSignature[4];
+extern const char kAv1FourCcUpper[4];
+extern const char kAv1FourCcLower[4];
+
+} // namespace libgav1
+
+#endif // LIBGAV1_EXAMPLES_FILE_READER_CONSTANTS_H_
diff --git a/libgav1/examples/file_reader_factory.cc b/libgav1/examples/file_reader_factory.cc
new file mode 100644
index 0000000..d5260eb
--- /dev/null
+++ b/libgav1/examples/file_reader_factory.cc
@@ -0,0 +1,51 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_reader_factory.h"
+
+#include <new>
+
+#include "examples/logging.h"
+
+namespace libgav1 {
+namespace {
+
+std::vector<FileReaderFactory::OpenFunction>* GetFileReaderOpenFunctions() {
+ static auto* open_functions =
+ new (std::nothrow) std::vector<FileReaderFactory::OpenFunction>();
+ return open_functions;
+}
+
+} // namespace
+
+bool FileReaderFactory::RegisterReader(OpenFunction open_function) {
+ if (open_function == nullptr) return false;
+ auto* open_functions = GetFileReaderOpenFunctions();
+ const size_t num_readers = open_functions->size();
+ open_functions->push_back(open_function);
+ return open_functions->size() == num_readers + 1;
+}
+
+std::unique_ptr<FileReaderInterface> FileReaderFactory::OpenReader(
+ const std::string& file_name, const bool error_tolerant /*= false*/) {
+ for (auto* open_function : *GetFileReaderOpenFunctions()) {
+ auto reader = open_function(file_name, error_tolerant);
+ if (reader == nullptr) continue;
+ return reader;
+ }
+ LIBGAV1_EXAMPLES_LOG_ERROR("No file reader able to open input");
+ return nullptr;
+}
+
+} // namespace libgav1
diff --git a/libgav1/examples/file_reader_factory.h b/libgav1/examples/file_reader_factory.h
new file mode 100644
index 0000000..0f53484
--- /dev/null
+++ b/libgav1/examples/file_reader_factory.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_READER_FACTORY_H_
+#define LIBGAV1_EXAMPLES_FILE_READER_FACTORY_H_
+
+#include <memory>
+#include <string>
+
+#include "examples/file_reader_interface.h"
+
+namespace libgav1 {
+
+class FileReaderFactory {
+ public:
+ using OpenFunction = std::unique_ptr<FileReaderInterface> (*)(
+ const std::string& file_name, bool error_tolerant);
+
+ FileReaderFactory() = delete;
+ FileReaderFactory(const FileReaderFactory&) = delete;
+ FileReaderFactory& operator=(const FileReaderFactory&) = delete;
+ ~FileReaderFactory() = default;
+
+ // Registers the OpenFunction for a FileReaderInterface and returns true when
+ // registration succeeds.
+ static bool RegisterReader(OpenFunction open_function);
+
+ // Passes |file_name| to each OpenFunction until one succeeds. Returns nullptr
+ // when no reader is found for |file_name|. Otherwise a FileReaderInterface is
+ // returned. If |error_tolerant| is true and the reader supports it, some
+ // format and read errors may be ignored and partial data returned.
+ static std::unique_ptr<FileReaderInterface> OpenReader(
+ const std::string& file_name, bool error_tolerant = false);
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_EXAMPLES_FILE_READER_FACTORY_H_
diff --git a/libgav1/examples/file_reader_interface.h b/libgav1/examples/file_reader_interface.h
new file mode 100644
index 0000000..d8f7030
--- /dev/null
+++ b/libgav1/examples/file_reader_interface.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_READER_INTERFACE_H_
+#define LIBGAV1_EXAMPLES_FILE_READER_INTERFACE_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+namespace libgav1 {
+
+class FileReaderInterface {
+ public:
+ FileReaderInterface() = default;
+ FileReaderInterface(const FileReaderInterface&) = delete;
+ FileReaderInterface& operator=(const FileReaderInterface&) = delete;
+
+ FileReaderInterface(FileReaderInterface&&) = default;
+ FileReaderInterface& operator=(FileReaderInterface&&) = default;
+
+ // Closes the file.
+ virtual ~FileReaderInterface() = default;
+
+ // Reads a temporal unit from the file and writes the data to |tu_data|.
+ // Returns true when:
+ // - A temporal unit is read successfully, or
+ // - At end of file.
+ // When ReadTemporalUnit() is called at the end of the file, it will return
+ // true without writing any data to |tu_data|.
+ //
+ // The |timestamp| pointer is optional: callers not interested in timestamps
+ // can pass nullptr. When |timestamp| is not a nullptr, this function returns
+ // the presentation timestamp of the temporal unit.
+ /*LIBGAV1_MUST_USE_RESULT*/ virtual bool ReadTemporalUnit(
+ std::vector<uint8_t>* tu_data, int64_t* timestamp) = 0;
+
+ /*LIBGAV1_MUST_USE_RESULT*/ virtual bool IsEndOfFile() const = 0;
+
+ // The values returned by these accessors are strictly informative. No
+ // validation is performed when they are read from file.
+ virtual size_t width() const = 0;
+ virtual size_t height() const = 0;
+ virtual size_t frame_rate() const = 0;
+ virtual size_t time_scale() const = 0;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_EXAMPLES_FILE_READER_INTERFACE_H_
diff --git a/libgav1/examples/file_writer.cc b/libgav1/examples/file_writer.cc
new file mode 100644
index 0000000..54afe14
--- /dev/null
+++ b/libgav1/examples/file_writer.cc
@@ -0,0 +1,183 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/file_writer.h"
+
+#include <cerrno>
+#include <cstdio>
+#include <cstring>
+#include <new>
+#include <string>
+
+#if defined(_WIN32)
+#include <fcntl.h>
+#include <io.h>
+#endif
+
+#include "examples/logging.h"
+
+namespace libgav1 {
+namespace {
+
+FILE* SetBinaryMode(FILE* stream) {
+#if defined(_WIN32)
+ _setmode(_fileno(stream), _O_BINARY);
+#endif
+ return stream;
+}
+
+std::string GetY4mColorSpaceString(
+ const FileWriter::Y4mParameters& y4m_parameters) {
+ std::string color_space_string;
+ switch (y4m_parameters.image_format) {
+ case kImageFormatMonochrome400:
+ color_space_string = "mono";
+ break;
+ case kImageFormatYuv420:
+ if (y4m_parameters.bitdepth == 8) {
+ if (y4m_parameters.chroma_sample_position ==
+ kChromaSamplePositionVertical) {
+ color_space_string = "420mpeg2";
+ } else if (y4m_parameters.chroma_sample_position ==
+ kChromaSamplePositionColocated) {
+ color_space_string = "420";
+ } else {
+ color_space_string = "420jpeg";
+ }
+ } else {
+ color_space_string = "420";
+ }
+ break;
+ case kImageFormatYuv422:
+ color_space_string = "422";
+ break;
+ case kImageFormatYuv444:
+ color_space_string = "444";
+ break;
+ }
+
+ if (y4m_parameters.bitdepth > 8) {
+ const bool monochrome =
+ y4m_parameters.image_format == kImageFormatMonochrome400;
+ if (!monochrome) color_space_string += "p";
+ color_space_string += std::to_string(y4m_parameters.bitdepth);
+ }
+
+ return color_space_string;
+}
+
+} // namespace
+
+FileWriter::~FileWriter() { fclose(file_); }
+
+std::unique_ptr<FileWriter> FileWriter::Open(
+ const std::string& file_name, FileType file_type,
+ const Y4mParameters* const y4m_parameters) {
+ if (file_name.empty() ||
+ (file_type == kFileTypeY4m && y4m_parameters == nullptr) ||
+ (file_type != kFileTypeRaw && file_type != kFileTypeY4m)) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Invalid parameters");
+ return nullptr;
+ }
+
+ FILE* raw_file_ptr;
+
+ if (file_name == "-") {
+ raw_file_ptr = SetBinaryMode(stdout);
+ } else {
+ raw_file_ptr = fopen(file_name.c_str(), "wb");
+ }
+
+ if (raw_file_ptr == nullptr) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Unable to open output file");
+ return nullptr;
+ }
+
+ std::unique_ptr<FileWriter> file(new (std::nothrow) FileWriter(raw_file_ptr));
+ if (file == nullptr) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Out of memory");
+ fclose(raw_file_ptr);
+ return nullptr;
+ }
+
+ if (file_type == kFileTypeY4m && !file->WriteY4mFileHeader(*y4m_parameters)) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Error writing Y4M file header");
+ return nullptr;
+ }
+
+ file->file_type_ = file_type;
+ return file;
+}
+
+bool FileWriter::WriteFrame(const DecoderBuffer& frame_buffer) {
+ if (file_type_ == kFileTypeY4m) {
+ const char kY4mFrameHeader[] = "FRAME\n";
+ if (fwrite(kY4mFrameHeader, 1, strlen(kY4mFrameHeader), file_) !=
+ strlen(kY4mFrameHeader)) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Error writing Y4M frame header");
+ return false;
+ }
+ }
+
+ const size_t pixel_size =
+ (frame_buffer.bitdepth == 8) ? sizeof(uint8_t) : sizeof(uint16_t);
+ for (int plane_index = 0; plane_index < frame_buffer.NumPlanes();
+ ++plane_index) {
+ const int height = frame_buffer.displayed_height[plane_index];
+ const int width = frame_buffer.displayed_width[plane_index];
+ const int stride = frame_buffer.stride[plane_index];
+ const uint8_t* const plane_pointer = frame_buffer.plane[plane_index];
+ for (int row = 0; row < height; ++row) {
+ const uint8_t* const row_pointer = &plane_pointer[row * stride];
+ if (fwrite(row_pointer, pixel_size, width, file_) !=
+ static_cast<size_t>(width)) {
+ char error_string[256];
+ snprintf(error_string, sizeof(error_string),
+ "File write failed: %s (errno=%d)", strerror(errno), errno);
+ LIBGAV1_EXAMPLES_LOG_ERROR(error_string);
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+// Writes Y4M file header to |file_| and returns true when successful.
+//
+// A Y4M file begins with a plaintext file signature of 'YUV4MPEG2 '.
+//
+// Following the signature is any number of optional parameters preceded by a
+// space. We always write:
+//
+// Width: 'W' followed by image width in pixels.
+// Height: 'H' followed by image height in pixels.
+// Frame Rate: 'F' followed frames/second in the form numerator:denominator.
+// Interlacing: 'I' followed by 'p' for progressive.
+// Color space: 'C' followed by a string representation of the color space.
+//
+// More info here: https://wiki.multimedia.cx/index.php/YUV4MPEG2
+bool FileWriter::WriteY4mFileHeader(const Y4mParameters& y4m_parameters) {
+ std::string y4m_header = "YUV4MPEG2";
+ y4m_header += " W" + std::to_string(y4m_parameters.width);
+ y4m_header += " H" + std::to_string(y4m_parameters.height);
+ y4m_header += " F" + std::to_string(y4m_parameters.frame_rate_numerator) +
+ ":" + std::to_string(y4m_parameters.frame_rate_denominator);
+ y4m_header += " Ip C" + GetY4mColorSpaceString(y4m_parameters);
+ y4m_header += "\n";
+ return fwrite(y4m_header.c_str(), 1, y4m_header.length(), file_) ==
+ y4m_header.length();
+}
+
+} // namespace libgav1
diff --git a/libgav1/examples/file_writer.h b/libgav1/examples/file_writer.h
new file mode 100644
index 0000000..00f6cc3
--- /dev/null
+++ b/libgav1/examples/file_writer.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_FILE_WRITER_H_
+#define LIBGAV1_EXAMPLES_FILE_WRITER_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+#include <string>
+
+#include "gav1/decoder_buffer.h"
+
+namespace libgav1 {
+
+// Frame based file writer class. Supports only Y4M (YUV4MPEG2) and RAW output.
+class FileWriter {
+ public:
+ enum FileType : uint8_t {
+ kFileTypeRaw,
+ kFileTypeY4m,
+ };
+
+ struct Y4mParameters {
+ Y4mParameters() = default;
+ Y4mParameters(size_t width, size_t height, size_t frame_rate_numerator,
+ size_t frame_rate_denominator,
+ ChromaSamplePosition chroma_sample_position,
+ ImageFormat image_format, size_t bitdepth)
+ : width(width),
+ height(height),
+ frame_rate_numerator(frame_rate_numerator),
+ frame_rate_denominator(frame_rate_denominator),
+ chroma_sample_position(chroma_sample_position),
+ image_format(image_format),
+ bitdepth(bitdepth) {}
+
+ Y4mParameters(const Y4mParameters& rhs) = default;
+ Y4mParameters& operator=(const Y4mParameters& rhs) = default;
+ Y4mParameters(Y4mParameters&& rhs) = default;
+ Y4mParameters& operator=(Y4mParameters&& rhs) = default;
+
+ size_t width = 0;
+ size_t height = 0;
+ size_t frame_rate_numerator = 30;
+ size_t frame_rate_denominator = 1;
+ ChromaSamplePosition chroma_sample_position = kChromaSamplePositionUnknown;
+ ImageFormat image_format = kImageFormatYuv420;
+ size_t bitdepth = 8;
+ };
+
+ // Opens |file_name|. When |file_type| is kFileTypeY4m the Y4M file header is
+ // written out to |file_| before this method returns.
+ //
+ // Returns a FileWriter instance after the file is opened successfully for
+ // kFileTypeRaw files, and after the Y4M file header bytes are written for
+ // kFileTypeY4m files. Returns nullptr upon failure.
+ static std::unique_ptr<FileWriter> Open(const std::string& file_name,
+ FileType type,
+ const Y4mParameters* y4m_parameters);
+
+ FileWriter() = delete;
+ FileWriter(const FileWriter&) = delete;
+ FileWriter& operator=(const FileWriter&) = delete;
+
+ FileWriter(FileWriter&&) = default;
+ FileWriter& operator=(FileWriter&&) = default;
+
+ // Closes |file_|.
+ ~FileWriter();
+
+ // Writes the frame data in |frame_buffer| to |file_|. Returns true after
+ // successful write of |frame_buffer| data.
+ /*LIBGAV1_MUST_USE_RESULT*/ bool WriteFrame(
+ const DecoderBuffer& frame_buffer);
+
+ private:
+ explicit FileWriter(FILE* file) : file_(file) {}
+
+ bool WriteY4mFileHeader(const Y4mParameters& y4m_parameters);
+
+ FILE* file_ = nullptr;
+ FileType file_type_ = kFileTypeRaw;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_EXAMPLES_FILE_WRITER_H_
diff --git a/libgav1/examples/gav1_decode.cc b/libgav1/examples/gav1_decode.cc
new file mode 100644
index 0000000..e7d3246
--- /dev/null
+++ b/libgav1/examples/gav1_decode.cc
@@ -0,0 +1,453 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cerrno>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <deque>
+#include <memory>
+#include <new>
+#include <vector>
+
+#include "absl/strings/numbers.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "examples/file_reader_factory.h"
+#include "examples/file_reader_interface.h"
+#include "examples/file_writer.h"
+#include "gav1/decoder.h"
+
+#ifdef GAV1_DECODE_USE_CV_PIXEL_BUFFER_POOL
+#include "examples/gav1_decode_cv_pixel_buffer_pool.h"
+#endif
+
+namespace {
+
+struct Options {
+ const char* input_file_name = nullptr;
+ const char* output_file_name = nullptr;
+ const char* frame_timing_file_name = nullptr;
+ libgav1::FileWriter::FileType output_file_type =
+ libgav1::FileWriter::kFileTypeRaw;
+ uint8_t post_filter_mask = 0x1f;
+ int threads = 1;
+ bool frame_parallel = false;
+ bool output_all_layers = false;
+ int operating_point = 0;
+ int limit = 0;
+ int skip = 0;
+ int verbose = 0;
+};
+
+struct Timing {
+ absl::Duration input;
+ absl::Duration dequeue;
+};
+
+struct FrameTiming {
+ absl::Time enqueue;
+ absl::Time dequeue;
+};
+
+void PrintHelp(FILE* const fout) {
+ fprintf(fout,
+ "Usage: gav1_decode [options] <input file>"
+ " [-o <output file>]\n");
+ fprintf(fout, "\n");
+ fprintf(fout, "Options:\n");
+ fprintf(fout, " -h, --help This help message.\n");
+ fprintf(fout, " --threads <positive integer> (Default 1).\n");
+ fprintf(fout, " --frame_parallel.\n");
+ fprintf(fout,
+ " --limit <integer> Stop decoding after N frames (0 = all).\n");
+ fprintf(fout, " --skip <integer> Skip initial N frames (Default 0).\n");
+ fprintf(fout, " --version.\n");
+ fprintf(fout, " --y4m (Default false).\n");
+ fprintf(fout, " --raw (Default true).\n");
+ fprintf(fout, " -v logging verbosity, can be used multiple times.\n");
+ fprintf(fout, " --all_layers.\n");
+ fprintf(fout,
+ " --operating_point <integer between 0 and 31> (Default 0).\n");
+ fprintf(fout,
+ " --frame_timing <file> Output per-frame timing to <file> in tsv"
+ " format.\n Yields meaningful results only when frame parallel is"
+ " off.\n");
+ fprintf(fout, "\nAdvanced settings:\n");
+ fprintf(fout, " --post_filter_mask <integer> (Default 0x1f).\n");
+ fprintf(fout,
+ " Mask indicating which post filters should be applied to the"
+ " reconstructed\n frame. This may be given as octal, decimal or"
+ " hexadecimal. From LSB:\n");
+ fprintf(fout, " Bit 0: Loop filter (deblocking filter)\n");
+ fprintf(fout, " Bit 1: Cdef\n");
+ fprintf(fout, " Bit 2: SuperRes\n");
+ fprintf(fout, " Bit 3: Loop Restoration\n");
+ fprintf(fout, " Bit 4: Film Grain Synthesis\n");
+}
+
+void ParseOptions(int argc, char* argv[], Options* const options) {
+ for (int i = 1; i < argc; ++i) {
+ int32_t value;
+ if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) {
+ PrintHelp(stdout);
+ exit(EXIT_SUCCESS);
+ } else if (strcmp(argv[i], "-o") == 0) {
+ if (++i >= argc) {
+ fprintf(stderr, "Missing argument for '-o'\n");
+ PrintHelp(stderr);
+ exit(EXIT_FAILURE);
+ }
+ options->output_file_name = argv[i];
+ } else if (strcmp(argv[i], "--frame_timing") == 0) {
+ if (++i >= argc) {
+ fprintf(stderr, "Missing argument for '--frame_timing'\n");
+ PrintHelp(stderr);
+ exit(EXIT_FAILURE);
+ }
+ options->frame_timing_file_name = argv[i];
+ } else if (strcmp(argv[i], "--version") == 0) {
+ printf("gav1_decode, a libgav1 based AV1 decoder\n");
+ printf("libgav1 %s\n", libgav1::GetVersionString());
+ printf("max bitdepth: %d\n", libgav1::Decoder::GetMaxBitdepth());
+ printf("build configuration: %s\n", libgav1::GetBuildConfiguration());
+ exit(EXIT_SUCCESS);
+ } else if (strcmp(argv[i], "-v") == 0) {
+ ++options->verbose;
+ } else if (strcmp(argv[i], "--raw") == 0) {
+ options->output_file_type = libgav1::FileWriter::kFileTypeRaw;
+ } else if (strcmp(argv[i], "--y4m") == 0) {
+ options->output_file_type = libgav1::FileWriter::kFileTypeY4m;
+ } else if (strcmp(argv[i], "--threads") == 0) {
+ if (++i >= argc || !absl::SimpleAtoi(argv[i], &value)) {
+ fprintf(stderr, "Missing/Invalid value for --threads.\n");
+ PrintHelp(stderr);
+ exit(EXIT_FAILURE);
+ }
+ options->threads = value;
+ } else if (strcmp(argv[i], "--frame_parallel") == 0) {
+ options->frame_parallel = true;
+ } else if (strcmp(argv[i], "--all_layers") == 0) {
+ options->output_all_layers = true;
+ } else if (strcmp(argv[i], "--operating_point") == 0) {
+ if (++i >= argc || !absl::SimpleAtoi(argv[i], &value) || value < 0 ||
+ value >= 32) {
+ fprintf(stderr, "Missing/Invalid value for --operating_point.\n");
+ PrintHelp(stderr);
+ exit(EXIT_FAILURE);
+ }
+ options->operating_point = value;
+ } else if (strcmp(argv[i], "--limit") == 0) {
+ if (++i >= argc || !absl::SimpleAtoi(argv[i], &value) || value < 0) {
+ fprintf(stderr, "Missing/Invalid value for --limit.\n");
+ PrintHelp(stderr);
+ exit(EXIT_FAILURE);
+ }
+ options->limit = value;
+ } else if (strcmp(argv[i], "--skip") == 0) {
+ if (++i >= argc || !absl::SimpleAtoi(argv[i], &value) || value < 0) {
+ fprintf(stderr, "Missing/Invalid value for --skip.\n");
+ PrintHelp(stderr);
+ exit(EXIT_FAILURE);
+ }
+ options->skip = value;
+ } else if (strcmp(argv[i], "--post_filter_mask") == 0) {
+ errno = 0;
+ char* endptr = nullptr;
+ value = (++i >= argc) ? -1
+ // NOLINTNEXTLINE(runtime/deprecated_fn)
+ : static_cast<int32_t>(strtol(argv[i], &endptr, 0));
+ // Only the last 5 bits of the mask can be set.
+ if ((value & ~31) != 0 || errno != 0 || endptr == argv[i]) {
+ fprintf(stderr, "Invalid value for --post_filter_mask.\n");
+ PrintHelp(stderr);
+ exit(EXIT_FAILURE);
+ }
+ options->post_filter_mask = value;
+ } else if (strlen(argv[i]) > 1 && argv[i][0] == '-') {
+ fprintf(stderr, "Unknown option '%s'!\n", argv[i]);
+ exit(EXIT_FAILURE);
+ } else {
+ if (options->input_file_name == nullptr) {
+ options->input_file_name = argv[i];
+ } else {
+ fprintf(stderr, "Found invalid parameter: \"%s\".\n", argv[i]);
+ PrintHelp(stderr);
+ exit(EXIT_FAILURE);
+ }
+ }
+ }
+
+ if (argc < 2 || options->input_file_name == nullptr) {
+ fprintf(stderr, "Input file is required!\n");
+ PrintHelp(stderr);
+ exit(EXIT_FAILURE);
+ }
+}
+
+using InputBuffer = std::vector<uint8_t>;
+
+class InputBuffers {
+ public:
+ ~InputBuffers() {
+ for (auto buffer : free_buffers_) {
+ delete buffer;
+ }
+ }
+ InputBuffer* GetFreeBuffer() {
+ if (free_buffers_.empty()) {
+ auto* const buffer = new (std::nothrow) InputBuffer();
+ if (buffer == nullptr) {
+ fprintf(stderr, "Failed to create input buffer.\n");
+ return nullptr;
+ }
+ free_buffers_.push_back(buffer);
+ }
+ InputBuffer* const buffer = free_buffers_.front();
+ free_buffers_.pop_front();
+ return buffer;
+ }
+
+ void ReleaseInputBuffer(InputBuffer* buffer) {
+ free_buffers_.push_back(buffer);
+ }
+
+ private:
+ std::deque<InputBuffer*> free_buffers_;
+};
+
+void ReleaseInputBuffer(void* callback_private_data,
+ void* buffer_private_data) {
+ auto* const input_buffers = static_cast<InputBuffers*>(callback_private_data);
+ input_buffers->ReleaseInputBuffer(
+ static_cast<InputBuffer*>(buffer_private_data));
+}
+
+int CloseFile(FILE* stream) { return (stream == nullptr) ? 0 : fclose(stream); }
+
+} // namespace
+
+int main(int argc, char* argv[]) {
+ Options options;
+ ParseOptions(argc, argv, &options);
+
+ auto file_reader =
+ libgav1::FileReaderFactory::OpenReader(options.input_file_name);
+ if (file_reader == nullptr) {
+ fprintf(stderr, "Cannot open input file!\n");
+ return EXIT_FAILURE;
+ }
+
+ std::unique_ptr<FILE, decltype(&CloseFile)> frame_timing_file(nullptr,
+ &CloseFile);
+ if (options.frame_timing_file_name != nullptr) {
+ frame_timing_file.reset(fopen(options.frame_timing_file_name, "wb"));
+ if (frame_timing_file == nullptr) {
+ fprintf(stderr, "Cannot open frame timing file '%s'!\n",
+ options.frame_timing_file_name);
+ return EXIT_FAILURE;
+ }
+ }
+
+#ifdef GAV1_DECODE_USE_CV_PIXEL_BUFFER_POOL
+ // Reference frames + 1 scratch frame (for either the current frame or the
+ // film grain frame).
+ constexpr int kNumBuffers = 8 + 1;
+ std::unique_ptr<Gav1DecodeCVPixelBufferPool> cv_pixel_buffers =
+ Gav1DecodeCVPixelBufferPool::Create(kNumBuffers);
+ if (cv_pixel_buffers == nullptr) {
+ fprintf(stderr, "Cannot create Gav1DecodeCVPixelBufferPool!\n");
+ return EXIT_FAILURE;
+ }
+#endif
+
+ InputBuffers input_buffers;
+ libgav1::Decoder decoder;
+ libgav1::DecoderSettings settings;
+ settings.post_filter_mask = options.post_filter_mask;
+ settings.threads = options.threads;
+ settings.frame_parallel = options.frame_parallel;
+ settings.output_all_layers = options.output_all_layers;
+ settings.operating_point = options.operating_point;
+ settings.blocking_dequeue = true;
+ settings.callback_private_data = &input_buffers;
+ settings.release_input_buffer = ReleaseInputBuffer;
+#ifdef GAV1_DECODE_USE_CV_PIXEL_BUFFER_POOL
+ settings.on_frame_buffer_size_changed = Gav1DecodeOnCVPixelBufferSizeChanged;
+ settings.get_frame_buffer = Gav1DecodeGetCVPixelBuffer;
+ settings.release_frame_buffer = Gav1DecodeReleaseCVPixelBuffer;
+ settings.callback_private_data = cv_pixel_buffers.get();
+ settings.release_input_buffer = nullptr;
+ // TODO(vigneshv): Support frame parallel mode to be used with
+ // CVPixelBufferPool.
+ settings.frame_parallel = false;
+#endif
+ libgav1::StatusCode status = decoder.Init(&settings);
+ if (status != libgav1::kStatusOk) {
+ fprintf(stderr, "Error initializing decoder: %s\n",
+ libgav1::GetErrorString(status));
+ return EXIT_FAILURE;
+ }
+
+ fprintf(stderr, "decoding '%s'\n", options.input_file_name);
+ if (options.verbose > 0 && options.skip > 0) {
+ fprintf(stderr, "skipping %d frame(s).\n", options.skip);
+ }
+
+ int input_frames = 0;
+ int decoded_frames = 0;
+ Timing timing = {};
+ std::vector<FrameTiming> frame_timing;
+ const bool record_frame_timing = frame_timing_file != nullptr;
+ std::unique_ptr<libgav1::FileWriter> file_writer;
+ InputBuffer* input_buffer = nullptr;
+ bool limit_reached = false;
+ bool dequeue_finished = false;
+ const absl::Time decode_loop_start = absl::Now();
+ do {
+ if (input_buffer == nullptr && !file_reader->IsEndOfFile() &&
+ !limit_reached) {
+ input_buffer = input_buffers.GetFreeBuffer();
+ if (input_buffer == nullptr) return EXIT_FAILURE;
+ const absl::Time read_start = absl::Now();
+ if (!file_reader->ReadTemporalUnit(input_buffer,
+ /*timestamp=*/nullptr)) {
+ fprintf(stderr, "Error reading input file.\n");
+ return EXIT_FAILURE;
+ }
+ timing.input += absl::Now() - read_start;
+ }
+
+ if (++input_frames <= options.skip) {
+ input_buffers.ReleaseInputBuffer(input_buffer);
+ input_buffer = nullptr;
+ continue;
+ }
+
+ if (input_buffer != nullptr) {
+ if (input_buffer->empty()) {
+ input_buffers.ReleaseInputBuffer(input_buffer);
+ input_buffer = nullptr;
+ continue;
+ }
+
+ const absl::Time enqueue_start = absl::Now();
+ status = decoder.EnqueueFrame(input_buffer->data(), input_buffer->size(),
+ static_cast<int64_t>(frame_timing.size()),
+ /*buffer_private_data=*/input_buffer);
+ if (status == libgav1::kStatusOk) {
+ if (options.verbose > 1) {
+ fprintf(stderr, "enqueue frame (length %zu)\n", input_buffer->size());
+ }
+ if (record_frame_timing) {
+ FrameTiming enqueue_time = {enqueue_start, absl::UnixEpoch()};
+ frame_timing.emplace_back(enqueue_time);
+ }
+
+ input_buffer = nullptr;
+ // Continue to enqueue frames until we get a kStatusTryAgain status.
+ continue;
+ }
+ if (status != libgav1::kStatusTryAgain) {
+ fprintf(stderr, "Unable to enqueue frame: %s\n",
+ libgav1::GetErrorString(status));
+ return EXIT_FAILURE;
+ }
+ }
+
+ const libgav1::DecoderBuffer* buffer;
+ status = decoder.DequeueFrame(&buffer);
+ if (status != libgav1::kStatusOk &&
+ status != libgav1::kStatusNothingToDequeue) {
+ fprintf(stderr, "Unable to dequeue frame: %s\n",
+ libgav1::GetErrorString(status));
+ return EXIT_FAILURE;
+ }
+ if (status == libgav1::kStatusNothingToDequeue) {
+ dequeue_finished = true;
+ continue;
+ }
+ dequeue_finished = false;
+ if (buffer == nullptr) continue;
+ ++decoded_frames;
+ if (options.verbose > 1) {
+ fprintf(stderr, "buffer dequeued\n");
+ }
+
+ if (record_frame_timing) {
+ frame_timing[static_cast<int>(buffer->user_private_data)].dequeue =
+ absl::Now();
+ }
+
+ if (options.output_file_name != nullptr && file_writer == nullptr) {
+ libgav1::FileWriter::Y4mParameters y4m_parameters;
+ y4m_parameters.width = buffer->displayed_width[0];
+ y4m_parameters.height = buffer->displayed_height[0];
+ y4m_parameters.frame_rate_numerator = file_reader->frame_rate();
+ y4m_parameters.frame_rate_denominator = file_reader->time_scale();
+ y4m_parameters.chroma_sample_position = buffer->chroma_sample_position;
+ y4m_parameters.image_format = buffer->image_format;
+ y4m_parameters.bitdepth = static_cast<size_t>(buffer->bitdepth);
+ file_writer = libgav1::FileWriter::Open(
+ options.output_file_name, options.output_file_type, &y4m_parameters);
+ if (file_writer == nullptr) {
+ fprintf(stderr, "Cannot open output file!\n");
+ return EXIT_FAILURE;
+ }
+ }
+
+ if (!limit_reached && file_writer != nullptr &&
+ !file_writer->WriteFrame(*buffer)) {
+ fprintf(stderr, "Error writing output file.\n");
+ return EXIT_FAILURE;
+ }
+ if (options.limit > 0 && options.limit == decoded_frames) {
+ limit_reached = true;
+ if (input_buffer != nullptr) {
+ input_buffers.ReleaseInputBuffer(input_buffer);
+ }
+ input_buffer = nullptr;
+ }
+ } while (input_buffer != nullptr ||
+ (!file_reader->IsEndOfFile() && !limit_reached) ||
+ !dequeue_finished);
+ timing.dequeue = absl::Now() - decode_loop_start - timing.input;
+
+ if (record_frame_timing) {
+ // Note timing for frame parallel will be skewed by the time spent queueing
+ // additional frames and in the output queue waiting for previous frames,
+ // the values reported won't be that meaningful.
+ fprintf(frame_timing_file.get(), "frame number\tdecode time us\n");
+ for (size_t i = 0; i < frame_timing.size(); ++i) {
+ const int decode_time_us = static_cast<int>(absl::ToInt64Microseconds(
+ frame_timing[i].dequeue - frame_timing[i].enqueue));
+ fprintf(frame_timing_file.get(), "%zu\t%d\n", i, decode_time_us);
+ }
+ }
+
+ if (options.verbose > 0) {
+ fprintf(stderr, "time to read input: %d us\n",
+ static_cast<int>(absl::ToInt64Microseconds(timing.input)));
+ const int decode_time_us =
+ static_cast<int>(absl::ToInt64Microseconds(timing.dequeue));
+ const double decode_fps =
+ (decode_time_us == 0) ? 0.0 : 1.0e6 * decoded_frames / decode_time_us;
+ fprintf(stderr, "time to decode input: %d us (%d frames, %.2f fps)\n",
+ decode_time_us, decoded_frames, decode_fps);
+ }
+
+ return EXIT_SUCCESS;
+}
diff --git a/libgav1/examples/gav1_decode_cv_pixel_buffer_pool.cc b/libgav1/examples/gav1_decode_cv_pixel_buffer_pool.cc
new file mode 100644
index 0000000..6aa4e61
--- /dev/null
+++ b/libgav1/examples/gav1_decode_cv_pixel_buffer_pool.cc
@@ -0,0 +1,278 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/gav1_decode_cv_pixel_buffer_pool.h"
+
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <memory>
+#include <new>
+#include <type_traits>
+
+namespace {
+
+struct CFTypeDeleter {
+ void operator()(CFTypeRef cf) const { CFRelease(cf); }
+};
+
+using UniqueCFNumberRef =
+ std::unique_ptr<std::remove_pointer<CFNumberRef>::type, CFTypeDeleter>;
+
+using UniqueCFDictionaryRef =
+ std::unique_ptr<std::remove_pointer<CFDictionaryRef>::type, CFTypeDeleter>;
+
+} // namespace
+
+extern "C" {
+
+libgav1::StatusCode Gav1DecodeOnCVPixelBufferSizeChanged(
+ void* callback_private_data, int bitdepth,
+ libgav1::ImageFormat image_format, int width, int height, int left_border,
+ int right_border, int top_border, int bottom_border, int stride_alignment) {
+ auto* buffer_pool =
+ static_cast<Gav1DecodeCVPixelBufferPool*>(callback_private_data);
+ return buffer_pool->OnCVPixelBufferSizeChanged(
+ bitdepth, image_format, width, height, left_border, right_border,
+ top_border, bottom_border, stride_alignment);
+}
+
+libgav1::StatusCode Gav1DecodeGetCVPixelBuffer(
+ void* callback_private_data, int bitdepth,
+ libgav1::ImageFormat image_format, int width, int height, int left_border,
+ int right_border, int top_border, int bottom_border, int stride_alignment,
+ libgav1::FrameBuffer* frame_buffer) {
+ auto* buffer_pool =
+ static_cast<Gav1DecodeCVPixelBufferPool*>(callback_private_data);
+ return buffer_pool->GetCVPixelBuffer(
+ bitdepth, image_format, width, height, left_border, right_border,
+ top_border, bottom_border, stride_alignment, frame_buffer);
+}
+
+void Gav1DecodeReleaseCVPixelBuffer(void* callback_private_data,
+ void* buffer_private_data) {
+ auto* buffer_pool =
+ static_cast<Gav1DecodeCVPixelBufferPool*>(callback_private_data);
+ buffer_pool->ReleaseCVPixelBuffer(buffer_private_data);
+}
+
+} // extern "C"
+
+// static
+std::unique_ptr<Gav1DecodeCVPixelBufferPool>
+Gav1DecodeCVPixelBufferPool::Create(size_t num_buffers) {
+ std::unique_ptr<Gav1DecodeCVPixelBufferPool> buffer_pool(
+ new (std::nothrow) Gav1DecodeCVPixelBufferPool(num_buffers));
+ return buffer_pool;
+}
+
+Gav1DecodeCVPixelBufferPool::Gav1DecodeCVPixelBufferPool(size_t num_buffers)
+ : num_buffers_(static_cast<int>(num_buffers)) {}
+
+Gav1DecodeCVPixelBufferPool::~Gav1DecodeCVPixelBufferPool() {
+ CVPixelBufferPoolRelease(pool_);
+}
+
+libgav1::StatusCode Gav1DecodeCVPixelBufferPool::OnCVPixelBufferSizeChanged(
+ int bitdepth, libgav1::ImageFormat image_format, int width, int height,
+ int left_border, int right_border, int top_border, int bottom_border,
+ int stride_alignment) {
+ if (bitdepth != 8 || (image_format != libgav1::kImageFormatYuv420 &&
+ image_format != libgav1::kImageFormatMonochrome400)) {
+ fprintf(stderr,
+ "Only bitdepth 8, 4:2:0 videos are supported: bitdepth %d, "
+ "image_format: %d.\n",
+ bitdepth, image_format);
+ return libgav1::kStatusUnimplemented;
+ }
+
+ // stride_alignment must be a power of 2.
+ assert((stride_alignment & (stride_alignment - 1)) == 0);
+
+ // The possible keys for CVPixelBufferPool are:
+ // kCVPixelBufferPoolMinimumBufferCountKey
+ // kCVPixelBufferPoolMaximumBufferAgeKey
+ // kCVPixelBufferPoolAllocationThresholdKey
+ const void* pool_keys[] = {kCVPixelBufferPoolMinimumBufferCountKey};
+ const int min_buffer_count = 10;
+ UniqueCFNumberRef cf_min_buffer_count(
+ CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &min_buffer_count));
+ if (cf_min_buffer_count == nullptr) {
+ fprintf(stderr, "CFNumberCreate failed.\n");
+ return libgav1::kStatusUnknownError;
+ }
+ const void* pool_values[] = {cf_min_buffer_count.get()};
+ UniqueCFDictionaryRef pool_attributes(CFDictionaryCreate(
+ nullptr, pool_keys, pool_values, 1, &kCFTypeDictionaryKeyCallBacks,
+ &kCFTypeDictionaryValueCallBacks));
+ if (pool_attributes == nullptr) {
+ fprintf(stderr, "CFDictionaryCreate failed.\n");
+ return libgav1::kStatusUnknownError;
+ }
+
+ // The pixelBufferAttributes argument to CVPixelBufferPoolCreate() cannot be
+ // null and must contain the pixel format, width, and height, otherwise
+ // CVPixelBufferPoolCreate() fails with kCVReturnInvalidPixelBufferAttributes
+ // (-6682).
+
+ // I420: kCVPixelFormatType_420YpCbCr8Planar (video range).
+ const int pixel_format = (image_format == libgav1::kImageFormatYuv420)
+ ? kCVPixelFormatType_420YpCbCr8PlanarFullRange
+ : kCVPixelFormatType_OneComponent8;
+ UniqueCFNumberRef cf_pixel_format(
+ CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &pixel_format));
+ UniqueCFNumberRef cf_width(
+ CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &width));
+ UniqueCFNumberRef cf_height(
+ CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &height));
+ UniqueCFNumberRef cf_left_border(
+ CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &left_border));
+ UniqueCFNumberRef cf_right_border(
+ CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &right_border));
+ UniqueCFNumberRef cf_top_border(
+ CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &top_border));
+ UniqueCFNumberRef cf_bottom_border(
+ CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &bottom_border));
+ UniqueCFNumberRef cf_stride_alignment(
+ CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &stride_alignment));
+
+ const void* buffer_keys[] = {
+ kCVPixelBufferPixelFormatTypeKey,
+ kCVPixelBufferWidthKey,
+ kCVPixelBufferHeightKey,
+ kCVPixelBufferExtendedPixelsLeftKey,
+ kCVPixelBufferExtendedPixelsRightKey,
+ kCVPixelBufferExtendedPixelsTopKey,
+ kCVPixelBufferExtendedPixelsBottomKey,
+ kCVPixelBufferBytesPerRowAlignmentKey,
+ };
+ const void* buffer_values[] = {
+ cf_pixel_format.get(), cf_width.get(),
+ cf_height.get(), cf_left_border.get(),
+ cf_right_border.get(), cf_top_border.get(),
+ cf_bottom_border.get(), cf_stride_alignment.get(),
+ };
+ UniqueCFDictionaryRef buffer_attributes(CFDictionaryCreate(
+ kCFAllocatorDefault, buffer_keys, buffer_values, 8,
+ &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks));
+ if (buffer_attributes == nullptr) {
+ fprintf(stderr, "CFDictionaryCreate of buffer_attributes failed.\n");
+ return libgav1::kStatusUnknownError;
+ }
+ CVPixelBufferPoolRef cv_pool;
+ CVReturn ret = CVPixelBufferPoolCreate(
+ /*allocator=*/nullptr, pool_attributes.get(), buffer_attributes.get(),
+ &cv_pool);
+ if (ret != kCVReturnSuccess) {
+ fprintf(stderr, "CVPixelBufferPoolCreate failed: %d.\n",
+ static_cast<int>(ret));
+ return libgav1::kStatusOutOfMemory;
+ }
+ CVPixelBufferPoolRelease(pool_);
+ pool_ = cv_pool;
+ return libgav1::kStatusOk;
+}
+
+libgav1::StatusCode Gav1DecodeCVPixelBufferPool::GetCVPixelBuffer(
+ int bitdepth, libgav1::ImageFormat image_format, int /*width*/,
+ int /*height*/, int /*left_border*/, int /*right_border*/,
+ int /*top_border*/, int /*bottom_border*/, int /*stride_alignment*/,
+ libgav1::FrameBuffer* frame_buffer) {
+ static_cast<void>(bitdepth);
+ assert(bitdepth == 8 && (image_format == libgav1::kImageFormatYuv420 ||
+ image_format == libgav1::kImageFormatMonochrome400));
+ const bool is_monochrome =
+ (image_format == libgav1::kImageFormatMonochrome400);
+
+ // The dictionary must have kCVPixelBufferPoolAllocationThresholdKey,
+ // otherwise CVPixelBufferPoolCreatePixelBufferWithAuxAttributes() fails with
+ // kCVReturnWouldExceedAllocationThreshold (-6689).
+ UniqueCFNumberRef cf_num_buffers(
+ CFNumberCreate(kCFAllocatorDefault, kCFNumberIntType, &num_buffers_));
+
+ const void* buffer_keys[] = {
+ kCVPixelBufferPoolAllocationThresholdKey,
+ };
+ const void* buffer_values[] = {
+ cf_num_buffers.get(),
+ };
+ UniqueCFDictionaryRef aux_attributes(CFDictionaryCreate(
+ kCFAllocatorDefault, buffer_keys, buffer_values, 1,
+ &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks));
+ if (aux_attributes == nullptr) {
+ fprintf(stderr, "CFDictionaryCreate of aux_attributes failed.\n");
+ return libgav1::kStatusUnknownError;
+ }
+
+ CVPixelBufferRef pixel_buffer;
+ CVReturn ret = CVPixelBufferPoolCreatePixelBufferWithAuxAttributes(
+ /*allocator=*/nullptr, pool_, aux_attributes.get(), &pixel_buffer);
+ if (ret != kCVReturnSuccess) {
+ fprintf(stderr,
+ "CVPixelBufferPoolCreatePixelBufferWithAuxAttributes failed: %d.\n",
+ static_cast<int>(ret));
+ return libgav1::kStatusOutOfMemory;
+ }
+
+ ret = CVPixelBufferLockBaseAddress(pixel_buffer, /*lockFlags=*/0);
+ if (ret != kCVReturnSuccess) {
+ fprintf(stderr, "CVPixelBufferLockBaseAddress failed: %d.\n",
+ static_cast<int>(ret));
+ CFRelease(pixel_buffer);
+ return libgav1::kStatusUnknownError;
+ }
+
+ // If the pixel format type is kCVPixelFormatType_OneComponent8, the pixel
+ // buffer is nonplanar (CVPixelBufferIsPlanar returns false and
+ // CVPixelBufferGetPlaneCount returns 0), but
+ // CVPixelBufferGetBytesPerRowOfPlane and CVPixelBufferGetBaseAddressOfPlane
+ // still work for plane index 0, even though the documentation says they
+ // return NULL for nonplanar pixel buffers.
+ frame_buffer->stride[0] =
+ static_cast<int>(CVPixelBufferGetBytesPerRowOfPlane(pixel_buffer, 0));
+ frame_buffer->plane[0] = static_cast<uint8_t*>(
+ CVPixelBufferGetBaseAddressOfPlane(pixel_buffer, 0));
+ if (is_monochrome) {
+ frame_buffer->stride[1] = 0;
+ frame_buffer->stride[2] = 0;
+ frame_buffer->plane[1] = nullptr;
+ frame_buffer->plane[2] = nullptr;
+ } else {
+ frame_buffer->stride[1] =
+ static_cast<int>(CVPixelBufferGetBytesPerRowOfPlane(pixel_buffer, 1));
+ frame_buffer->stride[2] =
+ static_cast<int>(CVPixelBufferGetBytesPerRowOfPlane(pixel_buffer, 2));
+ frame_buffer->plane[1] = static_cast<uint8_t*>(
+ CVPixelBufferGetBaseAddressOfPlane(pixel_buffer, 1));
+ frame_buffer->plane[2] = static_cast<uint8_t*>(
+ CVPixelBufferGetBaseAddressOfPlane(pixel_buffer, 2));
+ }
+ frame_buffer->private_data = pixel_buffer;
+
+ return libgav1::kStatusOk;
+}
+
+void Gav1DecodeCVPixelBufferPool::ReleaseCVPixelBuffer(
+ void* buffer_private_data) {
+ auto const pixel_buffer = static_cast<CVPixelBufferRef>(buffer_private_data);
+ CVReturn ret =
+ CVPixelBufferUnlockBaseAddress(pixel_buffer, /*unlockFlags=*/0);
+ if (ret != kCVReturnSuccess) {
+ fprintf(stderr, "%s:%d: CVPixelBufferUnlockBaseAddress failed: %d.\n",
+ __FILE__, __LINE__, static_cast<int>(ret));
+ abort();
+ }
+ CFRelease(pixel_buffer);
+}
diff --git a/libgav1/examples/gav1_decode_cv_pixel_buffer_pool.h b/libgav1/examples/gav1_decode_cv_pixel_buffer_pool.h
new file mode 100644
index 0000000..7aee324
--- /dev/null
+++ b/libgav1/examples/gav1_decode_cv_pixel_buffer_pool.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_GAV1_DECODE_CV_PIXEL_BUFFER_POOL_H_
+#define LIBGAV1_EXAMPLES_GAV1_DECODE_CV_PIXEL_BUFFER_POOL_H_
+
+#include <CoreVideo/CoreVideo.h>
+
+#include <cstddef>
+#include <memory>
+
+#include "gav1/frame_buffer.h"
+
+extern "C" libgav1::StatusCode Gav1DecodeOnCVPixelBufferSizeChanged(
+ void* callback_private_data, int bitdepth,
+ libgav1::ImageFormat image_format, int width, int height, int left_border,
+ int right_border, int top_border, int bottom_border, int stride_alignment);
+
+extern "C" libgav1::StatusCode Gav1DecodeGetCVPixelBuffer(
+ void* callback_private_data, int bitdepth,
+ libgav1::ImageFormat image_format, int width, int height, int left_border,
+ int right_border, int top_border, int bottom_border, int stride_alignment,
+ libgav1::FrameBuffer* frame_buffer);
+
+extern "C" void Gav1DecodeReleaseCVPixelBuffer(void* callback_private_data,
+ void* buffer_private_data);
+
+class Gav1DecodeCVPixelBufferPool {
+ public:
+ static std::unique_ptr<Gav1DecodeCVPixelBufferPool> Create(
+ size_t num_buffers);
+
+ // Not copyable or movable.
+ Gav1DecodeCVPixelBufferPool(const Gav1DecodeCVPixelBufferPool&) = delete;
+ Gav1DecodeCVPixelBufferPool& operator=(const Gav1DecodeCVPixelBufferPool&) =
+ delete;
+
+ ~Gav1DecodeCVPixelBufferPool();
+
+ libgav1::StatusCode OnCVPixelBufferSizeChanged(
+ int bitdepth, libgav1::ImageFormat image_format, int width, int height,
+ int left_border, int right_border, int top_border, int bottom_border,
+ int stride_alignment);
+
+ libgav1::StatusCode GetCVPixelBuffer(int bitdepth,
+ libgav1::ImageFormat image_format,
+ int width, int height, int left_border,
+ int right_border, int top_border,
+ int bottom_border, int stride_alignment,
+ libgav1::FrameBuffer* frame_buffer);
+ void ReleaseCVPixelBuffer(void* buffer_private_data);
+
+ private:
+ Gav1DecodeCVPixelBufferPool(size_t num_buffers);
+
+ CVPixelBufferPoolRef pool_ = nullptr;
+ const int num_buffers_;
+};
+
+#endif // LIBGAV1_EXAMPLES_GAV1_DECODE_CV_PIXEL_BUFFER_POOL_H_
diff --git a/libgav1/examples/ivf_parser.cc b/libgav1/examples/ivf_parser.cc
new file mode 100644
index 0000000..f8adb14
--- /dev/null
+++ b/libgav1/examples/ivf_parser.cc
@@ -0,0 +1,96 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "examples/ivf_parser.h"
+
+#include <cstdio>
+#include <cstring>
+
+#include "examples/file_reader_constants.h"
+#include "examples/logging.h"
+
+namespace libgav1 {
+namespace {
+
+size_t ReadLittleEndian16(const uint8_t* const buffer) {
+ size_t value = buffer[1] << 8;
+ value |= buffer[0];
+ return value;
+}
+
+size_t ReadLittleEndian32(const uint8_t* const buffer) {
+ size_t value = buffer[3] << 24;
+ value |= buffer[2] << 16;
+ value |= buffer[1] << 8;
+ value |= buffer[0];
+ return value;
+}
+
+} // namespace
+
+bool ParseIvfFileHeader(const uint8_t* const header_buffer,
+ IvfFileHeader* const ivf_file_header) {
+ if (header_buffer == nullptr || ivf_file_header == nullptr) return false;
+
+ if (memcmp(kIvfSignature, header_buffer, 4) != 0) {
+ return false;
+ }
+
+ // Verify header version and length.
+ const size_t ivf_header_version = ReadLittleEndian16(&header_buffer[4]);
+ if (ivf_header_version != kIvfHeaderVersion) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Unexpected IVF version");
+ }
+
+ const size_t ivf_header_size = ReadLittleEndian16(&header_buffer[6]);
+ if (ivf_header_size != kIvfFileHeaderSize) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Invalid IVF file header size");
+ return false;
+ }
+
+ if (memcmp(kAv1FourCcLower, &header_buffer[8], 4) != 0 &&
+ memcmp(kAv1FourCcUpper, &header_buffer[8], 4) != 0) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Unsupported codec 4CC");
+ return false;
+ }
+
+ ivf_file_header->width = ReadLittleEndian16(&header_buffer[12]);
+ ivf_file_header->height = ReadLittleEndian16(&header_buffer[14]);
+ ivf_file_header->frame_rate_numerator =
+ ReadLittleEndian32(&header_buffer[16]);
+ ivf_file_header->frame_rate_denominator =
+ ReadLittleEndian32(&header_buffer[20]);
+
+ return true;
+}
+
+bool ParseIvfFrameHeader(const uint8_t* const header_buffer,
+ IvfFrameHeader* const ivf_frame_header) {
+ if (header_buffer == nullptr || ivf_frame_header == nullptr) return false;
+
+ ivf_frame_header->frame_size = ReadLittleEndian32(header_buffer);
+ if (ivf_frame_header->frame_size > kMaxTemporalUnitSize) {
+ LIBGAV1_EXAMPLES_LOG_ERROR("Temporal Unit size exceeds maximum");
+ return false;
+ }
+
+ ivf_frame_header->timestamp = ReadLittleEndian32(&header_buffer[4]);
+ const uint64_t timestamp_hi =
+ static_cast<uint64_t>(ReadLittleEndian32(&header_buffer[8])) << 32;
+ ivf_frame_header->timestamp |= timestamp_hi;
+
+ return true;
+}
+
+} // namespace libgav1
diff --git a/libgav1/examples/ivf_parser.h b/libgav1/examples/ivf_parser.h
new file mode 100644
index 0000000..b6bbc59
--- /dev/null
+++ b/libgav1/examples/ivf_parser.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_IVF_PARSER_H_
+#define LIBGAV1_EXAMPLES_IVF_PARSER_H_
+
+#include <cstddef>
+#include <cstdint>
+
+namespace libgav1 {
+
+struct IvfFileHeader {
+ IvfFileHeader() = default;
+ IvfFileHeader(const IvfFileHeader& rhs) = default;
+ IvfFileHeader& operator=(const IvfFileHeader& rhs) = default;
+ IvfFileHeader(IvfFileHeader&& rhs) = default;
+ IvfFileHeader& operator=(IvfFileHeader&& rhs) = default;
+
+ size_t width = 0;
+ size_t height = 0;
+ size_t frame_rate_numerator = 0;
+ size_t frame_rate_denominator = 0;
+};
+
+struct IvfFrameHeader {
+ IvfFrameHeader() = default;
+ IvfFrameHeader(const IvfFrameHeader& rhs) = default;
+ IvfFrameHeader& operator=(const IvfFrameHeader& rhs) = default;
+ IvfFrameHeader(IvfFrameHeader&& rhs) = default;
+ IvfFrameHeader& operator=(IvfFrameHeader&& rhs) = default;
+
+ size_t frame_size = 0;
+ int64_t timestamp = 0;
+};
+
+bool ParseIvfFileHeader(const uint8_t* header_buffer,
+ IvfFileHeader* ivf_file_header);
+
+bool ParseIvfFrameHeader(const uint8_t* header_buffer,
+ IvfFrameHeader* ivf_frame_header);
+
+} // namespace libgav1
+
+#endif // LIBGAV1_EXAMPLES_IVF_PARSER_H_
diff --git a/libgav1/examples/libgav1_examples.cmake b/libgav1/examples/libgav1_examples.cmake
new file mode 100644
index 0000000..1f949f3
--- /dev/null
+++ b/libgav1/examples/libgav1_examples.cmake
@@ -0,0 +1,63 @@
+# Copyright 2019 The libgav1 Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(LIBGAV1_EXAMPLES_LIBGAV1_EXAMPLES_CMAKE_)
+ return()
+endif() # LIBGAV1_EXAMPLES_LIBGAV1_EXAMPLES_CMAKE_
+set(LIBGAV1_EXAMPLES_LIBGAV1_EXAMPLES_CMAKE_ 1)
+
+set(libgav1_file_reader_sources "${libgav1_examples}/file_reader.cc"
+ "${libgav1_examples}/file_reader.h"
+ "${libgav1_examples}/file_reader_constants.cc"
+ "${libgav1_examples}/file_reader_constants.h"
+ "${libgav1_examples}/file_reader_factory.cc"
+ "${libgav1_examples}/file_reader_factory.h"
+ "${libgav1_examples}/file_reader_interface.h"
+ "${libgav1_examples}/ivf_parser.cc"
+ "${libgav1_examples}/ivf_parser.h"
+ "${libgav1_examples}/logging.h")
+
+set(libgav1_file_writer_sources "${libgav1_examples}/file_writer.cc"
+ "${libgav1_examples}/file_writer.h"
+ "${libgav1_examples}/logging.h")
+
+set(libgav1_decode_sources "${libgav1_examples}/gav1_decode.cc")
+
+macro(libgav1_add_examples_targets)
+ libgav1_add_library(NAME libgav1_file_reader TYPE OBJECT SOURCES
+ ${libgav1_file_reader_sources} DEFINES ${libgav1_defines}
+ INCLUDES ${libgav1_include_paths})
+
+ libgav1_add_library(NAME libgav1_file_writer TYPE OBJECT SOURCES
+ ${libgav1_file_writer_sources} DEFINES ${libgav1_defines}
+ INCLUDES ${libgav1_include_paths})
+
+ libgav1_add_executable(NAME
+ gav1_decode
+ SOURCES
+ ${libgav1_decode_sources}
+ DEFINES
+ ${libgav1_defines}
+ INCLUDES
+ ${libgav1_include_paths}
+ ${libgav1_gtest_include_paths}
+ OBJLIB_DEPS
+ libgav1_file_reader
+ libgav1_file_writer
+ LIB_DEPS
+ absl::strings
+ absl::str_format_internal
+ absl::time
+ ${libgav1_dependency})
+endmacro()
diff --git a/libgav1/examples/logging.h b/libgav1/examples/logging.h
new file mode 100644
index 0000000..c0bcad7
--- /dev/null
+++ b/libgav1/examples/logging.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_EXAMPLES_LOGGING_H_
+#define LIBGAV1_EXAMPLES_LOGGING_H_
+
+#include <cstddef>
+#include <cstdio>
+
+namespace libgav1 {
+namespace examples {
+
+#if !defined(LIBGAV1_EXAMPLES_ENABLE_LOGGING)
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)
+#define LIBGAV1_EXAMPLES_ENABLE_LOGGING 0
+#else
+#define LIBGAV1_EXAMPLES_ENABLE_LOGGING 1
+#endif
+#endif
+
+#if LIBGAV1_EXAMPLES_ENABLE_LOGGING
+
+// Compile-time function to get the 'base' file_name, that is, the part of
+// a file_name after the last '/' or '\' path separator. The search starts at
+// the end of the string; the second parameter is the length of the string.
+constexpr const char* Basename(const char* file_name, size_t offset) {
+ return (offset == 0 || file_name[offset - 1] == '/' ||
+ file_name[offset - 1] == '\\')
+ ? file_name + offset
+ : Basename(file_name, offset - 1);
+}
+
+#define LIBGAV1_EXAMPLES_LOG_ERROR(error_string) \
+ do { \
+ constexpr const char* libgav1_examples_basename = \
+ ::libgav1::examples::Basename(__FILE__, sizeof(__FILE__) - 1); \
+ fprintf(stderr, "%s:%d (%s): %s.\n", libgav1_examples_basename, __LINE__, \
+ __func__, error_string); \
+ } while (false)
+
+#else // !LIBGAV1_EXAMPLES_ENABLE_LOGGING
+
+#define LIBGAV1_EXAMPLES_LOG_ERROR(error_string) \
+ do { \
+ } while (false)
+
+#endif // LIBGAV1_EXAMPLES_ENABLE_LOGGING
+
+} // namespace examples
+} // namespace libgav1
+
+#endif // LIBGAV1_EXAMPLES_LOGGING_H_
diff --git a/libgav1/src/buffer_pool.cc b/libgav1/src/buffer_pool.cc
index 63312ef..c1a5606 100644
--- a/libgav1/src/buffer_pool.cc
+++ b/libgav1/src/buffer_pool.cc
@@ -18,6 +18,7 @@
#include <cstring>
#include "src/utils/common.h"
+#include "src/utils/constants.h"
#include "src/utils/logging.h"
namespace libgav1 {
@@ -36,19 +37,28 @@
} // namespace
-RefCountedBuffer::RefCountedBuffer() {
- memset(&raw_frame_buffer_, 0, sizeof(raw_frame_buffer_));
-}
+RefCountedBuffer::RefCountedBuffer() = default;
RefCountedBuffer::~RefCountedBuffer() = default;
bool RefCountedBuffer::Realloc(int bitdepth, bool is_monochrome, int width,
int height, int subsampling_x, int subsampling_y,
- int border, int byte_alignment) {
- return yuv_buffer_.Realloc(bitdepth, is_monochrome, width, height,
- subsampling_x, subsampling_y, border,
- byte_alignment, pool_->get_frame_buffer_,
- pool_->callback_private_data_, &raw_frame_buffer_);
+ int left_border, int right_border,
+ int top_border, int bottom_border) {
+ // The YuvBuffer::Realloc() could call the get frame buffer callback which
+ // will need to be thread safe. So we ensure that we only call Realloc() once
+ // at any given time.
+ std::lock_guard<std::mutex> lock(pool_->mutex_);
+ assert(!buffer_private_data_valid_);
+ if (!yuv_buffer_.Realloc(
+ bitdepth, is_monochrome, width, height, subsampling_x, subsampling_y,
+ left_border, right_border, top_border, bottom_border,
+ pool_->get_frame_buffer_, pool_->callback_private_data_,
+ &buffer_private_data_)) {
+ return false;
+ }
+ buffer_private_data_valid_ = true;
+ return true;
}
bool RefCountedBuffer::SetFrameDimensions(const ObuFrameHeader& frame_header) {
@@ -59,13 +69,13 @@
render_height_ = frame_header.render_height;
rows4x4_ = frame_header.rows4x4;
columns4x4_ = frame_header.columns4x4;
- const int rows4x4_half = DivideBy2(rows4x4_);
- const int columns4x4_half = DivideBy2(columns4x4_);
- if (!motion_field_reference_frame_.Reset(rows4x4_half, columns4x4_half,
- /*zero_initialize=*/false) ||
- !motion_field_mv_.Reset(rows4x4_half, columns4x4_half,
- /*zero_initialize=*/false)) {
- return false;
+ if (frame_header.refresh_frame_flags != 0 &&
+ !IsIntraFrame(frame_header.frame_type)) {
+ const int rows4x4_half = DivideBy2(rows4x4_);
+ const int columns4x4_half = DivideBy2(columns4x4_);
+ if (!reference_info_.Reset(rows4x4_half, columns4x4_half)) {
+ return false;
+ }
}
return segmentation_map_.Allocate(rows4x4_, columns4x4_);
}
@@ -103,55 +113,105 @@
ptr->pool_->ReturnUnusedBuffer(ptr);
}
-// static
-constexpr int BufferPool::kNumBuffers;
-
-BufferPool::BufferPool(const DecoderSettings& settings) {
- if (settings.get != nullptr && settings.release != nullptr) {
- get_frame_buffer_ = settings.get;
- release_frame_buffer_ = settings.release;
- callback_private_data_ = settings.callback_private_data;
+BufferPool::BufferPool(
+ FrameBufferSizeChangedCallback on_frame_buffer_size_changed,
+ GetFrameBufferCallback get_frame_buffer,
+ ReleaseFrameBufferCallback release_frame_buffer,
+ void* callback_private_data) {
+ if (get_frame_buffer != nullptr) {
+ // on_frame_buffer_size_changed may be null.
+ assert(release_frame_buffer != nullptr);
+ on_frame_buffer_size_changed_ = on_frame_buffer_size_changed;
+ get_frame_buffer_ = get_frame_buffer;
+ release_frame_buffer_ = release_frame_buffer;
+ callback_private_data_ = callback_private_data;
} else {
- internal_frame_buffers_ = InternalFrameBufferList::Create(kNumBuffers);
- // GetInternalFrameBuffer checks whether its private_data argument is null,
- // so we don't need to check whether internal_frame_buffers_ is null here.
+ on_frame_buffer_size_changed_ = OnInternalFrameBufferSizeChanged;
get_frame_buffer_ = GetInternalFrameBuffer;
release_frame_buffer_ = ReleaseInternalFrameBuffer;
- callback_private_data_ = internal_frame_buffers_.get();
- }
- for (RefCountedBuffer& buffer : buffers_) {
- buffer.SetBufferPool(this);
+ callback_private_data_ = &internal_frame_buffers_;
}
}
BufferPool::~BufferPool() {
- for (const RefCountedBuffer& buffer : buffers_) {
- if (buffer.in_use_) {
- assert(0 && "RefCountedBuffer still in use at destruction time.");
+ for (const auto* buffer : buffers_) {
+ if (buffer->in_use_) {
+ assert(false && "RefCountedBuffer still in use at destruction time.");
LIBGAV1_DLOG(ERROR, "RefCountedBuffer still in use at destruction time.");
}
+ delete buffer;
}
}
+bool BufferPool::OnFrameBufferSizeChanged(int bitdepth,
+ Libgav1ImageFormat image_format,
+ int width, int height,
+ int left_border, int right_border,
+ int top_border, int bottom_border) {
+ if (on_frame_buffer_size_changed_ == nullptr) return true;
+ return on_frame_buffer_size_changed_(callback_private_data_, bitdepth,
+ image_format, width, height, left_border,
+ right_border, top_border, bottom_border,
+ /*stride_alignment=*/16) == kStatusOk;
+}
+
RefCountedBufferPtr BufferPool::GetFreeBuffer() {
- for (RefCountedBuffer& buffer : buffers_) {
- if (!buffer.in_use_) {
- buffer.in_use_ = true;
- return RefCountedBufferPtr(&buffer, RefCountedBuffer::ReturnToBufferPool);
+ // In frame parallel mode, the GetFreeBuffer() calls from ObuParser all happen
+ // from the same thread serially, but the GetFreeBuffer() call in
+ // DecoderImpl::ApplyFilmGrain can happen from multiple threads at the same
+ // time. So this function has to be thread safe.
+ // TODO(b/142583029): Investigate if the GetFreeBuffer() call in
+ // DecoderImpl::ApplyFilmGrain() call can be serialized so that this function
+ // need not be thread safe.
+ std::unique_lock<std::mutex> lock(mutex_);
+ for (auto buffer : buffers_) {
+ if (!buffer->in_use_) {
+ buffer->in_use_ = true;
+ buffer->progress_row_ = -1;
+ buffer->frame_state_ = kFrameStateUnknown;
+ lock.unlock();
+ return RefCountedBufferPtr(buffer, RefCountedBuffer::ReturnToBufferPool);
}
}
+ lock.unlock();
+ auto* const buffer = new (std::nothrow) RefCountedBuffer();
+ if (buffer == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to allocate a new reference counted buffer.");
+ return RefCountedBufferPtr();
+ }
+ buffer->SetBufferPool(this);
+ buffer->in_use_ = true;
+ buffer->progress_row_ = -1;
+ buffer->frame_state_ = kFrameStateUnknown;
+ lock.lock();
+ const bool ok = buffers_.push_back(buffer);
+ lock.unlock();
+ if (!ok) {
+ LIBGAV1_DLOG(
+ ERROR,
+ "Failed to push the new reference counted buffer into the vector.");
+ delete buffer;
+ return RefCountedBufferPtr();
+ }
+ return RefCountedBufferPtr(buffer, RefCountedBuffer::ReturnToBufferPool);
+}
- // We should never run out of free buffers. If we reach here, there is a
- // reference leak.
- return RefCountedBufferPtr();
+void BufferPool::Abort() {
+ std::unique_lock<std::mutex> lock(mutex_);
+ for (auto buffer : buffers_) {
+ if (buffer->in_use_) {
+ buffer->Abort();
+ }
+ }
}
void BufferPool::ReturnUnusedBuffer(RefCountedBuffer* buffer) {
+ std::lock_guard<std::mutex> lock(mutex_);
assert(buffer->in_use_);
buffer->in_use_ = false;
- if (buffer->raw_frame_buffer_.data[0] != nullptr) {
- release_frame_buffer_(callback_private_data_, &buffer->raw_frame_buffer_);
- memset(&buffer->raw_frame_buffer_, 0, sizeof(buffer->raw_frame_buffer_));
+ if (buffer->buffer_private_data_valid_) {
+ release_frame_buffer_(callback_private_data_, buffer->buffer_private_data_);
+ buffer->buffer_private_data_valid_ = false;
}
}
diff --git a/libgav1/src/buffer_pool.h b/libgav1/src/buffer_pool.h
index 4a34e23..f35a633 100644
--- a/libgav1/src/buffer_pool.h
+++ b/libgav1/src/buffer_pool.h
@@ -18,27 +18,38 @@
#define LIBGAV1_SRC_BUFFER_POOL_H_
#include <array>
+#include <cassert>
+#include <climits>
+#include <condition_variable> // NOLINT (unapproved c++11 header)
#include <cstdint>
-#include <memory>
+#include <cstring>
+#include <mutex> // NOLINT (unapproved c++11 header)
-#include "src/decoder_buffer.h"
-#include "src/decoder_settings.h"
#include "src/dsp/common.h"
-#include "src/frame_buffer.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/gav1/frame_buffer.h"
#include "src/internal_frame_buffer_list.h"
-#include "src/obu_parser.h"
#include "src/symbol_decoder_context.h"
-#include "src/utils/array_2d.h"
+#include "src/utils/compiler_attributes.h"
#include "src/utils/constants.h"
+#include "src/utils/reference_info.h"
#include "src/utils/segmentation.h"
#include "src/utils/segmentation_map.h"
#include "src/utils/types.h"
+#include "src/utils/vector.h"
#include "src/yuv_buffer.h"
namespace libgav1 {
class BufferPool;
+enum FrameState : uint8_t {
+ kFrameStateUnknown,
+ kFrameStateStarted,
+ kFrameStateParsed,
+ kFrameStateDecoded
+};
+
// A reference-counted frame buffer. Clients should access it via
// RefCountedBufferPtr, which manages reference counting transparently.
class RefCountedBuffer {
@@ -48,34 +59,39 @@
RefCountedBuffer& operator=(const RefCountedBuffer&) = delete;
// Allocates the YUV buffer. Returns true on success. Returns false on
- // failure.
+ // failure. This function ensures the thread safety of the |get_frame_buffer_|
+ // call (i.e.) only one |get_frame_buffer_| call will happen at a given time.
+ // TODO(b/142583029): In frame parallel mode, we can require the callbacks to
+ // be thread safe so that we can remove the thread safety of this function and
+ // applications can have fine grained locks.
//
// * |width| and |height| are the image dimensions in pixels.
// * |subsampling_x| and |subsampling_y| (either 0 or 1) specify the
// subsampling of the width and height of the chroma planes, respectively.
- // * |border| is the size of the borders (on all four sides) in pixels.
- // * |byte_alignment| specifies the additional alignment requirement of the
- // data buffers of the Y, U, and V planes. If |byte_alignment| is 0, there
- // is no additional alignment requirement. Otherwise, |byte_alignment|
- // must be a power of 2 and greater than or equal to 16.
- // NOTE: The strides are a multiple of 16. Therefore only the first row in
- // each plane is aligned to |byte_alignment|. Subsequent rows are only
- // 16-byte aligned.
+ // * |left_border|, |right_border|, |top_border|, and |bottom_border| are
+ // the sizes (in pixels) of the borders on the left, right, top, and
+ // bottom sides, respectively.
+ //
+ // NOTE: The strides are a multiple of 16. Since the first row in each plane
+ // is 16-byte aligned, subsequent rows are also 16-byte aligned.
bool Realloc(int bitdepth, bool is_monochrome, int width, int height,
- int subsampling_x, int subsampling_y, int border,
- int byte_alignment);
+ int subsampling_x, int subsampling_y, int left_border,
+ int right_border, int top_border, int bottom_border);
YuvBuffer* buffer() { return &yuv_buffer_; }
// Returns the buffer private data set by the get frame buffer callback when
// it allocated the YUV buffer.
- void* buffer_private_data() const { return raw_frame_buffer_.private_data; }
+ void* buffer_private_data() const {
+ assert(buffer_private_data_valid_);
+ return buffer_private_data_;
+ }
// NOTE: In the current frame, this is the frame_type syntax element in the
// frame header. In a reference frame, this implements the RefFrameType array
// in the spec.
FrameType frame_type() const { return frame_type_; }
- void set_frame_type(enum FrameType frame_type) { frame_type_ = frame_type; }
+ void set_frame_type(FrameType frame_type) { frame_type_ = frame_type; }
// The sample position for subsampled streams. This is the
// chroma_sample_position syntax element in the sequence header.
@@ -85,8 +101,7 @@
ChromaSamplePosition chroma_sample_position() const {
return chroma_sample_position_;
}
- void set_chroma_sample_position(
- enum ChromaSamplePosition chroma_sample_position) {
+ void set_chroma_sample_position(ChromaSamplePosition chroma_sample_position) {
chroma_sample_position_ = chroma_sample_position;
}
@@ -94,19 +109,11 @@
bool showable_frame() const { return showable_frame_; }
void set_showable_frame(bool value) { showable_frame_ = value; }
- uint8_t order_hint(ReferenceFrameType reference_frame) const {
- return order_hint_[reference_frame];
- }
- void set_order_hint(ReferenceFrameType reference_frame, uint8_t order_hint) {
- order_hint_[reference_frame] = order_hint;
- }
- void ClearOrderHints() { order_hint_.fill(0); }
-
// Sets upscaled_width_, frame_width_, frame_height_, render_width_,
// render_height_, rows4x4_ and columns4x4_ from the corresponding fields
- // in frame_header. Allocates motion_field_reference_frame_,
- // motion_field_mv_, and segmentation_map_. Returns true on success, false
- // on failure.
+ // in frame_header. Allocates reference_info_.motion_field_reference_frame,
+ // reference_info_.motion_field_mv_, and segmentation_map_. Returns true on
+ // success, false on failure.
bool SetFrameDimensions(const ObuFrameHeader& frame_header);
int32_t upscaled_width() const { return upscaled_width_; }
@@ -119,17 +126,10 @@
int32_t rows4x4() const { return rows4x4_; }
int32_t columns4x4() const { return columns4x4_; }
- // Entry at |row|, |column| corresponds to
- // MfRefFrames[row * 2 + 1][column * 2 + 1] in the spec.
- ReferenceFrameType* motion_field_reference_frame(int row, int column) {
- return &motion_field_reference_frame_[row][column];
- }
-
- // Entry at |row|, |column| corresponds to
- // MfMvs[row * 2 + 1][column * 2 + 1] in the spec.
- MotionVector* motion_field_mv(int row, int column) {
- return &motion_field_mv_[row][column];
- }
+ int spatial_id() const { return spatial_id_; }
+ void set_spatial_id(int value) { spatial_id_ = value; }
+ int temporal_id() const { return temporal_id_; }
+ void set_temporal_id(int value) { temporal_id_ = value; }
SegmentationMap* segmentation_map() { return &segmentation_map_; }
const SegmentationMap* segmentation_map() const { return &segmentation_map_; }
@@ -180,6 +180,99 @@
film_grain_params_ = params;
}
+ const ReferenceInfo* reference_info() const { return &reference_info_; }
+ ReferenceInfo* reference_info() { return &reference_info_; }
+
+ // This will wake up the WaitUntil*() functions and make them return false.
+ void Abort() {
+ {
+ std::lock_guard<std::mutex> lock(mutex_);
+ abort_ = true;
+ }
+ parsed_condvar_.notify_all();
+ decoded_condvar_.notify_all();
+ progress_row_condvar_.notify_all();
+ }
+
+ void SetFrameState(FrameState frame_state) {
+ {
+ std::lock_guard<std::mutex> lock(mutex_);
+ frame_state_ = frame_state;
+ }
+ if (frame_state == kFrameStateParsed) {
+ parsed_condvar_.notify_all();
+ } else if (frame_state == kFrameStateDecoded) {
+ decoded_condvar_.notify_all();
+ progress_row_condvar_.notify_all();
+ }
+ }
+
+ // Sets the progress of this frame to |progress_row| and notifies any threads
+ // that may be waiting on rows <= |progress_row|.
+ void SetProgress(int progress_row) {
+ {
+ std::lock_guard<std::mutex> lock(mutex_);
+ if (progress_row_ >= progress_row) return;
+ progress_row_ = progress_row;
+ }
+ progress_row_condvar_.notify_all();
+ }
+
+ void MarkFrameAsStarted() {
+ std::lock_guard<std::mutex> lock(mutex_);
+ if (frame_state_ != kFrameStateUnknown) return;
+ frame_state_ = kFrameStateStarted;
+ }
+
+ // All the WaitUntil* functions will return true if the desired wait state was
+ // reached successfully. If the return value is false, then the caller must
+ // assume that the wait was not successful and try to stop whatever they are
+ // doing as early as possible.
+
+ // Waits until the frame has been parsed.
+ bool WaitUntilParsed() {
+ std::unique_lock<std::mutex> lock(mutex_);
+ while (frame_state_ < kFrameStateParsed && !abort_) {
+ parsed_condvar_.wait(lock);
+ }
+ return !abort_;
+ }
+
+ // Waits until the |progress_row| has been decoded (as indicated either by
+ // |progress_row_| or |frame_state_|). |progress_row_cache| must not be
+ // nullptr and will be populated with the value of |progress_row_| after the
+ // wait.
+ //
+ // Typical usage of |progress_row_cache| is as follows:
+ // * Initialize |*progress_row_cache| to INT_MIN.
+ // * Call WaitUntil only if |*progress_row_cache| < |progress_row|.
+ bool WaitUntil(int progress_row, int* progress_row_cache) {
+ // If |progress_row| is negative, it means that the wait is on the top
+ // border to be available. The top border will be available when row 0 has
+ // been decoded. So we can simply wait on row 0 instead.
+ progress_row = std::max(progress_row, 0);
+ std::unique_lock<std::mutex> lock(mutex_);
+ while (progress_row_ < progress_row && frame_state_ != kFrameStateDecoded &&
+ !abort_) {
+ progress_row_condvar_.wait(lock);
+ }
+ // Once |frame_state_| reaches kFrameStateDecoded, |progress_row_| may no
+ // longer be updated. So we set |*progress_row_cache| to INT_MAX in that
+ // case.
+ *progress_row_cache =
+ (frame_state_ != kFrameStateDecoded) ? progress_row_ : INT_MAX;
+ return !abort_;
+ }
+
+ // Waits until the entire frame has been decoded.
+ bool WaitUntilDecoded() {
+ std::unique_lock<std::mutex> lock(mutex_);
+ while (frame_state_ != kFrameStateDecoded && !abort_) {
+ decoded_condvar_.wait(lock);
+ }
+ return !abort_;
+ }
+
private:
friend class BufferPool;
@@ -190,17 +283,26 @@
static void ReturnToBufferPool(RefCountedBuffer* ptr);
BufferPool* pool_ = nullptr;
- FrameBuffer raw_frame_buffer_;
+ bool buffer_private_data_valid_ = false;
+ void* buffer_private_data_ = nullptr;
YuvBuffer yuv_buffer_;
bool in_use_ = false; // Only used by BufferPool.
- enum FrameType frame_type_ = kFrameKey;
- enum ChromaSamplePosition chroma_sample_position_ =
- kChromaSamplePositionUnknown;
- bool showable_frame_ = false;
+ std::mutex mutex_;
+ FrameState frame_state_ = kFrameStateUnknown LIBGAV1_GUARDED_BY(mutex_);
+ int progress_row_ = -1 LIBGAV1_GUARDED_BY(mutex_);
+ // Signaled when progress_row_ is updated or when frame_state_ is set to
+ // kFrameStateDecoded.
+ std::condition_variable progress_row_condvar_;
+ // Signaled when the frame state is set to kFrameStateParsed.
+ std::condition_variable parsed_condvar_;
+ // Signaled when the frame state is set to kFrameStateDecoded.
+ std::condition_variable decoded_condvar_;
+ bool abort_ = false LIBGAV1_GUARDED_BY(mutex_);
- // Note: order_hint_[0] (for kReferenceFrameIntra) is not used.
- std::array<uint8_t, kNumReferenceFrameTypes> order_hint_ = {};
+ FrameType frame_type_ = kFrameKey;
+ ChromaSamplePosition chroma_sample_position_ = kChromaSamplePositionUnknown;
+ bool showable_frame_ = false;
int32_t upscaled_width_ = 0;
int32_t frame_width_ = 0;
@@ -209,13 +311,9 @@
int32_t render_height_ = 0;
int32_t columns4x4_ = 0;
int32_t rows4x4_ = 0;
+ int spatial_id_ = 0;
+ int temporal_id_ = 0;
- // Array of size (rows4x4 / 2) x (columns4x4 / 2). Entry at i, j corresponds
- // to MfRefFrames[i * 2 + 1][j * 2 + 1] in the spec.
- Array2D<ReferenceFrameType> motion_field_reference_frame_;
- // Array of size (rows4x4 / 2) x (columns4x4 / 2). Entry at i, j corresponds
- // to MfMvs[i * 2 + 1][j * 2 + 1] in the spec.
- Array2D<MotionVector> motion_field_mv_;
// segmentation_map_ contains a rows4x4_ by columns4x4_ 2D array.
SegmentationMap segmentation_map_;
@@ -233,6 +331,7 @@
// on feature_enabled only, we also save their values as an optimization.
Segmentation segmentation_ = {};
FilmGrainParams film_grain_params_ = {};
+ ReferenceInfo reference_info_;
};
// RefCountedBufferPtr contains a reference to a RefCountedBuffer.
@@ -247,7 +346,10 @@
// BufferPool maintains a pool of RefCountedBuffers.
class BufferPool {
public:
- explicit BufferPool(const DecoderSettings& settings);
+ BufferPool(FrameBufferSizeChangedCallback on_frame_buffer_size_changed,
+ GetFrameBufferCallback get_frame_buffer,
+ ReleaseFrameBufferCallback release_frame_buffer,
+ void* callback_private_data);
// Not copyable or movable.
BufferPool(const BufferPool&) = delete;
@@ -255,26 +357,37 @@
~BufferPool();
- // Finds a free buffer in the buffer pool and returns a reference to the
- // free buffer. If there is no free buffer, returns a null pointer.
+ LIBGAV1_MUST_USE_RESULT bool OnFrameBufferSizeChanged(
+ int bitdepth, Libgav1ImageFormat image_format, int width, int height,
+ int left_border, int right_border, int top_border, int bottom_border);
+
+ // Finds a free buffer in the buffer pool and returns a reference to the free
+ // buffer. If there is no free buffer, returns a null pointer. This function
+ // is thread safe.
RefCountedBufferPtr GetFreeBuffer();
+ // Aborts all the buffers that are in use.
+ void Abort();
+
private:
friend class RefCountedBuffer;
- // Reference frames + 1 scratch frame (for either the current frame or the
- // film grain frame).
- static constexpr int kNumBuffers = kNumReferenceFrameTypes + 1;
-
// Returns an unused buffer to the buffer pool. Called by RefCountedBuffer
- // only.
+ // only. This function is thread safe.
void ReturnUnusedBuffer(RefCountedBuffer* buffer);
- RefCountedBuffer buffers_[kNumBuffers];
+ // Used to make the following functions thread safe: GetFreeBuffer(),
+ // ReturnUnusedBuffer(), RefCountedBuffer::Realloc().
+ std::mutex mutex_;
- std::unique_ptr<InternalFrameBufferList> internal_frame_buffers_;
+ // Storing a RefCountedBuffer object in a Vector is complicated because of the
+ // copy/move semantics. So the simplest way around that is to store a list of
+ // pointers in the vector.
+ Vector<RefCountedBuffer*> buffers_ LIBGAV1_GUARDED_BY(mutex_);
+ InternalFrameBufferList internal_frame_buffers_;
// Frame buffer callbacks.
+ FrameBufferSizeChangedCallback on_frame_buffer_size_changed_;
GetFrameBufferCallback get_frame_buffer_;
ReleaseFrameBufferCallback release_frame_buffer_;
// Private data associated with the frame buffer callbacks.
diff --git a/libgav1/src/decoder.cc b/libgav1/src/decoder.cc
index 9a38dd1..b9e43e0 100644
--- a/libgav1/src/decoder.cc
+++ b/libgav1/src/decoder.cc
@@ -12,10 +12,73 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "src/decoder.h"
+#include "src/gav1/decoder.h"
+
+#include <memory>
+#include <new>
#include "src/decoder_impl.h"
+extern "C" {
+
+Libgav1StatusCode Libgav1DecoderCreate(const Libgav1DecoderSettings* settings,
+ Libgav1Decoder** decoder_out) {
+ std::unique_ptr<libgav1::Decoder> cxx_decoder(new (std::nothrow)
+ libgav1::Decoder());
+ if (cxx_decoder == nullptr) return kLibgav1StatusOutOfMemory;
+
+ libgav1::DecoderSettings cxx_settings;
+ cxx_settings.threads = settings->threads;
+ cxx_settings.frame_parallel = settings->frame_parallel != 0;
+ cxx_settings.blocking_dequeue = settings->blocking_dequeue != 0;
+ cxx_settings.on_frame_buffer_size_changed =
+ settings->on_frame_buffer_size_changed;
+ cxx_settings.get_frame_buffer = settings->get_frame_buffer;
+ cxx_settings.release_frame_buffer = settings->release_frame_buffer;
+ cxx_settings.release_input_buffer = settings->release_input_buffer;
+ cxx_settings.callback_private_data = settings->callback_private_data;
+ cxx_settings.output_all_layers = settings->output_all_layers != 0;
+ cxx_settings.operating_point = settings->operating_point;
+ cxx_settings.post_filter_mask = settings->post_filter_mask;
+
+ const Libgav1StatusCode status = cxx_decoder->Init(&cxx_settings);
+ if (status == kLibgav1StatusOk) {
+ *decoder_out = reinterpret_cast<Libgav1Decoder*>(cxx_decoder.release());
+ }
+ return status;
+}
+
+void Libgav1DecoderDestroy(Libgav1Decoder* decoder) {
+ auto* cxx_decoder = reinterpret_cast<libgav1::Decoder*>(decoder);
+ delete cxx_decoder;
+}
+
+Libgav1StatusCode Libgav1DecoderEnqueueFrame(Libgav1Decoder* decoder,
+ const uint8_t* data, size_t size,
+ int64_t user_private_data,
+ void* buffer_private_data) {
+ auto* cxx_decoder = reinterpret_cast<libgav1::Decoder*>(decoder);
+ return cxx_decoder->EnqueueFrame(data, size, user_private_data,
+ buffer_private_data);
+}
+
+Libgav1StatusCode Libgav1DecoderDequeueFrame(
+ Libgav1Decoder* decoder, const Libgav1DecoderBuffer** out_ptr) {
+ auto* cxx_decoder = reinterpret_cast<libgav1::Decoder*>(decoder);
+ return cxx_decoder->DequeueFrame(out_ptr);
+}
+
+Libgav1StatusCode Libgav1DecoderSignalEOS(Libgav1Decoder* decoder) {
+ auto* cxx_decoder = reinterpret_cast<libgav1::Decoder*>(decoder);
+ return cxx_decoder->SignalEOS();
+}
+
+int Libgav1DecoderGetMaxBitdepth() {
+ return libgav1::Decoder::GetMaxBitdepth();
+}
+
+} // extern "C"
+
namespace libgav1 {
Decoder::Decoder() = default;
@@ -23,27 +86,31 @@
Decoder::~Decoder() = default;
StatusCode Decoder::Init(const DecoderSettings* const settings) {
- if (initialized_) return kLibgav1StatusAlready;
+ if (impl_ != nullptr) return kStatusAlready;
if (settings != nullptr) settings_ = *settings;
- const StatusCode status = DecoderImpl::Create(&settings_, &impl_);
- if (status != kLibgav1StatusOk) return status;
- initialized_ = true;
- return kLibgav1StatusOk;
+ return DecoderImpl::Create(&settings_, &impl_);
}
StatusCode Decoder::EnqueueFrame(const uint8_t* data, const size_t size,
- int64_t user_private_data) {
- if (!initialized_) return kLibgav1StatusNotInitialized;
- return impl_->EnqueueFrame(data, size, user_private_data);
+ int64_t user_private_data,
+ void* buffer_private_data) {
+ if (impl_ == nullptr) return kStatusNotInitialized;
+ return impl_->EnqueueFrame(data, size, user_private_data,
+ buffer_private_data);
}
StatusCode Decoder::DequeueFrame(const DecoderBuffer** out_ptr) {
- if (!initialized_) return kLibgav1StatusNotInitialized;
+ if (impl_ == nullptr) return kStatusNotInitialized;
return impl_->DequeueFrame(out_ptr);
}
-int Decoder::GetMaxAllowedFrames() const {
- return settings_.frame_parallel ? settings_.threads : 1;
+StatusCode Decoder::SignalEOS() {
+ if (impl_ == nullptr) return kStatusNotInitialized;
+ // In non-frame-parallel mode, we have to release all the references. This
+ // simply means replacing the |impl_| with a new instance so that all the
+ // existing references are released and the state is cleared.
+ impl_ = nullptr;
+ return DecoderImpl::Create(&settings_, &impl_);
}
// static.
diff --git a/libgav1/src/decoder.h b/libgav1/src/decoder.h
deleted file mode 100644
index 1e3ac1a..0000000
--- a/libgav1/src/decoder.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright 2019 The libgav1 Authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef LIBGAV1_SRC_DECODER_H_
-#define LIBGAV1_SRC_DECODER_H_
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-
-#include "src/decoder_buffer.h"
-#include "src/decoder_settings.h"
-#include "src/status_code.h"
-#include "src/symbol_visibility.h"
-
-namespace libgav1 {
-
-// Forward declaration.
-class DecoderImpl;
-
-class LIBGAV1_PUBLIC Decoder {
- public:
- Decoder();
- ~Decoder();
-
- // Init must be called exactly once per instance. Subsequent calls will do
- // nothing. If |settings| is nullptr, the decoder will be initialized with
- // default settings. Returns kLibgav1StatusOk on success, an error status
- // otherwise.
- StatusCode Init(const DecoderSettings* settings);
-
- // Enqueues a compressed frame to be decoded. Applications can continue
- // enqueue'ing up to |GetMaxAllowedFrames()|. The decoder can be thought of as
- // a queue of size |GetMaxAllowedFrames()|. Returns kLibgav1StatusOk on
- // success and an error status otherwise. Returning an error status here isn't
- // a fatal error and the decoder can continue decoding further frames. To
- // signal EOF, call this function with |data| as nullptr and |size| as 0. That
- // will release all the frames held by the decoder.
- //
- // |user_private_data| may be used to asssociate application specific private
- // data with the compressed frame. It will be copied to the user_private_data
- // field of the DecoderBuffer returned by the corresponding |DequeueFrame()|
- // call.
- //
- // NOTE: |EnqueueFrame()| does not copy the data. Therefore, after a
- // successful |EnqueueFrame()| call, the caller must keep the |data| buffer
- // alive until the corresponding |DequeueFrame()| call returns.
- StatusCode EnqueueFrame(const uint8_t* data, size_t size,
- int64_t user_private_data);
-
- // Dequeues a decompressed frame. If there are enqueued compressed frames,
- // decodes one and sets |*out_ptr| to the last displayable frame in the
- // compressed frame. If there are no displayable frames available, sets
- // |*out_ptr| to nullptr. Returns an error status if there is an error.
- StatusCode DequeueFrame(const DecoderBuffer** out_ptr);
-
- // Returns the maximum number of frames allowed to be enqueued at a time. The
- // decoder will reject frames beyond this count. If |settings_.frame_parallel|
- // is false, then this function will always return 1.
- int GetMaxAllowedFrames() const;
-
- // Returns the maximum bitdepth that is supported by this decoder.
- static int GetMaxBitdepth();
-
- private:
- bool initialized_ = false;
- DecoderSettings settings_;
- std::unique_ptr<DecoderImpl> impl_;
-};
-
-} // namespace libgav1
-
-#endif // LIBGAV1_SRC_DECODER_H_
diff --git a/libgav1/src/decoder_buffer.h b/libgav1/src/decoder_buffer.h
deleted file mode 100644
index ecd133d..0000000
--- a/libgav1/src/decoder_buffer.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright 2019 The libgav1 Authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef LIBGAV1_SRC_DECODER_BUFFER_H_
-#define LIBGAV1_SRC_DECODER_BUFFER_H_
-
-#include <cstdint>
-
-#include "src/frame_buffer.h"
-#include "src/symbol_visibility.h"
-
-// All the declarations in this file are part of the public ABI.
-
-namespace libgav1 {
-
-enum ChromaSamplePosition : uint8_t {
- kChromaSamplePositionUnknown,
- kChromaSamplePositionVertical,
- kChromaSamplePositionColocated,
- kChromaSamplePositionReserved
-};
-
-enum ImageFormat : uint8_t {
- kImageFormatYuv420,
- kImageFormatYuv422,
- kImageFormatYuv444,
- kImageFormatMonochrome400
-};
-
-struct LIBGAV1_PUBLIC DecoderBuffer {
- int NumPlanes() const {
- return (image_format == kImageFormatMonochrome400) ? 1 : 3;
- }
-
- ChromaSamplePosition chroma_sample_position;
- ImageFormat image_format;
-
- // TODO(wtc): Add the following members:
- // - color range
- // * studio range: Y [16..235], UV [16..240]
- // * full range: (YUV/RGB [0..255]
- // - CICP Color Primaries (cp)
- // - CICP Transfer Characteristics (tc)
- // - CICP Matrix Coefficients (mc)
-
- // Image storage dimensions.
- // NOTE: These fields are named w and h in vpx_image_t and aom_image_t.
- // uint32_t width; // Stored image width.
- // uint32_t height; // Stored image height.
- int bitdepth; // Stored image bitdepth.
-
- // Image display dimensions.
- // NOTES:
- // 1. These fields are named d_w and d_h in vpx_image_t and aom_image_t.
- // 2. libvpx and libaom clients use d_w and d_h much more often than w and h.
- // 3. These fields can just be stored for the Y plane and the clients can
- // calculate the values for the U and V planes if the image format or
- // subsampling is exposed.
- int displayed_width[3]; // Displayed image width.
- int displayed_height[3]; // Displayed image height.
-
- int stride[3];
- uint8_t* plane[3];
-
- // The |user_private_data| argument passed to Decoder::EnqueueFrame().
- int64_t user_private_data;
- // The |private_data| field of FrameBuffer. Set by the get frame buffer
- // callback when it allocates a frame buffer.
- void* buffer_private_data;
-};
-
-} // namespace libgav1
-
-#endif // LIBGAV1_SRC_DECODER_BUFFER_H_
diff --git a/libgav1/src/decoder_impl.cc b/libgav1/src/decoder_impl.cc
index 5c61993..e40c692 100644
--- a/libgav1/src/decoder_impl.cc
+++ b/libgav1/src/decoder_impl.cc
@@ -24,13 +24,18 @@
#include "src/dsp/common.h"
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
-#include "src/loop_filter_mask.h"
+#include "src/film_grain.h"
+#include "src/frame_buffer_utils.h"
+#include "src/frame_scratch_buffer.h"
#include "src/loop_restoration_info.h"
+#include "src/obu_parser.h"
#include "src/post_filter.h"
#include "src/prediction_mask.h"
#include "src/quantizer.h"
+#include "src/threading_strategy.h"
#include "src/utils/blocking_counter.h"
#include "src/utils/common.h"
+#include "src/utils/constants.h"
#include "src/utils/logging.h"
#include "src/utils/parameter_tree.h"
#include "src/utils/raw_bit_reader.h"
@@ -44,275 +49,1066 @@
constexpr int kMaxBlockWidth4x4 = 32;
constexpr int kMaxBlockHeight4x4 = 32;
-// A cleanup helper class that releases the frame buffer reference held in
-// |frame| in the destructor.
-class RefCountedBufferPtrCleanup {
+// Computes the bottom border size in pixels. If CDEF, loop restoration or
+// SuperRes is enabled, adds extra border pixels to facilitate those steps to
+// happen nearly in-place (a few extra rows instead of an entire frame buffer).
+// The logic in this function should match the corresponding logic for
+// |vertical_shift| in the PostFilter constructor.
+int GetBottomBorderPixels(const bool do_cdef, const bool do_restoration,
+ const bool do_superres, const int subsampling_y) {
+ int extra_border = 0;
+ if (do_cdef) {
+ extra_border += kCdefBorder;
+ } else if (do_restoration) {
+ // If CDEF is enabled, loop restoration is safe without extra border.
+ extra_border += kRestorationVerticalBorder;
+ }
+ if (do_superres) extra_border += kSuperResVerticalBorder;
+ // Double the number of extra bottom border pixels if the bottom border will
+ // be subsampled.
+ extra_border <<= subsampling_y;
+ return Align(kBorderPixels + extra_border, 2); // Must be a multiple of 2.
+}
+
+// Sets |frame_scratch_buffer->tile_decoding_failed| to true (while holding on
+// to |frame_scratch_buffer->superblock_row_mutex|) and notifies the first
+// |count| condition variables in
+// |frame_scratch_buffer->superblock_row_progress_condvar|.
+void SetFailureAndNotifyAll(FrameScratchBuffer* const frame_scratch_buffer,
+ int count) {
+ {
+ std::lock_guard<std::mutex> lock(
+ frame_scratch_buffer->superblock_row_mutex);
+ frame_scratch_buffer->tile_decoding_failed = true;
+ }
+ std::condition_variable* const condvars =
+ frame_scratch_buffer->superblock_row_progress_condvar.get();
+ for (int i = 0; i < count; ++i) {
+ condvars[i].notify_one();
+ }
+}
+
+// Helper class that releases the frame scratch buffer in the destructor.
+class FrameScratchBufferReleaser {
public:
- explicit RefCountedBufferPtrCleanup(RefCountedBufferPtr* frame)
- : frame_(*frame) {}
-
- // Not copyable or movable.
- RefCountedBufferPtrCleanup(const RefCountedBufferPtrCleanup&) = delete;
- RefCountedBufferPtrCleanup& operator=(const RefCountedBufferPtrCleanup&) =
- delete;
-
- ~RefCountedBufferPtrCleanup() { frame_ = nullptr; }
+ FrameScratchBufferReleaser(
+ FrameScratchBufferPool* frame_scratch_buffer_pool,
+ std::unique_ptr<FrameScratchBuffer>* frame_scratch_buffer)
+ : frame_scratch_buffer_pool_(frame_scratch_buffer_pool),
+ frame_scratch_buffer_(frame_scratch_buffer) {}
+ ~FrameScratchBufferReleaser() {
+ frame_scratch_buffer_pool_->Release(std::move(*frame_scratch_buffer_));
+ }
private:
- RefCountedBufferPtr& frame_;
+ FrameScratchBufferPool* const frame_scratch_buffer_pool_;
+ std::unique_ptr<FrameScratchBuffer>* const frame_scratch_buffer_;
};
-} // namespace
-
-void DecoderState::UpdateReferenceFrames(int refresh_frame_flags) {
- for (int ref_index = 0, mask = refresh_frame_flags; mask != 0;
- ++ref_index, mask >>= 1) {
- if ((mask & 1) != 0) {
- reference_valid[ref_index] = true;
- reference_frame_id[ref_index] = current_frame_id;
- reference_frame[ref_index] = current_frame;
- reference_order_hint[ref_index] = order_hint;
+// Sets the |frame|'s segmentation map for two cases. The third case is handled
+// in Tile::DecodeBlock().
+void SetSegmentationMap(const ObuFrameHeader& frame_header,
+ const SegmentationMap* prev_segment_ids,
+ RefCountedBuffer* const frame) {
+ if (!frame_header.segmentation.enabled) {
+ // All segment_id's are 0.
+ frame->segmentation_map()->Clear();
+ } else if (!frame_header.segmentation.update_map) {
+ // Copy from prev_segment_ids.
+ if (prev_segment_ids == nullptr) {
+ // Treat a null prev_segment_ids pointer as if it pointed to a
+ // segmentation map containing all 0s.
+ frame->segmentation_map()->Clear();
+ } else {
+ frame->segmentation_map()->CopyFrom(*prev_segment_ids);
}
}
}
-void DecoderState::ClearReferenceFrames() {
- reference_valid = {};
- reference_frame_id = {};
- reference_order_hint = {};
- for (int ref_index = 0; ref_index < kNumReferenceFrameTypes; ++ref_index) {
- reference_frame[ref_index] = nullptr;
+StatusCode DecodeTilesNonFrameParallel(
+ const ObuSequenceHeader& sequence_header,
+ const ObuFrameHeader& frame_header,
+ const Vector<std::unique_ptr<Tile>>& tiles,
+ FrameScratchBuffer* const frame_scratch_buffer,
+ PostFilter* const post_filter) {
+ // Decode in superblock row order.
+ const int block_width4x4 = sequence_header.use_128x128_superblock ? 32 : 16;
+ std::unique_ptr<TileScratchBuffer> tile_scratch_buffer =
+ frame_scratch_buffer->tile_scratch_buffer_pool.Get();
+ if (tile_scratch_buffer == nullptr) return kLibgav1StatusOutOfMemory;
+ for (int row4x4 = 0; row4x4 < frame_header.rows4x4;
+ row4x4 += block_width4x4) {
+ for (const auto& tile_ptr : tiles) {
+ if (!tile_ptr->ProcessSuperBlockRow<kProcessingModeParseAndDecode, true>(
+ row4x4, tile_scratch_buffer.get())) {
+ return kLibgav1StatusUnknownError;
+ }
+ }
+ post_filter->ApplyFilteringForOneSuperBlockRow(
+ row4x4, block_width4x4, row4x4 + block_width4x4 >= frame_header.rows4x4,
+ /*do_deblock=*/true);
+ }
+ frame_scratch_buffer->tile_scratch_buffer_pool.Release(
+ std::move(tile_scratch_buffer));
+ return kStatusOk;
+}
+
+StatusCode DecodeTilesThreadedNonFrameParallel(
+ const Vector<std::unique_ptr<Tile>>& tiles,
+ FrameScratchBuffer* const frame_scratch_buffer,
+ PostFilter* const post_filter,
+ BlockingCounterWithStatus* const pending_tiles) {
+ ThreadingStrategy& threading_strategy =
+ frame_scratch_buffer->threading_strategy;
+ const int num_workers = threading_strategy.tile_thread_count();
+ BlockingCounterWithStatus pending_workers(num_workers);
+ std::atomic<int> tile_counter(0);
+ const int tile_count = static_cast<int>(tiles.size());
+ bool tile_decoding_failed = false;
+ // Submit tile decoding jobs to the thread pool.
+ for (int i = 0; i < num_workers; ++i) {
+ threading_strategy.tile_thread_pool()->Schedule([&tiles, tile_count,
+ &tile_counter,
+ &pending_workers,
+ &pending_tiles]() {
+ bool failed = false;
+ int index;
+ while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+ tile_count) {
+ if (!failed) {
+ const auto& tile_ptr = tiles[index];
+ if (!tile_ptr->ParseAndDecode()) {
+ LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number());
+ failed = true;
+ }
+ } else {
+ pending_tiles->Decrement(false);
+ }
+ }
+ pending_workers.Decrement(!failed);
+ });
+ }
+ // Have the current thread partake in tile decoding.
+ int index;
+ while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+ tile_count) {
+ if (!tile_decoding_failed) {
+ const auto& tile_ptr = tiles[index];
+ if (!tile_ptr->ParseAndDecode()) {
+ LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number());
+ tile_decoding_failed = true;
+ }
+ } else {
+ pending_tiles->Decrement(false);
+ }
+ }
+ // Wait until all the workers are done. This ensures that all the tiles have
+ // been parsed.
+ tile_decoding_failed |= !pending_workers.Wait();
+ // Wait until all the tiles have been decoded.
+ tile_decoding_failed |= !pending_tiles->Wait();
+ if (tile_decoding_failed) return kStatusUnknownError;
+ assert(threading_strategy.post_filter_thread_pool() != nullptr);
+ post_filter->ApplyFilteringThreaded();
+ return kStatusOk;
+}
+
+StatusCode DecodeTilesFrameParallel(
+ const ObuSequenceHeader& sequence_header,
+ const ObuFrameHeader& frame_header,
+ const Vector<std::unique_ptr<Tile>>& tiles,
+ const SymbolDecoderContext& saved_symbol_decoder_context,
+ const SegmentationMap* const prev_segment_ids,
+ FrameScratchBuffer* const frame_scratch_buffer,
+ PostFilter* const post_filter, RefCountedBuffer* const current_frame) {
+ // Parse the frame.
+ for (const auto& tile : tiles) {
+ if (!tile->Parse()) {
+ LIBGAV1_DLOG(ERROR, "Failed to parse tile number: %d\n", tile->number());
+ return kStatusUnknownError;
+ }
+ }
+ if (frame_header.enable_frame_end_update_cdf) {
+ frame_scratch_buffer->symbol_decoder_context = saved_symbol_decoder_context;
+ }
+ current_frame->SetFrameContext(frame_scratch_buffer->symbol_decoder_context);
+ SetSegmentationMap(frame_header, prev_segment_ids, current_frame);
+ // Mark frame as parsed.
+ current_frame->SetFrameState(kFrameStateParsed);
+ std::unique_ptr<TileScratchBuffer> tile_scratch_buffer =
+ frame_scratch_buffer->tile_scratch_buffer_pool.Get();
+ if (tile_scratch_buffer == nullptr) {
+ return kStatusOutOfMemory;
+ }
+ const int block_width4x4 = sequence_header.use_128x128_superblock ? 32 : 16;
+ // Decode in superblock row order (inter prediction in the Tile class will
+ // block until the required superblocks in the reference frame are decoded).
+ for (int row4x4 = 0; row4x4 < frame_header.rows4x4;
+ row4x4 += block_width4x4) {
+ for (const auto& tile_ptr : tiles) {
+ if (!tile_ptr->ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
+ row4x4, tile_scratch_buffer.get())) {
+ LIBGAV1_DLOG(ERROR, "Failed to decode tile number: %d\n",
+ tile_ptr->number());
+ return kStatusUnknownError;
+ }
+ }
+ const int progress_row = post_filter->ApplyFilteringForOneSuperBlockRow(
+ row4x4, block_width4x4, row4x4 + block_width4x4 >= frame_header.rows4x4,
+ /*do_deblock=*/true);
+ if (progress_row >= 0) {
+ current_frame->SetProgress(progress_row);
+ }
+ }
+ // Mark frame as decoded (we no longer care about row-level progress since the
+ // entire frame has been decoded).
+ current_frame->SetFrameState(kFrameStateDecoded);
+ frame_scratch_buffer->tile_scratch_buffer_pool.Release(
+ std::move(tile_scratch_buffer));
+ return kStatusOk;
+}
+
+// Helper function used by DecodeTilesThreadedFrameParallel. Applies the
+// deblocking filter for tile boundaries for the superblock row at |row4x4|.
+void ApplyDeblockingFilterForTileBoundaries(
+ PostFilter* const post_filter, const std::unique_ptr<Tile>* tile_row_base,
+ const ObuFrameHeader& frame_header, int row4x4, int block_width4x4,
+ int tile_columns, bool decode_entire_tiles_in_worker_threads) {
+ // Apply vertical deblock filtering for the first 64 columns of each tile.
+ for (int tile_column = 0; tile_column < tile_columns; ++tile_column) {
+ const Tile& tile = *tile_row_base[tile_column];
+ post_filter->ApplyDeblockFilter(
+ kLoopFilterTypeVertical, row4x4, tile.column4x4_start(),
+ tile.column4x4_start() + kNum4x4InLoopFilterUnit, block_width4x4);
+ }
+ if (decode_entire_tiles_in_worker_threads &&
+ row4x4 == tile_row_base[0]->row4x4_start()) {
+ // This is the first superblock row of a tile row. In this case, apply
+ // horizontal deblock filtering for the entire superblock row.
+ post_filter->ApplyDeblockFilter(kLoopFilterTypeHorizontal, row4x4, 0,
+ frame_header.columns4x4, block_width4x4);
+ } else {
+ // Apply horizontal deblock filtering for the first 64 columns of the
+ // first tile.
+ const Tile& first_tile = *tile_row_base[0];
+ post_filter->ApplyDeblockFilter(
+ kLoopFilterTypeHorizontal, row4x4, first_tile.column4x4_start(),
+ first_tile.column4x4_start() + kNum4x4InLoopFilterUnit, block_width4x4);
+ // Apply horizontal deblock filtering for the last 64 columns of the
+ // previous tile and the first 64 columns of the current tile.
+ for (int tile_column = 1; tile_column < tile_columns; ++tile_column) {
+ const Tile& tile = *tile_row_base[tile_column];
+ // If the previous tile has more than 64 columns, then include those
+ // for the horizontal deblock.
+ const Tile& previous_tile = *tile_row_base[tile_column - 1];
+ const int column4x4_start =
+ tile.column4x4_start() -
+ ((tile.column4x4_start() - kNum4x4InLoopFilterUnit !=
+ previous_tile.column4x4_start())
+ ? kNum4x4InLoopFilterUnit
+ : 0);
+ post_filter->ApplyDeblockFilter(
+ kLoopFilterTypeHorizontal, row4x4, column4x4_start,
+ tile.column4x4_start() + kNum4x4InLoopFilterUnit, block_width4x4);
+ }
+ // Apply horizontal deblock filtering for the last 64 columns of the
+ // last tile.
+ const Tile& last_tile = *tile_row_base[tile_columns - 1];
+ // Identify the last column4x4 value and do horizontal filtering for
+ // that column4x4. The value of last column4x4 is the nearest multiple
+ // of 16 that is before tile.column4x4_end().
+ const int column4x4_start = (last_tile.column4x4_end() - 1) & ~15;
+ // If column4x4_start is the same as tile.column4x4_start() then it
+ // means that the last tile has <= 64 columns. So there is nothing left
+ // to deblock (since it was already deblocked in the loop above).
+ if (column4x4_start != last_tile.column4x4_start()) {
+ post_filter->ApplyDeblockFilter(
+ kLoopFilterTypeHorizontal, row4x4, column4x4_start,
+ last_tile.column4x4_end(), block_width4x4);
+ }
}
}
+// Helper function used by DecodeTilesThreadedFrameParallel. Decodes the
+// superblock row starting at |row4x4| for tile at index |tile_index| in the
+// list of tiles |tiles|. If the decoding is successful, then it does the
+// following:
+// * Schedule the next superblock row in the current tile column for decoding
+// (the next superblock row may be in a different tile than the current
+// one).
+// * If an entire superblock row of the frame has been decoded, it notifies
+// the waiters (if there are any).
+void DecodeSuperBlockRowInTile(
+ const Vector<std::unique_ptr<Tile>>& tiles, size_t tile_index, int row4x4,
+ const int superblock_size4x4, const int tile_columns,
+ const int superblock_rows, FrameScratchBuffer* const frame_scratch_buffer,
+ PostFilter* const post_filter, BlockingCounter* const pending_jobs) {
+ std::unique_ptr<TileScratchBuffer> scratch_buffer =
+ frame_scratch_buffer->tile_scratch_buffer_pool.Get();
+ if (scratch_buffer == nullptr) {
+ SetFailureAndNotifyAll(frame_scratch_buffer, superblock_rows);
+ return;
+ }
+ Tile& tile = *tiles[tile_index];
+ const bool ok = tile.ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
+ row4x4, scratch_buffer.get());
+ frame_scratch_buffer->tile_scratch_buffer_pool.Release(
+ std::move(scratch_buffer));
+ if (!ok) {
+ SetFailureAndNotifyAll(frame_scratch_buffer, superblock_rows);
+ return;
+ }
+ if (post_filter->DoDeblock()) {
+ // Apply vertical deblock filtering for all the columns in this tile except
+ // for the first 64 columns.
+ post_filter->ApplyDeblockFilter(
+ kLoopFilterTypeVertical, row4x4,
+ tile.column4x4_start() + kNum4x4InLoopFilterUnit, tile.column4x4_end(),
+ superblock_size4x4);
+ // Apply horizontal deblock filtering for all the columns in this tile
+ // except for the first and the last 64 columns.
+ // Note about the last tile of each row: For the last tile, column4x4_end
+ // may not be a multiple of 16. In that case it is still okay to simply
+ // subtract 16 since ApplyDeblockFilter() will only do the filters in
+ // increments of 64 columns (or 32 columns for chroma with subsampling).
+ post_filter->ApplyDeblockFilter(
+ kLoopFilterTypeHorizontal, row4x4,
+ tile.column4x4_start() + kNum4x4InLoopFilterUnit,
+ tile.column4x4_end() - kNum4x4InLoopFilterUnit, superblock_size4x4);
+ }
+ const int superblock_size4x4_log2 = FloorLog2(superblock_size4x4);
+ const int index = row4x4 >> superblock_size4x4_log2;
+ int* const superblock_row_progress =
+ frame_scratch_buffer->superblock_row_progress.get();
+ std::condition_variable* const superblock_row_progress_condvar =
+ frame_scratch_buffer->superblock_row_progress_condvar.get();
+ bool notify;
+ {
+ std::lock_guard<std::mutex> lock(
+ frame_scratch_buffer->superblock_row_mutex);
+ notify = ++superblock_row_progress[index] == tile_columns;
+ }
+ if (notify) {
+ // We are done decoding this superblock row. Notify the post filtering
+ // thread.
+ superblock_row_progress_condvar[index].notify_one();
+ }
+ // Schedule the next superblock row (if one exists).
+ ThreadPool& thread_pool =
+ *frame_scratch_buffer->threading_strategy.thread_pool();
+ const int next_row4x4 = row4x4 + superblock_size4x4;
+ if (!tile.IsRow4x4Inside(next_row4x4)) {
+ tile_index += tile_columns;
+ }
+ if (tile_index >= tiles.size()) return;
+ pending_jobs->IncrementBy(1);
+ thread_pool.Schedule([&tiles, tile_index, next_row4x4, superblock_size4x4,
+ tile_columns, superblock_rows, frame_scratch_buffer,
+ post_filter, pending_jobs]() {
+ DecodeSuperBlockRowInTile(tiles, tile_index, next_row4x4,
+ superblock_size4x4, tile_columns, superblock_rows,
+ frame_scratch_buffer, post_filter, pending_jobs);
+ pending_jobs->Decrement();
+ });
+}
+
+StatusCode DecodeTilesThreadedFrameParallel(
+ const ObuSequenceHeader& sequence_header,
+ const ObuFrameHeader& frame_header,
+ const Vector<std::unique_ptr<Tile>>& tiles,
+ const SymbolDecoderContext& saved_symbol_decoder_context,
+ const SegmentationMap* const prev_segment_ids,
+ FrameScratchBuffer* const frame_scratch_buffer,
+ PostFilter* const post_filter, RefCountedBuffer* const current_frame) {
+ // Parse the frame.
+ ThreadPool& thread_pool =
+ *frame_scratch_buffer->threading_strategy.thread_pool();
+ std::atomic<int> tile_counter(0);
+ const int tile_count = static_cast<int>(tiles.size());
+ const int num_workers = thread_pool.num_threads();
+ BlockingCounterWithStatus parse_workers(num_workers);
+ // Submit tile parsing jobs to the thread pool.
+ for (int i = 0; i < num_workers; ++i) {
+ thread_pool.Schedule([&tiles, tile_count, &tile_counter, &parse_workers]() {
+ bool failed = false;
+ int index;
+ while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+ tile_count) {
+ if (!failed) {
+ const auto& tile_ptr = tiles[index];
+ if (!tile_ptr->Parse()) {
+ LIBGAV1_DLOG(ERROR, "Error parsing tile #%d", tile_ptr->number());
+ failed = true;
+ }
+ }
+ }
+ parse_workers.Decrement(!failed);
+ });
+ }
+
+ // Have the current thread participate in parsing.
+ bool failed = false;
+ int index;
+ while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+ tile_count) {
+ if (!failed) {
+ const auto& tile_ptr = tiles[index];
+ if (!tile_ptr->Parse()) {
+ LIBGAV1_DLOG(ERROR, "Error parsing tile #%d", tile_ptr->number());
+ failed = true;
+ }
+ }
+ }
+
+ // Wait until all the parse workers are done. This ensures that all the tiles
+ // have been parsed.
+ if (!parse_workers.Wait() || failed) {
+ return kLibgav1StatusUnknownError;
+ }
+ if (frame_header.enable_frame_end_update_cdf) {
+ frame_scratch_buffer->symbol_decoder_context = saved_symbol_decoder_context;
+ }
+ current_frame->SetFrameContext(frame_scratch_buffer->symbol_decoder_context);
+ SetSegmentationMap(frame_header, prev_segment_ids, current_frame);
+ current_frame->SetFrameState(kFrameStateParsed);
+
+ // Decode the frame.
+ const int block_width4x4 = sequence_header.use_128x128_superblock ? 32 : 16;
+ const int block_width4x4_log2 =
+ sequence_header.use_128x128_superblock ? 5 : 4;
+ const int superblock_rows =
+ (frame_header.rows4x4 + block_width4x4 - 1) >> block_width4x4_log2;
+ if (!frame_scratch_buffer->superblock_row_progress.Resize(superblock_rows) ||
+ !frame_scratch_buffer->superblock_row_progress_condvar.Resize(
+ superblock_rows)) {
+ return kLibgav1StatusOutOfMemory;
+ }
+ int* const superblock_row_progress =
+ frame_scratch_buffer->superblock_row_progress.get();
+ memset(superblock_row_progress, 0,
+ superblock_rows * sizeof(superblock_row_progress[0]));
+ frame_scratch_buffer->tile_decoding_failed = false;
+ const int tile_columns = frame_header.tile_info.tile_columns;
+ const bool decode_entire_tiles_in_worker_threads =
+ num_workers >= tile_columns;
+ BlockingCounter pending_jobs(
+ decode_entire_tiles_in_worker_threads ? num_workers : tile_columns);
+ if (decode_entire_tiles_in_worker_threads) {
+ // Submit tile decoding jobs to the thread pool.
+ tile_counter = 0;
+ for (int i = 0; i < num_workers; ++i) {
+ thread_pool.Schedule([&tiles, tile_count, &tile_counter, &pending_jobs,
+ frame_scratch_buffer, superblock_rows]() {
+ bool failed = false;
+ int index;
+ while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
+ tile_count) {
+ if (failed) continue;
+ const auto& tile_ptr = tiles[index];
+ if (!tile_ptr->Decode(
+ &frame_scratch_buffer->superblock_row_mutex,
+ frame_scratch_buffer->superblock_row_progress.get(),
+ frame_scratch_buffer->superblock_row_progress_condvar
+ .get())) {
+ LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number());
+ failed = true;
+ SetFailureAndNotifyAll(frame_scratch_buffer, superblock_rows);
+ }
+ }
+ pending_jobs.Decrement();
+ });
+ }
+ } else {
+ // Schedule the jobs for first tile row.
+ for (int tile_index = 0; tile_index < tile_columns; ++tile_index) {
+ thread_pool.Schedule([&tiles, tile_index, block_width4x4, tile_columns,
+ superblock_rows, frame_scratch_buffer, post_filter,
+ &pending_jobs]() {
+ DecodeSuperBlockRowInTile(
+ tiles, tile_index, 0, block_width4x4, tile_columns, superblock_rows,
+ frame_scratch_buffer, post_filter, &pending_jobs);
+ pending_jobs.Decrement();
+ });
+ }
+ }
+
+ // Current thread will do the post filters.
+ std::condition_variable* const superblock_row_progress_condvar =
+ frame_scratch_buffer->superblock_row_progress_condvar.get();
+ const std::unique_ptr<Tile>* tile_row_base = &tiles[0];
+ for (int row4x4 = 0, index = 0; row4x4 < frame_header.rows4x4;
+ row4x4 += block_width4x4, ++index) {
+ if (!tile_row_base[0]->IsRow4x4Inside(row4x4)) {
+ tile_row_base += tile_columns;
+ }
+ {
+ std::unique_lock<std::mutex> lock(
+ frame_scratch_buffer->superblock_row_mutex);
+ while (superblock_row_progress[index] != tile_columns &&
+ !frame_scratch_buffer->tile_decoding_failed) {
+ superblock_row_progress_condvar[index].wait(lock);
+ }
+ if (frame_scratch_buffer->tile_decoding_failed) break;
+ }
+ if (post_filter->DoDeblock()) {
+ // Apply deblocking filter for the tile boundaries of this superblock row.
+ // The deblocking filter for the internal blocks will be applied in the
+ // tile worker threads. In this thread, we will only have to apply
+ // deblocking filter for the tile boundaries.
+ ApplyDeblockingFilterForTileBoundaries(
+ post_filter, tile_row_base, frame_header, row4x4, block_width4x4,
+ tile_columns, decode_entire_tiles_in_worker_threads);
+ }
+ // Apply all the post filters other than deblocking.
+ const int progress_row = post_filter->ApplyFilteringForOneSuperBlockRow(
+ row4x4, block_width4x4, row4x4 + block_width4x4 >= frame_header.rows4x4,
+ /*do_deblock=*/false);
+ if (progress_row >= 0) {
+ current_frame->SetProgress(progress_row);
+ }
+ }
+ // Wait until all the pending jobs are done. This ensures that all the tiles
+ // have been decoded and wrapped up.
+ pending_jobs.Wait();
+ {
+ std::lock_guard<std::mutex> lock(
+ frame_scratch_buffer->superblock_row_mutex);
+ if (frame_scratch_buffer->tile_decoding_failed) {
+ return kLibgav1StatusUnknownError;
+ }
+ }
+
+ current_frame->SetFrameState(kFrameStateDecoded);
+ return kStatusOk;
+}
+
+} // namespace
+
// static
StatusCode DecoderImpl::Create(const DecoderSettings* settings,
std::unique_ptr<DecoderImpl>* output) {
if (settings->threads <= 0) {
LIBGAV1_DLOG(ERROR, "Invalid settings->threads: %d.", settings->threads);
- return kLibgav1StatusInvalidArgument;
+ return kStatusInvalidArgument;
+ }
+ if (settings->frame_parallel) {
+ if (settings->release_input_buffer == nullptr) {
+ LIBGAV1_DLOG(ERROR,
+ "release_input_buffer callback must not be null when "
+ "frame_parallel is true.");
+ return kStatusInvalidArgument;
+ }
}
std::unique_ptr<DecoderImpl> impl(new (std::nothrow) DecoderImpl(settings));
if (impl == nullptr) {
LIBGAV1_DLOG(ERROR, "Failed to allocate DecoderImpl.");
- return kLibgav1StatusOutOfMemory;
+ return kStatusOutOfMemory;
}
const StatusCode status = impl->Init();
- if (status != kLibgav1StatusOk) return status;
+ if (status != kStatusOk) return status;
*output = std::move(impl);
- return kLibgav1StatusOk;
+ return kStatusOk;
}
DecoderImpl::DecoderImpl(const DecoderSettings* settings)
- : buffer_pool_(*settings), settings_(*settings) {
+ : buffer_pool_(settings->on_frame_buffer_size_changed,
+ settings->get_frame_buffer, settings->release_frame_buffer,
+ settings->callback_private_data),
+ settings_(*settings) {
dsp::DspInit();
- GenerateWedgeMask(state_.wedge_master_mask.data(), state_.wedge_masks.data());
}
DecoderImpl::~DecoderImpl() {
- // The frame buffer references need to be released before |buffer_pool_| is
- // destroyed.
+ // Clean up and wait until all the threads have stopped. We just have to pass
+ // in a dummy status that is not kStatusOk or kStatusTryAgain to trigger the
+ // path that clears all the threads and structs.
+ SignalFailure(kStatusUnknownError);
+ // Release any other frame buffer references that we may be holding on to.
ReleaseOutputFrame();
- assert(state_.current_frame == nullptr);
+ output_frame_queue_.Clear();
for (auto& reference_frame : state_.reference_frame) {
reference_frame = nullptr;
}
}
StatusCode DecoderImpl::Init() {
- const int max_allowed_frames =
- settings_.frame_parallel ? settings_.threads : 1;
- assert(max_allowed_frames > 0);
- if (!encoded_frames_.Init(max_allowed_frames)) {
- LIBGAV1_DLOG(ERROR, "encoded_frames_.Init() failed.");
- return kLibgav1StatusOutOfMemory;
+ if (!GenerateWedgeMask(&wedge_masks_)) {
+ LIBGAV1_DLOG(ERROR, "GenerateWedgeMask() failed.");
+ return kStatusOutOfMemory;
}
- return kLibgav1StatusOk;
+ if (!output_frame_queue_.Init(kMaxLayers)) {
+ LIBGAV1_DLOG(ERROR, "output_frame_queue_.Init() failed.");
+ return kStatusOutOfMemory;
+ }
+ return kStatusOk;
+}
+
+StatusCode DecoderImpl::InitializeFrameThreadPoolAndTemporalUnitQueue(
+ const uint8_t* data, size_t size) {
+ is_frame_parallel_ = false;
+ if (settings_.frame_parallel) {
+ DecoderState state;
+ std::unique_ptr<ObuParser> obu(new (std::nothrow) ObuParser(
+ data, size, settings_.operating_point, &buffer_pool_, &state));
+ if (obu == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to allocate OBU parser.");
+ return kStatusOutOfMemory;
+ }
+ RefCountedBufferPtr current_frame;
+ const StatusCode status = obu->ParseOneFrame(¤t_frame);
+ if (status != kStatusOk) {
+ LIBGAV1_DLOG(ERROR, "Failed to parse OBU.");
+ return status;
+ }
+ current_frame = nullptr;
+ // We assume that the first frame that was parsed will contain the frame
+ // header. This assumption is usually true in practice. So we will simply
+ // not use frame parallel mode if this is not the case.
+ if (settings_.threads > 1 &&
+ !InitializeThreadPoolsForFrameParallel(
+ settings_.threads, obu->frame_header().tile_info.tile_count,
+ obu->frame_header().tile_info.tile_columns, &frame_thread_pool_,
+ &frame_scratch_buffer_pool_)) {
+ return kStatusOutOfMemory;
+ }
+ }
+ const int max_allowed_frames =
+ (frame_thread_pool_ != nullptr) ? frame_thread_pool_->num_threads() : 1;
+ assert(max_allowed_frames > 0);
+ if (!temporal_units_.Init(max_allowed_frames)) {
+ LIBGAV1_DLOG(ERROR, "temporal_units_.Init() failed.");
+ return kStatusOutOfMemory;
+ }
+ is_frame_parallel_ = frame_thread_pool_ != nullptr;
+ return kStatusOk;
}
StatusCode DecoderImpl::EnqueueFrame(const uint8_t* data, size_t size,
- int64_t user_private_data) {
- if (data == nullptr) {
- // This has to actually flush the decoder.
- return kLibgav1StatusOk;
+ int64_t user_private_data,
+ void* buffer_private_data) {
+ if (data == nullptr || size == 0) return kStatusInvalidArgument;
+ if (HasFailure()) return kStatusUnknownError;
+ if (!seen_first_frame_) {
+ seen_first_frame_ = true;
+ const StatusCode status =
+ InitializeFrameThreadPoolAndTemporalUnitQueue(data, size);
+ if (status != kStatusOk) {
+ return SignalFailure(status);
+ }
}
- if (encoded_frames_.Full()) {
- return kLibgav1StatusResourceExhausted;
+ if (temporal_units_.Full()) {
+ return kStatusTryAgain;
}
- encoded_frames_.Push(EncodedFrame(data, size, user_private_data));
- return kLibgav1StatusOk;
+ if (is_frame_parallel_) {
+ return ParseAndSchedule(data, size, user_private_data, buffer_private_data);
+ }
+ TemporalUnit temporal_unit(data, size, user_private_data,
+ buffer_private_data);
+ temporal_units_.Push(std::move(temporal_unit));
+ return kStatusOk;
+}
+
+StatusCode DecoderImpl::SignalFailure(StatusCode status) {
+ if (status == kStatusOk || status == kStatusTryAgain) return status;
+ // Set the |failure_status_| first so that any pending jobs in
+ // |frame_thread_pool_| will exit right away when the thread pool is being
+ // released below.
+ {
+ std::lock_guard<std::mutex> lock(mutex_);
+ failure_status_ = status;
+ }
+ // Make sure all waiting threads exit.
+ buffer_pool_.Abort();
+ frame_thread_pool_ = nullptr;
+ while (!temporal_units_.Empty()) {
+ if (settings_.release_input_buffer != nullptr) {
+ settings_.release_input_buffer(
+ settings_.callback_private_data,
+ temporal_units_.Front().buffer_private_data);
+ }
+ temporal_units_.Pop();
+ }
+ return status;
}
// DequeueFrame() follows the following policy to avoid holding unnecessary
-// frame buffer references in state_.current_frame and output_frame_.
-//
-// 1. state_.current_frame must be null when DequeueFrame() returns (success
-// or failure).
-//
-// 2. output_frame_ must be null when DequeueFrame() returns false.
+// frame buffer references in output_frame_: output_frame_ must be null when
+// DequeueFrame() returns false.
StatusCode DecoderImpl::DequeueFrame(const DecoderBuffer** out_ptr) {
if (out_ptr == nullptr) {
LIBGAV1_DLOG(ERROR, "Invalid argument: out_ptr == nullptr.");
- return kLibgav1StatusInvalidArgument;
+ return kStatusInvalidArgument;
}
- assert(state_.current_frame == nullptr);
// We assume a call to DequeueFrame() indicates that the caller is no longer
// using the previous output frame, so we can release it.
ReleaseOutputFrame();
- if (encoded_frames_.Empty()) {
- // No encoded frame to decode. Not an error.
+ if (temporal_units_.Empty()) {
+ // No input frames to decode.
*out_ptr = nullptr;
- return kLibgav1StatusOk;
+ return kStatusNothingToDequeue;
}
- const EncodedFrame encoded_frame = encoded_frames_.Pop();
- std::unique_ptr<ObuParser> obu(new (std::nothrow) ObuParser(
- encoded_frame.data, encoded_frame.size, &state_));
- if (obu == nullptr) {
- LIBGAV1_DLOG(ERROR, "Failed to initialize OBU parser.");
- return kLibgav1StatusOutOfMemory;
- }
- if (state_.has_sequence_header) {
- obu->set_sequence_header(state_.sequence_header);
- }
- RefCountedBufferPtrCleanup current_frame_cleanup(&state_.current_frame);
- RefCountedBufferPtr displayable_frame;
- StatusCode status;
- while (obu->HasData()) {
- state_.current_frame = buffer_pool_.GetFreeBuffer();
- if (state_.current_frame == nullptr) {
- LIBGAV1_DLOG(ERROR, "Could not get current_frame from the buffer pool.");
- return kLibgav1StatusResourceExhausted;
+ TemporalUnit& temporal_unit = temporal_units_.Front();
+ if (!is_frame_parallel_) {
+ // If |output_frame_queue_| is not empty, then return the first frame from
+ // that queue.
+ if (!output_frame_queue_.Empty()) {
+ RefCountedBufferPtr frame = std::move(output_frame_queue_.Front());
+ output_frame_queue_.Pop();
+ buffer_.user_private_data = temporal_unit.user_private_data;
+ if (output_frame_queue_.Empty()) {
+ temporal_units_.Pop();
+ }
+ const StatusCode status = CopyFrameToOutputBuffer(frame);
+ if (status != kStatusOk) {
+ return status;
+ }
+ *out_ptr = &buffer_;
+ return kStatusOk;
}
+ // Decode the next available temporal unit and return.
+ const StatusCode status = DecodeTemporalUnit(temporal_unit, out_ptr);
+ if (status != kStatusOk) {
+ // In case of failure, discard all the output frames that we may be
+ // holding on references to.
+ output_frame_queue_.Clear();
+ }
+ if (settings_.release_input_buffer != nullptr) {
+ settings_.release_input_buffer(settings_.callback_private_data,
+ temporal_unit.buffer_private_data);
+ }
+ if (output_frame_queue_.Empty()) {
+ temporal_units_.Pop();
+ }
+ return status;
+ }
+ {
+ std::unique_lock<std::mutex> lock(mutex_);
+ if (settings_.blocking_dequeue) {
+ while (!temporal_unit.decoded && failure_status_ == kStatusOk) {
+ decoded_condvar_.wait(lock);
+ }
+ } else {
+ if (!temporal_unit.decoded && failure_status_ == kStatusOk) {
+ return kStatusTryAgain;
+ }
+ }
+ if (failure_status_ != kStatusOk) {
+ const StatusCode failure_status = failure_status_;
+ lock.unlock();
+ return SignalFailure(failure_status);
+ }
+ }
+ if (settings_.release_input_buffer != nullptr &&
+ !temporal_unit.released_input_buffer) {
+ temporal_unit.released_input_buffer = true;
+ settings_.release_input_buffer(settings_.callback_private_data,
+ temporal_unit.buffer_private_data);
+ }
+ if (temporal_unit.status != kStatusOk) {
+ temporal_units_.Pop();
+ return SignalFailure(temporal_unit.status);
+ }
+ if (!temporal_unit.has_displayable_frame) {
+ *out_ptr = nullptr;
+ temporal_units_.Pop();
+ return kStatusOk;
+ }
+ assert(temporal_unit.output_layer_count > 0);
+ StatusCode status = CopyFrameToOutputBuffer(
+ temporal_unit.output_layers[temporal_unit.output_layer_count - 1].frame);
+ temporal_unit.output_layers[temporal_unit.output_layer_count - 1].frame =
+ nullptr;
+ if (status != kStatusOk) {
+ temporal_units_.Pop();
+ return SignalFailure(status);
+ }
+ buffer_.user_private_data = temporal_unit.user_private_data;
+ *out_ptr = &buffer_;
+ if (--temporal_unit.output_layer_count == 0) {
+ temporal_units_.Pop();
+ }
+ return kStatusOk;
+}
- if (!obu->ParseOneFrame()) {
+StatusCode DecoderImpl::ParseAndSchedule(const uint8_t* data, size_t size,
+ int64_t user_private_data,
+ void* buffer_private_data) {
+ TemporalUnit temporal_unit(data, size, user_private_data,
+ buffer_private_data);
+ std::unique_ptr<ObuParser> obu(new (std::nothrow) ObuParser(
+ temporal_unit.data, temporal_unit.size, settings_.operating_point,
+ &buffer_pool_, &state_));
+ if (obu == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to allocate OBU parser.");
+ return kStatusOutOfMemory;
+ }
+ if (has_sequence_header_) {
+ obu->set_sequence_header(sequence_header_);
+ }
+ StatusCode status;
+ int position_in_temporal_unit = 0;
+ while (obu->HasData()) {
+ RefCountedBufferPtr current_frame;
+ status = obu->ParseOneFrame(¤t_frame);
+ if (status != kStatusOk) {
LIBGAV1_DLOG(ERROR, "Failed to parse OBU.");
- return kLibgav1StatusUnknownError;
+ return status;
}
- if (std::find_if(obu->obu_headers().begin(), obu->obu_headers().end(),
- [](const ObuHeader& obu_header) {
- return obu_header.type == kObuSequenceHeader;
- }) != obu->obu_headers().end()) {
- state_.sequence_header = obu->sequence_header();
- state_.has_sequence_header = true;
+ if (IsNewSequenceHeader(*obu)) {
+ const ObuSequenceHeader& sequence_header = obu->sequence_header();
+ const Libgav1ImageFormat image_format =
+ ComposeImageFormat(sequence_header.color_config.is_monochrome,
+ sequence_header.color_config.subsampling_x,
+ sequence_header.color_config.subsampling_y);
+ const int max_bottom_border = GetBottomBorderPixels(
+ /*do_cdef=*/true, /*do_restoration=*/true,
+ /*do_superres=*/true, sequence_header.color_config.subsampling_y);
+ // TODO(vigneshv): This may not be the right place to call this callback
+ // for the frame parallel case. Investigate and fix it.
+ if (!buffer_pool_.OnFrameBufferSizeChanged(
+ sequence_header.color_config.bitdepth, image_format,
+ sequence_header.max_frame_width, sequence_header.max_frame_height,
+ kBorderPixels, kBorderPixels, kBorderPixels, max_bottom_border)) {
+ LIBGAV1_DLOG(ERROR, "buffer_pool_.OnFrameBufferSizeChanged failed.");
+ return kStatusUnknownError;
+ }
+ }
+ // This can happen when there are multiple spatial/temporal layers and if
+ // all the layers are outside the current operating point.
+ if (current_frame == nullptr) {
+ continue;
+ }
+ // Note that we cannot set EncodedFrame.temporal_unit here. It will be set
+ // in the code below after |temporal_unit| is std::move'd into the
+ // |temporal_units_| queue.
+ if (!temporal_unit.frames.emplace_back(obu.get(), state_, current_frame,
+ position_in_temporal_unit++)) {
+ LIBGAV1_DLOG(ERROR, "temporal_unit.frames.emplace_back failed.");
+ return kStatusOutOfMemory;
+ }
+ state_.UpdateReferenceFrames(current_frame,
+ obu->frame_header().refresh_frame_flags);
+ }
+ // This function cannot fail after this point. So it is okay to move the
+ // |temporal_unit| into |temporal_units_| queue.
+ temporal_units_.Push(std::move(temporal_unit));
+ if (temporal_units_.Back().frames.empty()) {
+ std::lock_guard<std::mutex> lock(mutex_);
+ temporal_units_.Back().has_displayable_frame = false;
+ temporal_units_.Back().decoded = true;
+ return kStatusOk;
+ }
+ for (auto& frame : temporal_units_.Back().frames) {
+ EncodedFrame* const encoded_frame = &frame;
+ encoded_frame->temporal_unit = &temporal_units_.Back();
+ frame_thread_pool_->Schedule([this, encoded_frame]() {
+ if (HasFailure()) return;
+ const StatusCode status = DecodeFrame(encoded_frame);
+ encoded_frame->state = {};
+ encoded_frame->frame = nullptr;
+ TemporalUnit& temporal_unit = *encoded_frame->temporal_unit;
+ std::lock_guard<std::mutex> lock(mutex_);
+ if (failure_status_ != kStatusOk) return;
+ // temporal_unit's status defaults to kStatusOk. So we need to set it only
+ // on error. If |failure_status_| is not kStatusOk at this point, it means
+ // that there has already been a failure. So we don't care about this
+ // subsequent failure. We will simply return the error code of the first
+ // failure.
+ if (status != kStatusOk) {
+ temporal_unit.status = status;
+ if (failure_status_ == kStatusOk) {
+ failure_status_ = status;
+ }
+ }
+ temporal_unit.decoded =
+ ++temporal_unit.decoded_count == temporal_unit.frames.size();
+ if (temporal_unit.decoded && settings_.output_all_layers &&
+ temporal_unit.output_layer_count > 1) {
+ std::sort(
+ temporal_unit.output_layers,
+ temporal_unit.output_layers + temporal_unit.output_layer_count);
+ }
+ if (temporal_unit.decoded || failure_status_ != kStatusOk) {
+ decoded_condvar_.notify_one();
+ }
+ });
+ }
+ return kStatusOk;
+}
+
+StatusCode DecoderImpl::DecodeFrame(EncodedFrame* const encoded_frame) {
+ const ObuSequenceHeader& sequence_header = encoded_frame->sequence_header;
+ const ObuFrameHeader& frame_header = encoded_frame->frame_header;
+ RefCountedBufferPtr current_frame = std::move(encoded_frame->frame);
+
+ std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer =
+ frame_scratch_buffer_pool_.Get();
+ if (frame_scratch_buffer == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Error when getting FrameScratchBuffer.");
+ return kStatusOutOfMemory;
+ }
+ // |frame_scratch_buffer| will be released when this local variable goes out
+ // of scope (i.e.) on any return path in this function.
+ FrameScratchBufferReleaser frame_scratch_buffer_releaser(
+ &frame_scratch_buffer_pool_, &frame_scratch_buffer);
+
+ StatusCode status;
+ if (!frame_header.show_existing_frame) {
+ if (encoded_frame->tile_buffers.empty()) {
+ // This means that the last call to ParseOneFrame() did not actually
+ // have any tile groups. This could happen in rare cases (for example,
+ // if there is a Metadata OBU after the TileGroup OBU). We currently do
+ // not have a reason to handle those cases, so we simply continue.
+ return kStatusOk;
+ }
+ status = DecodeTiles(sequence_header, frame_header,
+ encoded_frame->tile_buffers, encoded_frame->state,
+ frame_scratch_buffer.get(), current_frame.get());
+ if (status != kStatusOk) {
+ return status;
+ }
+ } else {
+ if (!current_frame->WaitUntilDecoded()) {
+ return kStatusUnknownError;
+ }
+ }
+ if (!frame_header.show_frame && !frame_header.show_existing_frame) {
+ // This frame is not displayable. Not an error.
+ return kStatusOk;
+ }
+ RefCountedBufferPtr film_grain_frame;
+ status = ApplyFilmGrain(
+ sequence_header, frame_header, current_frame, &film_grain_frame,
+ frame_scratch_buffer->threading_strategy.thread_pool());
+ if (status != kStatusOk) {
+ return status;
+ }
+
+ TemporalUnit& temporal_unit = *encoded_frame->temporal_unit;
+ std::lock_guard<std::mutex> lock(mutex_);
+ if (temporal_unit.has_displayable_frame && !settings_.output_all_layers) {
+ assert(temporal_unit.output_frame_position >= 0);
+ // A displayable frame was already found in this temporal unit. This can
+ // happen if there are multiple spatial/temporal layers. Since
+ // |settings_.output_all_layers| is false, we will output only the last
+ // displayable frame.
+ if (temporal_unit.output_frame_position >
+ encoded_frame->position_in_temporal_unit) {
+ return kStatusOk;
+ }
+ // Replace any output frame that we may have seen before with the current
+ // frame.
+ assert(temporal_unit.output_layer_count == 1);
+ --temporal_unit.output_layer_count;
+ }
+ temporal_unit.has_displayable_frame = true;
+ temporal_unit.output_layers[temporal_unit.output_layer_count].frame =
+ std::move(film_grain_frame);
+ temporal_unit.output_layers[temporal_unit.output_layer_count]
+ .position_in_temporal_unit = encoded_frame->position_in_temporal_unit;
+ ++temporal_unit.output_layer_count;
+ temporal_unit.output_frame_position =
+ encoded_frame->position_in_temporal_unit;
+ return kStatusOk;
+}
+
+StatusCode DecoderImpl::DecodeTemporalUnit(const TemporalUnit& temporal_unit,
+ const DecoderBuffer** out_ptr) {
+ std::unique_ptr<ObuParser> obu(new (std::nothrow) ObuParser(
+ temporal_unit.data, temporal_unit.size, settings_.operating_point,
+ &buffer_pool_, &state_));
+ if (obu == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to allocate OBU parser.");
+ return kStatusOutOfMemory;
+ }
+ if (has_sequence_header_) {
+ obu->set_sequence_header(sequence_header_);
+ }
+ StatusCode status;
+ std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer =
+ frame_scratch_buffer_pool_.Get();
+ if (frame_scratch_buffer == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Error when getting FrameScratchBuffer.");
+ return kStatusOutOfMemory;
+ }
+ // |frame_scratch_buffer| will be released when this local variable goes out
+ // of scope (i.e.) on any return path in this function.
+ FrameScratchBufferReleaser frame_scratch_buffer_releaser(
+ &frame_scratch_buffer_pool_, &frame_scratch_buffer);
+
+ while (obu->HasData()) {
+ RefCountedBufferPtr current_frame;
+ status = obu->ParseOneFrame(¤t_frame);
+ if (status != kStatusOk) {
+ LIBGAV1_DLOG(ERROR, "Failed to parse OBU.");
+ return status;
+ }
+ if (IsNewSequenceHeader(*obu)) {
+ const ObuSequenceHeader& sequence_header = obu->sequence_header();
+ const Libgav1ImageFormat image_format =
+ ComposeImageFormat(sequence_header.color_config.is_monochrome,
+ sequence_header.color_config.subsampling_x,
+ sequence_header.color_config.subsampling_y);
+ const int max_bottom_border = GetBottomBorderPixels(
+ /*do_cdef=*/true, /*do_restoration=*/true,
+ /*do_superres=*/true, sequence_header.color_config.subsampling_y);
+ if (!buffer_pool_.OnFrameBufferSizeChanged(
+ sequence_header.color_config.bitdepth, image_format,
+ sequence_header.max_frame_width, sequence_header.max_frame_height,
+ kBorderPixels, kBorderPixels, kBorderPixels, max_bottom_border)) {
+ LIBGAV1_DLOG(ERROR, "buffer_pool_.OnFrameBufferSizeChanged failed.");
+ return kStatusUnknownError;
+ }
}
if (!obu->frame_header().show_existing_frame) {
- if (obu->tile_groups().empty()) {
+ if (obu->tile_buffers().empty()) {
// This means that the last call to ParseOneFrame() did not actually
// have any tile groups. This could happen in rare cases (for example,
// if there is a Metadata OBU after the TileGroup OBU). We currently do
// not have a reason to handle those cases, so we simply continue.
continue;
}
- status = DecodeTiles(obu.get());
- if (status != kLibgav1StatusOk) {
+ status = DecodeTiles(obu->sequence_header(), obu->frame_header(),
+ obu->tile_buffers(), state_,
+ frame_scratch_buffer.get(), current_frame.get());
+ if (status != kStatusOk) {
return status;
}
}
- state_.UpdateReferenceFrames(obu->frame_header().refresh_frame_flags);
+ state_.UpdateReferenceFrames(current_frame,
+ obu->frame_header().refresh_frame_flags);
if (obu->frame_header().show_frame ||
obu->frame_header().show_existing_frame) {
- if (displayable_frame != nullptr) {
- // This can happen if there are multiple spatial/temporal layers. We
- // don't care about it for now, so simply return the last displayable
- // frame.
- // TODO(b/129153372): Add support for outputting multiple
- // spatial/temporal layers.
- LIBGAV1_DLOG(
- WARNING,
- "More than one displayable frame found. Using the last one.");
+ if (!output_frame_queue_.Empty() && !settings_.output_all_layers) {
+ // There is more than one displayable frame in the current operating
+ // point and |settings_.output_all_layers| is false. In this case, we
+ // simply return the last displayable frame as the output frame and
+ // ignore the rest.
+ assert(output_frame_queue_.Size() == 1);
+ output_frame_queue_.Pop();
}
- displayable_frame = std::move(state_.current_frame);
- if (obu->sequence_header().film_grain_params_present &&
- displayable_frame->film_grain_params().apply_grain &&
- (settings_.post_filter_mask & 0x10) != 0) {
- RefCountedBufferPtr film_grain_frame;
- if (!obu->frame_header().show_existing_frame &&
- obu->frame_header().refresh_frame_flags == 0) {
- // If show_existing_frame is true, then the current frame is a
- // previously saved reference frame. If refresh_frame_flags is
- // nonzero, then the state_.UpdateReferenceFrames() call above has
- // saved the current frame as a reference frame. Therefore, if both
- // of these conditions are false, then the current frame is not
- // saved as a reference frame. displayable_frame should hold the
- // only reference to the current frame.
- assert(displayable_frame.use_count() == 1);
- // Add film grain noise in place.
- film_grain_frame = displayable_frame;
- } else {
- film_grain_frame = buffer_pool_.GetFreeBuffer();
- if (film_grain_frame == nullptr) {
- LIBGAV1_DLOG(
- ERROR, "Could not get film_grain_frame from the buffer pool.");
- return kLibgav1StatusResourceExhausted;
- }
- if (!film_grain_frame->Realloc(
- displayable_frame->buffer()->bitdepth(),
- displayable_frame->buffer()->is_monochrome(),
- displayable_frame->upscaled_width(),
- displayable_frame->frame_height(),
- displayable_frame->buffer()->subsampling_x(),
- displayable_frame->buffer()->subsampling_y(),
- /*border=*/0,
- /*byte_alignment=*/0)) {
- LIBGAV1_DLOG(ERROR, "film_grain_frame->Realloc() failed.");
- return kLibgav1StatusOutOfMemory;
- }
- film_grain_frame->set_chroma_sample_position(
- displayable_frame->chroma_sample_position());
- }
- const dsp::Dsp* const dsp =
- dsp::GetDspTable(displayable_frame->buffer()->bitdepth());
- if (!dsp->film_grain_synthesis(
- displayable_frame->buffer()->data(kPlaneY),
- displayable_frame->buffer()->stride(kPlaneY),
- displayable_frame->buffer()->data(kPlaneU),
- displayable_frame->buffer()->stride(kPlaneU),
- displayable_frame->buffer()->data(kPlaneV),
- displayable_frame->buffer()->stride(kPlaneV),
- displayable_frame->film_grain_params(),
- displayable_frame->buffer()->is_monochrome(),
- obu->sequence_header().color_config.matrix_coefficients ==
- kMatrixCoefficientIdentity,
- displayable_frame->upscaled_width(),
- displayable_frame->frame_height(),
- displayable_frame->buffer()->subsampling_x(),
- displayable_frame->buffer()->subsampling_y(),
- film_grain_frame->buffer()->data(kPlaneY),
- film_grain_frame->buffer()->stride(kPlaneY),
- film_grain_frame->buffer()->data(kPlaneU),
- film_grain_frame->buffer()->stride(kPlaneU),
- film_grain_frame->buffer()->data(kPlaneV),
- film_grain_frame->buffer()->stride(kPlaneV))) {
- LIBGAV1_DLOG(ERROR, "dsp->film_grain_synthesis() failed.");
- return kLibgav1StatusOutOfMemory;
- }
- displayable_frame = std::move(film_grain_frame);
- }
+ RefCountedBufferPtr film_grain_frame;
+ status = ApplyFilmGrain(
+ obu->sequence_header(), obu->frame_header(), current_frame,
+ &film_grain_frame,
+ frame_scratch_buffer->threading_strategy.film_grain_thread_pool());
+ if (status != kStatusOk) return status;
+ output_frame_queue_.Push(std::move(film_grain_frame));
}
}
- if (displayable_frame == nullptr) {
- // No displayable frame in the encoded frame. Not an error.
+ if (output_frame_queue_.Empty()) {
+ // No displayable frame in the temporal unit. Not an error.
*out_ptr = nullptr;
- return kLibgav1StatusOk;
+ return kStatusOk;
}
- status = CopyFrameToOutputBuffer(displayable_frame);
- if (status != kLibgav1StatusOk) {
+ status = CopyFrameToOutputBuffer(output_frame_queue_.Front());
+ output_frame_queue_.Pop();
+ if (status != kStatusOk) {
return status;
}
- buffer_.user_private_data = encoded_frame.user_private_data;
+ buffer_.user_private_data = temporal_unit.user_private_data;
*out_ptr = &buffer_;
- return kLibgav1StatusOk;
-}
-
-bool DecoderImpl::AllocateCurrentFrame(const ObuFrameHeader& frame_header) {
- const ColorConfig& color_config = state_.sequence_header.color_config;
- state_.current_frame->set_chroma_sample_position(
- color_config.chroma_sample_position);
- return state_.current_frame->Realloc(
- color_config.bitdepth, color_config.is_monochrome,
- frame_header.upscaled_width, frame_header.height,
- color_config.subsampling_x, color_config.subsampling_y, kBorderPixels,
- /*byte_alignment=*/0);
+ return kStatusOk;
}
StatusCode DecoderImpl::CopyFrameToOutputBuffer(
@@ -336,9 +1132,15 @@
LIBGAV1_DLOG(ERROR,
"Invalid chroma subsampling values: cannot determine buffer "
"image format.");
- return kLibgav1StatusInvalidArgument;
+ return kStatusInvalidArgument;
}
}
+ buffer_.color_range = sequence_header_.color_config.color_range;
+ buffer_.color_primary = sequence_header_.color_config.color_primary;
+ buffer_.transfer_characteristics =
+ sequence_header_.color_config.transfer_characteristics;
+ buffer_.matrix_coefficients =
+ sequence_header_.color_config.matrix_coefficients;
buffer_.bitdepth = yuv_buffer->bitdepth();
const int num_planes =
@@ -347,8 +1149,8 @@
for (; plane < num_planes; ++plane) {
buffer_.stride[plane] = yuv_buffer->stride(plane);
buffer_.plane[plane] = yuv_buffer->data(plane);
- buffer_.displayed_width[plane] = yuv_buffer->displayed_width(plane);
- buffer_.displayed_height[plane] = yuv_buffer->displayed_height(plane);
+ buffer_.displayed_width[plane] = yuv_buffer->width(plane);
+ buffer_.displayed_height[plane] = yuv_buffer->height(plane);
}
for (; plane < kMaxPlanes; ++plane) {
buffer_.stride[plane] = 0;
@@ -356,9 +1158,11 @@
buffer_.displayed_width[plane] = 0;
buffer_.displayed_height[plane] = 0;
}
+ buffer_.spatial_id = frame->spatial_id();
+ buffer_.temporal_id = frame->temporal_id();
buffer_.buffer_private_data = frame->buffer_private_data();
output_frame_ = frame;
- return kLibgav1StatusOk;
+ return kStatusOk;
}
void DecoderImpl::ReleaseOutputFrame() {
@@ -368,336 +1172,458 @@
output_frame_ = nullptr;
}
-StatusCode DecoderImpl::DecodeTiles(const ObuParser* obu) {
- if (PostFilter::DoDeblock(obu->frame_header(), settings_.post_filter_mask) &&
- !loop_filter_mask_.Reset(obu->frame_header().width,
- obu->frame_header().height)) {
- LIBGAV1_DLOG(ERROR, "Failed to allocate memory for loop filter masks.");
- return kLibgav1StatusOutOfMemory;
- }
- LoopRestorationInfo loop_restoration_info(
- obu->frame_header().loop_restoration, obu->frame_header().upscaled_width,
- obu->frame_header().height,
- obu->sequence_header().color_config.subsampling_x,
- obu->sequence_header().color_config.subsampling_y,
- obu->sequence_header().color_config.is_monochrome);
- if (!loop_restoration_info.Allocate()) {
+StatusCode DecoderImpl::DecodeTiles(
+ const ObuSequenceHeader& sequence_header,
+ const ObuFrameHeader& frame_header, const Vector<TileBuffer>& tile_buffers,
+ const DecoderState& state, FrameScratchBuffer* const frame_scratch_buffer,
+ RefCountedBuffer* const current_frame) {
+ frame_scratch_buffer->tile_scratch_buffer_pool.Reset(
+ sequence_header.color_config.bitdepth);
+ if (!frame_scratch_buffer->loop_restoration_info.Reset(
+ &frame_header.loop_restoration, frame_header.upscaled_width,
+ frame_header.height, sequence_header.color_config.subsampling_x,
+ sequence_header.color_config.subsampling_y,
+ sequence_header.color_config.is_monochrome)) {
LIBGAV1_DLOG(ERROR,
"Failed to allocate memory for loop restoration info units.");
- return kLibgav1StatusOutOfMemory;
+ return kStatusOutOfMemory;
}
- if (!AllocateCurrentFrame(obu->frame_header())) {
+ const bool do_cdef =
+ PostFilter::DoCdef(frame_header, settings_.post_filter_mask);
+ const int num_planes = sequence_header.color_config.is_monochrome
+ ? kMaxPlanesMonochrome
+ : kMaxPlanes;
+ const bool do_restoration = PostFilter::DoRestoration(
+ frame_header.loop_restoration, settings_.post_filter_mask, num_planes);
+ const bool do_superres =
+ PostFilter::DoSuperRes(frame_header, settings_.post_filter_mask);
+ // Use kBorderPixels for the left, right, and top borders. Only the bottom
+ // border may need to be bigger. SuperRes border is needed only if we are
+ // applying SuperRes in-place which is being done only in single threaded
+ // mode.
+ const int bottom_border = GetBottomBorderPixels(
+ do_cdef, do_restoration,
+ do_superres &&
+ frame_scratch_buffer->threading_strategy.post_filter_thread_pool() ==
+ nullptr,
+ sequence_header.color_config.subsampling_y);
+ current_frame->set_chroma_sample_position(
+ sequence_header.color_config.chroma_sample_position);
+ if (!current_frame->Realloc(sequence_header.color_config.bitdepth,
+ sequence_header.color_config.is_monochrome,
+ frame_header.upscaled_width, frame_header.height,
+ sequence_header.color_config.subsampling_x,
+ sequence_header.color_config.subsampling_y,
+ /*left_border=*/kBorderPixels,
+ /*right_border=*/kBorderPixels,
+ /*top_border=*/kBorderPixels, bottom_border)) {
LIBGAV1_DLOG(ERROR, "Failed to allocate memory for the decoder buffer.");
- return kLibgav1StatusOutOfMemory;
+ return kStatusOutOfMemory;
}
- Array2D<int16_t> cdef_index;
- if (obu->sequence_header().enable_cdef) {
- if (!cdef_index.Reset(
- DivideBy16(obu->frame_header().rows4x4 + kMaxBlockHeight4x4),
- DivideBy16(obu->frame_header().columns4x4 + kMaxBlockWidth4x4))) {
+ if (sequence_header.enable_cdef) {
+ if (!frame_scratch_buffer->cdef_index.Reset(
+ DivideBy16(frame_header.rows4x4 + kMaxBlockHeight4x4),
+ DivideBy16(frame_header.columns4x4 + kMaxBlockWidth4x4),
+ /*zero_initialize=*/false)) {
LIBGAV1_DLOG(ERROR, "Failed to allocate memory for cdef index.");
- return kLibgav1StatusOutOfMemory;
+ return kStatusOutOfMemory;
}
}
- if (!inter_transform_sizes_.Reset(
- obu->frame_header().rows4x4 + kMaxBlockHeight4x4,
- obu->frame_header().columns4x4 + kMaxBlockWidth4x4,
+ if (!frame_scratch_buffer->inter_transform_sizes.Reset(
+ frame_header.rows4x4 + kMaxBlockHeight4x4,
+ frame_header.columns4x4 + kMaxBlockWidth4x4,
/*zero_initialize=*/false)) {
LIBGAV1_DLOG(ERROR, "Failed to allocate memory for inter_transform_sizes.");
- return kLibgav1StatusOutOfMemory;
+ return kStatusOutOfMemory;
}
- if (obu->frame_header().use_ref_frame_mvs &&
- !state_.motion_field_mv.Reset(DivideBy2(obu->frame_header().rows4x4),
- DivideBy2(obu->frame_header().columns4x4),
- /*zero_initialize=*/false)) {
- LIBGAV1_DLOG(ERROR,
- "Failed to allocate memory for temporal motion vectors.");
- return kLibgav1StatusOutOfMemory;
+ if (frame_header.use_ref_frame_mvs) {
+ if (!frame_scratch_buffer->motion_field.mv.Reset(
+ DivideBy2(frame_header.rows4x4), DivideBy2(frame_header.columns4x4),
+ /*zero_initialize=*/false) ||
+ !frame_scratch_buffer->motion_field.reference_offset.Reset(
+ DivideBy2(frame_header.rows4x4), DivideBy2(frame_header.columns4x4),
+ /*zero_initialize=*/false)) {
+ LIBGAV1_DLOG(ERROR,
+ "Failed to allocate memory for temporal motion vectors.");
+ return kStatusOutOfMemory;
+ }
+
+ // For each motion vector, only mv[0] needs to be initialized to
+ // kInvalidMvValue, mv[1] is not necessary to be initialized and can be
+ // set to an arbitrary value. For simplicity, mv[1] is set to 0.
+ // The following memory initialization of contiguous memory is very fast. It
+ // is not recommended to make the initialization multi-threaded, unless the
+ // memory which needs to be initialized in each thread is still contiguous.
+ MotionVector invalid_mv;
+ invalid_mv.mv[0] = kInvalidMvValue;
+ invalid_mv.mv[1] = 0;
+ MotionVector* const motion_field_mv =
+ &frame_scratch_buffer->motion_field.mv[0][0];
+ std::fill(motion_field_mv,
+ motion_field_mv + frame_scratch_buffer->motion_field.mv.size(),
+ invalid_mv);
}
// The addition of kMaxBlockHeight4x4 and kMaxBlockWidth4x4 is necessary so
// that the block parameters cache can be filled in for the last row/column
// without having to check for boundary conditions.
- BlockParametersHolder block_parameters_holder(
- obu->frame_header().rows4x4 + kMaxBlockHeight4x4,
- obu->frame_header().columns4x4 + kMaxBlockWidth4x4,
- obu->sequence_header().use_128x128_superblock);
- if (!block_parameters_holder.Init()) {
- return kLibgav1StatusOutOfMemory;
+ if (!frame_scratch_buffer->block_parameters_holder.Reset(
+ frame_header.rows4x4 + kMaxBlockHeight4x4,
+ frame_header.columns4x4 + kMaxBlockWidth4x4,
+ sequence_header.use_128x128_superblock)) {
+ return kStatusOutOfMemory;
}
const dsp::Dsp* const dsp =
- dsp::GetDspTable(obu->sequence_header().color_config.bitdepth);
+ dsp::GetDspTable(sequence_header.color_config.bitdepth);
if (dsp == nullptr) {
LIBGAV1_DLOG(ERROR, "Failed to get the dsp table for bitdepth %d.",
- obu->sequence_header().color_config.bitdepth);
- return kLibgav1StatusInternalError;
- }
- // If prev_segment_ids is a null pointer, it is treated as if it pointed to
- // a segmentation map containing all 0s.
- const SegmentationMap* prev_segment_ids = nullptr;
- if (obu->frame_header().primary_reference_frame == kPrimaryReferenceNone) {
- symbol_decoder_context_.Initialize(
- obu->frame_header().quantizer.base_index);
- } else {
- const int index =
- obu->frame_header()
- .reference_frame_index[obu->frame_header().primary_reference_frame];
- const RefCountedBuffer* prev_frame = state_.reference_frame[index].get();
- symbol_decoder_context_ = prev_frame->FrameContext();
- if (obu->frame_header().segmentation.enabled &&
- prev_frame->columns4x4() == obu->frame_header().columns4x4 &&
- prev_frame->rows4x4() == obu->frame_header().rows4x4) {
- prev_segment_ids = prev_frame->segmentation_map();
- }
+ sequence_header.color_config.bitdepth);
+ return kStatusInternalError;
}
- const uint8_t tile_size_bytes = obu->frame_header().tile_info.tile_size_bytes;
- const int tile_count = obu->tile_groups().back().end + 1;
+ const int tile_count = frame_header.tile_info.tile_count;
assert(tile_count >= 1);
Vector<std::unique_ptr<Tile>> tiles;
if (!tiles.reserve(tile_count)) {
LIBGAV1_DLOG(ERROR, "tiles.reserve(%d) failed.\n", tile_count);
- return kLibgav1StatusOutOfMemory;
+ return kStatusOutOfMemory;
}
- if (!threading_strategy_.Reset(obu->frame_header(), settings_.threads)) {
- return kLibgav1StatusOutOfMemory;
+ ThreadingStrategy& threading_strategy =
+ frame_scratch_buffer->threading_strategy;
+ if (!is_frame_parallel_ &&
+ !threading_strategy.Reset(frame_header, settings_.threads)) {
+ return kStatusOutOfMemory;
}
- if (threading_strategy_.row_thread_pool(0) != nullptr) {
- if (residual_buffer_pool_ == nullptr) {
- residual_buffer_pool_.reset(new (std::nothrow) ResidualBufferPool(
- obu->sequence_header().use_128x128_superblock,
- obu->sequence_header().color_config.subsampling_x,
- obu->sequence_header().color_config.subsampling_y,
- obu->sequence_header().color_config.bitdepth == 8 ? sizeof(int16_t)
- : sizeof(int32_t)));
- if (residual_buffer_pool_ == nullptr) {
+ if (threading_strategy.row_thread_pool(0) != nullptr || is_frame_parallel_) {
+ if (frame_scratch_buffer->residual_buffer_pool == nullptr) {
+ frame_scratch_buffer->residual_buffer_pool.reset(
+ new (std::nothrow) ResidualBufferPool(
+ sequence_header.use_128x128_superblock,
+ sequence_header.color_config.subsampling_x,
+ sequence_header.color_config.subsampling_y,
+ sequence_header.color_config.bitdepth == 8 ? sizeof(int16_t)
+ : sizeof(int32_t)));
+ if (frame_scratch_buffer->residual_buffer_pool == nullptr) {
LIBGAV1_DLOG(ERROR, "Failed to allocate residual buffer.\n");
- return kLibgav1StatusOutOfMemory;
+ return kStatusOutOfMemory;
}
} else {
- residual_buffer_pool_->Reset(
- obu->sequence_header().use_128x128_superblock,
- obu->sequence_header().color_config.subsampling_x,
- obu->sequence_header().color_config.subsampling_y,
- obu->sequence_header().color_config.bitdepth == 8 ? sizeof(int16_t)
- : sizeof(int32_t));
+ frame_scratch_buffer->residual_buffer_pool->Reset(
+ sequence_header.use_128x128_superblock,
+ sequence_header.color_config.subsampling_x,
+ sequence_header.color_config.subsampling_y,
+ sequence_header.color_config.bitdepth == 8 ? sizeof(int16_t)
+ : sizeof(int32_t));
}
}
- const bool do_cdef =
- PostFilter::DoCdef(obu->frame_header(), settings_.post_filter_mask);
- const int num_planes = obu->sequence_header().color_config.is_monochrome
- ? kMaxPlanesMonochrome
- : kMaxPlanes;
- const bool do_restoration =
- PostFilter::DoRestoration(obu->frame_header().loop_restoration,
- settings_.post_filter_mask, num_planes);
- if (threading_strategy_.post_filter_thread_pool() != nullptr &&
+ if (threading_strategy.post_filter_thread_pool() != nullptr &&
(do_cdef || do_restoration)) {
const int window_buffer_width = PostFilter::GetWindowBufferWidth(
- threading_strategy_.post_filter_thread_pool(), obu->frame_header());
+ threading_strategy.post_filter_thread_pool(), frame_header);
size_t threaded_window_buffer_size =
window_buffer_width *
PostFilter::GetWindowBufferHeight(
- threading_strategy_.post_filter_thread_pool(),
- obu->frame_header()) *
- (obu->sequence_header().color_config.bitdepth == 8 ? sizeof(uint8_t)
- : sizeof(uint16_t));
- if (do_cdef && !do_restoration) {
+ threading_strategy.post_filter_thread_pool(), frame_header) *
+ (sequence_header.color_config.bitdepth == 8 ? sizeof(uint8_t)
+ : sizeof(uint16_t));
+ if (do_cdef) {
// TODO(chengchen): for cdef U, V planes, if there's subsampling, we can
// use smaller buffer.
threaded_window_buffer_size *= num_planes;
}
- if (threaded_window_buffer_size_ < threaded_window_buffer_size) {
- // threaded_window_buffer_ will be subdivided by PostFilter into windows
- // of width 512 pixels. Each row in the window is filtered by a worker
- // thread. To avoid false sharing, each 512-pixel row processed by one
- // thread should not share a cache line with a row processed by another
- // thread. So we align threaded_window_buffer_ to the cache line size.
- // In addition, it is faster to memcpy from an aligned buffer.
- //
- // On Linux, the cache line size can be looked up with the command:
- // getconf LEVEL1_DCACHE_LINESIZE
- //
- // The cache line size should ideally be queried at run time. 64 is a
- // common cache line size of x86 CPUs. Web searches showed the cache line
- // size of ARM CPUs is 32 or 64 bytes. So aligning to 64-byte boundary
- // will work for all CPUs that we care about, even though it is excessive
- // for some ARM CPUs.
- constexpr size_t kCacheLineSize = 64;
- // To avoid false sharing, PostFilter's window width in bytes should also
- // be a multiple of the cache line size. For simplicity, we check the
- // window width in pixels.
- assert(window_buffer_width % kCacheLineSize == 0);
- threaded_window_buffer_ = MakeAlignedUniquePtr<uint8_t>(
- kCacheLineSize, threaded_window_buffer_size);
- if (threaded_window_buffer_ == nullptr) {
- LIBGAV1_DLOG(ERROR,
- "Failed to allocate threaded loop restoration buffer.\n");
- threaded_window_buffer_size_ = 0;
- return kLibgav1StatusOutOfMemory;
- }
- threaded_window_buffer_size_ = threaded_window_buffer_size;
+ // To avoid false sharing, PostFilter's window width in bytes should be a
+ // multiple of the cache line size. For simplicity, we check the window
+ // width in pixels.
+ assert(window_buffer_width % kCacheLineSize == 0);
+ if (!frame_scratch_buffer->threaded_window_buffer.Resize(
+ threaded_window_buffer_size)) {
+ LIBGAV1_DLOG(ERROR,
+ "Failed to resize threaded loop restoration buffer.\n");
+ return kStatusOutOfMemory;
}
}
- PostFilter post_filter(
- obu->frame_header(), obu->sequence_header(), &loop_filter_mask_,
- cdef_index, &loop_restoration_info, &block_parameters_holder,
- state_.current_frame->buffer(), dsp,
- threading_strategy_.post_filter_thread_pool(),
- threaded_window_buffer_.get(), settings_.post_filter_mask);
- SymbolDecoderContext saved_symbol_decoder_context;
- int tile_index = 0;
- BlockingCounterWithStatus pending_tiles(tile_count);
- for (const auto& tile_group : obu->tile_groups()) {
- size_t bytes_left = tile_group.data_size;
- size_t byte_offset = 0;
- // The for loop in 5.11.1.
- for (int tile_number = tile_group.start; tile_number <= tile_group.end;
- ++tile_number) {
- size_t tile_size = 0;
- if (tile_number != tile_group.end) {
- RawBitReader bit_reader(tile_group.data + byte_offset, bytes_left);
- if (!bit_reader.ReadLittleEndian(tile_size_bytes, &tile_size)) {
- LIBGAV1_DLOG(ERROR, "Could not read tile size for tile #%d",
- tile_number);
- return kLibgav1StatusBitstreamError;
- }
- ++tile_size;
- byte_offset += tile_size_bytes;
- bytes_left -= tile_size_bytes;
- if (tile_size > bytes_left) {
- LIBGAV1_DLOG(ERROR, "Invalid tile size %zu for tile #%d", tile_size,
- tile_number);
- return kLibgav1StatusBitstreamError;
- }
- } else {
- tile_size = bytes_left;
- }
-
- std::unique_ptr<Tile> tile(new (std::nothrow) Tile(
- tile_number, tile_group.data + byte_offset, tile_size,
- obu->sequence_header(), obu->frame_header(),
- state_.current_frame.get(), state_.reference_frame_sign_bias,
- state_.reference_frame, &state_.motion_field_mv,
- state_.reference_order_hint, state_.wedge_masks,
- symbol_decoder_context_, &saved_symbol_decoder_context,
- prev_segment_ids, &post_filter, &block_parameters_holder, &cdef_index,
- &inter_transform_sizes_, dsp,
- threading_strategy_.row_thread_pool(tile_index++),
- residual_buffer_pool_.get(), &decoder_scratch_buffer_pool_,
- &pending_tiles));
- if (tile == nullptr) {
- LIBGAV1_DLOG(ERROR, "Failed to allocate tile.");
- return kLibgav1StatusOutOfMemory;
- }
- tiles.push_back_unchecked(std::move(tile));
-
- byte_offset += tile_size;
- bytes_left -= tile_size;
+ if (do_cdef && do_restoration) {
+ // We need to store 4 rows per 64x64 unit.
+ const int num_deblock_units = MultiplyBy4(Ceil(frame_header.rows4x4, 16));
+ // subsampling_y is set to zero irrespective of the actual frame's
+ // subsampling since we need to store exactly |num_deblock_units| rows of
+ // the deblocked pixels.
+ if (!frame_scratch_buffer->deblock_buffer.Realloc(
+ sequence_header.color_config.bitdepth,
+ sequence_header.color_config.is_monochrome,
+ frame_header.upscaled_width, num_deblock_units,
+ sequence_header.color_config.subsampling_x,
+ /*subsampling_y=*/0, kBorderPixels, kBorderPixels, kBorderPixels,
+ kBorderPixels, nullptr, nullptr, nullptr)) {
+ return kStatusOutOfMemory;
}
}
+
+ if (do_superres) {
+ const int num_threads =
+ 1 + ((threading_strategy.post_filter_thread_pool() == nullptr)
+ ? 0
+ : threading_strategy.post_filter_thread_pool()->num_threads());
+ const size_t superres_line_buffer_size =
+ num_threads *
+ (MultiplyBy4(frame_header.columns4x4) +
+ MultiplyBy2(kSuperResHorizontalBorder) + kSuperResHorizontalPadding) *
+ (sequence_header.color_config.bitdepth == 8 ? sizeof(uint8_t)
+ : sizeof(uint16_t));
+ if (!frame_scratch_buffer->superres_line_buffer.Resize(
+ superres_line_buffer_size)) {
+ LIBGAV1_DLOG(ERROR, "Failed to resize superres line buffer.\n");
+ return kStatusOutOfMemory;
+ }
+ }
+
+ PostFilter post_filter(frame_header, sequence_header, frame_scratch_buffer,
+ current_frame->buffer(), dsp,
+ settings_.post_filter_mask);
+
+ if (is_frame_parallel_) {
+ // We can parse the current frame if all the reference frames have been
+ // parsed.
+ for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
+ if (!state.reference_valid[i] || state.reference_frame[i] == nullptr) {
+ continue;
+ }
+ if (!state.reference_frame[i]->WaitUntilParsed()) {
+ return kStatusUnknownError;
+ }
+ }
+ }
+
+ // If prev_segment_ids is a null pointer, it is treated as if it pointed to
+ // a segmentation map containing all 0s.
+ const SegmentationMap* prev_segment_ids = nullptr;
+ if (frame_header.primary_reference_frame == kPrimaryReferenceNone) {
+ frame_scratch_buffer->symbol_decoder_context.Initialize(
+ frame_header.quantizer.base_index);
+ } else {
+ const int index =
+ frame_header
+ .reference_frame_index[frame_header.primary_reference_frame];
+ assert(index != -1);
+ const RefCountedBuffer* prev_frame = state.reference_frame[index].get();
+ frame_scratch_buffer->symbol_decoder_context = prev_frame->FrameContext();
+ if (frame_header.segmentation.enabled &&
+ prev_frame->columns4x4() == frame_header.columns4x4 &&
+ prev_frame->rows4x4() == frame_header.rows4x4) {
+ prev_segment_ids = prev_frame->segmentation_map();
+ }
+ }
+
+ // The Tile class must make use of a separate buffer to store the unfiltered
+ // pixels for the intra prediction of the next superblock row. This is done
+ // only when one of the following conditions are true:
+ // * is_frame_parallel_ is true.
+ // * settings_.threads == 1.
+ // In the non-frame-parallel multi-threaded case, we do not run the post
+ // filters in the decode loop. So this buffer need not be used.
+ const bool use_intra_prediction_buffer =
+ is_frame_parallel_ || settings_.threads == 1;
+ if (use_intra_prediction_buffer) {
+ if (!frame_scratch_buffer->intra_prediction_buffers.Resize(
+ frame_header.tile_info.tile_rows)) {
+ LIBGAV1_DLOG(ERROR, "Failed to Resize intra_prediction_buffers.");
+ return kStatusOutOfMemory;
+ }
+ IntraPredictionBuffer* const intra_prediction_buffers =
+ frame_scratch_buffer->intra_prediction_buffers.get();
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const int subsampling =
+ (plane == kPlaneY) ? 0 : sequence_header.color_config.subsampling_x;
+ const size_t intra_prediction_buffer_size =
+ ((MultiplyBy4(frame_header.columns4x4) >> subsampling) *
+ (sequence_header.color_config.bitdepth == 8 ? sizeof(uint8_t)
+ : sizeof(uint16_t)));
+ for (int tile_row = 0; tile_row < frame_header.tile_info.tile_rows;
+ ++tile_row) {
+ if (!intra_prediction_buffers[tile_row][plane].Resize(
+ intra_prediction_buffer_size)) {
+ LIBGAV1_DLOG(ERROR,
+ "Failed to allocate intra prediction buffer for tile "
+ "row %d plane %d.\n",
+ tile_row, plane);
+ return kStatusOutOfMemory;
+ }
+ }
+ }
+ }
+
+ SymbolDecoderContext saved_symbol_decoder_context;
+ BlockingCounterWithStatus pending_tiles(tile_count);
+ for (int tile_number = 0; tile_number < tile_count; ++tile_number) {
+ std::unique_ptr<Tile> tile = Tile::Create(
+ tile_number, tile_buffers[tile_number].data,
+ tile_buffers[tile_number].size, sequence_header, frame_header,
+ current_frame, state, frame_scratch_buffer, wedge_masks_,
+ &saved_symbol_decoder_context, prev_segment_ids, &post_filter, dsp,
+ threading_strategy.row_thread_pool(tile_number), &pending_tiles,
+ is_frame_parallel_, use_intra_prediction_buffer);
+ if (tile == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to create tile.");
+ return kStatusOutOfMemory;
+ }
+ tiles.push_back_unchecked(std::move(tile));
+ }
assert(tiles.size() == static_cast<size_t>(tile_count));
- bool tile_decoding_failed = false;
- if (threading_strategy_.tile_thread_pool() == nullptr) {
- for (const auto& tile_ptr : tiles) {
- if (!tile_decoding_failed) {
- if (!tile_ptr->Decode(/*is_main_thread=*/true)) {
- LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number());
- tile_decoding_failed = true;
- }
- } else {
- pending_tiles.Decrement(false);
- }
+ if (is_frame_parallel_) {
+ if (frame_scratch_buffer->threading_strategy.thread_pool() == nullptr) {
+ return DecodeTilesFrameParallel(
+ sequence_header, frame_header, tiles, saved_symbol_decoder_context,
+ prev_segment_ids, frame_scratch_buffer, &post_filter, current_frame);
}
+ return DecodeTilesThreadedFrameParallel(
+ sequence_header, frame_header, tiles, saved_symbol_decoder_context,
+ prev_segment_ids, frame_scratch_buffer, &post_filter, current_frame);
+ }
+ StatusCode status;
+ if (settings_.threads == 1) {
+ status = DecodeTilesNonFrameParallel(sequence_header, frame_header, tiles,
+ frame_scratch_buffer, &post_filter);
} else {
- const int num_workers = threading_strategy_.tile_thread_count();
- BlockingCounterWithStatus pending_workers(num_workers);
- std::atomic<int> tile_counter(0);
- // Submit tile decoding jobs to the thread pool.
- for (int i = 0; i < num_workers; ++i) {
- threading_strategy_.tile_thread_pool()->Schedule(
- [&tiles, tile_count, &tile_counter, &pending_workers,
- &pending_tiles]() {
- bool failed = false;
- int index;
- while ((index = tile_counter.fetch_add(
- 1, std::memory_order_relaxed)) < tile_count) {
- if (!failed) {
- const auto& tile_ptr = tiles[index];
- if (!tile_ptr->Decode(/*is_main_thread=*/false)) {
- LIBGAV1_DLOG(ERROR, "Error decoding tile #%d",
- tile_ptr->number());
- failed = true;
- }
- } else {
- pending_tiles.Decrement(false);
- }
- }
- pending_workers.Decrement(!failed);
- });
- }
- // Have the current thread partake in tile decoding.
- int index;
- while ((index = tile_counter.fetch_add(1, std::memory_order_relaxed)) <
- tile_count) {
- if (!tile_decoding_failed) {
- const auto& tile_ptr = tiles[index];
- if (!tile_ptr->Decode(/*is_main_thread=*/true)) {
- LIBGAV1_DLOG(ERROR, "Error decoding tile #%d", tile_ptr->number());
- tile_decoding_failed = true;
- }
- } else {
- pending_tiles.Decrement(false);
- }
- }
- // Wait until all the workers are done. This ensures that all the tiles have
- // been parsed.
- tile_decoding_failed |= !pending_workers.Wait();
+ status = DecodeTilesThreadedNonFrameParallel(tiles, frame_scratch_buffer,
+ &post_filter, &pending_tiles);
}
- // Wait until all the tiles have been decoded.
- tile_decoding_failed |= !pending_tiles.Wait();
-
- // At this point, all the tiles have been parsed and decoded and the
- // threadpool will be empty.
- if (tile_decoding_failed) return kLibgav1StatusUnknownError;
-
- if (obu->frame_header().enable_frame_end_update_cdf) {
- symbol_decoder_context_ = saved_symbol_decoder_context;
+ if (status != kStatusOk) return status;
+ if (frame_header.enable_frame_end_update_cdf) {
+ frame_scratch_buffer->symbol_decoder_context = saved_symbol_decoder_context;
}
- state_.current_frame->SetFrameContext(symbol_decoder_context_);
- if (post_filter.DoDeblock()) {
- loop_filter_mask_.Build(obu->sequence_header(), obu->frame_header(),
- obu->tile_groups().front().start,
- obu->tile_groups().back().end,
- block_parameters_holder, inter_transform_sizes_);
- }
- if (!post_filter.ApplyFiltering()) {
- LIBGAV1_DLOG(ERROR, "Error applying in-loop filtering.");
- return kLibgav1StatusUnknownError;
- }
- SetCurrentFrameSegmentationMap(obu->frame_header(), prev_segment_ids);
- return kLibgav1StatusOk;
+ current_frame->SetFrameContext(frame_scratch_buffer->symbol_decoder_context);
+ SetSegmentationMap(frame_header, prev_segment_ids, current_frame);
+ return kStatusOk;
}
-void DecoderImpl::SetCurrentFrameSegmentationMap(
+StatusCode DecoderImpl::ApplyFilmGrain(
+ const ObuSequenceHeader& sequence_header,
const ObuFrameHeader& frame_header,
- const SegmentationMap* prev_segment_ids) {
- if (!frame_header.segmentation.enabled) {
- // All segment_id's are 0.
- state_.current_frame->segmentation_map()->Clear();
- } else if (!frame_header.segmentation.update_map) {
- // Copy from prev_segment_ids.
- if (prev_segment_ids == nullptr) {
- // Treat a null prev_segment_ids pointer as if it pointed to a
- // segmentation map containing all 0s.
- state_.current_frame->segmentation_map()->Clear();
- } else {
- state_.current_frame->segmentation_map()->CopyFrom(*prev_segment_ids);
- }
+ const RefCountedBufferPtr& displayable_frame,
+ RefCountedBufferPtr* film_grain_frame, ThreadPool* thread_pool) {
+ if (!sequence_header.film_grain_params_present ||
+ !displayable_frame->film_grain_params().apply_grain ||
+ (settings_.post_filter_mask & 0x10) == 0) {
+ *film_grain_frame = displayable_frame;
+ return kStatusOk;
}
+ if (!frame_header.show_existing_frame &&
+ frame_header.refresh_frame_flags == 0) {
+ // If show_existing_frame is true, then the current frame is a previously
+ // saved reference frame. If refresh_frame_flags is nonzero, then the
+ // state_.UpdateReferenceFrames() call above has saved the current frame as
+ // a reference frame. Therefore, if both of these conditions are false, then
+ // the current frame is not saved as a reference frame. displayable_frame
+ // should hold the only reference to the current frame.
+ assert(displayable_frame.use_count() == 1);
+ // Add film grain noise in place.
+ *film_grain_frame = displayable_frame;
+ } else {
+ *film_grain_frame = buffer_pool_.GetFreeBuffer();
+ if (*film_grain_frame == nullptr) {
+ LIBGAV1_DLOG(ERROR,
+ "Could not get film_grain_frame from the buffer pool.");
+ return kStatusResourceExhausted;
+ }
+ if (!(*film_grain_frame)
+ ->Realloc(displayable_frame->buffer()->bitdepth(),
+ displayable_frame->buffer()->is_monochrome(),
+ displayable_frame->upscaled_width(),
+ displayable_frame->frame_height(),
+ displayable_frame->buffer()->subsampling_x(),
+ displayable_frame->buffer()->subsampling_y(),
+ kBorderPixelsFilmGrain, kBorderPixelsFilmGrain,
+ kBorderPixelsFilmGrain, kBorderPixelsFilmGrain)) {
+ LIBGAV1_DLOG(ERROR, "film_grain_frame->Realloc() failed.");
+ return kStatusOutOfMemory;
+ }
+ (*film_grain_frame)
+ ->set_chroma_sample_position(
+ displayable_frame->chroma_sample_position());
+ (*film_grain_frame)->set_spatial_id(displayable_frame->spatial_id());
+ (*film_grain_frame)->set_temporal_id(displayable_frame->temporal_id());
+ }
+ const bool color_matrix_is_identity =
+ sequence_header.color_config.matrix_coefficients ==
+ kMatrixCoefficientsIdentity;
+ assert(displayable_frame->buffer()->stride(kPlaneU) ==
+ displayable_frame->buffer()->stride(kPlaneV));
+ const int input_stride_uv = displayable_frame->buffer()->stride(kPlaneU);
+ assert((*film_grain_frame)->buffer()->stride(kPlaneU) ==
+ (*film_grain_frame)->buffer()->stride(kPlaneV));
+ const int output_stride_uv = (*film_grain_frame)->buffer()->stride(kPlaneU);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (displayable_frame->buffer()->bitdepth() > 8) {
+ FilmGrain<10> film_grain(displayable_frame->film_grain_params(),
+ displayable_frame->buffer()->is_monochrome(),
+ color_matrix_is_identity,
+ displayable_frame->buffer()->subsampling_x(),
+ displayable_frame->buffer()->subsampling_y(),
+ displayable_frame->upscaled_width(),
+ displayable_frame->frame_height(), thread_pool);
+ if (!film_grain.AddNoise(
+ displayable_frame->buffer()->data(kPlaneY),
+ displayable_frame->buffer()->stride(kPlaneY),
+ displayable_frame->buffer()->data(kPlaneU),
+ displayable_frame->buffer()->data(kPlaneV), input_stride_uv,
+ (*film_grain_frame)->buffer()->data(kPlaneY),
+ (*film_grain_frame)->buffer()->stride(kPlaneY),
+ (*film_grain_frame)->buffer()->data(kPlaneU),
+ (*film_grain_frame)->buffer()->data(kPlaneV), output_stride_uv)) {
+ LIBGAV1_DLOG(ERROR, "film_grain.AddNoise() failed.");
+ return kStatusOutOfMemory;
+ }
+ return kStatusOk;
+ }
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+ FilmGrain<8> film_grain(displayable_frame->film_grain_params(),
+ displayable_frame->buffer()->is_monochrome(),
+ color_matrix_is_identity,
+ displayable_frame->buffer()->subsampling_x(),
+ displayable_frame->buffer()->subsampling_y(),
+ displayable_frame->upscaled_width(),
+ displayable_frame->frame_height(), thread_pool);
+ if (!film_grain.AddNoise(
+ displayable_frame->buffer()->data(kPlaneY),
+ displayable_frame->buffer()->stride(kPlaneY),
+ displayable_frame->buffer()->data(kPlaneU),
+ displayable_frame->buffer()->data(kPlaneV), input_stride_uv,
+ (*film_grain_frame)->buffer()->data(kPlaneY),
+ (*film_grain_frame)->buffer()->stride(kPlaneY),
+ (*film_grain_frame)->buffer()->data(kPlaneU),
+ (*film_grain_frame)->buffer()->data(kPlaneV), output_stride_uv)) {
+ LIBGAV1_DLOG(ERROR, "film_grain.AddNoise() failed.");
+ return kStatusOutOfMemory;
+ }
+ return kStatusOk;
+}
+
+bool DecoderImpl::IsNewSequenceHeader(const ObuParser& obu) {
+ if (std::find_if(obu.obu_headers().begin(), obu.obu_headers().end(),
+ [](const ObuHeader& obu_header) {
+ return obu_header.type == kObuSequenceHeader;
+ }) == obu.obu_headers().end()) {
+ return false;
+ }
+ const ObuSequenceHeader sequence_header = obu.sequence_header();
+ const bool sequence_header_changed =
+ !has_sequence_header_ ||
+ sequence_header_.color_config.bitdepth !=
+ sequence_header.color_config.bitdepth ||
+ sequence_header_.color_config.is_monochrome !=
+ sequence_header.color_config.is_monochrome ||
+ sequence_header_.color_config.subsampling_x !=
+ sequence_header.color_config.subsampling_x ||
+ sequence_header_.color_config.subsampling_y !=
+ sequence_header.color_config.subsampling_y ||
+ sequence_header_.max_frame_width != sequence_header.max_frame_width ||
+ sequence_header_.max_frame_height != sequence_header.max_frame_height;
+ sequence_header_ = sequence_header;
+ has_sequence_header_ = true;
+ return sequence_header_changed;
}
} // namespace libgav1
diff --git a/libgav1/src/decoder_impl.h b/libgav1/src/decoder_impl.h
index 18026f7..df1b091 100644
--- a/libgav1/src/decoder_impl.h
+++ b/libgav1/src/decoder_impl.h
@@ -18,23 +18,26 @@
#define LIBGAV1_SRC_DECODER_IMPL_H_
#include <array>
+#include <condition_variable> // NOLINT (unapproved c++11 header)
#include <cstddef>
#include <cstdint>
#include <memory>
+#include <mutex> // NOLINT (unapproved c++11 header)
#include "src/buffer_pool.h"
-#include "src/decoder_buffer.h"
-#include "src/decoder_settings.h"
+#include "src/decoder_state.h"
#include "src/dsp/constants.h"
-#include "src/loop_filter_mask.h"
+#include "src/frame_scratch_buffer.h"
+#include "src/gav1/decoder_buffer.h"
+#include "src/gav1/decoder_settings.h"
+#include "src/gav1/status_code.h"
#include "src/obu_parser.h"
#include "src/residual_buffer_pool.h"
-#include "src/status_code.h"
#include "src/symbol_decoder_context.h"
-#include "src/threading_strategy.h"
#include "src/tile.h"
#include "src/utils/array_2d.h"
#include "src/utils/block_parameters_holder.h"
+#include "src/utils/compiler_attributes.h"
#include "src/utils/constants.h"
#include "src/utils/memory.h"
#include "src/utils/queue.h"
@@ -43,69 +46,85 @@
namespace libgav1 {
-struct EncodedFrame : public Allocable {
- // The default constructor is invoked by the Queue<EncodedFrame>::Init()
+struct TemporalUnit;
+
+struct EncodedFrame {
+ EncodedFrame(ObuParser* const obu, const DecoderState& state,
+ const RefCountedBufferPtr& frame, int position_in_temporal_unit)
+ : sequence_header(obu->sequence_header()),
+ frame_header(obu->frame_header()),
+ state(state),
+ temporal_unit(nullptr),
+ frame(frame),
+ position_in_temporal_unit(position_in_temporal_unit) {
+ obu->MoveTileBuffer(&tile_buffers);
+ frame->MarkFrameAsStarted();
+ }
+
+ const ObuSequenceHeader sequence_header;
+ const ObuFrameHeader frame_header;
+ Vector<TileBuffer> tile_buffers;
+ DecoderState state;
+ TemporalUnit* temporal_unit;
+ RefCountedBufferPtr frame;
+ const int position_in_temporal_unit;
+};
+
+struct TemporalUnit : public Allocable {
+ // The default constructor is invoked by the Queue<TemporalUnit>::Init()
// method. Queue<> does not use the default-constructed elements, so it is
// safe for the default constructor to not initialize the members.
- EncodedFrame() = default;
- EncodedFrame(const uint8_t* data, size_t size, int64_t user_private_data)
- : data(data), size(size), user_private_data(user_private_data) {}
+ TemporalUnit() = default;
+ TemporalUnit(const uint8_t* data, size_t size, int64_t user_private_data,
+ void* buffer_private_data)
+ : data(data),
+ size(size),
+ user_private_data(user_private_data),
+ buffer_private_data(buffer_private_data),
+ decoded(false),
+ status(kStatusOk),
+ has_displayable_frame(false),
+ output_frame_position(-1),
+ decoded_count(0),
+ output_layer_count(0),
+ released_input_buffer(false) {}
const uint8_t* data;
size_t size;
int64_t user_private_data;
-};
+ void* buffer_private_data;
-struct DecoderState {
- // Section 7.20. Updates frames in the reference_frame array with
- // current_frame, based on the refresh_frame_flags bitmask.
- void UpdateReferenceFrames(int refresh_frame_flags);
+ // The following members are used only in frame parallel mode.
+ bool decoded;
+ StatusCode status;
+ bool has_displayable_frame;
+ int output_frame_position;
- // Clears all the reference frames.
- void ClearReferenceFrames();
+ Vector<EncodedFrame> frames;
+ size_t decoded_count;
- ObuSequenceHeader sequence_header = {};
- // If true, sequence_header is valid.
- bool has_sequence_header = false;
- // reference_valid and reference_frame_id are used only if
- // sequence_header_.frame_id_numbers_present is true.
- // The reference_valid array is indexed by a reference picture slot number.
- // A value (boolean) in the array signifies whether the corresponding
- // reference picture slot is valid for use as a reference picture.
- std::array<bool, kNumReferenceFrameTypes> reference_valid = {};
- std::array<uint16_t, kNumReferenceFrameTypes> reference_frame_id = {};
- // A valid value of current_frame_id is an unsigned integer of at most 16
- // bits. -1 indicates current_frame_id is not initialized.
- int current_frame_id = -1;
- // The RefOrderHint array variable in the spec.
- std::array<uint8_t, kNumReferenceFrameTypes> reference_order_hint = {};
- // The OrderHint variable in the spec. Its value comes from either the
- // order_hint syntax element in the uncompressed header (if
- // show_existing_frame is false) or RefOrderHint[ frame_to_show_map_idx ]
- // (if show_existing_frame is true and frame_type is KEY_FRAME). See Section
- // 5.9.2 and Section 7.4.
- //
- // NOTE: When show_existing_frame is false, it is often more convenient to
- // just use the order_hint field of the frame header as OrderHint. So this
- // field is mainly used to update the reference_order_hint array in
- // UpdateReferenceFrames().
- uint8_t order_hint = 0;
- // reference_frame_sign_bias[i] (a boolean) specifies the intended direction
- // of the motion vector in time for each reference frame.
- // * |false| indicates that the reference frame is a forwards reference (i.e.
- // the reference frame is expected to be output before the current frame);
- // * |true| indicates that the reference frame is a backwards reference.
- // Note: reference_frame_sign_bias[0] (for kReferenceFrameIntra) is not used.
- std::array<bool, kNumReferenceFrameTypes> reference_frame_sign_bias = {};
- std::array<RefCountedBufferPtr, kNumReferenceFrameTypes> reference_frame;
- RefCountedBufferPtr current_frame;
- // wedge_master_mask has to be initialized to zero.
- std::array<uint8_t, 6 * kWedgeMaskMasterSize* kWedgeMaskMasterSize>
- wedge_master_mask = {};
- // TODO(chengchen): It is possible to reduce the buffer size. Because wedge
- // mask sizes are 8x8, 8x16, ..., 32x32. This buffer size can fit 32x32.
- std::array<uint8_t, kWedgeMaskSize> wedge_masks = {};
- Array2D<TemporalMotionVector> motion_field_mv;
+ // The struct (and the counter) is used to support output of multiple layers
+ // within a single temporal unit. The decoding process will store the output
+ // frames in |output_layers| in the order they are finished decoding. At the
+ // end of the decoding process, this array will be sorted in reverse order of
+ // |position_in_temporal_unit|. DequeueFrame() will then return the frames in
+ // reverse order (so that the entire process can run with a single counter
+ // variable).
+ struct OutputLayer {
+ // Used by std::sort to sort |output_layers| in reverse order of
+ // |position_in_temporal_unit|.
+ bool operator<(const OutputLayer& rhs) const {
+ return position_in_temporal_unit > rhs.position_in_temporal_unit;
+ }
+
+ RefCountedBufferPtr frame;
+ int position_in_temporal_unit = 0;
+ } output_layers[kMaxLayers];
+ // Number of entries in |output_layers|.
+ int output_layer_count;
+ // Flag to ensure that we release the input buffer only once if there are
+ // multiple output layers.
+ bool released_input_buffer;
};
class DecoderImpl : public Allocable {
@@ -118,51 +137,121 @@
std::unique_ptr<DecoderImpl>* output);
~DecoderImpl();
StatusCode EnqueueFrame(const uint8_t* data, size_t size,
- int64_t user_private_data);
+ int64_t user_private_data, void* buffer_private_data);
StatusCode DequeueFrame(const DecoderBuffer** out_ptr);
static constexpr int GetMaxBitdepth() {
-#if LIBGAV1_MAX_BITDEPTH >= 10
- return 10;
-#else
- return 8;
-#endif
+ static_assert(LIBGAV1_MAX_BITDEPTH == 8 || LIBGAV1_MAX_BITDEPTH == 10,
+ "LIBGAV1_MAX_BITDEPTH must be 8 or 10.");
+ return LIBGAV1_MAX_BITDEPTH;
}
private:
explicit DecoderImpl(const DecoderSettings* settings);
StatusCode Init();
- bool AllocateCurrentFrame(const ObuFrameHeader& frame_header);
+ // Called when the first frame is enqueued. It does the OBU parsing for one
+ // temporal unit to retrieve the tile configuration and sets up the frame
+ // threading if frame parallel mode is allowed. It also initializes the
+ // |temporal_units_| queue based on the number of frame threads.
+ //
+ // The following are the limitations of the current implementation:
+ // * It assumes that all frames in the video have the same tile
+ // configuration. The frame parallel threading model will not be updated
+ // based on tile configuration changes mid-stream.
+ // * The above assumption holds true even when there is a new coded video
+ // sequence (i.e.) a new sequence header.
+ StatusCode InitializeFrameThreadPoolAndTemporalUnitQueue(const uint8_t* data,
+ size_t size);
+ // Used only in frame parallel mode. Signals failure and waits until the
+ // worker threads are aborted if |status| is a failure status. If |status| is
+ // equal to kStatusOk or kStatusTryAgain, this function does not do anything.
+ // Always returns the input parameter |status| as the return value.
+ //
+ // This function is called only from the application thread (from
+ // EnqueueFrame() and DequeueFrame()).
+ StatusCode SignalFailure(StatusCode status);
+
void ReleaseOutputFrame();
- // Populates buffer_ with values from |frame|. Adds a reference to |frame|
- // in output_frame_.
+
+ // Decodes all the frames contained in the given temporal unit. Used only in
+ // non frame parallel mode.
+ StatusCode DecodeTemporalUnit(const TemporalUnit& temporal_unit,
+ const DecoderBuffer** out_ptr);
+ // Used only in frame parallel mode. Does the OBU parsing for |data| and
+ // schedules the individual frames for decoding in the |frame_thread_pool_|.
+ StatusCode ParseAndSchedule(const uint8_t* data, size_t size,
+ int64_t user_private_data,
+ void* buffer_private_data);
+ // Decodes the |encoded_frame| and updates the
+ // |encoded_frame->temporal_unit|'s parameters if the decoded frame is a
+ // displayable frame. Used only in frame parallel mode.
+ StatusCode DecodeFrame(EncodedFrame* encoded_frame);
+
+ // Populates |buffer_| with values from |frame|. Adds a reference to |frame|
+ // in |output_frame_|.
StatusCode CopyFrameToOutputBuffer(const RefCountedBufferPtr& frame);
- StatusCode DecodeTiles(const ObuParser* obu);
- // Sets the current frame's segmentation map for two cases. The third case
- // is handled in Tile::DecodeBlock().
- void SetCurrentFrameSegmentationMap(const ObuFrameHeader& frame_header,
- const SegmentationMap* prev_segment_ids);
+ StatusCode DecodeTiles(const ObuSequenceHeader& sequence_header,
+ const ObuFrameHeader& frame_header,
+ const Vector<TileBuffer>& tile_buffers,
+ const DecoderState& state,
+ FrameScratchBuffer* frame_scratch_buffer,
+ RefCountedBuffer* current_frame);
+ // Applies film grain synthesis to the |displayable_frame| and stores the film
+ // grain applied frame into |film_grain_frame|. Returns kStatusOk on success.
+ StatusCode ApplyFilmGrain(const ObuSequenceHeader& sequence_header,
+ const ObuFrameHeader& frame_header,
+ const RefCountedBufferPtr& displayable_frame,
+ RefCountedBufferPtr* film_grain_frame,
+ ThreadPool* thread_pool);
- Queue<EncodedFrame> encoded_frames_;
+ bool IsNewSequenceHeader(const ObuParser& obu);
+
+ bool HasFailure() {
+ std::lock_guard<std::mutex> lock(mutex_);
+ return failure_status_ != kStatusOk;
+ }
+
+ // Elements in this queue cannot be moved with std::move since the
+ // |EncodedFrame.temporal_unit| stores a pointer to elements in this queue.
+ Queue<TemporalUnit> temporal_units_;
DecoderState state_;
- ThreadingStrategy threading_strategy_;
- SymbolDecoderContext symbol_decoder_context_;
- // TODO(vigneshv): Only support one buffer for now. Eventually this has to be
- // a vector or an array.
DecoderBuffer buffer_ = {};
- // output_frame_ holds a reference to the output frame on behalf of buffer_.
+ // |output_frame_| holds a reference to the output frame on behalf of
+ // |buffer_|.
RefCountedBufferPtr output_frame_;
- BufferPool buffer_pool_;
- std::unique_ptr<ResidualBufferPool> residual_buffer_pool_;
- AlignedUniquePtr<uint8_t> threaded_window_buffer_;
- size_t threaded_window_buffer_size_ = 0;
- Array2D<TransformSize> inter_transform_sizes_;
- DecoderScratchBufferPool decoder_scratch_buffer_pool_;
+ // Queue of output frames that are to be returned in the DequeueFrame() calls.
+ // If |settings_.output_all_layers| is false, this queue will never contain
+ // more than 1 element. This queue is used only when |is_frame_parallel_| is
+ // false.
+ Queue<RefCountedBufferPtr> output_frame_queue_;
- LoopFilterMask loop_filter_mask_;
+ BufferPool buffer_pool_;
+ WedgeMaskArray wedge_masks_;
+ FrameScratchBufferPool frame_scratch_buffer_pool_;
+
+ // Used to synchronize the accesses into |temporal_units_| in order to update
+ // the "decoded" state of an temporal unit.
+ std::mutex mutex_;
+ std::condition_variable decoded_condvar_;
+ bool is_frame_parallel_;
+ std::unique_ptr<ThreadPool> frame_thread_pool_;
+
+ // In frame parallel mode, there are two primary points of failure:
+ // 1) ParseAndSchedule()
+ // 2) DecodeTiles()
+ // Both of these functions have to respond to the other one failing by
+ // aborting whatever they are doing. This variable is used to accomplish that.
+ // If |failure_status_| is not kStatusOk, then the two functions will try to
+ // abort as early as they can.
+ StatusCode failure_status_ = kStatusOk LIBGAV1_GUARDED_BY(mutex_);
+
+ ObuSequenceHeader sequence_header_ = {};
+ // If true, sequence_header is valid.
+ bool has_sequence_header_ = false;
const DecoderSettings& settings_;
+ bool seen_first_frame_ = false;
};
} // namespace libgav1
diff --git a/libgav1/src/decoder_scratch_buffer.h b/libgav1/src/decoder_scratch_buffer.h
deleted file mode 100644
index 54ee1b7..0000000
--- a/libgav1/src/decoder_scratch_buffer.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Copyright 2019 The libgav1 Authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef LIBGAV1_SRC_DECODER_SCRATCH_BUFFER_H_
-#define LIBGAV1_SRC_DECODER_SCRATCH_BUFFER_H_
-
-#include <cstdint>
-#include <mutex> // NOLINT (unapproved c++11 header)
-
-#include "src/dsp/constants.h"
-#include "src/utils/compiler_attributes.h"
-#include "src/utils/constants.h"
-#include "src/utils/memory.h"
-#include "src/utils/stack.h"
-
-namespace libgav1 {
-
-// Buffer to facilitate decoding a superblock.
-struct DecoderScratchBuffer : public Allocable {
- static constexpr int kBlockDecodedStride = 34;
-
- private:
-#if LIBGAV1_MAX_BITDEPTH >= 10
- static constexpr int kPixelSize = 2;
-#else
- static constexpr int kPixelSize = 1;
-#endif
-
- public:
- // The following prediction modes need a prediction mask:
- // kCompoundPredictionTypeDiffWeighted, kCompoundPredictionTypeWedge,
- // kCompoundPredictionTypeIntra. They are mutually exclusive. This buffer is
- // used to store the prediction mask during the inter prediction process. The
- // mask only needs to be created for the Y plane and is used for the U & V
- // planes.
- alignas(kMaxAlignment) uint8_t
- prediction_mask[kMaxSuperBlockSizeSquareInPixels];
-
- // For each instance of the DecoderScratchBuffer, only one of the following
- // buffers will be used at any given time, so it is ok to share them in a
- // union.
- union {
- // Union usage note: This is used only by functions in the "inter"
- // prediction path.
- //
- // Buffers used for inter prediction process.
- alignas(kMaxAlignment) uint16_t
- prediction_buffer[2][kMaxSuperBlockSizeSquareInPixels];
-
- struct {
- // Union usage note: This is used only by functions in the "intra"
- // prediction path.
- //
- // Buffer used for storing subsampled luma samples needed for CFL
- // prediction. This buffer is used to avoid repetition of the subsampling
- // for the V plane when it is already done for the U plane.
- int16_t cfl_luma_buffer[kCflLumaBufferStride][kCflLumaBufferStride];
-
- // Union usage note: This is used only by the
- // Tile::ReadTransformCoefficients() function (and the helper functions
- // that it calls). This cannot be shared with |cfl_luma_buffer| since
- // |cfl_luma_buffer| has to live across the 3 plane loop in
- // Tile::TransformBlock.
- //
- // Buffer used by Tile::ReadTransformCoefficients() to store the quantized
- // coefficients until the dequantization process is performed.
- int32_t quantized_buffer[kQuantizedCoefficientBufferSize];
- };
- };
-
- // Buffer used for convolve. The maximum size required for this buffer is:
- // maximum block height (with scaling) = 2 * 128 = 256.
- // maximum block stride (with scaling and border aligned to 16) =
- // (2 * 128 + 7 + 9) * pixel_size = 272 * pixel_size.
- alignas(kMaxAlignment) uint8_t
- convolve_block_buffer[256 * 272 * DecoderScratchBuffer::kPixelSize];
-
- // Flag indicating whether the data in |cfl_luma_buffer| is valid.
- bool cfl_luma_buffer_valid;
-
- // Equivalent to BlockDecoded array in the spec. This stores the decoded
- // state of every 4x4 block in a superblock. It has 1 row/column border on
- // all 4 sides (hence the 34x34 dimension instead of 32x32). Note that the
- // spec uses "-1" as an index to access the left and top borders. In the
- // code, we treat the index (1, 1) as equivalent to the spec's (0, 0). So
- // all accesses into this array will be offset by +1 when compared with the
- // spec.
- bool block_decoded[kMaxPlanes][kBlockDecodedStride][kBlockDecodedStride];
-};
-
-class DecoderScratchBufferPool {
- public:
- std::unique_ptr<DecoderScratchBuffer> Get() {
- std::lock_guard<std::mutex> lock(mutex_);
- if (buffers_.Empty()) {
- std::unique_ptr<DecoderScratchBuffer> scratch_buffer(
- new (std::nothrow) DecoderScratchBuffer);
- return scratch_buffer;
- }
- return buffers_.Pop();
- }
-
- void Release(std::unique_ptr<DecoderScratchBuffer> scratch_buffer) {
- std::lock_guard<std::mutex> lock(mutex_);
- buffers_.Push(std::move(scratch_buffer));
- }
-
- private:
- std::mutex mutex_;
- // We will never need more than kMaxThreads scratch buffers since that is the
- // maximum amount of work that will be done at any given time.
- Stack<std::unique_ptr<DecoderScratchBuffer>, kMaxThreads> buffers_
- LIBGAV1_GUARDED_BY(mutex_);
-};
-
-} // namespace libgav1
-
-#endif // LIBGAV1_SRC_DECODER_SCRATCH_BUFFER_H_
diff --git a/libgav1/src/decoder_settings.cc b/libgav1/src/decoder_settings.cc
new file mode 100644
index 0000000..9399073
--- /dev/null
+++ b/libgav1/src/decoder_settings.cc
@@ -0,0 +1,33 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/gav1/decoder_settings.h"
+
+extern "C" {
+
+void Libgav1DecoderSettingsInitDefault(Libgav1DecoderSettings* settings) {
+ settings->threads = 1;
+ settings->frame_parallel = 0; // false
+ settings->blocking_dequeue = 0; // false
+ settings->on_frame_buffer_size_changed = nullptr;
+ settings->get_frame_buffer = nullptr;
+ settings->release_frame_buffer = nullptr;
+ settings->release_input_buffer = nullptr;
+ settings->callback_private_data = nullptr;
+ settings->output_all_layers = 0; // false
+ settings->operating_point = 0;
+ settings->post_filter_mask = 0x1f;
+}
+
+} // extern "C"
diff --git a/libgav1/src/decoder_settings.h b/libgav1/src/decoder_settings.h
deleted file mode 100644
index 6c6f21d..0000000
--- a/libgav1/src/decoder_settings.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright 2019 The libgav1 Authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef LIBGAV1_SRC_DECODER_SETTINGS_H_
-#define LIBGAV1_SRC_DECODER_SETTINGS_H_
-
-#include <cstdint>
-
-#include "src/frame_buffer.h"
-
-// All the declarations in this file are part of the public ABI.
-
-namespace libgav1 {
-
-// Applications must populate this structure before creating a decoder instance.
-struct DecoderSettings {
- // Number of threads to use when decoding. Must be greater than 0. The
- // library will create at most |threads|-1 new threads, the calling thread is
- // considered part of the library's thread count. Defaults to 1 (no new
- // threads will be created).
- int threads = 1;
- // Do frame parallel decoding.
- bool frame_parallel = false;
- // Get frame buffer callback.
- GetFrameBufferCallback get = nullptr;
- // Release frame buffer callback.
- ReleaseFrameBufferCallback release = nullptr;
- // Passed as the private_data argument to the callbacks.
- void* callback_private_data = nullptr;
- // Mask indicating the post processing filters that need to be applied to the
- // reconstructed frame. From LSB:
- // Bit 0: Loop filter (deblocking filter).
- // Bit 1: Cdef.
- // Bit 2: Superres.
- // Bit 3: Loop restoration.
- // Bit 4: Film grain synthesis.
- // All the bits other than the last 5 are ignored.
- uint8_t post_filter_mask = 0x1f;
-};
-
-} // namespace libgav1
-#endif // LIBGAV1_SRC_DECODER_SETTINGS_H_
diff --git a/libgav1/src/decoder_state.h b/libgav1/src/decoder_state.h
new file mode 100644
index 0000000..897c99f
--- /dev/null
+++ b/libgav1/src/decoder_state.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DECODER_STATE_H_
+#define LIBGAV1_SRC_DECODER_STATE_H_
+
+#include <array>
+#include <cstdint>
+
+#include "src/buffer_pool.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+
+struct DecoderState {
+ // Section 7.20. Updates frames in the reference_frame array with
+ // |current_frame|, based on the |refresh_frame_flags| bitmask.
+ void UpdateReferenceFrames(const RefCountedBufferPtr& current_frame,
+ int refresh_frame_flags) {
+ for (int ref_index = 0, mask = refresh_frame_flags; mask != 0;
+ ++ref_index, mask >>= 1) {
+ if ((mask & 1) != 0) {
+ reference_valid[ref_index] = true;
+ reference_frame_id[ref_index] = current_frame_id;
+ reference_frame[ref_index] = current_frame;
+ reference_order_hint[ref_index] = order_hint;
+ }
+ }
+ }
+
+ // Clears all the reference frames.
+ void ClearReferenceFrames() {
+ reference_valid = {};
+ reference_frame_id = {};
+ reference_order_hint = {};
+ for (int ref_index = 0; ref_index < kNumReferenceFrameTypes; ++ref_index) {
+ reference_frame[ref_index] = nullptr;
+ }
+ }
+
+ // reference_valid and reference_frame_id are used only if
+ // sequence_header_.frame_id_numbers_present is true.
+ // The reference_valid array is indexed by a reference picture slot number.
+ // A value (boolean) in the array signifies whether the corresponding
+ // reference picture slot is valid for use as a reference picture.
+ std::array<bool, kNumReferenceFrameTypes> reference_valid = {};
+ std::array<uint16_t, kNumReferenceFrameTypes> reference_frame_id = {};
+ // A valid value of current_frame_id is an unsigned integer of at most 16
+ // bits. -1 indicates current_frame_id is not initialized.
+ int current_frame_id = -1;
+ // The RefOrderHint array variable in the spec.
+ std::array<uint8_t, kNumReferenceFrameTypes> reference_order_hint = {};
+ // The OrderHint variable in the spec. Its value comes from either the
+ // order_hint syntax element in the uncompressed header (if
+ // show_existing_frame is false) or RefOrderHint[ frame_to_show_map_idx ]
+ // (if show_existing_frame is true and frame_type is KEY_FRAME). See Section
+ // 5.9.2 and Section 7.4.
+ //
+ // NOTE: When show_existing_frame is false, it is often more convenient to
+ // just use the order_hint field of the frame header as OrderHint. So this
+ // field is mainly used to update the reference_order_hint array in
+ // UpdateReferenceFrames().
+ uint8_t order_hint = 0;
+ // reference_frame_sign_bias[i] (a boolean) specifies the intended direction
+ // of the motion vector in time for each reference frame.
+ // * |false| indicates that the reference frame is a forwards reference (i.e.
+ // the reference frame is expected to be output before the current frame);
+ // * |true| indicates that the reference frame is a backwards reference.
+ // Note: reference_frame_sign_bias[0] (for kReferenceFrameIntra) is not used.
+ std::array<bool, kNumReferenceFrameTypes> reference_frame_sign_bias = {};
+ std::array<RefCountedBufferPtr, kNumReferenceFrameTypes> reference_frame;
+};
+
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DECODER_STATE_H_
diff --git a/libgav1/src/dsp/arm/average_blend_neon.cc b/libgav1/src/dsp/arm/average_blend_neon.cc
index 94fad54..d946d70 100644
--- a/libgav1/src/dsp/arm/average_blend_neon.cc
+++ b/libgav1/src/dsp/arm/average_blend_neon.cc
@@ -13,7 +13,7 @@
// limitations under the License.
#include "src/dsp/average_blend.h"
-#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
#if LIBGAV1_ENABLE_NEON
@@ -24,83 +24,61 @@
#include <cstdint>
#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
#include "src/utils/common.h"
namespace libgav1 {
namespace dsp {
namespace {
-constexpr int kBitdepth8 = 8;
-constexpr int kInterPostRoundBit = 4;
-// An offset to cancel offsets used in compound predictor generation that
-// make intermediate computations non negative.
-const int16x8_t kCompoundRoundOffset =
- vdupq_n_s16((2 << (kBitdepth8 + 4)) + (2 << (kBitdepth8 + 3)));
+constexpr int kInterPostRoundBit =
+ kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
-inline void AverageBlend4Row(const uint16_t* prediction_0,
- const uint16_t* prediction_1, uint8_t* dest) {
- const int16x4_t pred0 = vreinterpret_s16_u16(vld1_u16(prediction_0));
- const int16x4_t pred1 = vreinterpret_s16_u16(vld1_u16(prediction_1));
- int16x4_t res = vadd_s16(pred0, pred1);
- res = vsub_s16(res, vget_low_s16(kCompoundRoundOffset));
- StoreLo4(dest,
- vqrshrun_n_s16(vcombine_s16(res, res), kInterPostRoundBit + 1));
+inline uint8x8_t AverageBlend8Row(const int16_t* prediction_0,
+ const int16_t* prediction_1) {
+ const int16x8_t pred0 = vld1q_s16(prediction_0);
+ const int16x8_t pred1 = vld1q_s16(prediction_1);
+ const int16x8_t res = vaddq_s16(pred0, pred1);
+ return vqrshrun_n_s16(res, kInterPostRoundBit + 1);
}
-inline void AverageBlend8Row(const uint16_t* prediction_0,
- const uint16_t* prediction_1, uint8_t* dest) {
- const int16x8_t pred0 = vreinterpretq_s16_u16(vld1q_u16(prediction_0));
- const int16x8_t pred1 = vreinterpretq_s16_u16(vld1q_u16(prediction_1));
- int16x8_t res = vaddq_s16(pred0, pred1);
- res = vsubq_s16(res, kCompoundRoundOffset);
- vst1_u8(dest, vqrshrun_n_s16(res, kInterPostRoundBit + 1));
-}
-
-inline void AverageBlendLargeRow(const uint16_t* prediction_0,
- const uint16_t* prediction_1, const int width,
+inline void AverageBlendLargeRow(const int16_t* prediction_0,
+ const int16_t* prediction_1, const int width,
uint8_t* dest) {
int x = 0;
do {
- const int16x8_t pred_00 =
- vreinterpretq_s16_u16(vld1q_u16(&prediction_0[x]));
- const int16x8_t pred_01 =
- vreinterpretq_s16_u16(vld1q_u16(&prediction_1[x]));
- int16x8_t res0 = vaddq_s16(pred_00, pred_01);
- res0 = vsubq_s16(res0, kCompoundRoundOffset);
+ const int16x8_t pred_00 = vld1q_s16(&prediction_0[x]);
+ const int16x8_t pred_01 = vld1q_s16(&prediction_1[x]);
+ const int16x8_t res0 = vaddq_s16(pred_00, pred_01);
const uint8x8_t res_out0 = vqrshrun_n_s16(res0, kInterPostRoundBit + 1);
- const int16x8_t pred_10 =
- vreinterpretq_s16_u16(vld1q_u16(&prediction_0[x + 8]));
- const int16x8_t pred_11 =
- vreinterpretq_s16_u16(vld1q_u16(&prediction_1[x + 8]));
- int16x8_t res1 = vaddq_s16(pred_10, pred_11);
- res1 = vsubq_s16(res1, kCompoundRoundOffset);
+ const int16x8_t pred_10 = vld1q_s16(&prediction_0[x + 8]);
+ const int16x8_t pred_11 = vld1q_s16(&prediction_1[x + 8]);
+ const int16x8_t res1 = vaddq_s16(pred_10, pred_11);
const uint8x8_t res_out1 = vqrshrun_n_s16(res1, kInterPostRoundBit + 1);
vst1q_u8(dest + x, vcombine_u8(res_out0, res_out1));
x += 16;
} while (x < width);
}
-void AverageBlend_NEON(const uint16_t* prediction_0,
- const ptrdiff_t prediction_stride_0,
- const uint16_t* prediction_1,
- const ptrdiff_t prediction_stride_1, const int width,
- const int height, void* const dest,
+void AverageBlend_NEON(const void* prediction_0, const void* prediction_1,
+ const int width, const int height, void* const dest,
const ptrdiff_t dest_stride) {
auto* dst = static_cast<uint8_t*>(dest);
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
int y = height;
if (width == 4) {
do {
- AverageBlend4Row(prediction_0, prediction_1, dst);
- dst += dest_stride;
- prediction_0 += prediction_stride_0;
- prediction_1 += prediction_stride_1;
+ const uint8x8_t result = AverageBlend8Row(pred_0, pred_1);
+ pred_0 += 8;
+ pred_1 += 8;
- AverageBlend4Row(prediction_0, prediction_1, dst);
+ StoreLo4(dst, result);
dst += dest_stride;
- prediction_0 += prediction_stride_0;
- prediction_1 += prediction_stride_1;
-
+ StoreHi4(dst, result);
+ dst += dest_stride;
y -= 2;
} while (y != 0);
return;
@@ -108,15 +86,15 @@
if (width == 8) {
do {
- AverageBlend8Row(prediction_0, prediction_1, dst);
+ vst1_u8(dst, AverageBlend8Row(pred_0, pred_1));
dst += dest_stride;
- prediction_0 += prediction_stride_0;
- prediction_1 += prediction_stride_1;
+ pred_0 += 8;
+ pred_1 += 8;
- AverageBlend8Row(prediction_0, prediction_1, dst);
+ vst1_u8(dst, AverageBlend8Row(pred_0, pred_1));
dst += dest_stride;
- prediction_0 += prediction_stride_0;
- prediction_1 += prediction_stride_1;
+ pred_0 += 8;
+ pred_1 += 8;
y -= 2;
} while (y != 0);
@@ -124,22 +102,22 @@
}
do {
- AverageBlendLargeRow(prediction_0, prediction_1, width, dst);
+ AverageBlendLargeRow(pred_0, pred_1, width, dst);
dst += dest_stride;
- prediction_0 += prediction_stride_0;
- prediction_1 += prediction_stride_1;
+ pred_0 += width;
+ pred_1 += width;
- AverageBlendLargeRow(prediction_0, prediction_1, width, dst);
+ AverageBlendLargeRow(pred_0, pred_1, width, dst);
dst += dest_stride;
- prediction_0 += prediction_stride_0;
- prediction_1 += prediction_stride_1;
+ pred_0 += width;
+ pred_1 += width;
y -= 2;
} while (y != 0);
}
void Init8bpp() {
- Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
assert(dsp != nullptr);
dsp->average_blend = AverageBlend_NEON;
}
@@ -151,7 +129,7 @@
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/libgav1/src/dsp/arm/average_blend_neon.h b/libgav1/src/dsp/arm/average_blend_neon.h
index 569da64..d13bcd6 100644
--- a/libgav1/src/dsp/arm/average_blend_neon.h
+++ b/libgav1/src/dsp/arm/average_blend_neon.h
@@ -17,8 +17,8 @@
#ifndef LIBGAV1_SRC_DSP_ARM_AVERAGE_BLEND_NEON_H_
#define LIBGAV1_SRC_DSP_ARM_AVERAGE_BLEND_NEON_H_
-#include "src/dsp/cpu.h"
#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
namespace libgav1 {
namespace dsp {
@@ -30,7 +30,7 @@
} // namespace libgav1
#if LIBGAV1_ENABLE_NEON
-#define LIBGAV1_Dsp8bpp_AverageBlend LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_AverageBlend LIBGAV1_CPU_NEON
#endif // LIBGAV1_ENABLE_NEON
#endif // LIBGAV1_SRC_DSP_ARM_AVERAGE_BLEND_NEON_H_
diff --git a/libgav1/src/dsp/arm/cdef_neon.cc b/libgav1/src/dsp/arm/cdef_neon.cc
new file mode 100644
index 0000000..968b0ff
--- /dev/null
+++ b/libgav1/src/dsp/arm/cdef_neon.cc
@@ -0,0 +1,697 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/cdef.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+#include "src/dsp/cdef.inc"
+
+// ----------------------------------------------------------------------------
+// Refer to CdefDirection_C().
+//
+// int32_t partial[8][15] = {};
+// for (int i = 0; i < 8; ++i) {
+// for (int j = 0; j < 8; ++j) {
+// const int x = 1;
+// partial[0][i + j] += x;
+// partial[1][i + j / 2] += x;
+// partial[2][i] += x;
+// partial[3][3 + i - j / 2] += x;
+// partial[4][7 + i - j] += x;
+// partial[5][3 - i / 2 + j] += x;
+// partial[6][j] += x;
+// partial[7][i / 2 + j] += x;
+// }
+// }
+//
+// Using the code above, generate the position count for partial[8][15].
+//
+// partial[0]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[1]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[2]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[3]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[4]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[5]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[6]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[7]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+//
+// The SIMD code shifts the input horizontally, then adds vertically to get the
+// correct partial value for the given position.
+// ----------------------------------------------------------------------------
+
+// ----------------------------------------------------------------------------
+// partial[0][i + j] += x;
+//
+// 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00
+// 00 10 11 12 13 14 15 16 17 00 00 00 00 00 00
+// 00 00 20 21 22 23 24 25 26 27 00 00 00 00 00
+// 00 00 00 30 31 32 33 34 35 36 37 00 00 00 00
+// 00 00 00 00 40 41 42 43 44 45 46 47 00 00 00
+// 00 00 00 00 00 50 51 52 53 54 55 56 57 00 00
+// 00 00 00 00 00 00 60 61 62 63 64 65 66 67 00
+// 00 00 00 00 00 00 00 70 71 72 73 74 75 76 77
+//
+// partial[4] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D0_D4(uint8x8_t* v_src,
+ uint16x8_t* partial_lo,
+ uint16x8_t* partial_hi) {
+ const uint8x8_t v_zero = vdup_n_u8(0);
+ // 00 01 02 03 04 05 06 07
+ // 00 10 11 12 13 14 15 16
+ *partial_lo = vaddl_u8(v_src[0], vext_u8(v_zero, v_src[1], 7));
+
+ // 00 00 20 21 22 23 24 25
+ *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[2], 6));
+ // 17 00 00 00 00 00 00 00
+ // 26 27 00 00 00 00 00 00
+ *partial_hi =
+ vaddl_u8(vext_u8(v_src[1], v_zero, 7), vext_u8(v_src[2], v_zero, 6));
+
+ // 00 00 00 30 31 32 33 34
+ *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[3], 5));
+ // 35 36 37 00 00 00 00 00
+ *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[3], v_zero, 5));
+
+ // 00 00 00 00 40 41 42 43
+ *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[4], 4));
+ // 44 45 46 47 00 00 00 00
+ *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[4], v_zero, 4));
+
+ // 00 00 00 00 00 50 51 52
+ *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[5], 3));
+ // 53 54 55 56 57 00 00 00
+ *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[5], v_zero, 3));
+
+ // 00 00 00 00 00 00 60 61
+ *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[6], 2));
+ // 62 63 64 65 66 67 00 00
+ *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[6], v_zero, 2));
+
+ // 00 00 00 00 00 00 00 70
+ *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[7], 1));
+ // 71 72 73 74 75 76 77 00
+ *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[7], v_zero, 1));
+}
+
+// ----------------------------------------------------------------------------
+// partial[1][i + j / 2] += x;
+//
+// A0 = src[0] + src[1], A1 = src[2] + src[3], ...
+//
+// A0 A1 A2 A3 00 00 00 00 00 00 00 00 00 00 00
+// 00 B0 B1 B2 B3 00 00 00 00 00 00 00 00 00 00
+// 00 00 C0 C1 C2 C3 00 00 00 00 00 00 00 00 00
+// 00 00 00 D0 D1 D2 D3 00 00 00 00 00 00 00 00
+// 00 00 00 00 E0 E1 E2 E3 00 00 00 00 00 00 00
+// 00 00 00 00 00 F0 F1 F2 F3 00 00 00 00 00 00
+// 00 00 00 00 00 00 G0 G1 G2 G3 00 00 00 00 00
+// 00 00 00 00 00 00 00 H0 H1 H2 H3 00 00 00 00
+//
+// partial[3] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D1_D3(uint8x8_t* v_src,
+ uint16x8_t* partial_lo,
+ uint16x8_t* partial_hi) {
+ uint8x16_t v_d1_temp[8];
+ const uint8x8_t v_zero = vdup_n_u8(0);
+ const uint8x16_t v_zero_16 = vdupq_n_u8(0);
+
+ for (int i = 0; i < 8; ++i) {
+ v_d1_temp[i] = vcombine_u8(v_src[i], v_zero);
+ }
+
+ *partial_lo = *partial_hi = vdupq_n_u16(0);
+ // A0 A1 A2 A3 00 00 00 00
+ *partial_lo = vpadalq_u8(*partial_lo, v_d1_temp[0]);
+
+ // 00 B0 B1 B2 B3 00 00 00
+ *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[1], 14));
+
+ // 00 00 C0 C1 C2 C3 00 00
+ *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[2], 12));
+ // 00 00 00 D0 D1 D2 D3 00
+ *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[3], 10));
+ // 00 00 00 00 E0 E1 E2 E3
+ *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[4], 8));
+
+ // 00 00 00 00 00 F0 F1 F2
+ *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[5], 6));
+ // F3 00 00 00 00 00 00 00
+ *partial_hi = vpadalq_u8(*partial_hi, vextq_u8(v_d1_temp[5], v_zero_16, 6));
+
+ // 00 00 00 00 00 00 G0 G1
+ *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[6], 4));
+ // G2 G3 00 00 00 00 00 00
+ *partial_hi = vpadalq_u8(*partial_hi, vextq_u8(v_d1_temp[6], v_zero_16, 4));
+
+ // 00 00 00 00 00 00 00 H0
+ *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[7], 2));
+ // H1 H2 H3 00 00 00 00 00
+ *partial_hi = vpadalq_u8(*partial_hi, vextq_u8(v_d1_temp[7], v_zero_16, 2));
+}
+
+// ----------------------------------------------------------------------------
+// partial[7][i / 2 + j] += x;
+//
+// 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00
+// 10 11 12 13 14 15 16 17 00 00 00 00 00 00 00
+// 00 20 21 22 23 24 25 26 27 00 00 00 00 00 00
+// 00 30 31 32 33 34 35 36 37 00 00 00 00 00 00
+// 00 00 40 41 42 43 44 45 46 47 00 00 00 00 00
+// 00 00 50 51 52 53 54 55 56 57 00 00 00 00 00
+// 00 00 00 60 61 62 63 64 65 66 67 00 00 00 00
+// 00 00 00 70 71 72 73 74 75 76 77 00 00 00 00
+//
+// partial[5] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D5_D7(uint8x8_t* v_src,
+ uint16x8_t* partial_lo,
+ uint16x8_t* partial_hi) {
+ const uint16x8_t v_zero = vdupq_n_u16(0);
+ uint16x8_t v_pair_add[4];
+ // Add vertical source pairs.
+ v_pair_add[0] = vaddl_u8(v_src[0], v_src[1]);
+ v_pair_add[1] = vaddl_u8(v_src[2], v_src[3]);
+ v_pair_add[2] = vaddl_u8(v_src[4], v_src[5]);
+ v_pair_add[3] = vaddl_u8(v_src[6], v_src[7]);
+
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ *partial_lo = v_pair_add[0];
+ // 00 00 00 00 00 00 00 00
+ // 00 00 00 00 00 00 00 00
+ *partial_hi = vdupq_n_u16(0);
+
+ // 00 20 21 22 23 24 25 26
+ // 00 30 31 32 33 34 35 36
+ *partial_lo = vaddq_u16(*partial_lo, vextq_u16(v_zero, v_pair_add[1], 7));
+ // 27 00 00 00 00 00 00 00
+ // 37 00 00 00 00 00 00 00
+ *partial_hi = vaddq_u16(*partial_hi, vextq_u16(v_pair_add[1], v_zero, 7));
+
+ // 00 00 40 41 42 43 44 45
+ // 00 00 50 51 52 53 54 55
+ *partial_lo = vaddq_u16(*partial_lo, vextq_u16(v_zero, v_pair_add[2], 6));
+ // 46 47 00 00 00 00 00 00
+ // 56 57 00 00 00 00 00 00
+ *partial_hi = vaddq_u16(*partial_hi, vextq_u16(v_pair_add[2], v_zero, 6));
+
+ // 00 00 00 60 61 62 63 64
+ // 00 00 00 70 71 72 73 74
+ *partial_lo = vaddq_u16(*partial_lo, vextq_u16(v_zero, v_pair_add[3], 5));
+ // 65 66 67 00 00 00 00 00
+ // 75 76 77 00 00 00 00 00
+ *partial_hi = vaddq_u16(*partial_hi, vextq_u16(v_pair_add[3], v_zero, 5));
+}
+
+LIBGAV1_ALWAYS_INLINE void AddPartial(const void* const source,
+ ptrdiff_t stride, uint16x8_t* partial_lo,
+ uint16x8_t* partial_hi) {
+ const auto* src = static_cast<const uint8_t*>(source);
+
+ // 8x8 input
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27
+ // 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47
+ // 50 51 52 53 54 55 56 57
+ // 60 61 62 63 64 65 66 67
+ // 70 71 72 73 74 75 76 77
+ uint8x8_t v_src[8];
+ for (int i = 0; i < 8; ++i) {
+ v_src[i] = vld1_u8(src);
+ src += stride;
+ }
+
+ // partial for direction 2
+ // --------------------------------------------------------------------------
+ // partial[2][i] += x;
+ // 00 10 20 30 40 50 60 70 00 00 00 00 00 00 00 00
+ // 01 11 21 33 41 51 61 71 00 00 00 00 00 00 00 00
+ // 02 12 22 33 42 52 62 72 00 00 00 00 00 00 00 00
+ // 03 13 23 33 43 53 63 73 00 00 00 00 00 00 00 00
+ // 04 14 24 34 44 54 64 74 00 00 00 00 00 00 00 00
+ // 05 15 25 35 45 55 65 75 00 00 00 00 00 00 00 00
+ // 06 16 26 36 46 56 66 76 00 00 00 00 00 00 00 00
+ // 07 17 27 37 47 57 67 77 00 00 00 00 00 00 00 00
+ partial_lo[2] = vsetq_lane_u16(SumVector(v_src[0]), partial_lo[2], 0);
+ partial_lo[2] = vsetq_lane_u16(SumVector(v_src[1]), partial_lo[2], 1);
+ partial_lo[2] = vsetq_lane_u16(SumVector(v_src[2]), partial_lo[2], 2);
+ partial_lo[2] = vsetq_lane_u16(SumVector(v_src[3]), partial_lo[2], 3);
+ partial_lo[2] = vsetq_lane_u16(SumVector(v_src[4]), partial_lo[2], 4);
+ partial_lo[2] = vsetq_lane_u16(SumVector(v_src[5]), partial_lo[2], 5);
+ partial_lo[2] = vsetq_lane_u16(SumVector(v_src[6]), partial_lo[2], 6);
+ partial_lo[2] = vsetq_lane_u16(SumVector(v_src[7]), partial_lo[2], 7);
+
+ // partial for direction 6
+ // --------------------------------------------------------------------------
+ // partial[6][j] += x;
+ // 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00 00
+ // 10 11 12 13 14 15 16 17 00 00 00 00 00 00 00 00
+ // 20 21 22 23 24 25 26 27 00 00 00 00 00 00 00 00
+ // 30 31 32 33 34 35 36 37 00 00 00 00 00 00 00 00
+ // 40 41 42 43 44 45 46 47 00 00 00 00 00 00 00 00
+ // 50 51 52 53 54 55 56 57 00 00 00 00 00 00 00 00
+ // 60 61 62 63 64 65 66 67 00 00 00 00 00 00 00 00
+ // 70 71 72 73 74 75 76 77 00 00 00 00 00 00 00 00
+ const uint8x8_t v_zero = vdup_n_u8(0);
+ partial_lo[6] = vaddl_u8(v_zero, v_src[0]);
+ for (int i = 1; i < 8; ++i) {
+ partial_lo[6] = vaddw_u8(partial_lo[6], v_src[i]);
+ }
+
+ // partial for direction 0
+ AddPartial_D0_D4(v_src, &partial_lo[0], &partial_hi[0]);
+
+ // partial for direction 1
+ AddPartial_D1_D3(v_src, &partial_lo[1], &partial_hi[1]);
+
+ // partial for direction 7
+ AddPartial_D5_D7(v_src, &partial_lo[7], &partial_hi[7]);
+
+ uint8x8_t v_src_reverse[8];
+ for (int i = 0; i < 8; ++i) {
+ v_src_reverse[i] = vrev64_u8(v_src[i]);
+ }
+
+ // partial for direction 4
+ AddPartial_D0_D4(v_src_reverse, &partial_lo[4], &partial_hi[4]);
+
+ // partial for direction 3
+ AddPartial_D1_D3(v_src_reverse, &partial_lo[3], &partial_hi[3]);
+
+ // partial for direction 5
+ AddPartial_D5_D7(v_src_reverse, &partial_lo[5], &partial_hi[5]);
+}
+
+uint32x4_t Square(uint16x4_t a) { return vmull_u16(a, a); }
+
+uint32x4_t SquareAccumulate(uint32x4_t a, uint16x4_t b) {
+ return vmlal_u16(a, b, b);
+}
+
+// |cost[0]| and |cost[4]| square the input and sum with the corresponding
+// element from the other end of the vector:
+// |kCdefDivisionTable[]| element:
+// cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) *
+// kCdefDivisionTable[i + 1];
+// cost[0] += Square(partial[0][7]) * kCdefDivisionTable[8];
+// Because everything is being summed into a single value the distributive
+// property allows us to mirror the division table and accumulate once.
+uint32_t Cost0Or4(const uint16x8_t a, const uint16x8_t b,
+ const uint32x4_t division_table[4]) {
+ uint32x4_t c = vmulq_u32(Square(vget_low_u16(a)), division_table[0]);
+ c = vmlaq_u32(c, Square(vget_high_u16(a)), division_table[1]);
+ c = vmlaq_u32(c, Square(vget_low_u16(b)), division_table[2]);
+ c = vmlaq_u32(c, Square(vget_high_u16(b)), division_table[3]);
+ return SumVector(c);
+}
+
+// |cost[2]| and |cost[6]| square the input and accumulate:
+// cost[2] += Square(partial[2][i])
+uint32_t SquareAccumulate(const uint16x8_t a) {
+ uint32x4_t c = Square(vget_low_u16(a));
+ c = SquareAccumulate(c, vget_high_u16(a));
+ c = vmulq_n_u32(c, kCdefDivisionTable[7]);
+ return SumVector(c);
+}
+
+uint32_t CostOdd(const uint16x8_t a, const uint16x8_t b, const uint32x4_t mask,
+ const uint32x4_t division_table[2]) {
+ // Remove elements 0-2.
+ uint32x4_t c = vandq_u32(mask, Square(vget_low_u16(a)));
+ c = vaddq_u32(c, Square(vget_high_u16(a)));
+ c = vmulq_n_u32(c, kCdefDivisionTable[7]);
+
+ c = vmlaq_u32(c, Square(vget_low_u16(a)), division_table[0]);
+ c = vmlaq_u32(c, Square(vget_low_u16(b)), division_table[1]);
+ return SumVector(c);
+}
+
+void CdefDirection_NEON(const void* const source, ptrdiff_t stride,
+ int* const direction, int* const variance) {
+ assert(direction != nullptr);
+ assert(variance != nullptr);
+ const auto* src = static_cast<const uint8_t*>(source);
+ uint32_t cost[8];
+ uint16x8_t partial_lo[8], partial_hi[8];
+
+ AddPartial(src, stride, partial_lo, partial_hi);
+
+ cost[2] = SquareAccumulate(partial_lo[2]);
+ cost[6] = SquareAccumulate(partial_lo[6]);
+
+ const uint32x4_t division_table[4] = {
+ vld1q_u32(kCdefDivisionTable), vld1q_u32(kCdefDivisionTable + 4),
+ vld1q_u32(kCdefDivisionTable + 8), vld1q_u32(kCdefDivisionTable + 12)};
+
+ cost[0] = Cost0Or4(partial_lo[0], partial_hi[0], division_table);
+ cost[4] = Cost0Or4(partial_lo[4], partial_hi[4], division_table);
+
+ const uint32x4_t division_table_odd[2] = {
+ vld1q_u32(kCdefDivisionTableOdd), vld1q_u32(kCdefDivisionTableOdd + 4)};
+
+ const uint32x4_t element_3_mask = {0, 0, 0, static_cast<uint32_t>(-1)};
+
+ cost[1] =
+ CostOdd(partial_lo[1], partial_hi[1], element_3_mask, division_table_odd);
+ cost[3] =
+ CostOdd(partial_lo[3], partial_hi[3], element_3_mask, division_table_odd);
+ cost[5] =
+ CostOdd(partial_lo[5], partial_hi[5], element_3_mask, division_table_odd);
+ cost[7] =
+ CostOdd(partial_lo[7], partial_hi[7], element_3_mask, division_table_odd);
+
+ uint32_t best_cost = 0;
+ *direction = 0;
+ for (int i = 0; i < 8; ++i) {
+ if (cost[i] > best_cost) {
+ best_cost = cost[i];
+ *direction = i;
+ }
+ }
+ *variance = (best_cost - cost[(*direction + 4) & 7]) >> 10;
+}
+
+// -------------------------------------------------------------------------
+// CdefFilter
+
+// Load 4 vectors based on the given |direction|.
+void LoadDirection(const uint16_t* const src, const ptrdiff_t stride,
+ uint16x8_t* output, const int direction) {
+ // Each |direction| describes a different set of source values. Expand this
+ // set by negating each set. For |direction| == 0 this gives a diagonal line
+ // from top right to bottom left. The first value is y, the second x. Negative
+ // y values move up.
+ // a b c d
+ // {-1, 1}, {1, -1}, {-2, 2}, {2, -2}
+ // c
+ // a
+ // 0
+ // b
+ // d
+ const int y_0 = kCdefDirections[direction][0][0];
+ const int x_0 = kCdefDirections[direction][0][1];
+ const int y_1 = kCdefDirections[direction][1][0];
+ const int x_1 = kCdefDirections[direction][1][1];
+ output[0] = vld1q_u16(src + y_0 * stride + x_0);
+ output[1] = vld1q_u16(src - y_0 * stride - x_0);
+ output[2] = vld1q_u16(src + y_1 * stride + x_1);
+ output[3] = vld1q_u16(src - y_1 * stride - x_1);
+}
+
+// Load 4 vectors based on the given |direction|. Use when |block_width| == 4 to
+// do 2 rows at a time.
+void LoadDirection4(const uint16_t* const src, const ptrdiff_t stride,
+ uint16x8_t* output, const int direction) {
+ const int y_0 = kCdefDirections[direction][0][0];
+ const int x_0 = kCdefDirections[direction][0][1];
+ const int y_1 = kCdefDirections[direction][1][0];
+ const int x_1 = kCdefDirections[direction][1][1];
+ output[0] = vcombine_u16(vld1_u16(src + y_0 * stride + x_0),
+ vld1_u16(src + y_0 * stride + stride + x_0));
+ output[1] = vcombine_u16(vld1_u16(src - y_0 * stride - x_0),
+ vld1_u16(src - y_0 * stride + stride - x_0));
+ output[2] = vcombine_u16(vld1_u16(src + y_1 * stride + x_1),
+ vld1_u16(src + y_1 * stride + stride + x_1));
+ output[3] = vcombine_u16(vld1_u16(src - y_1 * stride - x_1),
+ vld1_u16(src - y_1 * stride + stride - x_1));
+}
+
+int16x8_t Constrain(const uint16x8_t pixel, const uint16x8_t reference,
+ const uint16x8_t threshold, const int16x8_t damping) {
+ // If reference > pixel, the difference will be negative, so covert to 0 or
+ // -1.
+ const uint16x8_t sign = vcgtq_u16(reference, pixel);
+ const uint16x8_t abs_diff = vabdq_u16(pixel, reference);
+ const uint16x8_t shifted_diff = vshlq_u16(abs_diff, damping);
+ // For bitdepth == 8, the threshold range is [0, 15] and the damping range is
+ // [3, 6]. If pixel == kCdefLargeValue(0x4000), shifted_diff will always be
+ // larger than threshold. Subtract using saturation will return 0 when pixel
+ // == kCdefLargeValue.
+ static_assert(kCdefLargeValue == 0x4000, "Invalid kCdefLargeValue");
+ const uint16x8_t thresh_minus_shifted_diff =
+ vqsubq_u16(threshold, shifted_diff);
+ const uint16x8_t clamp_abs_diff =
+ vminq_u16(thresh_minus_shifted_diff, abs_diff);
+ // Restore the sign.
+ return vreinterpretq_s16_u16(
+ vsubq_u16(veorq_u16(clamp_abs_diff, sign), sign));
+}
+
+template <int width, bool enable_primary = true, bool enable_secondary = true>
+void CdefFilter_NEON(const uint16_t* src, const ptrdiff_t src_stride,
+ const int height, const int primary_strength,
+ const int secondary_strength, const int damping,
+ const int direction, void* dest,
+ const ptrdiff_t dst_stride) {
+ static_assert(width == 8 || width == 4, "");
+ static_assert(enable_primary || enable_secondary, "");
+ constexpr bool clipping_required = enable_primary && enable_secondary;
+ auto* dst = static_cast<uint8_t*>(dest);
+ const uint16x8_t cdef_large_value_mask =
+ vdupq_n_u16(static_cast<uint16_t>(~kCdefLargeValue));
+ const uint16x8_t primary_threshold = vdupq_n_u16(primary_strength);
+ const uint16x8_t secondary_threshold = vdupq_n_u16(secondary_strength);
+
+ int16x8_t primary_damping_shift, secondary_damping_shift;
+
+ // FloorLog2() requires input to be > 0.
+ // 8-bit damping range: Y: [3, 6], UV: [2, 5].
+ if (enable_primary) {
+ // primary_strength: [0, 15] -> FloorLog2: [0, 3] so a clamp is necessary
+ // for UV filtering.
+ primary_damping_shift =
+ vdupq_n_s16(-std::max(0, damping - FloorLog2(primary_strength)));
+ }
+ if (enable_secondary) {
+ // secondary_strength: [0, 4] -> FloorLog2: [0, 2] so no clamp to 0 is
+ // necessary.
+ assert(damping - FloorLog2(secondary_strength) >= 0);
+ secondary_damping_shift =
+ vdupq_n_s16(-(damping - FloorLog2(secondary_strength)));
+ }
+
+ const int primary_tap_0 = kCdefPrimaryTaps[primary_strength & 1][0];
+ const int primary_tap_1 = kCdefPrimaryTaps[primary_strength & 1][1];
+
+ int y = height;
+ do {
+ uint16x8_t pixel;
+ if (width == 8) {
+ pixel = vld1q_u16(src);
+ } else {
+ pixel = vcombine_u16(vld1_u16(src), vld1_u16(src + src_stride));
+ }
+
+ uint16x8_t min = pixel;
+ uint16x8_t max = pixel;
+ int16x8_t sum;
+
+ if (enable_primary) {
+ // Primary |direction|.
+ uint16x8_t primary_val[4];
+ if (width == 8) {
+ LoadDirection(src, src_stride, primary_val, direction);
+ } else {
+ LoadDirection4(src, src_stride, primary_val, direction);
+ }
+
+ if (clipping_required) {
+ min = vminq_u16(min, primary_val[0]);
+ min = vminq_u16(min, primary_val[1]);
+ min = vminq_u16(min, primary_val[2]);
+ min = vminq_u16(min, primary_val[3]);
+
+ // The source is 16 bits, however, we only really care about the lower
+ // 8 bits. The upper 8 bits contain the "large" flag. After the final
+ // primary max has been calculated, zero out the upper 8 bits. Use this
+ // to find the "16 bit" max.
+ const uint8x16_t max_p01 =
+ vmaxq_u8(vreinterpretq_u8_u16(primary_val[0]),
+ vreinterpretq_u8_u16(primary_val[1]));
+ const uint8x16_t max_p23 =
+ vmaxq_u8(vreinterpretq_u8_u16(primary_val[2]),
+ vreinterpretq_u8_u16(primary_val[3]));
+ const uint16x8_t max_p =
+ vreinterpretq_u16_u8(vmaxq_u8(max_p01, max_p23));
+ max = vmaxq_u16(max, vandq_u16(max_p, cdef_large_value_mask));
+ }
+
+ sum = Constrain(primary_val[0], pixel, primary_threshold,
+ primary_damping_shift);
+ sum = vmulq_n_s16(sum, primary_tap_0);
+ sum = vmlaq_n_s16(sum,
+ Constrain(primary_val[1], pixel, primary_threshold,
+ primary_damping_shift),
+ primary_tap_0);
+ sum = vmlaq_n_s16(sum,
+ Constrain(primary_val[2], pixel, primary_threshold,
+ primary_damping_shift),
+ primary_tap_1);
+ sum = vmlaq_n_s16(sum,
+ Constrain(primary_val[3], pixel, primary_threshold,
+ primary_damping_shift),
+ primary_tap_1);
+ } else {
+ sum = vdupq_n_s16(0);
+ }
+
+ if (enable_secondary) {
+ // Secondary |direction| values (+/- 2). Clamp |direction|.
+ uint16x8_t secondary_val[8];
+ if (width == 8) {
+ LoadDirection(src, src_stride, secondary_val, direction + 2);
+ LoadDirection(src, src_stride, secondary_val + 4, direction - 2);
+ } else {
+ LoadDirection4(src, src_stride, secondary_val, direction + 2);
+ LoadDirection4(src, src_stride, secondary_val + 4, direction - 2);
+ }
+
+ if (clipping_required) {
+ min = vminq_u16(min, secondary_val[0]);
+ min = vminq_u16(min, secondary_val[1]);
+ min = vminq_u16(min, secondary_val[2]);
+ min = vminq_u16(min, secondary_val[3]);
+ min = vminq_u16(min, secondary_val[4]);
+ min = vminq_u16(min, secondary_val[5]);
+ min = vminq_u16(min, secondary_val[6]);
+ min = vminq_u16(min, secondary_val[7]);
+
+ const uint8x16_t max_s01 =
+ vmaxq_u8(vreinterpretq_u8_u16(secondary_val[0]),
+ vreinterpretq_u8_u16(secondary_val[1]));
+ const uint8x16_t max_s23 =
+ vmaxq_u8(vreinterpretq_u8_u16(secondary_val[2]),
+ vreinterpretq_u8_u16(secondary_val[3]));
+ const uint8x16_t max_s45 =
+ vmaxq_u8(vreinterpretq_u8_u16(secondary_val[4]),
+ vreinterpretq_u8_u16(secondary_val[5]));
+ const uint8x16_t max_s67 =
+ vmaxq_u8(vreinterpretq_u8_u16(secondary_val[6]),
+ vreinterpretq_u8_u16(secondary_val[7]));
+ const uint16x8_t max_s = vreinterpretq_u16_u8(
+ vmaxq_u8(vmaxq_u8(max_s01, max_s23), vmaxq_u8(max_s45, max_s67)));
+ max = vmaxq_u16(max, vandq_u16(max_s, cdef_large_value_mask));
+ }
+
+ sum = vmlaq_n_s16(sum,
+ Constrain(secondary_val[0], pixel, secondary_threshold,
+ secondary_damping_shift),
+ kCdefSecondaryTap0);
+ sum = vmlaq_n_s16(sum,
+ Constrain(secondary_val[1], pixel, secondary_threshold,
+ secondary_damping_shift),
+ kCdefSecondaryTap0);
+ sum = vmlaq_n_s16(sum,
+ Constrain(secondary_val[2], pixel, secondary_threshold,
+ secondary_damping_shift),
+ kCdefSecondaryTap1);
+ sum = vmlaq_n_s16(sum,
+ Constrain(secondary_val[3], pixel, secondary_threshold,
+ secondary_damping_shift),
+ kCdefSecondaryTap1);
+ sum = vmlaq_n_s16(sum,
+ Constrain(secondary_val[4], pixel, secondary_threshold,
+ secondary_damping_shift),
+ kCdefSecondaryTap0);
+ sum = vmlaq_n_s16(sum,
+ Constrain(secondary_val[5], pixel, secondary_threshold,
+ secondary_damping_shift),
+ kCdefSecondaryTap0);
+ sum = vmlaq_n_s16(sum,
+ Constrain(secondary_val[6], pixel, secondary_threshold,
+ secondary_damping_shift),
+ kCdefSecondaryTap1);
+ sum = vmlaq_n_s16(sum,
+ Constrain(secondary_val[7], pixel, secondary_threshold,
+ secondary_damping_shift),
+ kCdefSecondaryTap1);
+ }
+ // Clip3(pixel + ((8 + sum - (sum < 0)) >> 4), min, max))
+ const int16x8_t sum_lt_0 = vshrq_n_s16(sum, 15);
+ sum = vaddq_s16(sum, sum_lt_0);
+ int16x8_t result = vrsraq_n_s16(vreinterpretq_s16_u16(pixel), sum, 4);
+ if (clipping_required) {
+ result = vminq_s16(result, vreinterpretq_s16_u16(max));
+ result = vmaxq_s16(result, vreinterpretq_s16_u16(min));
+ }
+
+ const uint8x8_t dst_pixel = vqmovun_s16(result);
+ if (width == 8) {
+ src += src_stride;
+ vst1_u8(dst, dst_pixel);
+ dst += dst_stride;
+ --y;
+ } else {
+ src += src_stride << 1;
+ StoreLo4(dst, dst_pixel);
+ dst += dst_stride;
+ StoreHi4(dst, dst_pixel);
+ dst += dst_stride;
+ y -= 2;
+ }
+ } while (y != 0);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->cdef_direction = CdefDirection_NEON;
+ dsp->cdef_filters[0][0] = CdefFilter_NEON<4>;
+ dsp->cdef_filters[0][1] =
+ CdefFilter_NEON<4, /*enable_primary=*/true, /*enable_secondary=*/false>;
+ dsp->cdef_filters[0][2] = CdefFilter_NEON<4, /*enable_primary=*/false>;
+ dsp->cdef_filters[1][0] = CdefFilter_NEON<8>;
+ dsp->cdef_filters[1][1] =
+ CdefFilter_NEON<8, /*enable_primary=*/true, /*enable_secondary=*/false>;
+ dsp->cdef_filters[1][2] = CdefFilter_NEON<8, /*enable_primary=*/false>;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void CdefInit_NEON() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+#else // !LIBGAV1_ENABLE_NEON
+namespace libgav1 {
+namespace dsp {
+
+void CdefInit_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON
diff --git a/libgav1/src/dsp/arm/cdef_neon.h b/libgav1/src/dsp/arm/cdef_neon.h
new file mode 100644
index 0000000..53d5f86
--- /dev/null
+++ b/libgav1/src/dsp/arm/cdef_neon.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2019 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_CDEF_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_CDEF_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cdef_direction and Dsp::cdef_filters. This function is not
+// thread-safe.
+void CdefInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_CdefDirection LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_CdefFilters LIBGAV1_CPU_NEON
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_CDEF_NEON_H_
diff --git a/libgav1/src/dsp/arm/common_neon.h b/libgav1/src/dsp/arm/common_neon.h
index e0667f9..e8367ab 100644
--- a/libgav1/src/dsp/arm/common_neon.h
+++ b/libgav1/src/dsp/arm/common_neon.h
@@ -17,7 +17,7 @@
#ifndef LIBGAV1_SRC_DSP_ARM_COMMON_NEON_H_
#define LIBGAV1_SRC_DSP_ARM_COMMON_NEON_H_
-#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
#if LIBGAV1_ENABLE_NEON
@@ -29,6 +29,8 @@
#if 0
#include <cstdio>
+#include "absl/strings/str_cat.h"
+
constexpr bool kEnablePrintRegs = true;
union DebugRegister {
@@ -82,6 +84,16 @@
}
}
+inline void PrintReg(const int32x4x2_t val, const std::string& name) {
+ DebugRegisterQ r;
+ vst1q_u32(r.u32, val.val[0]);
+ const std::string name0 = absl::StrCat(name, ".val[0]").c_str();
+ PrintVectQ(r, name0.c_str(), 32);
+ vst1q_u32(r.u32, val.val[1]);
+ const std::string name1 = absl::StrCat(name, ".val[1]").c_str();
+ PrintVectQ(r, name1.c_str(), 32);
+}
+
inline void PrintReg(const uint32x4_t val, const char* name) {
DebugRegisterQ r;
vst1q_u32(r.u32, val);
@@ -180,49 +192,89 @@
//------------------------------------------------------------------------------
// Load functions.
-// Load 4 uint8_t values into the low half of a uint8x8_t register.
-inline uint8x8_t LoadLo4(const uint8_t* const buf, uint8x8_t val) {
- uint32_t temp;
- memcpy(&temp, buf, 4);
- return vreinterpret_u8_u32(vld1_lane_u32(&temp, vreinterpret_u32_u8(val), 0));
+// Load 2 uint8_t values into lanes 0 and 1. Zeros the register before loading
+// the values. Use caution when using this in loops because it will re-zero the
+// register before loading on every iteration.
+inline uint8x8_t Load2(const void* const buf) {
+ const uint16x4_t zero = vdup_n_u16(0);
+ uint16_t temp;
+ memcpy(&temp, buf, 2);
+ return vreinterpret_u8_u16(vld1_lane_u16(&temp, zero, 0));
}
-// Load 4 uint8_t values into the high half of a uint8x8_t register.
-inline uint8x8_t LoadHi4(const uint8_t* const buf, uint8x8_t val) {
+// Load 2 uint8_t values into |lane| * 2 and |lane| * 2 + 1.
+template <int lane>
+inline uint8x8_t Load2(const void* const buf, uint8x8_t val) {
+ uint16_t temp;
+ memcpy(&temp, buf, 2);
+ return vreinterpret_u8_u16(
+ vld1_lane_u16(&temp, vreinterpret_u16_u8(val), lane));
+}
+
+// Load 4 uint8_t values into the low half of a uint8x8_t register. Zeros the
+// register before loading the values. Use caution when using this in loops
+// because it will re-zero the register before loading on every iteration.
+inline uint8x8_t Load4(const void* const buf) {
+ const uint32x2_t zero = vdup_n_u32(0);
uint32_t temp;
memcpy(&temp, buf, 4);
- return vreinterpret_u8_u32(vld1_lane_u32(&temp, vreinterpret_u32_u8(val), 1));
+ return vreinterpret_u8_u32(vld1_lane_u32(&temp, zero, 0));
+}
+
+// Load 4 uint8_t values into 4 lanes staring with |lane| * 4.
+template <int lane>
+inline uint8x8_t Load4(const void* const buf, uint8x8_t val) {
+ uint32_t temp;
+ memcpy(&temp, buf, 4);
+ return vreinterpret_u8_u32(
+ vld1_lane_u32(&temp, vreinterpret_u32_u8(val), lane));
}
//------------------------------------------------------------------------------
// Store functions.
// Propagate type information to the compiler. Without this the compiler may
-// assume the required alignment of uint32_t (4 bytes) and add alignment hints
-// to the memory access.
-inline void Uint32ToMem(uint8_t* const buf, uint32_t val) {
- memcpy(buf, &val, 4);
+// assume the required alignment of the type (4 bytes in the case of uint32_t)
+// and add alignment hints to the memory access.
+template <typename T>
+inline void ValueToMem(void* const buf, T val) {
+ memcpy(buf, &val, sizeof(val));
}
-inline void Uint32ToMem(uint16_t* const buf, uint32_t val) {
- memcpy(buf, &val, 4);
+// Store 4 int8_t values from the low half of an int8x8_t register.
+inline void StoreLo4(void* const buf, const int8x8_t val) {
+ ValueToMem<int32_t>(buf, vget_lane_s32(vreinterpret_s32_s8(val), 0));
}
// Store 4 uint8_t values from the low half of a uint8x8_t register.
-inline void StoreLo4(uint8_t* const buf, const uint8x8_t val) {
- Uint32ToMem(buf, vget_lane_u32(vreinterpret_u32_u8(val), 0));
+inline void StoreLo4(void* const buf, const uint8x8_t val) {
+ ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u8(val), 0));
}
// Store 4 uint8_t values from the high half of a uint8x8_t register.
-inline void StoreHi4(uint8_t* const buf, const uint8x8_t val) {
- Uint32ToMem(buf, vget_lane_u32(vreinterpret_u32_u8(val), 1));
+inline void StoreHi4(void* const buf, const uint8x8_t val) {
+ ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u8(val), 1));
+}
+
+// Store 2 uint8_t values from |lane| * 2 and |lane| * 2 + 1 of a uint8x8_t
+// register.
+template <int lane>
+inline void Store2(void* const buf, const uint8x8_t val) {
+ ValueToMem<uint16_t>(buf, vget_lane_u16(vreinterpret_u16_u8(val), lane));
}
// Store 2 uint16_t values from |lane| * 2 and |lane| * 2 + 1 of a uint16x8_t
// register.
template <int lane>
-inline void Store2(uint16_t* const buf, const uint16x8_t val) {
- Uint32ToMem(buf, vgetq_lane_u32(vreinterpretq_u32_u16(val), lane));
+inline void Store2(void* const buf, const uint16x8_t val) {
+ ValueToMem<uint32_t>(buf, vgetq_lane_u32(vreinterpretq_u32_u16(val), lane));
+}
+
+// Store 2 uint16_t values from |lane| * 2 and |lane| * 2 + 1 of a uint16x4_t
+// register.
+template <int lane>
+inline void Store2(uint16_t* const buf, const uint16x4_t val) {
+ ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u16(val), lane));
}
//------------------------------------------------------------------------------
@@ -230,6 +282,11 @@
// vshXX_n_XX() requires an immediate.
template <int shift>
+inline uint8x8_t LeftShift(const uint8x8_t vector) {
+ return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vector), shift));
+}
+
+template <int shift>
inline uint8x8_t RightShift(const uint8x8_t vector) {
return vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(vector), shift));
}
@@ -249,6 +306,16 @@
#endif
}
+// Shim vqtbl1_s8 for armv7.
+inline int8x8_t VQTbl1S8(const int8x16_t a, const uint8x8_t index) {
+#if defined(__aarch64__)
+ return vqtbl1_s8(a, index);
+#else
+ const int8x8x2_t b = {vget_low_s8(a), vget_high_s8(a)};
+ return vtbl2_s8(b, vreinterpret_s8_u8(index));
+#endif
+}
+
//------------------------------------------------------------------------------
// Interleave.
@@ -307,6 +374,30 @@
}
//------------------------------------------------------------------------------
+// Sum.
+
+inline uint16_t SumVector(const uint8x8_t a) {
+#if defined(__aarch64__)
+ return vaddlv_u8(a);
+#else
+ const uint16x4_t c = vpaddl_u8(a);
+ const uint32x2_t d = vpaddl_u16(c);
+ const uint64x1_t e = vpaddl_u32(d);
+ return static_cast<uint16_t>(vget_lane_u64(e, 0));
+#endif // defined(__aarch64__)
+}
+
+inline uint32_t SumVector(const uint32x4_t a) {
+#if defined(__aarch64__)
+ return vaddvq_u32(a);
+#else
+ const uint64x2_t b = vpaddlq_u32(a);
+ const uint64x1_t c = vadd_u64(vget_low_u64(b), vget_high_u64(b));
+ return static_cast<uint32_t>(vget_lane_u64(c, 0));
+#endif
+}
+
+//------------------------------------------------------------------------------
// Transpose.
// Transpose 32 bit elements such that:
@@ -497,76 +588,24 @@
}
// Input:
-// a0: 00 01 02 03 04 05 06 07
-// a1: 10 11 12 13 14 15 16 17
-// a2: 20 21 22 23 24 25 26 27
-// a3: 30 31 32 33 34 35 36 37
-// a4: 40 41 42 43 44 45 46 47
-// a5: 50 51 52 53 54 55 56 57
-// a6: 60 61 62 63 64 65 66 67
-// a7: 70 71 72 73 74 75 76 77
+// a[0]: 00 01 02 03 04 05 06 07
+// a[1]: 10 11 12 13 14 15 16 17
+// a[2]: 20 21 22 23 24 25 26 27
+// a[3]: 30 31 32 33 34 35 36 37
+// a[4]: 40 41 42 43 44 45 46 47
+// a[5]: 50 51 52 53 54 55 56 57
+// a[6]: 60 61 62 63 64 65 66 67
+// a[7]: 70 71 72 73 74 75 76 77
// Output:
-// a0: 00 10 20 30 40 50 60 70
-// a1: 01 11 21 31 41 51 61 71
-// a2: 02 12 22 32 42 52 62 72
-// a3: 03 13 23 33 43 53 63 73
-// a4: 04 14 24 34 44 54 64 74
-// a5: 05 15 25 35 45 55 65 75
-// a6: 06 16 26 36 46 56 66 76
-// a7: 07 17 27 37 47 57 67 77
-inline void Transpose8x8(int16x8_t* a0, int16x8_t* a1, int16x8_t* a2,
- int16x8_t* a3, int16x8_t* a4, int16x8_t* a5,
- int16x8_t* a6, int16x8_t* a7) {
- const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
- const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
- const int16x8x2_t b2 = vtrnq_s16(*a4, *a5);
- const int16x8x2_t b3 = vtrnq_s16(*a6, *a7);
-
- const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
- vreinterpretq_s32_s16(b1.val[0]));
- const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
- vreinterpretq_s32_s16(b1.val[1]));
- const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
- vreinterpretq_s32_s16(b3.val[0]));
- const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
- vreinterpretq_s32_s16(b3.val[1]));
-
- const int16x8x2_t d0 = VtrnqS64(c0.val[0], c2.val[0]);
- const int16x8x2_t d1 = VtrnqS64(c1.val[0], c3.val[0]);
- const int16x8x2_t d2 = VtrnqS64(c0.val[1], c2.val[1]);
- const int16x8x2_t d3 = VtrnqS64(c1.val[1], c3.val[1]);
-
- *a0 = d0.val[0];
- *a1 = d1.val[0];
- *a2 = d2.val[0];
- *a3 = d3.val[0];
- *a4 = d0.val[1];
- *a5 = d1.val[1];
- *a6 = d2.val[1];
- *a7 = d3.val[1];
-}
-
-// Input:
-// a0: 00 01 02 03 04 05 06 07
-// a1: 10 11 12 13 14 15 16 17
-// a2: 20 21 22 23 24 25 26 27
-// a3: 30 31 32 33 34 35 36 37
-// a4: 40 41 42 43 44 45 46 47
-// a5: 50 51 52 53 54 55 56 57
-// a6: 60 61 62 63 64 65 66 67
-// a7: 70 71 72 73 74 75 76 77
-
-// Output:
-// a0: 00 10 20 30 40 50 60 70
-// a1: 01 11 21 31 41 51 61 71
-// a2: 02 12 22 32 42 52 62 72
-// a3: 03 13 23 33 43 53 63 73
-// a4: 04 14 24 34 44 54 64 74
-// a5: 05 15 25 35 45 55 65 75
-// a6: 06 16 26 36 46 56 66 76
-// a7: 07 17 27 37 47 57 67 77
-// TODO(johannkoenig): Switch users of the above transpose to this one.
+// a[0]: 00 10 20 30 40 50 60 70
+// a[1]: 01 11 21 31 41 51 61 71
+// a[2]: 02 12 22 32 42 52 62 72
+// a[3]: 03 13 23 33 43 53 63 73
+// a[4]: 04 14 24 34 44 54 64 74
+// a[5]: 05 15 25 35 45 55 65 75
+// a[6]: 06 16 26 36 46 56 66 76
+// a[7]: 07 17 27 37 47 57 67 77
inline void Transpose8x8(int16x8_t a[8]) {
const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]);
const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]);
@@ -628,125 +667,8 @@
a[7] = d3.val[1];
}
-// Input:
-// i0: 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
-// i1: 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f
-// i2: 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f
-// i3: 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f
-// i4: 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f
-// i5: 50 51 52 53 54 55 56 57 58 59 5a 5b 5c 5d 5e 5f
-// i6: 60 61 62 63 64 65 66 67 68 69 6a 6b 6c 6d 6e 6f
-// i7: 70 71 72 73 74 75 76 77 78 79 7a 7b 7c 7d 7e 7f
-
-// Output:
-// o00: 00 10 20 30 40 50 60 70
-// o01: 01 11 21 31 41 51 61 71
-// o02: 02 12 22 32 42 52 62 72
-// o03: 03 13 23 33 43 53 63 73
-// o04: 04 14 24 34 44 54 64 74
-// o05: 05 15 25 35 45 55 65 75
-// o06: 06 16 26 36 46 56 66 76
-// o07: 07 17 27 37 47 57 67 77
-// o08: 08 18 28 38 48 58 68 78
-// o09: 09 19 29 39 49 59 69 79
-// o0a: 0a 1a 2a 3a 4a 5a 6a 7a
-// o0b: 0b 1b 2b 3b 4b 5b 6b 7b
-// o0c: 0c 1c 2c 3c 4c 5c 6c 7c
-// o0d: 0d 1d 2d 3d 4d 5d 6d 7d
-// o0e: 0e 1e 2e 3e 4e 5e 6e 7e
-// o0f: 0f 1f 2f 3f 4f 5f 6f 7f
-inline void Transpose16x8(const uint8x16_t i0, const uint8x16_t i1,
- const uint8x16_t i2, const uint8x16_t i3,
- const uint8x16_t i4, const uint8x16_t i5,
- const uint8x16_t i6, const uint8x16_t i7,
- uint8x8_t* o00, uint8x8_t* o01, uint8x8_t* o02,
- uint8x8_t* o03, uint8x8_t* o04, uint8x8_t* o05,
- uint8x8_t* o06, uint8x8_t* o07, uint8x8_t* o08,
- uint8x8_t* o09, uint8x8_t* o10, uint8x8_t* o11,
- uint8x8_t* o12, uint8x8_t* o13, uint8x8_t* o14,
- uint8x8_t* o15) {
- const uint8x16x2_t b0 = vtrnq_u8(i0, i1);
- const uint8x16x2_t b1 = vtrnq_u8(i2, i3);
- const uint8x16x2_t b2 = vtrnq_u8(i4, i5);
- const uint8x16x2_t b3 = vtrnq_u8(i6, i7);
-
- const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
- vreinterpretq_u16_u8(b1.val[0]));
- const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
- vreinterpretq_u16_u8(b1.val[1]));
- const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
- vreinterpretq_u16_u8(b3.val[0]));
- const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
- vreinterpretq_u16_u8(b3.val[1]));
-
- const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
- vreinterpretq_u32_u16(c2.val[0]));
- const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
- vreinterpretq_u32_u16(c2.val[1]));
- const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
- vreinterpretq_u32_u16(c3.val[0]));
- const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
- vreinterpretq_u32_u16(c3.val[1]));
-
- *o00 = vget_low_u8(vreinterpretq_u8_u32(d0.val[0]));
- *o01 = vget_low_u8(vreinterpretq_u8_u32(d2.val[0]));
- *o02 = vget_low_u8(vreinterpretq_u8_u32(d1.val[0]));
- *o03 = vget_low_u8(vreinterpretq_u8_u32(d3.val[0]));
- *o04 = vget_low_u8(vreinterpretq_u8_u32(d0.val[1]));
- *o05 = vget_low_u8(vreinterpretq_u8_u32(d2.val[1]));
- *o06 = vget_low_u8(vreinterpretq_u8_u32(d1.val[1]));
- *o07 = vget_low_u8(vreinterpretq_u8_u32(d3.val[1]));
- *o08 = vget_high_u8(vreinterpretq_u8_u32(d0.val[0]));
- *o09 = vget_high_u8(vreinterpretq_u8_u32(d2.val[0]));
- *o10 = vget_high_u8(vreinterpretq_u8_u32(d1.val[0]));
- *o11 = vget_high_u8(vreinterpretq_u8_u32(d3.val[0]));
- *o12 = vget_high_u8(vreinterpretq_u8_u32(d0.val[1]));
- *o13 = vget_high_u8(vreinterpretq_u8_u32(d2.val[1]));
- *o14 = vget_high_u8(vreinterpretq_u8_u32(d1.val[1]));
- *o15 = vget_high_u8(vreinterpretq_u8_u32(d3.val[1]));
-}
-
-// TODO(johannkoenig): Replace usage of the above transpose with this one.
-inline void Transpose16x8(const uint8x16_t input[8], uint8x8_t output[16]) {
- const uint8x16x2_t b0 = vtrnq_u8(input[0], input[1]);
- const uint8x16x2_t b1 = vtrnq_u8(input[2], input[3]);
- const uint8x16x2_t b2 = vtrnq_u8(input[4], input[5]);
- const uint8x16x2_t b3 = vtrnq_u8(input[6], input[7]);
-
- const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
- vreinterpretq_u16_u8(b1.val[0]));
- const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
- vreinterpretq_u16_u8(b1.val[1]));
- const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
- vreinterpretq_u16_u8(b3.val[0]));
- const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
- vreinterpretq_u16_u8(b3.val[1]));
-
- const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
- vreinterpretq_u32_u16(c2.val[0]));
- const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
- vreinterpretq_u32_u16(c2.val[1]));
- const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
- vreinterpretq_u32_u16(c3.val[0]));
- const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
- vreinterpretq_u32_u16(c3.val[1]));
-
- output[0] = vget_low_u8(vreinterpretq_u8_u32(d0.val[0]));
- output[1] = vget_low_u8(vreinterpretq_u8_u32(d2.val[0]));
- output[2] = vget_low_u8(vreinterpretq_u8_u32(d1.val[0]));
- output[3] = vget_low_u8(vreinterpretq_u8_u32(d3.val[0]));
- output[4] = vget_low_u8(vreinterpretq_u8_u32(d0.val[1]));
- output[5] = vget_low_u8(vreinterpretq_u8_u32(d2.val[1]));
- output[6] = vget_low_u8(vreinterpretq_u8_u32(d1.val[1]));
- output[7] = vget_low_u8(vreinterpretq_u8_u32(d3.val[1]));
- output[8] = vget_high_u8(vreinterpretq_u8_u32(d0.val[0]));
- output[9] = vget_high_u8(vreinterpretq_u8_u32(d2.val[0]));
- output[10] = vget_high_u8(vreinterpretq_u8_u32(d1.val[0]));
- output[11] = vget_high_u8(vreinterpretq_u8_u32(d3.val[0]));
- output[12] = vget_high_u8(vreinterpretq_u8_u32(d0.val[1]));
- output[13] = vget_high_u8(vreinterpretq_u8_u32(d2.val[1]));
- output[14] = vget_high_u8(vreinterpretq_u8_u32(d1.val[1]));
- output[15] = vget_high_u8(vreinterpretq_u8_u32(d3.val[1]));
+inline int16x8_t ZeroExtend(const uint8x8_t in) {
+ return vreinterpretq_s16_u16(vmovl_u8(in));
}
} // namespace dsp
diff --git a/libgav1/src/dsp/arm/convolve_neon.cc b/libgav1/src/dsp/arm/convolve_neon.cc
index 5f7eef7..2c2557f 100644
--- a/libgav1/src/dsp/arm/convolve_neon.cc
+++ b/libgav1/src/dsp/arm/convolve_neon.cc
@@ -13,7 +13,7 @@
// limitations under the License.
#include "src/dsp/convolve.h"
-#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
#if LIBGAV1_ENABLE_NEON
@@ -25,325 +25,231 @@
#include <cstdint>
#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
namespace libgav1 {
namespace dsp {
namespace low_bitdepth {
namespace {
-constexpr int kBitdepth8 = 8;
constexpr int kIntermediateStride = kMaxSuperBlockSizeInPixels;
-constexpr int kSubPixelMask = (1 << kSubPixelBits) - 1;
constexpr int kHorizontalOffset = 3;
-constexpr int kVerticalOffset = 3;
-constexpr int kInterRoundBitsVertical = 11;
+constexpr int kFilterIndexShift = 6;
-int GetFilterIndex(const int filter_index, const int length) {
- if (length <= 4) {
- if (filter_index == kInterpolationFilterEightTap ||
- filter_index == kInterpolationFilterEightTapSharp) {
- return 4;
- }
- if (filter_index == kInterpolationFilterEightTapSmooth) {
- return 5;
- }
+// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
+// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
+// sum from outranging int16_t.
+template <int filter_index, bool negative_outside_taps = false>
+int16x8_t SumOnePassTaps(const uint8x8_t* const src,
+ const uint8x8_t* const taps) {
+ uint16x8_t sum;
+ if (filter_index == 0) {
+ // 6 taps. + - + + - +
+ sum = vmull_u8(src[0], taps[0]);
+ // Unsigned overflow will result in a valid int16_t value.
+ sum = vmlsl_u8(sum, src[1], taps[1]);
+ sum = vmlal_u8(sum, src[2], taps[2]);
+ sum = vmlal_u8(sum, src[3], taps[3]);
+ sum = vmlsl_u8(sum, src[4], taps[4]);
+ sum = vmlal_u8(sum, src[5], taps[5]);
+ } else if (filter_index == 1 && negative_outside_taps) {
+ // 6 taps. - + + + + -
+ // Set a base we can subtract from.
+ sum = vmull_u8(src[1], taps[1]);
+ sum = vmlsl_u8(sum, src[0], taps[0]);
+ sum = vmlal_u8(sum, src[2], taps[2]);
+ sum = vmlal_u8(sum, src[3], taps[3]);
+ sum = vmlal_u8(sum, src[4], taps[4]);
+ sum = vmlsl_u8(sum, src[5], taps[5]);
+ } else if (filter_index == 1) {
+ // 6 taps. All are positive.
+ sum = vmull_u8(src[0], taps[0]);
+ sum = vmlal_u8(sum, src[1], taps[1]);
+ sum = vmlal_u8(sum, src[2], taps[2]);
+ sum = vmlal_u8(sum, src[3], taps[3]);
+ sum = vmlal_u8(sum, src[4], taps[4]);
+ sum = vmlal_u8(sum, src[5], taps[5]);
+ } else if (filter_index == 2) {
+ // 8 taps. - + - + + - + -
+ sum = vmull_u8(src[1], taps[1]);
+ sum = vmlsl_u8(sum, src[0], taps[0]);
+ sum = vmlsl_u8(sum, src[2], taps[2]);
+ sum = vmlal_u8(sum, src[3], taps[3]);
+ sum = vmlal_u8(sum, src[4], taps[4]);
+ sum = vmlsl_u8(sum, src[5], taps[5]);
+ sum = vmlal_u8(sum, src[6], taps[6]);
+ sum = vmlsl_u8(sum, src[7], taps[7]);
+ } else if (filter_index == 3) {
+ // 2 taps. All are positive.
+ sum = vmull_u8(src[0], taps[0]);
+ sum = vmlal_u8(sum, src[1], taps[1]);
+ } else if (filter_index == 4) {
+ // 4 taps. - + + -
+ sum = vmull_u8(src[1], taps[1]);
+ sum = vmlsl_u8(sum, src[0], taps[0]);
+ sum = vmlal_u8(sum, src[2], taps[2]);
+ sum = vmlsl_u8(sum, src[3], taps[3]);
+ } else if (filter_index == 5) {
+ // 4 taps. All are positive.
+ sum = vmull_u8(src[0], taps[0]);
+ sum = vmlal_u8(sum, src[1], taps[1]);
+ sum = vmlal_u8(sum, src[2], taps[2]);
+ sum = vmlal_u8(sum, src[3], taps[3]);
}
- return filter_index;
+ return vreinterpretq_s16_u16(sum);
}
-inline int16x8_t ZeroExtend(const uint8x8_t in) {
- return vreinterpretq_s16_u16(vmovl_u8(in));
-}
-
-inline void Load8x8(const uint8_t* s, const ptrdiff_t p, int16x8_t* dst) {
- dst[0] = ZeroExtend(vld1_u8(s));
- s += p;
- dst[1] = ZeroExtend(vld1_u8(s));
- s += p;
- dst[2] = ZeroExtend(vld1_u8(s));
- s += p;
- dst[3] = ZeroExtend(vld1_u8(s));
- s += p;
- dst[4] = ZeroExtend(vld1_u8(s));
- s += p;
- dst[5] = ZeroExtend(vld1_u8(s));
- s += p;
- dst[6] = ZeroExtend(vld1_u8(s));
- s += p;
- dst[7] = ZeroExtend(vld1_u8(s));
-}
-
-// Multiply every entry in |src[]| by the corresponding lane in |taps| and sum.
-// The sum of the entries in |taps| is always 128. In some situations negative
-// values are used. This creates a situation where the positive taps sum to more
-// than 128. An example is:
-// {-4, 10, -24, 100, 60, -20, 8, -2}
-// The negative taps never sum to < -128
-// The center taps are always positive. The remaining positive taps never sum
-// to > 128.
-// Summing these naively can overflow int16_t. This can be avoided by adding the
-// center taps last and saturating the result.
-// We do not need to expand to int32_t because later in the function the value
-// is shifted by |kFilterBits| (7) and saturated to uint8_t. This means any
-// value over 255 << 7 (32576 because of rounding) is clamped.
-template <int num_taps>
-int16x8_t SumTaps(const int16x8_t* const src, const int16x8_t taps) {
+template <int filter_index, bool negative_outside_taps>
+int16x8_t SumHorizontalTaps(const uint8_t* const src,
+ const uint8x8_t* const v_tap) {
+ uint8x8_t v_src[8];
+ const uint8x16_t src_long = vld1q_u8(src);
int16x8_t sum;
- if (num_taps == 8) {
- const int16x4_t taps_lo = vget_low_s16(taps);
- const int16x4_t taps_hi = vget_high_s16(taps);
- sum = vmulq_lane_s16(src[0], taps_lo, 0);
- sum = vmlaq_lane_s16(sum, src[1], taps_lo, 1);
- sum = vmlaq_lane_s16(sum, src[2], taps_lo, 2);
- sum = vmlaq_lane_s16(sum, src[5], taps_hi, 1);
- sum = vmlaq_lane_s16(sum, src[6], taps_hi, 2);
- sum = vmlaq_lane_s16(sum, src[7], taps_hi, 3);
- // Center taps.
- sum = vqaddq_s16(sum, vmulq_lane_s16(src[3], taps_lo, 3));
- sum = vqaddq_s16(sum, vmulq_lane_s16(src[4], taps_hi, 0));
- } else if (num_taps == 6) {
- const int16x4_t taps_lo = vget_low_s16(taps);
- const int16x4_t taps_hi = vget_high_s16(taps);
- sum = vmulq_lane_s16(src[0], taps_lo, 1);
- sum = vmlaq_lane_s16(sum, src[1], taps_lo, 2);
- sum = vmlaq_lane_s16(sum, src[4], taps_hi, 1);
- sum = vmlaq_lane_s16(sum, src[5], taps_hi, 2);
-
- // Center taps.
- sum = vqaddq_s16(sum, vmulq_lane_s16(src[2], taps_lo, 3));
- sum = vqaddq_s16(sum, vmulq_lane_s16(src[3], taps_hi, 0));
- } else if (num_taps == 4) {
- const int16x4_t taps_lo = vget_low_s16(taps);
- sum = vmulq_lane_s16(src[0], taps_lo, 0);
- sum = vmlaq_lane_s16(sum, src[3], taps_lo, 3);
-
- // Center taps.
- sum = vqaddq_s16(sum, vmulq_lane_s16(src[1], taps_lo, 1));
- sum = vqaddq_s16(sum, vmulq_lane_s16(src[2], taps_lo, 2));
- } else {
- assert(num_taps == 2);
- // All the taps are positive so there is no concern regarding saturation.
- const int16x4_t taps_lo = vget_low_s16(taps);
- sum = vmulq_lane_s16(src[0], taps_lo, 1);
- sum = vmlaq_lane_s16(sum, src[1], taps_lo, 2);
+ if (filter_index < 2) {
+ v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+ v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+ v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+ v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+ v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 6));
+ sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 1);
+ } else if (filter_index == 2) {
+ v_src[0] = vget_low_u8(src_long);
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+ v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+ v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+ v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+ v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+ v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6));
+ v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7));
+ sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap);
+ } else if (filter_index == 3) {
+ v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+ sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 3);
+ } else if (filter_index > 3) {
+ v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+ v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+ v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+ sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 2);
}
-
return sum;
}
-// Add an offset to ensure the sum is positive and it fits within uint16_t.
-template <int num_taps>
-uint16x8_t SumTaps8To16(const int16x8_t* const src, const int16x8_t taps) {
- // The worst case sum of negative taps is -56. The worst case sum of positive
- // taps is 184. With the single pass versions of the Convolve we could safely
- // saturate to int16_t because it outranged the final shift and narrow to
- // uint8_t. For the 2D Convolve the intermediate values are 16 bits so we
- // don't have that option.
- // 184 * 255 = 46920 which is greater than int16_t can hold, but not uint16_t.
- // The minimum value we need to handle is -56 * 255 = -14280.
- // By offsetting the sum with 1 << 14 = 16384 we ensure that the sum is never
- // negative and that 46920 + 16384 = 63304 fits comfortably in uint16_t. This
- // allows us to use 16 bit registers instead of 32 bit registers.
- // When considering the bit operations it is safe to ignore signedness. Due to
- // the magic of 2's complement and well defined rollover rules the bit
- // representations are equivalent.
- const int16x4_t taps_lo = vget_low_s16(taps);
- const int16x4_t taps_hi = vget_high_s16(taps);
- // |offset| == 1 << (bitdepth + kFilterBits - 1);
- int16x8_t sum = vdupq_n_s16(1 << 14);
- if (num_taps == 8) {
- sum = vmlaq_lane_s16(sum, src[0], taps_lo, 0);
- sum = vmlaq_lane_s16(sum, src[1], taps_lo, 1);
- sum = vmlaq_lane_s16(sum, src[2], taps_lo, 2);
- sum = vmlaq_lane_s16(sum, src[3], taps_lo, 3);
- sum = vmlaq_lane_s16(sum, src[4], taps_hi, 0);
- sum = vmlaq_lane_s16(sum, src[5], taps_hi, 1);
- sum = vmlaq_lane_s16(sum, src[6], taps_hi, 2);
- sum = vmlaq_lane_s16(sum, src[7], taps_hi, 3);
- } else if (num_taps == 6) {
- sum = vmlaq_lane_s16(sum, src[0], taps_lo, 1);
- sum = vmlaq_lane_s16(sum, src[1], taps_lo, 2);
- sum = vmlaq_lane_s16(sum, src[2], taps_lo, 3);
- sum = vmlaq_lane_s16(sum, src[3], taps_hi, 0);
- sum = vmlaq_lane_s16(sum, src[4], taps_hi, 1);
- sum = vmlaq_lane_s16(sum, src[5], taps_hi, 2);
- } else if (num_taps == 4) {
- sum = vmlaq_lane_s16(sum, src[0], taps_lo, 2);
- sum = vmlaq_lane_s16(sum, src[1], taps_lo, 3);
- sum = vmlaq_lane_s16(sum, src[2], taps_hi, 0);
- sum = vmlaq_lane_s16(sum, src[3], taps_hi, 1);
- } else if (num_taps == 2) {
- sum = vmlaq_lane_s16(sum, src[0], taps_lo, 3);
- sum = vmlaq_lane_s16(sum, src[1], taps_hi, 0);
- }
+template <int filter_index, bool negative_outside_taps>
+uint8x8_t SimpleHorizontalTaps(const uint8_t* const src,
+ const uint8x8_t* const v_tap) {
+ int16x8_t sum =
+ SumHorizontalTaps<filter_index, negative_outside_taps>(src, v_tap);
- // This is guaranteed to be positive. Convert it for the final shift.
- return vreinterpretq_u16_s16(sum);
+ // Normally the Horizontal pass does the downshift in two passes:
+ // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+ // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
+ // requires adding the rounding offset from the skipped shift.
+ constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
+
+ sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
+ return vqrshrun_n_s16(sum, kFilterBits - 1);
}
-template <int num_taps, int filter_index, bool negative_outside_taps = true>
-uint16x8_t SumCompoundHorizontalTaps(const uint8_t* const src,
- const uint8x8_t* const v_tap) {
- // Start with an offset to guarantee the sum is non negative.
- uint16x8_t v_sum = vdupq_n_u16(1 << 14);
- uint8x16_t v_src[8];
- v_src[0] = vld1q_u8(&src[0]);
- if (num_taps == 8) {
- v_src[1] = vextq_u8(v_src[0], v_src[0], 1);
- v_src[2] = vextq_u8(v_src[0], v_src[0], 2);
- v_src[3] = vextq_u8(v_src[0], v_src[0], 3);
- v_src[4] = vextq_u8(v_src[0], v_src[0], 4);
- v_src[5] = vextq_u8(v_src[0], v_src[0], 5);
- v_src[6] = vextq_u8(v_src[0], v_src[0], 6);
- v_src[7] = vextq_u8(v_src[0], v_src[0], 7);
+template <int filter_index, bool negative_outside_taps>
+uint16x8_t HorizontalTaps8To16(const uint8_t* const src,
+ const uint8x8_t* const v_tap) {
+ const int16x8_t sum =
+ SumHorizontalTaps<filter_index, negative_outside_taps>(src, v_tap);
- // tap signs : - + - + + - + -
- v_sum = vmlsl_u8(v_sum, vget_low_u8(v_src[0]), v_tap[0]);
- v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[1]), v_tap[1]);
- v_sum = vmlsl_u8(v_sum, vget_low_u8(v_src[2]), v_tap[2]);
- v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[3]), v_tap[3]);
- v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[4]), v_tap[4]);
- v_sum = vmlsl_u8(v_sum, vget_low_u8(v_src[5]), v_tap[5]);
- v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[6]), v_tap[6]);
- v_sum = vmlsl_u8(v_sum, vget_low_u8(v_src[7]), v_tap[7]);
- } else if (num_taps == 6) {
- v_src[1] = vextq_u8(v_src[0], v_src[0], 1);
- v_src[2] = vextq_u8(v_src[0], v_src[0], 2);
- v_src[3] = vextq_u8(v_src[0], v_src[0], 3);
- v_src[4] = vextq_u8(v_src[0], v_src[0], 4);
- v_src[5] = vextq_u8(v_src[0], v_src[0], 5);
- v_src[6] = vextq_u8(v_src[0], v_src[0], 6);
- if (filter_index == 0) {
- // tap signs : + - + + - +
- v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[1]), v_tap[1]);
- v_sum = vmlsl_u8(v_sum, vget_low_u8(v_src[2]), v_tap[2]);
- v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[3]), v_tap[3]);
- v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[4]), v_tap[4]);
- v_sum = vmlsl_u8(v_sum, vget_low_u8(v_src[5]), v_tap[5]);
- v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[6]), v_tap[6]);
- } else {
- if (negative_outside_taps) {
- // tap signs : - + + + + -
- v_sum = vmlsl_u8(v_sum, vget_low_u8(v_src[1]), v_tap[1]);
- v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[2]), v_tap[2]);
- v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[3]), v_tap[3]);
- v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[4]), v_tap[4]);
- v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[5]), v_tap[5]);
- v_sum = vmlsl_u8(v_sum, vget_low_u8(v_src[6]), v_tap[6]);
- } else {
- // tap signs : + + + + + +
- v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[1]), v_tap[1]);
- v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[2]), v_tap[2]);
- v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[3]), v_tap[3]);
- v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[4]), v_tap[4]);
- v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[5]), v_tap[5]);
- v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[6]), v_tap[6]);
- }
- }
- } else if (num_taps == 4) {
- v_src[2] = vextq_u8(v_src[0], v_src[0], 2);
- v_src[3] = vextq_u8(v_src[0], v_src[0], 3);
- v_src[4] = vextq_u8(v_src[0], v_src[0], 4);
- v_src[5] = vextq_u8(v_src[0], v_src[0], 5);
- if (filter_index == 4) {
- // tap signs : - + + -
- v_sum = vmlsl_u8(v_sum, vget_low_u8(v_src[2]), v_tap[2]);
- v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[3]), v_tap[3]);
- v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[4]), v_tap[4]);
- v_sum = vmlsl_u8(v_sum, vget_low_u8(v_src[5]), v_tap[5]);
- } else {
- // tap signs : + + + +
- v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[2]), v_tap[2]);
- v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[3]), v_tap[3]);
- v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[4]), v_tap[4]);
- v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[5]), v_tap[5]);
- }
- } else {
- assert(num_taps == 2);
- v_src[3] = vextq_u8(v_src[0], v_src[0], 3);
- v_src[4] = vextq_u8(v_src[0], v_src[0], 4);
- // tap signs : + +
- v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[3]), v_tap[3]);
- v_sum = vmlal_u8(v_sum, vget_low_u8(v_src[4]), v_tap[4]);
- }
-
- return v_sum;
+ return vreinterpretq_u16_s16(
+ vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
}
-template <int num_taps, int filter_index>
-uint16x8_t SumHorizontalTaps2xH(const uint8_t* src, const ptrdiff_t src_stride,
- const uint8x8_t* const v_tap) {
- constexpr int positive_offset_bits = kBitdepth8 + kFilterBits - 1;
- uint16x8_t sum = vdupq_n_u16(1 << positive_offset_bits);
- uint8x8_t input0 = vld1_u8(src);
+template <int filter_index>
+int16x8_t SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
+ const uint8x8_t* const v_tap) {
+ uint16x8_t sum;
+ const uint8x8_t input0 = vld1_u8(src);
src += src_stride;
- uint8x8_t input1 = vld1_u8(src);
+ const uint8x8_t input1 = vld1_u8(src);
uint8x8x2_t input = vzip_u8(input0, input1);
- if (num_taps == 2) {
+ if (filter_index == 3) {
// tap signs : + +
- sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
+ sum = vmull_u8(vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
sum = vmlal_u8(sum, input.val[1], v_tap[4]);
} else if (filter_index == 4) {
// tap signs : - + + -
+ sum = vmull_u8(vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
sum = vmlsl_u8(sum, RightShift<4 * 8>(input.val[0]), v_tap[2]);
- sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
sum = vmlal_u8(sum, input.val[1], v_tap[4]);
sum = vmlsl_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]);
} else {
// tap signs : + + + +
- sum = vmlal_u8(sum, RightShift<4 * 8>(input.val[0]), v_tap[2]);
+ sum = vmull_u8(RightShift<4 * 8>(input.val[0]), v_tap[2]);
sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
sum = vmlal_u8(sum, input.val[1], v_tap[4]);
sum = vmlal_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]);
}
- return vrshrq_n_u16(sum, kInterRoundBitsHorizontal);
+ return vreinterpretq_s16_u16(sum);
}
-// TODO(johannkoenig): Rename this function. It works for more than just
-// compound convolutions.
+template <int filter_index>
+uint8x8_t SimpleHorizontalTaps2x2(const uint8_t* src,
+ const ptrdiff_t src_stride,
+ const uint8x8_t* const v_tap) {
+ int16x8_t sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+ // Normally the Horizontal pass does the downshift in two passes:
+ // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+ // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
+ // requires adding the rounding offset from the skipped shift.
+ constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
+
+ sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
+ return vqrshrun_n_s16(sum, kFilterBits - 1);
+}
+
+template <int filter_index>
+uint16x8_t HorizontalTaps8To16_2x2(const uint8_t* src,
+ const ptrdiff_t src_stride,
+ const uint8x8_t* const v_tap) {
+ const int16x8_t sum =
+ SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+ return vreinterpretq_u16_s16(
+ vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
+}
+
template <int num_taps, int step, int filter_index,
bool negative_outside_taps = true, bool is_2d = false,
- bool is_8bit = false>
-void ConvolveCompoundHorizontalBlock(const uint8_t* src,
- const ptrdiff_t src_stride,
- void* const dest,
- const ptrdiff_t pred_stride,
- const int width, const int height,
- const uint8x8_t* const v_tap) {
- const uint16x8_t v_compound_round_offset = vdupq_n_u16(1 << (kBitdepth8 + 4));
- const int16x8_t v_inter_round_bits_0 =
- vdupq_n_s16(-kInterRoundBitsHorizontal);
-
+ bool is_compound = false>
+void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dest, const ptrdiff_t pred_stride,
+ const int width, const int height,
+ const uint8x8_t* const v_tap) {
auto* dest8 = static_cast<uint8_t*>(dest);
auto* dest16 = static_cast<uint16_t*>(dest);
- if (width > 4) {
+ // 4 tap filters are never used when width > 4.
+ if (num_taps != 4 && width > 4) {
int y = 0;
do {
int x = 0;
do {
- uint16x8_t v_sum =
- SumCompoundHorizontalTaps<num_taps, filter_index,
- negative_outside_taps>(&src[x], v_tap);
- if (is_8bit) {
- // Split shifts the way they are in C. They can be combined but that
- // makes removing the 1 << 14 offset much more difficult.
- v_sum = vrshrq_n_u16(v_sum, kInterRoundBitsHorizontal);
- int16x8_t v_sum_signed = vreinterpretq_s16_u16(vsubq_u16(
- v_sum, vdupq_n_u16(1 << (14 - kInterRoundBitsHorizontal))));
- uint8x8_t result = vqrshrun_n_s16(
- v_sum_signed, kFilterBits - kInterRoundBitsHorizontal);
- vst1_u8(&dest8[x], result);
- } else {
- v_sum = vrshlq_u16(v_sum, v_inter_round_bits_0);
- if (!is_2d) {
- v_sum = vaddq_u16(v_sum, v_compound_round_offset);
- }
+ if (is_2d || is_compound) {
+ const uint16x8_t v_sum =
+ HorizontalTaps8To16<filter_index, negative_outside_taps>(&src[x],
+ v_tap);
vst1q_u16(&dest16[x], v_sum);
+ } else {
+ const uint8x8_t result =
+ SimpleHorizontalTaps<filter_index, negative_outside_taps>(&src[x],
+ v_tap);
+ vst1_u8(&dest8[x], result);
}
x += step;
} while (x < width);
@@ -352,135 +258,142 @@
dest16 += pred_stride;
} while (++y < height);
return;
- } else if (width == 4) {
- int y = 0;
- do {
- uint16x8_t v_sum =
- SumCompoundHorizontalTaps<num_taps, filter_index,
- negative_outside_taps>(&src[0], v_tap);
- if (is_8bit) {
- v_sum = vrshrq_n_u16(v_sum, kInterRoundBitsHorizontal);
- int16x8_t v_sum_signed = vreinterpretq_s16_u16(vsubq_u16(
- v_sum, vdupq_n_u16(1 << (14 - kInterRoundBitsHorizontal))));
- uint8x8_t result = vqrshrun_n_s16(
- v_sum_signed, kFilterBits - kInterRoundBitsHorizontal);
- StoreLo4(&dest8[0], result);
- } else {
- v_sum = vrshlq_u16(v_sum, v_inter_round_bits_0);
- if (!is_2d) {
- v_sum = vaddq_u16(v_sum, v_compound_round_offset);
- }
- vst1_u16(&dest16[0], vget_low_u16(v_sum));
- }
- src += src_stride;
- dest8 += pred_stride;
- dest16 += pred_stride;
- } while (++y < height);
- return;
}
// Horizontal passes only needs to account for |num_taps| 2 and 4 when
- // |width| == 2.
- assert(width == 2);
+ // |width| <= 4.
+ assert(width <= 4);
assert(num_taps <= 4);
-
- constexpr int positive_offset_bits = kBitdepth8 + kFilterBits - 1;
- // Leave off + 1 << (kBitdepth8 + 3).
- constexpr int compound_round_offset = 1 << (kBitdepth8 + 4);
-
if (num_taps <= 4) {
- int y = 0;
- do {
- // TODO(johannkoenig): Re-order the values for storing.
- uint16x8_t sum =
- SumHorizontalTaps2xH<num_taps, filter_index>(src, src_stride, v_tap);
+ if (width == 4) {
+ int y = 0;
+ do {
+ if (is_2d || is_compound) {
+ const uint16x8_t v_sum =
+ HorizontalTaps8To16<filter_index, negative_outside_taps>(src,
+ v_tap);
+ vst1_u16(dest16, vget_low_u16(v_sum));
+ } else {
+ const uint8x8_t result =
+ SimpleHorizontalTaps<filter_index, negative_outside_taps>(src,
+ v_tap);
+ StoreLo4(&dest8[0], result);
+ }
+ src += src_stride;
+ dest8 += pred_stride;
+ dest16 += pred_stride;
+ } while (++y < height);
+ return;
+ }
+ if (!is_compound) {
+ int y = 0;
+ do {
+ if (is_2d) {
+ const uint16x8_t sum =
+ HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap);
+ dest16[0] = vgetq_lane_u16(sum, 0);
+ dest16[1] = vgetq_lane_u16(sum, 2);
+ dest16 += pred_stride;
+ dest16[0] = vgetq_lane_u16(sum, 1);
+ dest16[1] = vgetq_lane_u16(sum, 3);
+ dest16 += pred_stride;
+ } else {
+ const uint8x8_t sum =
+ SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+ dest8[0] = vget_lane_u8(sum, 0);
+ dest8[1] = vget_lane_u8(sum, 2);
+ dest8 += pred_stride;
+
+ dest8[0] = vget_lane_u8(sum, 1);
+ dest8[1] = vget_lane_u8(sum, 3);
+ dest8 += pred_stride;
+ }
+
+ src += src_stride << 1;
+ y += 2;
+ } while (y < height - 1);
+
+ // The 2d filters have an odd |height| because the horizontal pass
+ // generates context for the vertical pass.
if (is_2d) {
- dest16[0] = vgetq_lane_u16(sum, 0);
- dest16[1] = vgetq_lane_u16(sum, 2);
- dest16 += pred_stride;
- dest16[0] = vgetq_lane_u16(sum, 1);
- dest16[1] = vgetq_lane_u16(sum, 3);
- dest16 += pred_stride;
- } else if (!is_8bit) {
- // None of the test vectors hit this path but the unit tests do.
- sum = vaddq_u16(sum, vdupq_n_u16(compound_round_offset));
-
- dest16[0] = vgetq_lane_u16(sum, 0);
- dest16[1] = vgetq_lane_u16(sum, 2);
- dest16 += pred_stride;
- dest16[0] = vgetq_lane_u16(sum, 1);
- dest16[1] = vgetq_lane_u16(sum, 3);
- dest16 += pred_stride;
- } else {
- // Split shifts the way they are in C. They can be combined but that
- // makes removing the 1 << 14 offset much more difficult.
- int16x8_t sum_signed = vreinterpretq_s16_u16(vsubq_u16(
- sum, vdupq_n_u16(
- 1 << (positive_offset_bits - kInterRoundBitsHorizontal))));
- uint8x8_t result =
- vqrshrun_n_s16(sum_signed, kFilterBits - kInterRoundBitsHorizontal);
-
- // Could de-interleave and vst1_lane_u16().
- dest8[0] = vget_lane_u8(result, 0);
- dest8[1] = vget_lane_u8(result, 2);
- dest8 += pred_stride;
-
- dest8[0] = vget_lane_u8(result, 1);
- dest8[1] = vget_lane_u8(result, 3);
- dest8 += pred_stride;
+ assert(height % 2 == 1);
+ uint16x8_t sum;
+ const uint8x8_t input = vld1_u8(src);
+ if (filter_index == 3) { // |num_taps| == 2
+ sum = vmull_u8(RightShift<3 * 8>(input), v_tap[3]);
+ sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
+ } else if (filter_index == 4) {
+ sum = vmull_u8(RightShift<3 * 8>(input), v_tap[3]);
+ sum = vmlsl_u8(sum, RightShift<2 * 8>(input), v_tap[2]);
+ sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
+ sum = vmlsl_u8(sum, RightShift<5 * 8>(input), v_tap[5]);
+ } else {
+ assert(filter_index == 5);
+ sum = vmull_u8(RightShift<2 * 8>(input), v_tap[2]);
+ sum = vmlal_u8(sum, RightShift<3 * 8>(input), v_tap[3]);
+ sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
+ sum = vmlal_u8(sum, RightShift<5 * 8>(input), v_tap[5]);
+ }
+ // |sum| contains an int16_t value.
+ sum = vreinterpretq_u16_s16(vrshrq_n_s16(
+ vreinterpretq_s16_u16(sum), kInterRoundBitsHorizontal - 1));
+ Store2<0>(dest16, sum);
}
-
- src += src_stride << 1;
- y += 2;
- } while (y < height - 1);
-
- // The 2d filters have an odd |height| because the horizontal pass generates
- // context for the vertical pass.
- if (is_2d) {
- assert(height % 2 == 1);
- uint16x8_t sum = vdupq_n_u16(1 << positive_offset_bits);
- uint8x8_t input = vld1_u8(src);
- if (filter_index == 3) { // |num_taps| == 2
- sum = vmlal_u8(sum, RightShift<3 * 8>(input), v_tap[3]);
- sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
- } else if (filter_index == 4) {
- sum = vmlsl_u8(sum, RightShift<2 * 8>(input), v_tap[2]);
- sum = vmlal_u8(sum, RightShift<3 * 8>(input), v_tap[3]);
- sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
- sum = vmlsl_u8(sum, RightShift<5 * 8>(input), v_tap[5]);
- } else {
- assert(filter_index == 5);
- sum = vmlal_u8(sum, RightShift<2 * 8>(input), v_tap[2]);
- sum = vmlal_u8(sum, RightShift<3 * 8>(input), v_tap[3]);
- sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
- sum = vmlal_u8(sum, RightShift<5 * 8>(input), v_tap[5]);
- sum = vrshrq_n_u16(sum, kInterRoundBitsHorizontal);
- }
- Store2<0>(dest16, sum);
}
}
}
// Process 16 bit inputs and output 32 bits.
-template <int num_taps>
-uint32x4x2_t Sum2DVerticalTaps(const int16x8_t* const src,
- const int16x8_t taps) {
- // In order to get the rollover correct with the lengthening instruction we
- // need to treat these as signed so that they sign extend properly.
+template <int num_taps, bool is_compound>
+inline int16x4_t Sum2DVerticalTaps4(const int16x4_t* const src,
+ const int16x8_t taps) {
const int16x4_t taps_lo = vget_low_s16(taps);
const int16x4_t taps_hi = vget_high_s16(taps);
- // An offset to guarantee the sum is non negative. Captures 56 * -4590 =
- // 257040 (worst case negative value from horizontal pass). It should be
- // possible to use 1 << 18 (262144) instead of 1 << 19 but there probably
- // isn't any benefit.
- // |offset_bits| = bitdepth + 2 * kFilterBits - kInterRoundBitsHorizontal
- // == 19.
- int32x4_t sum_lo = vdupq_n_s32(1 << 19);
- int32x4_t sum_hi = sum_lo;
+ int32x4_t sum;
if (num_taps == 8) {
- sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[0]), taps_lo, 0);
- sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[0]), taps_lo, 0);
+ sum = vmull_lane_s16(src[0], taps_lo, 0);
+ sum = vmlal_lane_s16(sum, src[1], taps_lo, 1);
+ sum = vmlal_lane_s16(sum, src[2], taps_lo, 2);
+ sum = vmlal_lane_s16(sum, src[3], taps_lo, 3);
+ sum = vmlal_lane_s16(sum, src[4], taps_hi, 0);
+ sum = vmlal_lane_s16(sum, src[5], taps_hi, 1);
+ sum = vmlal_lane_s16(sum, src[6], taps_hi, 2);
+ sum = vmlal_lane_s16(sum, src[7], taps_hi, 3);
+ } else if (num_taps == 6) {
+ sum = vmull_lane_s16(src[0], taps_lo, 1);
+ sum = vmlal_lane_s16(sum, src[1], taps_lo, 2);
+ sum = vmlal_lane_s16(sum, src[2], taps_lo, 3);
+ sum = vmlal_lane_s16(sum, src[3], taps_hi, 0);
+ sum = vmlal_lane_s16(sum, src[4], taps_hi, 1);
+ sum = vmlal_lane_s16(sum, src[5], taps_hi, 2);
+ } else if (num_taps == 4) {
+ sum = vmull_lane_s16(src[0], taps_lo, 2);
+ sum = vmlal_lane_s16(sum, src[1], taps_lo, 3);
+ sum = vmlal_lane_s16(sum, src[2], taps_hi, 0);
+ sum = vmlal_lane_s16(sum, src[3], taps_hi, 1);
+ } else if (num_taps == 2) {
+ sum = vmull_lane_s16(src[0], taps_lo, 3);
+ sum = vmlal_lane_s16(sum, src[1], taps_hi, 0);
+ }
+
+ if (is_compound) {
+ return vqrshrn_n_s32(sum, kInterRoundBitsCompoundVertical - 1);
+ }
+
+ return vqrshrn_n_s32(sum, kInterRoundBitsVertical - 1);
+}
+
+template <int num_taps, bool is_compound>
+int16x8_t SimpleSum2DVerticalTaps(const int16x8_t* const src,
+ const int16x8_t taps) {
+ const int16x4_t taps_lo = vget_low_s16(taps);
+ const int16x4_t taps_hi = vget_high_s16(taps);
+ int32x4_t sum_lo, sum_hi;
+ if (num_taps == 8) {
+ sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 0);
+ sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 0);
sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 1);
sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 1);
sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 2);
@@ -497,8 +410,8 @@
sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[7]), taps_hi, 3);
sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[7]), taps_hi, 3);
} else if (num_taps == 6) {
- sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[0]), taps_lo, 1);
- sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[0]), taps_lo, 1);
+ sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 1);
+ sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 1);
sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 2);
sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 2);
sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[2]), taps_lo, 3);
@@ -511,8 +424,8 @@
sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[5]), taps_hi, 2);
sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[5]), taps_hi, 2);
} else if (num_taps == 4) {
- sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[0]), taps_lo, 2);
- sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[0]), taps_lo, 2);
+ sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 2);
+ sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 2);
sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_lo, 3);
sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_lo, 3);
@@ -521,384 +434,273 @@
sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[3]), taps_hi, 1);
sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[3]), taps_hi, 1);
} else if (num_taps == 2) {
- sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[0]), taps_lo, 3);
- sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[0]), taps_lo, 3);
+ sum_lo = vmull_lane_s16(vget_low_s16(src[0]), taps_lo, 3);
+ sum_hi = vmull_lane_s16(vget_high_s16(src[0]), taps_lo, 3);
sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(src[1]), taps_hi, 0);
sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(src[1]), taps_hi, 0);
}
- // This is guaranteed to be positive. Convert it for the final shift.
- const uint32x4x2_t return_val = {vreinterpretq_u32_s32(sum_lo),
- vreinterpretq_u32_s32(sum_hi)};
- return return_val;
-}
-
-// Process 16 bit inputs and output 32 bits.
-template <int num_taps>
-uint32x4_t Sum2DVerticalTaps(const int16x4_t* const src, const int16x8_t taps) {
- // In order to get the rollover correct with the lengthening instruction we
- // need to treat these as signed so that they sign extend properly.
- const int16x4_t taps_lo = vget_low_s16(taps);
- const int16x4_t taps_hi = vget_high_s16(taps);
- // An offset to guarantee the sum is non negative. Captures 56 * -4590 =
- // 257040 (worst case negative value from horizontal pass). It should be
- // possible to use 1 << 18 (262144) instead of 1 << 19 but there probably
- // isn't any benefit.
- // |offset_bits| = bitdepth + 2 * kFilterBits - kInterRoundBitsHorizontal
- // == 19.
- int32x4_t sum = vdupq_n_s32(1 << 19);
- if (num_taps == 8) {
- sum = vmlal_lane_s16(sum, src[0], taps_lo, 0);
- sum = vmlal_lane_s16(sum, src[1], taps_lo, 1);
- sum = vmlal_lane_s16(sum, src[2], taps_lo, 2);
- sum = vmlal_lane_s16(sum, src[3], taps_lo, 3);
-
- sum = vmlal_lane_s16(sum, src[4], taps_hi, 0);
- sum = vmlal_lane_s16(sum, src[5], taps_hi, 1);
- sum = vmlal_lane_s16(sum, src[6], taps_hi, 2);
- sum = vmlal_lane_s16(sum, src[7], taps_hi, 3);
- } else if (num_taps == 6) {
- sum = vmlal_lane_s16(sum, src[0], taps_lo, 1);
- sum = vmlal_lane_s16(sum, src[1], taps_lo, 2);
- sum = vmlal_lane_s16(sum, src[2], taps_lo, 3);
-
- sum = vmlal_lane_s16(sum, src[3], taps_hi, 0);
- sum = vmlal_lane_s16(sum, src[4], taps_hi, 1);
- sum = vmlal_lane_s16(sum, src[5], taps_hi, 2);
- } else if (num_taps == 4) {
- sum = vmlal_lane_s16(sum, src[0], taps_lo, 2);
- sum = vmlal_lane_s16(sum, src[1], taps_lo, 3);
-
- sum = vmlal_lane_s16(sum, src[2], taps_hi, 0);
- sum = vmlal_lane_s16(sum, src[3], taps_hi, 1);
- } else if (num_taps == 2) {
- sum = vmlal_lane_s16(sum, src[0], taps_lo, 3);
-
- sum = vmlal_lane_s16(sum, src[1], taps_hi, 0);
+ if (is_compound) {
+ return vcombine_s16(
+ vqrshrn_n_s32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+ vqrshrn_n_s32(sum_hi, kInterRoundBitsCompoundVertical - 1));
}
- // This is guaranteed to be positive. Convert it for the final shift.
- return vreinterpretq_u32_s32(sum);
+ return vcombine_s16(vqrshrn_n_s32(sum_lo, kInterRoundBitsVertical - 1),
+ vqrshrn_n_s32(sum_hi, kInterRoundBitsVertical - 1));
}
template <int num_taps, bool is_compound = false>
-void Filter2DVertical(const uint16_t* src, const ptrdiff_t src_stride,
- void* const dst, const ptrdiff_t dst_stride,
- const int width, const int height, const int16x8_t taps,
- const int inter_round_bits_vertical) {
+void Filter2DVertical(const uint16_t* src, void* const dst,
+ const ptrdiff_t dst_stride, const int width,
+ const int height, const int16x8_t taps) {
+ assert(width >= 8);
constexpr int next_row = num_taps - 1;
- const int32x4_t v_inter_round_bits_vertical =
- vdupq_n_s32(-inter_round_bits_vertical);
+ // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
+ const ptrdiff_t src_stride = width;
auto* dst8 = static_cast<uint8_t*>(dst);
auto* dst16 = static_cast<uint16_t*>(dst);
- if (width > 4) {
- int x = 0;
+ int x = 0;
+ do {
+ int16x8_t srcs[8];
+ const uint16_t* src_x = src + x;
+ srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src_x));
+ src_x += src_stride;
+ if (num_taps >= 4) {
+ srcs[1] = vreinterpretq_s16_u16(vld1q_u16(src_x));
+ src_x += src_stride;
+ srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src_x));
+ src_x += src_stride;
+ if (num_taps >= 6) {
+ srcs[3] = vreinterpretq_s16_u16(vld1q_u16(src_x));
+ src_x += src_stride;
+ srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src_x));
+ src_x += src_stride;
+ if (num_taps == 8) {
+ srcs[5] = vreinterpretq_s16_u16(vld1q_u16(src_x));
+ src_x += src_stride;
+ srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src_x));
+ src_x += src_stride;
+ }
+ }
+ }
+
+ int y = 0;
do {
- int16x8_t srcs[8];
- srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src + x));
+ srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src_x));
+ src_x += src_stride;
+
+ const int16x8_t sum =
+ SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+ if (is_compound) {
+ vst1q_u16(dst16 + x + y * dst_stride, vreinterpretq_u16_s16(sum));
+ } else {
+ vst1_u8(dst8 + x + y * dst_stride, vqmovun_s16(sum));
+ }
+
+ srcs[0] = srcs[1];
if (num_taps >= 4) {
- srcs[1] = vreinterpretq_s16_u16(vld1q_u16(src + x + src_stride));
- srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src + x + 2 * src_stride));
+ srcs[1] = srcs[2];
+ srcs[2] = srcs[3];
if (num_taps >= 6) {
- srcs[3] = vreinterpretq_s16_u16(vld1q_u16(src + x + 3 * src_stride));
- srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src + x + 4 * src_stride));
+ srcs[3] = srcs[4];
+ srcs[4] = srcs[5];
if (num_taps == 8) {
- srcs[5] =
- vreinterpretq_s16_u16(vld1q_u16(src + x + 5 * src_stride));
- srcs[6] =
- vreinterpretq_s16_u16(vld1q_u16(src + x + 6 * src_stride));
+ srcs[5] = srcs[6];
+ srcs[6] = srcs[7];
}
}
}
+ } while (++y < height);
+ x += 8;
+ } while (x < width);
+}
- int y = 0;
- do {
- srcs[next_row] = vreinterpretq_s16_u16(
- vld1q_u16(src + x + (y + next_row) * src_stride));
+// Take advantage of |src_stride| == |width| to process two rows at a time.
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical4xH(const uint16_t* src, void* const dst,
+ const ptrdiff_t dst_stride, const int height,
+ const int16x8_t taps) {
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
- const uint32x4x2_t sums = Sum2DVerticalTaps<num_taps>(srcs, taps);
- if (is_compound) {
- const uint16x8_t results = vcombine_u16(
- vmovn_u32(vqrshlq_u32(sums.val[0], v_inter_round_bits_vertical)),
- vmovn_u32(vqrshlq_u32(sums.val[1], v_inter_round_bits_vertical)));
- vst1q_u16(dst16 + x + y * dst_stride, results);
- } else {
- const uint16x8_t first_shift =
- vcombine_u16(vqrshrn_n_u32(sums.val[0], kInterRoundBitsVertical),
- vqrshrn_n_u32(sums.val[1], kInterRoundBitsVertical));
- // |single_round_offset| == (1 << bitdepth) + (1 << (bitdepth - 1)) ==
- // 384
- const uint8x8_t results =
- vqmovn_u16(vqsubq_u16(first_shift, vdupq_n_u16(384)));
-
- vst1_u8(dst8 + x + y * dst_stride, results);
- }
-
- srcs[0] = srcs[1];
- if (num_taps >= 4) {
- srcs[1] = srcs[2];
- srcs[2] = srcs[3];
- if (num_taps >= 6) {
- srcs[3] = srcs[4];
- srcs[4] = srcs[5];
- if (num_taps == 8) {
- srcs[5] = srcs[6];
- srcs[6] = srcs[7];
- }
- }
- }
- } while (++y < height);
- x += 8;
- } while (x < width);
- return;
- }
-
- assert(width == 4);
- int16x4_t srcs[8];
- srcs[0] = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
+ int16x8_t srcs[9];
+ srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
if (num_taps >= 4) {
- srcs[1] = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- srcs[2] = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
+ srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ srcs[1] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[2]));
if (num_taps >= 6) {
- srcs[3] = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- srcs[4] = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
+ srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ srcs[3] = vcombine_s16(vget_high_s16(srcs[2]), vget_low_s16(srcs[4]));
if (num_taps == 8) {
- srcs[5] = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
- srcs[6] = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
+ srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ srcs[5] = vcombine_s16(vget_high_s16(srcs[4]), vget_low_s16(srcs[6]));
}
}
}
int y = 0;
do {
- srcs[next_row] = vreinterpret_s16_u16(vld1_u16(src));
- src += src_stride;
+ srcs[num_taps] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ srcs[num_taps - 1] = vcombine_s16(vget_high_s16(srcs[num_taps - 2]),
+ vget_low_s16(srcs[num_taps]));
- const uint32x4_t sums = Sum2DVerticalTaps<num_taps>(srcs, taps);
+ const int16x8_t sum =
+ SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
if (is_compound) {
- const uint16x4_t results =
- vmovn_u32(vqrshlq_u32(sums, v_inter_round_bits_vertical));
- vst1_u16(dst16, results);
- dst16 += dst_stride;
+ const uint16x8_t results = vreinterpretq_u16_s16(sum);
+ vst1q_u16(dst16, results);
+ dst16 += 4 << 1;
} else {
- const uint16x4_t first_shift =
- vqrshrn_n_u32(sums, kInterRoundBitsVertical);
- // |single_round_offset| == (1 << bitdepth) + (1 << (bitdepth - 1)) ==
- // 384
- const uint8x8_t results = vqmovn_u16(
- vcombine_u16(vqsub_u16(first_shift, vdup_n_u16(384)), vdup_n_u16(0)));
+ const uint8x8_t results = vqmovun_s16(sum);
StoreLo4(dst8, results);
dst8 += dst_stride;
+ StoreHi4(dst8, results);
+ dst8 += dst_stride;
}
- srcs[0] = srcs[1];
+ srcs[0] = srcs[2];
if (num_taps >= 4) {
- srcs[1] = srcs[2];
- srcs[2] = srcs[3];
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
if (num_taps >= 6) {
- srcs[3] = srcs[4];
- srcs[4] = srcs[5];
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
if (num_taps == 8) {
- srcs[5] = srcs[6];
- srcs[6] = srcs[7];
+ srcs[5] = srcs[7];
+ srcs[6] = srcs[8];
}
}
}
- } while (++y < height);
+ y += 2;
+ } while (y < height);
}
-template <bool is_2d = false, bool is_8bit = false>
-void HorizontalPass(const uint8_t* const src, const ptrdiff_t src_stride,
- void* const dst, const ptrdiff_t dst_stride,
- const int width, const int height, const int subpixel,
- const int filter_index) {
+// Take advantage of |src_stride| == |width| to process four rows at a time.
+template <int num_taps>
+void Filter2DVertical2xH(const uint16_t* src, void* const dst,
+ const ptrdiff_t dst_stride, const int height,
+ const int16x8_t taps) {
+ constexpr int next_row = (num_taps < 6) ? 4 : 8;
+
+ auto* dst8 = static_cast<uint8_t*>(dst);
+
+ int16x8_t srcs[9];
+ srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ if (num_taps >= 6) {
+ srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
+ if (num_taps == 8) {
+ srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
+ srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
+ }
+ }
+
+ int y = 0;
+ do {
+ srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ if (num_taps == 2) {
+ srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
+ } else if (num_taps == 4) {
+ srcs[1] = vextq_s16(srcs[0], srcs[4], 2);
+ srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
+ srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
+ } else if (num_taps == 6) {
+ srcs[2] = vcombine_s16(vget_high_s16(srcs[0]), vget_low_s16(srcs[4]));
+ srcs[3] = vextq_s16(srcs[0], srcs[4], 6);
+ srcs[5] = vextq_s16(srcs[4], srcs[8], 2);
+ } else if (num_taps == 8) {
+ srcs[5] = vextq_s16(srcs[4], srcs[8], 2);
+ srcs[6] = vcombine_s16(vget_high_s16(srcs[4]), vget_low_s16(srcs[8]));
+ srcs[7] = vextq_s16(srcs[4], srcs[8], 6);
+ }
+
+ const int16x8_t sum =
+ SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps);
+ const uint8x8_t results = vqmovun_s16(sum);
+
+ Store2<0>(dst8, results);
+ dst8 += dst_stride;
+ Store2<1>(dst8, results);
+ // When |height| <= 4 the taps are restricted to 2 and 4 tap variants.
+ // Therefore we don't need to check this condition when |height| > 4.
+ if (num_taps <= 4 && height == 2) return;
+ dst8 += dst_stride;
+ Store2<2>(dst8, results);
+ dst8 += dst_stride;
+ Store2<3>(dst8, results);
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[4];
+ if (num_taps == 6) {
+ srcs[1] = srcs[5];
+ srcs[4] = srcs[8];
+ } else if (num_taps == 8) {
+ srcs[1] = srcs[5];
+ srcs[2] = srcs[6];
+ srcs[3] = srcs[7];
+ srcs[4] = srcs[8];
+ }
+
+ y += 4;
+ } while (y < height);
+}
+
+template <bool is_2d = false, bool is_compound = false>
+LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
+ const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
+ const ptrdiff_t dst_stride, const int width, const int height,
+ const int subpixel, const int filter_index) {
// Duplicate the absolute value for each tap. Negative taps are corrected
// by using the vmlsl_u8 instruction. Positive taps use vmlal_u8.
uint8x8_t v_tap[kSubPixelTaps];
const int filter_id = (subpixel >> 6) & kSubPixelMask;
+ assert(filter_id != 0);
+
for (int k = 0; k < kSubPixelTaps; ++k) {
- v_tap[k] = vreinterpret_u8_s8(
- vabs_s8(vdup_n_s8(kSubPixelFilters[filter_index][filter_id][k])));
+ v_tap[k] = vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][filter_id][k]);
}
if (filter_index == 2) { // 8 tap.
- ConvolveCompoundHorizontalBlock<8, 8, 2, true, is_2d, is_8bit>(
+ FilterHorizontal<8, 8, 2, true, is_2d, is_compound>(
src, src_stride, dst, dst_stride, width, height, v_tap);
} else if (filter_index == 1) { // 6 tap.
// Check if outside taps are positive.
if ((filter_id == 1) | (filter_id == 15)) {
- ConvolveCompoundHorizontalBlock<6, 8, 1, false, is_2d, is_8bit>(
+ FilterHorizontal<6, 8, 1, false, is_2d, is_compound>(
src, src_stride, dst, dst_stride, width, height, v_tap);
} else {
- ConvolveCompoundHorizontalBlock<6, 8, 1, true, is_2d, is_8bit>(
+ FilterHorizontal<6, 8, 1, true, is_2d, is_compound>(
src, src_stride, dst, dst_stride, width, height, v_tap);
}
} else if (filter_index == 0) { // 6 tap.
- ConvolveCompoundHorizontalBlock<6, 8, 0, true, is_2d, is_8bit>(
+ FilterHorizontal<6, 8, 0, true, is_2d, is_compound>(
src, src_stride, dst, dst_stride, width, height, v_tap);
} else if (filter_index == 4) { // 4 tap.
- ConvolveCompoundHorizontalBlock<4, 8, 4, true, is_2d, is_8bit>(
+ FilterHorizontal<4, 8, 4, true, is_2d, is_compound>(
src, src_stride, dst, dst_stride, width, height, v_tap);
} else if (filter_index == 5) { // 4 tap.
- ConvolveCompoundHorizontalBlock<4, 8, 5, true, is_2d, is_8bit>(
+ FilterHorizontal<4, 8, 5, true, is_2d, is_compound>(
src, src_stride, dst, dst_stride, width, height, v_tap);
} else { // 2 tap.
- ConvolveCompoundHorizontalBlock<2, 8, 3, true, is_2d, is_8bit>(
+ FilterHorizontal<2, 8, 3, true, is_2d, is_compound>(
src, src_stride, dst, dst_stride, width, height, v_tap);
}
}
-// There are three forms of this function:
-// 2D: input 8bit, output 16bit. |is_compound| has no effect.
-// 1D Horizontal: input 8bit, output 8bit.
-// 1D Compound Horizontal: input 8bit, output 16bit. Different rounding from 2D.
-// |width| is guaranteed to be 2 because all other cases are handled in neon.
-template <bool is_2d = true, bool is_compound = false>
-void HorizontalPass2xH(const uint8_t* src, const ptrdiff_t src_stride,
- void* const dst, const ptrdiff_t dst_stride,
- const int height, const int filter_index, const int taps,
- const int subpixel) {
- // Even though |is_compound| has no effect when |is_2d| is true we block this
- // combination in case the compiler gets confused.
- static_assert(!is_2d || !is_compound, "|is_compound| is ignored.");
- // Since this only handles |width| == 2, we only need to be concerned with
- // 2 or 4 tap filters.
- assert(taps == 2 || taps == 4);
- auto* dst8 = static_cast<uint8_t*>(dst);
- auto* dst16 = static_cast<uint16_t*>(dst);
-
- const int compound_round_offset =
- (1 << (kBitdepth8 + 4)) + (1 << (kBitdepth8 + 3));
-
- const int filter_id = (subpixel >> 6) & kSubPixelMask;
- const int taps_start = (kSubPixelTaps - taps) / 2;
- int y = 0;
- do {
- int x = 0;
- do {
- int sum;
- if (is_2d) {
- // An offset to guarantee the sum is non negative.
- sum = 1 << (kBitdepth8 + kFilterBits - 1);
- } else if (is_compound) {
- sum = 0;
- } else {
- // 1D non-Compound. The C uses a two stage shift with rounding. Here the
- // shifts are combined and the rounding bit from the first stage is
- // added in.
- // (sum + 4 >> 3) + 8) >> 4 == (sum + 64 + 4) >> 7
- sum = 4;
- }
- for (int k = 0; k < taps; ++k) {
- const int tap = k + taps_start;
- sum += kSubPixelFilters[filter_index][filter_id][tap] * src[x + k];
- }
- if (is_2d) {
- dst16[x] = static_cast<int16_t>(
- RightShiftWithRounding(sum, kInterRoundBitsHorizontal));
- } else if (is_compound) {
- sum = RightShiftWithRounding(sum, kInterRoundBitsHorizontal);
- dst16[x] = sum + compound_round_offset;
- } else {
- // 1D non-Compound.
- dst8[x] = static_cast<uint8_t>(
- Clip3(RightShiftWithRounding(sum, kFilterBits), 0, 255));
- }
- } while (++x < 2);
-
- src += src_stride;
- dst8 += dst_stride;
- dst16 += dst_stride;
- } while (++y < height);
-}
-
-// This will always need to handle all |filter_index| values. Even with |width|
-// restricted to 2 the value of |height| can go up to at least 16.
-template <bool is_2d = true, bool is_compound = false>
-void VerticalPass2xH(const void* const src, const ptrdiff_t src_stride,
- void* const dst, const ptrdiff_t dst_stride,
- const int height, const int inter_round_bits_vertical,
- const int filter_index, const int taps,
- const int subpixel) {
- const auto* src8 = static_cast<const uint8_t*>(src);
- const auto* src16 = static_cast<const uint16_t*>(src);
- auto* dst8 = static_cast<uint8_t*>(dst);
- auto* dst16 = static_cast<uint16_t*>(dst);
- const int filter_id = (subpixel >> 6) & kSubPixelMask;
- const int taps_start = (kSubPixelTaps - taps) / 2;
- constexpr int max_pixel_value = (1 << kBitdepth8) - 1;
-
- int y = 0;
- do {
- int x = 0;
- do {
- int sum;
- if (is_2d) {
- sum = 1 << (kBitdepth8 + 2 * kFilterBits - kInterRoundBitsHorizontal);
- } else if (is_compound) {
- // TODO(johannkoenig): Keeping the sum positive is valuable for neon but
- // may not actually help the C implementation. Investigate removing
- // this.
- // Use this offset to cancel out 1 << (kBitdepth8 + 3) >> 3 from
- // |compound_round_offset|.
- sum = (1 << (kBitdepth8 + 3)) << 3;
- } else {
- sum = 0;
- }
-
- for (int k = 0; k < taps; ++k) {
- const int tap = k + taps_start;
- if (is_2d) {
- sum += kSubPixelFilters[filter_index][filter_id][tap] *
- src16[x + k * src_stride];
- } else {
- sum += kSubPixelFilters[filter_index][filter_id][tap] *
- src8[x + k * src_stride];
- }
- }
-
- if (is_2d) {
- if (is_compound) {
- dst16[x] = static_cast<uint16_t>(
- RightShiftWithRounding(sum, inter_round_bits_vertical));
- } else {
- constexpr int single_round_offset =
- (1 << kBitdepth8) + (1 << (kBitdepth8 - 1));
- dst8[x] = static_cast<uint8_t>(
- Clip3(RightShiftWithRounding(sum, kInterRoundBitsVertical) -
- single_round_offset,
- 0, max_pixel_value));
- }
- } else if (is_compound) {
- // Leave off + 1 << (kBitdepth8 + 3).
- constexpr int compound_round_offset = 1 << (kBitdepth8 + 4);
- dst16[x] = RightShiftWithRounding(sum, 3) + compound_round_offset;
- } else {
- // 1D non-compound.
- dst8[x] = static_cast<uint8_t>(Clip3(
- RightShiftWithRounding(sum, kFilterBits), 0, max_pixel_value));
- }
- } while (++x < 2);
-
- src8 += src_stride;
- src16 += src_stride;
- dst8 += dst_stride;
- dst16 += dst_stride;
- } while (++y < height);
-}
-
-int NumTapsInFilter(const int filter_index) {
+int GetNumTapsInFilter(const int filter_index) {
if (filter_index < 2) {
// Despite the names these only use 6 taps.
// kInterpolationFilterEightTap
@@ -930,255 +732,135 @@
void Convolve2D_NEON(const void* const reference,
const ptrdiff_t reference_stride,
const int horizontal_filter_index,
- const int vertical_filter_index,
- const int /*inter_round_bits_vertical*/,
- const int subpixel_x, const int subpixel_y,
- const int /*step_x*/, const int /*step_y*/,
- const int width, const int height, void* prediction,
- const ptrdiff_t pred_stride) {
+ const int vertical_filter_index, const int subpixel_x,
+ const int subpixel_y, const int width, const int height,
+ void* prediction, const ptrdiff_t pred_stride) {
const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
- const int horizontal_taps = NumTapsInFilter(horiz_filter_index);
- const int vertical_taps = NumTapsInFilter(vert_filter_index);
+ const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
// The output of the horizontal filter is guaranteed to fit in 16 bits.
uint16_t
intermediate_result[kMaxSuperBlockSizeInPixels *
(kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
- const int intermediate_stride = width;
const int intermediate_height = height + vertical_taps - 1;
- if (width >= 4) {
- const ptrdiff_t src_stride = reference_stride;
- const auto* src = static_cast<const uint8_t*>(reference) -
- (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
- HorizontalPass<true>(src, src_stride, intermediate_result,
- intermediate_stride, width, intermediate_height,
- subpixel_x, horiz_filter_index);
+ DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result, width,
+ width, intermediate_height, subpixel_x,
+ horiz_filter_index);
- // Vertical filter.
- auto* dest = static_cast<uint8_t*>(prediction);
- const ptrdiff_t dest_stride = pred_stride;
- const int filter_id = ((subpixel_y & 1023) >> 6) & kSubPixelMask;
- const int16x8_t taps =
- vld1q_s16(kSubPixelFilters[vert_filter_index][filter_id]);
+ // Vertical filter.
+ auto* dest = static_cast<uint8_t*>(prediction);
+ const ptrdiff_t dest_stride = pred_stride;
+ const int filter_id = ((subpixel_y & 1023) >> 6) & kSubPixelMask;
+ assert(filter_id != 0);
- if (vertical_taps == 8) {
- Filter2DVertical<8>(intermediate_result, intermediate_stride, dest,
- dest_stride, width, height, taps, 0);
- } else if (vertical_taps == 6) {
- Filter2DVertical<6>(intermediate_result, intermediate_stride, dest,
- dest_stride, width, height, taps, 0);
- } else if (vertical_taps == 4) {
- Filter2DVertical<4>(intermediate_result, intermediate_stride, dest,
- dest_stride, width, height, taps, 0);
- } else { // |vertical_taps| == 2
- Filter2DVertical<2>(intermediate_result, intermediate_stride, dest,
- dest_stride, width, height, taps, 0);
+ const int16x8_t taps =
+ vmovl_s8(vld1_s8(kHalfSubPixelFilters[vert_filter_index][filter_id]));
+
+ if (vertical_taps == 8) {
+ if (width == 2) {
+ Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else if (width == 4) {
+ Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else {
+ Filter2DVertical<8>(intermediate_result, dest, dest_stride, width, height,
+ taps);
}
- } else {
- assert(width == 2);
- // Horizontal filter.
- const auto* const src = static_cast<const uint8_t*>(reference) -
- ((vertical_taps / 2) - 1) * reference_stride -
- ((horizontal_taps / 2) - 1);
-
- HorizontalPass2xH(src, reference_stride, intermediate_result,
- intermediate_stride, intermediate_height,
- horiz_filter_index, horizontal_taps, subpixel_x);
-
- // Vertical filter.
- auto* dest = static_cast<uint8_t*>(prediction);
- const ptrdiff_t dest_stride = pred_stride;
-
- VerticalPass2xH(intermediate_result, intermediate_stride, dest, dest_stride,
- height, 0, vert_filter_index, vertical_taps, subpixel_y);
+ } else if (vertical_taps == 6) {
+ if (width == 2) {
+ Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else if (width == 4) {
+ Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else {
+ Filter2DVertical<6>(intermediate_result, dest, dest_stride, width, height,
+ taps);
+ }
+ } else if (vertical_taps == 4) {
+ if (width == 2) {
+ Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else if (width == 4) {
+ Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else {
+ Filter2DVertical<4>(intermediate_result, dest, dest_stride, width, height,
+ taps);
+ }
+ } else { // |vertical_taps| == 2
+ if (width == 2) {
+ Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else if (width == 4) {
+ Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else {
+ Filter2DVertical<2>(intermediate_result, dest, dest_stride, width, height,
+ taps);
+ }
}
}
-template <int tap_lane0, int tap_lane1>
-inline int16x8_t CombineFilterTapsLong(const int16x8_t sum,
- const int16x8_t src0, int16x8_t src1,
- int16x4_t taps0, int16x4_t taps1) {
- int32x4_t sum_lo = vmovl_s16(vget_low_s16(sum));
- int32x4_t sum_hi = vmovl_s16(vget_high_s16(sum));
- const int16x8_t product0 = vmulq_lane_s16(src0, taps0, tap_lane0);
- const int16x8_t product1 = vmulq_lane_s16(src1, taps1, tap_lane1);
- const int32x4_t center_vals_lo =
- vaddl_s16(vget_low_s16(product0), vget_low_s16(product1));
- const int32x4_t center_vals_hi =
- vaddl_s16(vget_high_s16(product0), vget_high_s16(product1));
-
- sum_lo = vaddq_s32(sum_lo, center_vals_lo);
- sum_hi = vaddq_s32(sum_hi, center_vals_hi);
- return vcombine_s16(vrshrn_n_s32(sum_lo, 3), vrshrn_n_s32(sum_hi, 3));
-}
-
-// TODO(b/133525024): Replace usage of this function with version that uses
-// unsigned trick, once cl/263050071 is submitted.
-template <int num_taps>
-inline int16x8_t SumTapsCompound(const int16x8_t* const src,
- const int16x8_t taps) {
- int16x8_t sum = vdupq_n_s16(1 << (kBitdepth8 + kFilterBits - 1));
- if (num_taps == 8) {
- const int16x4_t taps_lo = vget_low_s16(taps);
- const int16x4_t taps_hi = vget_high_s16(taps);
- sum = vmlaq_lane_s16(sum, src[0], taps_lo, 0);
- sum = vmlaq_lane_s16(sum, src[1], taps_lo, 1);
- sum = vmlaq_lane_s16(sum, src[2], taps_lo, 2);
- sum = vmlaq_lane_s16(sum, src[5], taps_hi, 1);
- sum = vmlaq_lane_s16(sum, src[6], taps_hi, 2);
- sum = vmlaq_lane_s16(sum, src[7], taps_hi, 3);
-
- // Center taps may sum to as much as 160, which pollutes the sign bit in
- // int16 types.
- sum = CombineFilterTapsLong<3, 0>(sum, src[3], src[4], taps_lo, taps_hi);
- } else if (num_taps == 6) {
- const int16x4_t taps_lo = vget_low_s16(taps);
- const int16x4_t taps_hi = vget_high_s16(taps);
- sum = vmlaq_lane_s16(sum, src[0], taps_lo, 0);
- sum = vmlaq_lane_s16(sum, src[1], taps_lo, 1);
- sum = vmlaq_lane_s16(sum, src[4], taps_hi, 0);
- sum = vmlaq_lane_s16(sum, src[5], taps_hi, 1);
-
- // Center taps in filter 0 may sum to as much as 148, which pollutes the
- // sign bit in int16 types. This is not true of filter 1.
- sum = CombineFilterTapsLong<2, 3>(sum, src[2], src[3], taps_lo, taps_lo);
- } else if (num_taps == 4) {
- const int16x4_t taps_lo = vget_low_s16(taps);
- sum = vmlaq_lane_s16(sum, src[0], taps_lo, 0);
- sum = vmlaq_lane_s16(sum, src[3], taps_lo, 3);
-
- // Center taps.
- sum = vqaddq_s16(sum, vmulq_lane_s16(src[1], taps_lo, 1));
- sum = vrshrq_n_s16(vqaddq_s16(sum, vmulq_lane_s16(src[2], taps_lo, 2)),
- kInterRoundBitsHorizontal);
- } else {
- assert(num_taps == 2);
- // All the taps are positive so there is no concern regarding saturation.
- const int16x4_t taps_lo = vget_low_s16(taps);
- sum = vmlaq_lane_s16(sum, src[0], taps_lo, 0);
- sum = vrshrq_n_s16(vmlaq_lane_s16(sum, src[1], taps_lo, 1),
- kInterRoundBitsHorizontal);
+// There are many opportunities for overreading in scaled convolve, because the
+// range of starting points for filter windows is anywhere from 0 to 16 for 8
+// destination pixels, and the window sizes range from 2 to 8. To accommodate
+// this range concisely, we use |grade_x| to mean the most steps in src that can
+// be traversed in a single |step_x| increment, i.e. 1 or 2. When grade_x is 2,
+// we are guaranteed to exceed 8 whole steps in src for every 8 |step_x|
+// increments. The first load covers the initial elements of src_x, while the
+// final load covers the taps.
+template <int grade_x>
+inline uint8x8x3_t LoadSrcVals(const uint8_t* src_x) {
+ uint8x8x3_t ret;
+ const uint8x16_t src_val = vld1q_u8(src_x);
+ ret.val[0] = vget_low_u8(src_val);
+ ret.val[1] = vget_high_u8(src_val);
+ if (grade_x > 1) {
+ ret.val[2] = vld1_u8(src_x + 16);
}
- return sum;
+ return ret;
}
-// |grade_x| determines an upper limit on how many whole-pixel steps will be
-// realized with 8 |step_x| increments.
-template <int filter_index, int num_taps, int grade_x>
-inline void ConvolveHorizontalScaled_NEON(const uint8_t* src,
- const ptrdiff_t src_stride,
- const int width, const int subpixel_x,
- const int step_x,
- const int intermediate_height,
- int16_t* dst) {
- const int dst_stride = kMaxSuperBlockSizeInPixels;
- const int kernel_offset = (8 - num_taps) / 2;
- const int ref_x = subpixel_x >> kScaleSubPixelBits;
- int y = intermediate_height;
- do { // y > 0
- int p = subpixel_x;
- int prev_p = p;
- int x = 0;
- int16x8_t s[(grade_x + 1) * 8];
- const uint8_t* src_x =
- &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
- // TODO(petersonab,b/139707209): Fix source buffer overreads.
- // For example, when |height| == 2 and |num_taps| == 8 then
- // |intermediate_height| == 9. On the second pass this will load and
- // transpose 7 rows past where |src| may end.
- Load8x8(src_x, src_stride, s);
- Transpose8x8(s);
- if (grade_x > 1) {
- Load8x8(src_x + 8, src_stride, &s[8]);
- Transpose8x8(&s[8]);
- }
-
- do { // x < width
- int16x8_t result[8];
- src_x = &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
- // process 8 src_x steps
- Load8x8(src_x + 8, src_stride, &s[8]);
- Transpose8x8(&s[8]);
- if (grade_x > 1) {
- Load8x8(src_x + 16, src_stride, &s[16]);
- Transpose8x8(&s[16]);
- }
- // Remainder after whole index increments.
- int pixel_offset = p & ((1 << kScaleSubPixelBits) - 1);
- for (int z = 0; z < 8; ++z) {
- const int16x8_t filter = vld1q_s16(
- &kSubPixelFilters[filter_index][(p >> 6) & 0xF][kernel_offset]);
- result[z] = SumTapsCompound<num_taps>(
- &s[pixel_offset >> kScaleSubPixelBits], filter);
- pixel_offset += step_x;
- p += step_x;
- }
-
- // Transpose the 8x8 filtered values back to dst.
- Transpose8x8(result);
-
- vst1q_s16(&dst[x + 0 * dst_stride], result[0]);
- vst1q_s16(&dst[x + 1 * dst_stride], result[1]);
- vst1q_s16(&dst[x + 2 * dst_stride], result[2]);
- vst1q_s16(&dst[x + 3 * dst_stride], result[3]);
- vst1q_s16(&dst[x + 4 * dst_stride], result[4]);
- vst1q_s16(&dst[x + 5 * dst_stride], result[5]);
- vst1q_s16(&dst[x + 6 * dst_stride], result[6]);
- vst1q_s16(&dst[x + 7 * dst_stride], result[7]);
-
- for (int i = 0; i < 8; ++i) {
- s[i] =
- s[(p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits) + i];
- if (grade_x > 1) {
- s[i + 8] = s[(p >> kScaleSubPixelBits) -
- (prev_p >> kScaleSubPixelBits) + i + 8];
- }
- }
-
- prev_p = p;
- x += 8;
- } while (x < width);
-
- src += src_stride * 8;
- dst += dst_stride * 8;
- y -= 8;
- } while (y > 0);
-}
-
+// Pre-transpose the 2 tap filters in |kAbsHalfSubPixelFilters|[3]
inline uint8x16_t GetPositive2TapFilter(const int tap_index) {
assert(tap_index < 2);
- constexpr uint8_t kSubPixel2TapFilterColumns[2][16] = {
- {128, 120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8},
- {0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120}};
+ alignas(
+ 16) static constexpr uint8_t kAbsHalfSubPixel2TapFilterColumns[2][16] = {
+ {64, 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4},
+ {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60}};
- return vld1q_u8(kSubPixel2TapFilterColumns[tap_index]);
+ return vld1q_u8(kAbsHalfSubPixel2TapFilterColumns[tap_index]);
}
+template <int grade_x>
inline void ConvolveKernelHorizontal2Tap(const uint8_t* src,
const ptrdiff_t src_stride,
const int width, const int subpixel_x,
const int step_x,
const int intermediate_height,
int16_t* intermediate) {
- const int kIntermediateStride = kMaxSuperBlockSizeInPixels;
// Account for the 0-taps that precede the 2 nonzero taps.
const int kernel_offset = 3;
const int ref_x = subpixel_x >> kScaleSubPixelBits;
const int step_x8 = step_x << 3;
const uint8x16_t filter_taps0 = GetPositive2TapFilter(0);
const uint8x16_t filter_taps1 = GetPositive2TapFilter(1);
- const uint16x8_t sum = vdupq_n_u16(1 << (kBitdepth8 + kFilterBits - 1));
- uint16x8_t index_steps = vmulq_n_u16(vmovl_u8(vcreate_u8(0x0706050403020100)),
- static_cast<uint16_t>(step_x));
-
+ const uint16x8_t index_steps = vmulq_n_u16(
+ vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
- for (int x = 0, p = subpixel_x; x < width; x += 8, p += step_x8) {
+
+ int p = subpixel_x;
+ if (width <= 4) {
const uint8_t* src_x =
&src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
- int16_t* intermediate_x = intermediate + x;
// Only add steps to the 10-bit truncated p to avoid overflow.
const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
@@ -1189,45 +871,86 @@
// For each x, a lane of tapsK has
// kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
// on x.
- const uint8x8_t taps0 = VQTbl1U8(filter_taps0, filter_indices);
- const uint8x8_t taps1 = VQTbl1U8(filter_taps1, filter_indices);
- for (int y = 0; y < intermediate_height; ++y) {
+ const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
+ VQTbl1U8(filter_taps1, filter_indices)};
+ int y = 0;
+ do {
// Load a pool of samples to select from using stepped indices.
- uint8x16_t src_vals = vld1q_u8(src_x);
+ const uint8x16_t src_vals = vld1q_u8(src_x);
const uint8x8_t src_indices =
vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
// For each x, a lane of srcK contains src_x[k].
- const uint8x8_t src0 = VQTbl1U8(src_vals, src_indices);
- const uint8x8_t src1 =
- VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)));
+ const uint8x8_t src[2] = {
+ VQTbl1U8(src_vals, src_indices),
+ VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)))};
- const uint16x8_t product0 = vmlal_u8(sum, taps0, src0);
- // product0 + product1
- const uint16x8_t result = vmlal_u8(product0, taps1, src1);
+ vst1q_s16(intermediate,
+ vrshrq_n_s16(SumOnePassTaps</*filter_index=*/3>(src, taps),
+ kInterRoundBitsHorizontal - 1));
+ src_x += src_stride;
+ intermediate += kIntermediateStride;
+ } while (++y < intermediate_height);
+ return;
+ }
- vst1q_s16(intermediate_x, vreinterpretq_s16_u16(vrshrq_n_u16(result, 3)));
+ // |width| >= 8
+ int x = 0;
+ do {
+ const uint8_t* src_x =
+ &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+ int16_t* intermediate_x = intermediate + x;
+ // Only add steps to the 10-bit truncated p to avoid overflow.
+ const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+ const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+ const uint8x8_t filter_indices =
+ vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+ filter_index_mask);
+ // This is a special case. The 2-tap filter has no negative taps, so we
+ // can use unsigned values.
+ // For each x, a lane of tapsK has
+ // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
+ // on x.
+ const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
+ VQTbl1U8(filter_taps1, filter_indices)};
+ int y = 0;
+ do {
+ // Load a pool of samples to select from using stepped indices.
+ const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
+ const uint8x8_t src_indices =
+ vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+
+ // For each x, a lane of srcK contains src_x[k].
+ const uint8x8_t src[2] = {
+ vtbl3_u8(src_vals, src_indices),
+ vtbl3_u8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)))};
+
+ vst1q_s16(intermediate_x,
+ vrshrq_n_s16(SumOnePassTaps</*filter_index=*/3>(src, taps),
+ kInterRoundBitsHorizontal - 1));
src_x += src_stride;
intermediate_x += kIntermediateStride;
- }
- }
+ } while (++y < intermediate_height);
+ x += 8;
+ p += step_x8;
+ } while (x < width);
}
+// Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[5].
inline uint8x16_t GetPositive4TapFilter(const int tap_index) {
assert(tap_index < 4);
- constexpr uint8_t kSubPixel4TapPositiveFilterColumns[4][16] = {
- {0, 30, 26, 22, 20, 18, 16, 14, 12, 12, 10, 8, 6, 4, 4, 2},
- {128, 62, 62, 62, 60, 58, 56, 54, 52, 48, 46, 44, 42, 40, 36, 34},
- {0, 34, 36, 40, 42, 44, 46, 48, 52, 54, 56, 58, 60, 62, 62, 62},
- {0, 2, 4, 4, 6, 8, 10, 12, 12, 14, 16, 18, 20, 22, 26, 30}};
+ alignas(
+ 16) static constexpr uint8_t kSubPixel4TapPositiveFilterColumns[4][16] = {
+ {0, 15, 13, 11, 10, 9, 8, 7, 6, 6, 5, 4, 3, 2, 2, 1},
+ {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+ {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+ {0, 1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11, 13, 15}};
- uint8x16_t filter_taps =
- vld1q_u8(kSubPixel4TapPositiveFilterColumns[tap_index]);
- return filter_taps;
+ return vld1q_u8(kSubPixel4TapPositiveFilterColumns[tap_index]);
}
// This filter is only possible when width <= 4.
-inline void ConvolveKernelHorizontalPositive4Tap(
+void ConvolveKernelHorizontalPositive4Tap(
const uint8_t* src, const ptrdiff_t src_stride, const int subpixel_x,
const int step_x, const int intermediate_height, int16_t* intermediate) {
const int kernel_offset = 2;
@@ -1237,69 +960,60 @@
const uint8x16_t filter_taps1 = GetPositive4TapFilter(1);
const uint8x16_t filter_taps2 = GetPositive4TapFilter(2);
const uint8x16_t filter_taps3 = GetPositive4TapFilter(3);
- uint16x8_t index_steps = vmulq_n_u16(vmovl_u8(vcreate_u8(0x0706050403020100)),
- static_cast<uint16_t>(step_x));
- int p = subpixel_x;
- const uint16x8_t base = vdupq_n_u16(1 << (kBitdepth8 + kFilterBits - 1));
+ const uint16x8_t index_steps = vmulq_n_u16(
+ vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+ const int p = subpixel_x;
// First filter is special, just a 128 tap on the center.
const uint8_t* src_x =
&src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
// Only add steps to the 10-bit truncated p to avoid overflow.
const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
- const uint8x8_t filter_indices =
- vand_u8(vshrn_n_u16(subpel_index_offsets, 6), filter_index_mask);
+ const uint8x8_t filter_indices = vand_u8(
+ vshrn_n_u16(subpel_index_offsets, kFilterIndexShift), filter_index_mask);
// Note that filter_id depends on x.
// For each x, tapsK has kSubPixelFilters[filter_index][filter_id][k].
- const uint8x8_t taps0 = VQTbl1U8(filter_taps0, filter_indices);
- const uint8x8_t taps1 = VQTbl1U8(filter_taps1, filter_indices);
- const uint8x8_t taps2 = VQTbl1U8(filter_taps2, filter_indices);
- const uint8x8_t taps3 = VQTbl1U8(filter_taps3, filter_indices);
+ const uint8x8_t taps[4] = {VQTbl1U8(filter_taps0, filter_indices),
+ VQTbl1U8(filter_taps1, filter_indices),
+ VQTbl1U8(filter_taps2, filter_indices),
+ VQTbl1U8(filter_taps3, filter_indices)};
const uint8x8_t src_indices =
vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
- for (int y = 0; y < intermediate_height; ++y) {
+ int y = 0;
+ do {
// Load a pool of samples to select from using stepped index vectors.
- uint8x16_t src_vals = vld1q_u8(src_x);
+ const uint8x16_t src_vals = vld1q_u8(src_x);
// For each x, srcK contains src_x[k] where k=1.
// Whereas taps come from different arrays, src pixels are drawn from the
// same contiguous line.
- const uint8x8_t src0 = VQTbl1U8(src_vals, src_indices);
- const uint8x8_t src1 =
- VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)));
- const uint8x8_t src2 =
- VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(2)));
- const uint8x8_t src3 =
- VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(3)));
+ const uint8x8_t src[4] = {
+ VQTbl1U8(src_vals, src_indices),
+ VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(1))),
+ VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(2))),
+ VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(3)))};
- uint16x8_t sum = vmlal_u8(base, taps0, src0);
- sum = vmlal_u8(sum, taps1, src1);
- sum = vmlal_u8(sum, taps2, src2);
- sum = vmlal_u8(sum, taps3, src3);
-
- vst1_s16(intermediate,
- vreinterpret_s16_u16(vrshr_n_u16(vget_low_u16(sum), 3)));
+ vst1q_s16(intermediate,
+ vrshrq_n_s16(SumOnePassTaps</*filter_index=*/5>(src, taps),
+ kInterRoundBitsHorizontal - 1));
src_x += src_stride;
intermediate += kIntermediateStride;
- }
+ } while (++y < intermediate_height);
}
+// Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[4].
inline uint8x16_t GetSigned4TapFilter(const int tap_index) {
assert(tap_index < 4);
- // The first and fourth taps of each filter are negative. However
- // 128 does not fit in an 8-bit signed integer. Thus we use subtraction to
- // keep everything unsigned.
- constexpr uint8_t kSubPixel4TapSignedFilterColumns[4][16] = {
- {0, 4, 8, 10, 12, 12, 14, 12, 12, 10, 10, 10, 8, 6, 4, 2},
- {128, 126, 122, 116, 110, 102, 94, 84, 76, 66, 58, 48, 38, 28, 18, 8},
- {0, 8, 18, 28, 38, 48, 58, 66, 76, 84, 94, 102, 110, 116, 122, 126},
- {0, 2, 4, 6, 8, 10, 10, 10, 12, 12, 14, 12, 12, 10, 8, 4}};
+ alignas(16) static constexpr uint8_t
+ kAbsHalfSubPixel4TapSignedFilterColumns[4][16] = {
+ {0, 2, 4, 5, 6, 6, 7, 6, 6, 5, 5, 5, 4, 3, 2, 1},
+ {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+ {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+ {0, 1, 2, 3, 4, 5, 5, 5, 6, 6, 7, 6, 6, 5, 4, 2}};
- uint8x16_t filter_taps =
- vld1q_u8(kSubPixel4TapSignedFilterColumns[tap_index]);
- return filter_taps;
+ return vld1q_u8(kAbsHalfSubPixel4TapSignedFilterColumns[tap_index]);
}
// This filter is only possible when width <= 4.
@@ -1313,66 +1027,480 @@
const uint8x16_t filter_taps1 = GetSigned4TapFilter(1);
const uint8x16_t filter_taps2 = GetSigned4TapFilter(2);
const uint8x16_t filter_taps3 = GetSigned4TapFilter(3);
- const uint16x8_t index_steps = vmulq_n_u16(vmovl_u8(vcreate_u8(0x03020100)),
- static_cast<uint16_t>(step_x));
+ const uint16x4_t index_steps = vmul_n_u16(vcreate_u16(0x0003000200010000),
+ static_cast<uint16_t>(step_x));
- const uint16x8_t base = vdupq_n_u16(1 << (kBitdepth8 + kFilterBits - 1));
- int p = subpixel_x;
+ const int p = subpixel_x;
const uint8_t* src_x =
&src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
// Only add steps to the 10-bit truncated p to avoid overflow.
- const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
- const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+ const uint16x4_t p_fraction = vdup_n_u16(p & 1023);
+ const uint16x4_t subpel_index_offsets = vadd_u16(index_steps, p_fraction);
+ const uint8x8_t filter_index_offsets = vshrn_n_u16(
+ vcombine_u16(subpel_index_offsets, vdup_n_u16(0)), kFilterIndexShift);
const uint8x8_t filter_indices =
- vand_u8(vshrn_n_u16(subpel_index_offsets, 6), filter_index_mask);
+ vand_u8(filter_index_offsets, filter_index_mask);
// Note that filter_id depends on x.
// For each x, tapsK has kSubPixelFilters[filter_index][filter_id][k].
- const uint8x8_t taps0 = VQTbl1U8(filter_taps0, filter_indices);
- const uint8x8_t taps1 = VQTbl1U8(filter_taps1, filter_indices);
- const uint8x8_t taps2 = VQTbl1U8(filter_taps2, filter_indices);
- const uint8x8_t taps3 = VQTbl1U8(filter_taps3, filter_indices);
- for (int y = 0; y < intermediate_height; ++y) {
+ const uint8x8_t taps[4] = {VQTbl1U8(filter_taps0, filter_indices),
+ VQTbl1U8(filter_taps1, filter_indices),
+ VQTbl1U8(filter_taps2, filter_indices),
+ VQTbl1U8(filter_taps3, filter_indices)};
+
+ const uint8x8_t src_indices_base =
+ vshr_n_u8(filter_index_offsets, kScaleSubPixelBits - kFilterIndexShift);
+
+ const uint8x8_t src_indices[4] = {src_indices_base,
+ vadd_u8(src_indices_base, vdup_n_u8(1)),
+ vadd_u8(src_indices_base, vdup_n_u8(2)),
+ vadd_u8(src_indices_base, vdup_n_u8(3))};
+
+ int y = 0;
+ do {
// Load a pool of samples to select from using stepped indices.
- uint8x16_t src_vals = vld1q_u8(src_x);
- const uint8x8_t src_indices =
- vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+ const uint8x16_t src_vals = vld1q_u8(src_x);
// For each x, srcK contains src_x[k] where k=1.
// Whereas taps come from different arrays, src pixels are drawn from the
// same contiguous line.
- const uint8x8_t src0 = VQTbl1U8(src_vals, src_indices);
- const uint8x8_t src1 =
- VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(1)));
- const uint8x8_t src2 =
- VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(2)));
- const uint8x8_t src3 =
- VQTbl1U8(src_vals, vadd_u8(src_indices, vdup_n_u8(3)));
+ const uint8x8_t src[4] = {
+ VQTbl1U8(src_vals, src_indices[0]), VQTbl1U8(src_vals, src_indices[1]),
+ VQTbl1U8(src_vals, src_indices[2]), VQTbl1U8(src_vals, src_indices[3])};
- // Offsetting by base permits a guaranteed positive.
- uint16x8_t sum = vmlsl_u8(base, taps0, src0);
- sum = vmlal_u8(sum, taps1, src1);
- sum = vmlal_u8(sum, taps2, src2);
- sum = vmlsl_u8(sum, taps3, src3);
-
- vst1_s16(intermediate,
- vreinterpret_s16_u16(vrshr_n_u16(vget_low_u16(sum), 3)));
+ vst1q_s16(intermediate,
+ vrshrq_n_s16(SumOnePassTaps</*filter_index=*/4>(src, taps),
+ kInterRoundBitsHorizontal - 1));
src_x += src_stride;
intermediate += kIntermediateStride;
- }
+ } while (++y < intermediate_height);
}
-void ConvolveCompoundScale2D_NEON(
- const void* const reference, const ptrdiff_t reference_stride,
- const int horizontal_filter_index, const int vertical_filter_index,
- const int inter_round_bits_vertical, const int subpixel_x,
- const int subpixel_y, const int step_x, const int step_y, const int width,
- const int height, void* prediction, const ptrdiff_t pred_stride) {
+// Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[0].
+inline uint8x16_t GetSigned6TapFilter(const int tap_index) {
+ assert(tap_index < 6);
+ alignas(16) static constexpr uint8_t
+ kAbsHalfSubPixel6TapSignedFilterColumns[6][16] = {
+ {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0},
+ {0, 3, 5, 6, 7, 7, 8, 7, 7, 6, 6, 6, 5, 4, 2, 1},
+ {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
+ {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
+ {0, 1, 2, 4, 5, 6, 6, 6, 7, 7, 8, 7, 7, 6, 5, 3},
+ {0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
+
+ return vld1q_u8(kAbsHalfSubPixel6TapSignedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width >= 8.
+template <int grade_x>
+inline void ConvolveKernelHorizontalSigned6Tap(
+ const uint8_t* src, const ptrdiff_t src_stride, const int width,
+ const int subpixel_x, const int step_x, const int intermediate_height,
+ int16_t* intermediate) {
+ const int kernel_offset = 1;
+ const uint8x8_t one = vdup_n_u8(1);
+ const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+ const int ref_x = subpixel_x >> kScaleSubPixelBits;
+ const int step_x8 = step_x << 3;
+ uint8x16_t filter_taps[6];
+ for (int i = 0; i < 6; ++i) {
+ filter_taps[i] = GetSigned6TapFilter(i);
+ }
+ const uint16x8_t index_steps = vmulq_n_u16(
+ vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+
+ int x = 0;
+ int p = subpixel_x;
+ do {
+ // Avoid overloading outside the reference boundaries. This means
+ // |trailing_width| can be up to 24.
+ const uint8_t* src_x =
+ &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+ int16_t* intermediate_x = intermediate + x;
+ // Only add steps to the 10-bit truncated p to avoid overflow.
+ const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+ const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+ const uint8x8_t src_indices =
+ vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+ uint8x8_t src_lookup[6];
+ src_lookup[0] = src_indices;
+ for (int i = 1; i < 6; ++i) {
+ src_lookup[i] = vadd_u8(src_lookup[i - 1], one);
+ }
+
+ const uint8x8_t filter_indices =
+ vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+ filter_index_mask);
+ // For each x, a lane of taps[k] has
+ // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
+ // on x.
+ uint8x8_t taps[6];
+ for (int i = 0; i < 6; ++i) {
+ taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
+ }
+ int y = 0;
+ do {
+ // Load a pool of samples to select from using stepped indices.
+ const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
+
+ const uint8x8_t src[6] = {
+ vtbl3_u8(src_vals, src_lookup[0]), vtbl3_u8(src_vals, src_lookup[1]),
+ vtbl3_u8(src_vals, src_lookup[2]), vtbl3_u8(src_vals, src_lookup[3]),
+ vtbl3_u8(src_vals, src_lookup[4]), vtbl3_u8(src_vals, src_lookup[5])};
+
+ vst1q_s16(intermediate_x,
+ vrshrq_n_s16(SumOnePassTaps</*filter_index=*/0>(src, taps),
+ kInterRoundBitsHorizontal - 1));
+ src_x += src_stride;
+ intermediate_x += kIntermediateStride;
+ } while (++y < intermediate_height);
+ x += 8;
+ p += step_x8;
+ } while (x < width);
+}
+
+// Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[1]. This filter
+// has mixed positive and negative outer taps which are handled in
+// GetMixed6TapFilter().
+inline uint8x16_t GetPositive6TapFilter(const int tap_index) {
+ assert(tap_index < 6);
+ alignas(16) static constexpr uint8_t
+ kAbsHalfSubPixel6TapPositiveFilterColumns[4][16] = {
+ {0, 14, 13, 11, 10, 9, 8, 8, 7, 6, 5, 4, 3, 2, 2, 1},
+ {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
+ {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
+ {0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 13, 14}};
+
+ return vld1q_u8(kAbsHalfSubPixel6TapPositiveFilterColumns[tap_index]);
+}
+
+inline int8x16_t GetMixed6TapFilter(const int tap_index) {
+ assert(tap_index < 2);
+ alignas(
+ 16) static constexpr int8_t kHalfSubPixel6TapMixedFilterColumns[2][16] = {
+ {0, 1, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0},
+ {0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 1}};
+
+ return vld1q_s8(kHalfSubPixel6TapMixedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width >= 8.
+template <int grade_x>
+inline void ConvolveKernelHorizontalMixed6Tap(
+ const uint8_t* src, const ptrdiff_t src_stride, const int width,
+ const int subpixel_x, const int step_x, const int intermediate_height,
+ int16_t* intermediate) {
+ const int kernel_offset = 1;
+ const uint8x8_t one = vdup_n_u8(1);
+ const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+ const int ref_x = subpixel_x >> kScaleSubPixelBits;
+ const int step_x8 = step_x << 3;
+ uint8x8_t taps[4];
+ int16x8_t mixed_taps[2];
+ uint8x16_t positive_filter_taps[4];
+ for (int i = 0; i < 4; ++i) {
+ positive_filter_taps[i] = GetPositive6TapFilter(i);
+ }
+ int8x16_t mixed_filter_taps[2];
+ mixed_filter_taps[0] = GetMixed6TapFilter(0);
+ mixed_filter_taps[1] = GetMixed6TapFilter(1);
+ const uint16x8_t index_steps = vmulq_n_u16(
+ vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+
+ int x = 0;
+ int p = subpixel_x;
+ do {
+ const uint8_t* src_x =
+ &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
+ int16_t* intermediate_x = intermediate + x;
+ // Only add steps to the 10-bit truncated p to avoid overflow.
+ const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+ const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+ const uint8x8_t src_indices =
+ vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+ uint8x8_t src_lookup[6];
+ src_lookup[0] = src_indices;
+ for (int i = 1; i < 6; ++i) {
+ src_lookup[i] = vadd_u8(src_lookup[i - 1], one);
+ }
+
+ const uint8x8_t filter_indices =
+ vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+ filter_index_mask);
+ // For each x, a lane of taps[k] has
+ // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
+ // on x.
+ for (int i = 0; i < 4; ++i) {
+ taps[i] = VQTbl1U8(positive_filter_taps[i], filter_indices);
+ }
+ mixed_taps[0] = vmovl_s8(VQTbl1S8(mixed_filter_taps[0], filter_indices));
+ mixed_taps[1] = vmovl_s8(VQTbl1S8(mixed_filter_taps[1], filter_indices));
+
+ int y = 0;
+ do {
+ // Load a pool of samples to select from using stepped indices.
+ const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
+
+ int16x8_t sum_mixed = vmulq_s16(
+ mixed_taps[0], ZeroExtend(vtbl3_u8(src_vals, src_lookup[0])));
+ sum_mixed = vmlaq_s16(sum_mixed, mixed_taps[1],
+ ZeroExtend(vtbl3_u8(src_vals, src_lookup[5])));
+ uint16x8_t sum = vreinterpretq_u16_s16(sum_mixed);
+ sum = vmlal_u8(sum, taps[0], vtbl3_u8(src_vals, src_lookup[1]));
+ sum = vmlal_u8(sum, taps[1], vtbl3_u8(src_vals, src_lookup[2]));
+ sum = vmlal_u8(sum, taps[2], vtbl3_u8(src_vals, src_lookup[3]));
+ sum = vmlal_u8(sum, taps[3], vtbl3_u8(src_vals, src_lookup[4]));
+
+ vst1q_s16(intermediate_x, vrshrq_n_s16(vreinterpretq_s16_u16(sum),
+ kInterRoundBitsHorizontal - 1));
+ src_x += src_stride;
+ intermediate_x += kIntermediateStride;
+ } while (++y < intermediate_height);
+ x += 8;
+ p += step_x8;
+ } while (x < width);
+}
+
+// Pre-transpose the 8 tap filters in |kAbsHalfSubPixelFilters|[2].
+inline uint8x16_t GetSigned8TapFilter(const int tap_index) {
+ assert(tap_index < 8);
+ alignas(16) static constexpr uint8_t
+ kAbsHalfSubPixel8TapSignedFilterColumns[8][16] = {
+ {0, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0},
+ {0, 1, 3, 4, 5, 5, 5, 5, 6, 5, 4, 4, 3, 3, 2, 1},
+ {0, 3, 6, 9, 11, 11, 12, 12, 12, 11, 10, 9, 7, 5, 3, 1},
+ {64, 63, 62, 60, 58, 54, 50, 45, 40, 35, 30, 24, 19, 13, 8, 4},
+ {0, 4, 8, 13, 19, 24, 30, 35, 40, 45, 50, 54, 58, 60, 62, 63},
+ {0, 1, 3, 5, 7, 9, 10, 11, 12, 12, 12, 11, 11, 9, 6, 3},
+ {0, 1, 2, 3, 3, 4, 4, 5, 6, 5, 5, 5, 5, 4, 3, 1},
+ {0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1}};
+
+ return vld1q_u8(kAbsHalfSubPixel8TapSignedFilterColumns[tap_index]);
+}
+
+// This filter is only possible when width >= 8.
+template <int grade_x>
+inline void ConvolveKernelHorizontalSigned8Tap(
+ const uint8_t* src, const ptrdiff_t src_stride, const int width,
+ const int subpixel_x, const int step_x, const int intermediate_height,
+ int16_t* intermediate) {
+ const uint8x8_t one = vdup_n_u8(1);
+ const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
+ const int ref_x = subpixel_x >> kScaleSubPixelBits;
+ const int step_x8 = step_x << 3;
+ uint8x8_t taps[8];
+ uint8x16_t filter_taps[8];
+ for (int i = 0; i < 8; ++i) {
+ filter_taps[i] = GetSigned8TapFilter(i);
+ }
+ const uint16x8_t index_steps = vmulq_n_u16(
+ vmovl_u8(vcreate_u8(0x0706050403020100)), static_cast<uint16_t>(step_x));
+ int x = 0;
+ int p = subpixel_x;
+ do {
+ const uint8_t* src_x = &src[(p >> kScaleSubPixelBits) - ref_x];
+ int16_t* intermediate_x = intermediate + x;
+ // Only add steps to the 10-bit truncated p to avoid overflow.
+ const uint16x8_t p_fraction = vdupq_n_u16(p & 1023);
+ const uint16x8_t subpel_index_offsets = vaddq_u16(index_steps, p_fraction);
+ const uint8x8_t src_indices =
+ vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
+ uint8x8_t src_lookup[8];
+ src_lookup[0] = src_indices;
+ for (int i = 1; i < 8; ++i) {
+ src_lookup[i] = vadd_u8(src_lookup[i - 1], one);
+ }
+
+ const uint8x8_t filter_indices =
+ vand_u8(vshrn_n_u16(subpel_index_offsets, kFilterIndexShift),
+ filter_index_mask);
+ // For each x, a lane of taps[k] has
+ // kSubPixelFilters[filter_index][filter_id][k], where filter_id depends
+ // on x.
+ for (int i = 0; i < 8; ++i) {
+ taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
+ }
+
+ int y = 0;
+ do {
+ // Load a pool of samples to select from using stepped indices.
+ const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
+
+ const uint8x8_t src[8] = {
+ vtbl3_u8(src_vals, src_lookup[0]), vtbl3_u8(src_vals, src_lookup[1]),
+ vtbl3_u8(src_vals, src_lookup[2]), vtbl3_u8(src_vals, src_lookup[3]),
+ vtbl3_u8(src_vals, src_lookup[4]), vtbl3_u8(src_vals, src_lookup[5]),
+ vtbl3_u8(src_vals, src_lookup[6]), vtbl3_u8(src_vals, src_lookup[7])};
+
+ vst1q_s16(intermediate_x,
+ vrshrq_n_s16(SumOnePassTaps</*filter_index=*/2>(src, taps),
+ kInterRoundBitsHorizontal - 1));
+ src_x += src_stride;
+ intermediate_x += kIntermediateStride;
+ } while (++y < intermediate_height);
+ x += 8;
+ p += step_x8;
+ } while (x < width);
+}
+
+// This function handles blocks of width 2 or 4.
+template <int num_taps, int grade_y, int width, bool is_compound>
+void ConvolveVerticalScale4xH(const int16_t* src, const int subpixel_y,
+ const int filter_index, const int step_y,
+ const int height, void* dest,
+ const ptrdiff_t dest_stride) {
+ constexpr ptrdiff_t src_stride = kIntermediateStride;
+ const int16_t* src_y = src;
+ // |dest| is 16-bit in compound mode, Pixel otherwise.
+ uint16_t* dest16_y = static_cast<uint16_t*>(dest);
+ uint8_t* dest_y = static_cast<uint8_t*>(dest);
+ int16x4_t s[num_taps + grade_y];
+
+ int p = subpixel_y & 1023;
+ int prev_p = p;
+ int y = 0;
+ do { // y < height
+ for (int i = 0; i < num_taps; ++i) {
+ s[i] = vld1_s16(src_y + i * src_stride);
+ }
+ int filter_id = (p >> 6) & kSubPixelMask;
+ int16x8_t filter =
+ vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+ int16x4_t sums = Sum2DVerticalTaps4<num_taps, is_compound>(s, filter);
+ if (is_compound) {
+ assert(width != 2);
+ const uint16x4_t result = vreinterpret_u16_s16(sums);
+ vst1_u16(dest16_y, result);
+ } else {
+ const uint8x8_t result = vqmovun_s16(vcombine_s16(sums, sums));
+ if (width == 2) {
+ Store2<0>(dest_y, result);
+ } else {
+ StoreLo4(dest_y, result);
+ }
+ }
+ p += step_y;
+ const int p_diff =
+ (p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits);
+ prev_p = p;
+ // Here we load extra source in case it is needed. If |p_diff| == 0, these
+ // values will be unused, but it's faster to load than to branch.
+ s[num_taps] = vld1_s16(src_y + num_taps * src_stride);
+ if (grade_y > 1) {
+ s[num_taps + 1] = vld1_s16(src_y + (num_taps + 1) * src_stride);
+ }
+ dest16_y += dest_stride;
+ dest_y += dest_stride;
+
+ filter_id = (p >> 6) & kSubPixelMask;
+ filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+ sums = Sum2DVerticalTaps4<num_taps, is_compound>(&s[p_diff], filter);
+ if (is_compound) {
+ assert(width != 2);
+ const uint16x4_t result = vreinterpret_u16_s16(sums);
+ vst1_u16(dest16_y, result);
+ } else {
+ const uint8x8_t result = vqmovun_s16(vcombine_s16(sums, sums));
+ if (width == 2) {
+ Store2<0>(dest_y, result);
+ } else {
+ StoreLo4(dest_y, result);
+ }
+ }
+ p += step_y;
+ src_y = src + (p >> kScaleSubPixelBits) * src_stride;
+ prev_p = p;
+ dest16_y += dest_stride;
+ dest_y += dest_stride;
+
+ y += 2;
+ } while (y < height);
+}
+
+template <int num_taps, int grade_y, bool is_compound>
+inline void ConvolveVerticalScale(const int16_t* src, const int width,
+ const int subpixel_y, const int filter_index,
+ const int step_y, const int height,
+ void* dest, const ptrdiff_t dest_stride) {
+ constexpr ptrdiff_t src_stride = kIntermediateStride;
+ // A possible improvement is to use arithmetic to decide how many times to
+ // apply filters to same source before checking whether to load new srcs.
+ // However, this will only improve performance with very small step sizes.
+ int16x8_t s[num_taps + grade_y];
+ // |dest| is 16-bit in compound mode, Pixel otherwise.
+ uint16_t* dest16_y;
+ uint8_t* dest_y;
+
+ int x = 0;
+ do { // x < width
+ const int16_t* src_x = src + x;
+ const int16_t* src_y = src_x;
+ dest16_y = static_cast<uint16_t*>(dest) + x;
+ dest_y = static_cast<uint8_t*>(dest) + x;
+ int p = subpixel_y & 1023;
+ int prev_p = p;
+ int y = 0;
+ do { // y < height
+ for (int i = 0; i < num_taps; ++i) {
+ s[i] = vld1q_s16(src_y + i * src_stride);
+ }
+ int filter_id = (p >> 6) & kSubPixelMask;
+ int16x8_t filter =
+ vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+ int16x8_t sum = SimpleSum2DVerticalTaps<num_taps, is_compound>(s, filter);
+ if (is_compound) {
+ vst1q_u16(dest16_y, vreinterpretq_u16_s16(sum));
+ } else {
+ vst1_u8(dest_y, vqmovun_s16(sum));
+ }
+ p += step_y;
+ const int p_diff =
+ (p >> kScaleSubPixelBits) - (prev_p >> kScaleSubPixelBits);
+ // |grade_y| > 1 always means p_diff > 0, so load vectors that may be
+ // needed. Otherwise, we only need to load one vector because |p_diff|
+ // can't exceed 1.
+ s[num_taps] = vld1q_s16(src_y + num_taps * src_stride);
+ if (grade_y > 1) {
+ s[num_taps + 1] = vld1q_s16(src_y + (num_taps + 1) * src_stride);
+ }
+ dest16_y += dest_stride;
+ dest_y += dest_stride;
+
+ filter_id = (p >> 6) & kSubPixelMask;
+ filter = vmovl_s8(vld1_s8(kHalfSubPixelFilters[filter_index][filter_id]));
+ sum = SimpleSum2DVerticalTaps<num_taps, is_compound>(&s[p_diff], filter);
+ if (is_compound) {
+ vst1q_u16(dest16_y, vreinterpretq_u16_s16(sum));
+ } else {
+ vst1_u8(dest_y, vqmovun_s16(sum));
+ }
+ p += step_y;
+ src_y = src_x + (p >> kScaleSubPixelBits) * src_stride;
+ prev_p = p;
+ dest16_y += dest_stride;
+ dest_y += dest_stride;
+
+ y += 2;
+ } while (y < height);
+ x += 8;
+ } while (x < width);
+}
+
+template <bool is_compound>
+void ConvolveScale2D_NEON(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int vertical_filter_index, const int subpixel_x,
+ const int subpixel_y, const int step_x,
+ const int step_y, const int width, const int height,
+ void* prediction, const ptrdiff_t pred_stride) {
+ const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+ assert(step_x <= 2048);
+ const int num_vert_taps = GetNumTapsInFilter(vert_filter_index);
const int intermediate_height =
(((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
kScaleSubPixelBits) +
- kSubPixelTaps;
- // TODO(b/133525024): Decide whether it's worth branching to a special case
- // when step_x or step_y is 1024.
+ num_vert_taps;
assert(step_x <= 2048);
// The output of the horizontal filter, i.e. the intermediate_result, is
// guaranteed to fit in int16_t.
@@ -1384,49 +1512,71 @@
// When width > 4, the valid filter index range is always [0, 3].
// When width <= 4, the valid filter index range is always [3, 5].
// Similarly for height.
- const int kIntermediateStride = kMaxSuperBlockSizeInPixels;
int filter_index = GetFilterIndex(horizontal_filter_index, width);
int16_t* intermediate = intermediate_result;
- const auto* src = static_cast<const uint8_t*>(reference);
const ptrdiff_t src_stride = reference_stride;
- auto* dest = static_cast<uint16_t*>(prediction);
+ const auto* src = static_cast<const uint8_t*>(reference);
+ const int vert_kernel_offset = (8 - num_vert_taps) / 2;
+ src += vert_kernel_offset * src_stride;
+
+ // Derive the maximum value of |step_x| at which all source values fit in one
+ // 16-byte load. Final index is src_x + |num_taps| - 1 < 16
+ // step_x*7 is the final base subpel index for the shuffle mask for filter
+ // inputs in each iteration on large blocks. When step_x is large, we need a
+ // larger structure and use a larger table lookup in order to gather all
+ // filter inputs.
+ // |num_taps| - 1 is the shuffle index of the final filter input.
+ const int num_horiz_taps = GetNumTapsInFilter(horiz_filter_index);
+ const int kernel_start_ceiling = 16 - num_horiz_taps;
+ // This truncated quotient |grade_x_threshold| selects |step_x| such that:
+ // (step_x * 7) >> kScaleSubPixelBits < single load limit
+ const int grade_x_threshold =
+ (kernel_start_ceiling << kScaleSubPixelBits) / 7;
switch (filter_index) {
case 0:
- if (step_x < 1024) {
- ConvolveHorizontalScaled_NEON<0, 6, 1>(
+ if (step_x > grade_x_threshold) {
+ ConvolveKernelHorizontalSigned6Tap<2>(
src, src_stride, width, subpixel_x, step_x, intermediate_height,
intermediate);
} else {
- ConvolveHorizontalScaled_NEON<0, 6, 2>(
+ ConvolveKernelHorizontalSigned6Tap<1>(
src, src_stride, width, subpixel_x, step_x, intermediate_height,
intermediate);
}
break;
case 1:
- if (step_x < 1024) {
- ConvolveHorizontalScaled_NEON<1, 6, 1>(
- src, src_stride, width, subpixel_x, step_x, intermediate_height,
- intermediate);
+ if (step_x > grade_x_threshold) {
+ ConvolveKernelHorizontalMixed6Tap<2>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+
} else {
- ConvolveHorizontalScaled_NEON<1, 6, 2>(
- src, src_stride, width, subpixel_x, step_x, intermediate_height,
- intermediate);
+ ConvolveKernelHorizontalMixed6Tap<1>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
}
break;
case 2:
- if (step_x <= 1024) {
- ConvolveHorizontalScaled_NEON<2, 8, 1>(
+ if (step_x > grade_x_threshold) {
+ ConvolveKernelHorizontalSigned8Tap<2>(
src, src_stride, width, subpixel_x, step_x, intermediate_height,
intermediate);
} else {
- ConvolveHorizontalScaled_NEON<2, 8, 2>(
+ ConvolveKernelHorizontalSigned8Tap<1>(
src, src_stride, width, subpixel_x, step_x, intermediate_height,
intermediate);
}
break;
case 3:
- ConvolveKernelHorizontal2Tap(src, src_stride, width, subpixel_x, step_x,
- intermediate_height, intermediate);
+ if (step_x > grade_x_threshold) {
+ ConvolveKernelHorizontal2Tap<2>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ } else {
+ ConvolveKernelHorizontal2Tap<1>(src, src_stride, width, subpixel_x,
+ step_x, intermediate_height,
+ intermediate);
+ }
break;
case 4:
assert(width <= 4);
@@ -1441,23 +1591,135 @@
// Vertical filter.
filter_index = GetFilterIndex(vertical_filter_index, height);
intermediate = intermediate_result;
- const int offset_bits = kBitdepth8 + 2 * kFilterBits - 3;
- for (int y = 0, p = subpixel_y & 1023; y < height; ++y, p += step_y) {
- const int filter_id = (p >> 6) & kSubPixelMask;
- for (int x = 0; x < width; ++x) {
- // An offset to guarantee the sum is non negative.
- int sum = 1 << offset_bits;
- for (int k = 0; k < kSubPixelTaps; ++k) {
- sum +=
- kSubPixelFilters[filter_index][filter_id][k] *
- intermediate[((p >> kScaleSubPixelBits) + k) * kIntermediateStride +
- x];
+
+ switch (filter_index) {
+ case 0:
+ case 1:
+ if (step_y <= 1024) {
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale4xH<6, 1, 2, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale4xH<6, 1, 4, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<6, 1, is_compound>(
+ intermediate, width, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ }
+ } else {
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale4xH<6, 2, 2, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale4xH<6, 2, 4, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<6, 2, is_compound>(
+ intermediate, width, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ }
}
- assert(sum >= 0 && sum < (1 << (offset_bits + 2)));
- dest[x] = static_cast<uint16_t>(
- RightShiftWithRounding(sum, inter_round_bits_vertical));
- }
- dest += pred_stride;
+ break;
+ case 2:
+ if (step_y <= 1024) {
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale4xH<8, 1, 2, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale4xH<8, 1, 4, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<8, 1, is_compound>(
+ intermediate, width, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ }
+ } else {
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale4xH<8, 2, 2, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale4xH<8, 2, 4, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<8, 2, is_compound>(
+ intermediate, width, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ }
+ }
+ break;
+ case 3:
+ if (step_y <= 1024) {
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale4xH<2, 1, 2, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale4xH<2, 1, 4, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<2, 1, is_compound>(
+ intermediate, width, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ }
+ } else {
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale4xH<2, 2, 2, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale4xH<2, 2, 4, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<2, 2, is_compound>(
+ intermediate, width, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ }
+ }
+ break;
+ case 4:
+ default:
+ assert(filter_index == 4 || filter_index == 5);
+ assert(height <= 4);
+ if (step_y <= 1024) {
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale4xH<4, 1, 2, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale4xH<4, 1, 4, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<4, 1, is_compound>(
+ intermediate, width, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ }
+ } else {
+ if (!is_compound && width == 2) {
+ ConvolveVerticalScale4xH<4, 2, 2, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else if (width == 4) {
+ ConvolveVerticalScale4xH<4, 2, 4, is_compound>(
+ intermediate, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ } else {
+ ConvolveVerticalScale<4, 2, is_compound>(
+ intermediate, width, subpixel_y, filter_index, step_y, height,
+ prediction, pred_stride);
+ }
+ }
}
}
@@ -1465,65 +1727,75 @@
const ptrdiff_t reference_stride,
const int horizontal_filter_index,
const int /*vertical_filter_index*/,
- const int /*inter_round_bits_vertical*/,
const int subpixel_x, const int /*subpixel_y*/,
- const int /*step_x*/, const int /*step_y*/,
const int width, const int height,
void* prediction, const ptrdiff_t pred_stride) {
- // For 8 (and 10) bit calculations |inter_round_bits_horizontal| is 3.
const int filter_index = GetFilterIndex(horizontal_filter_index, width);
// Set |src| to the outermost tap.
const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
auto* dest = static_cast<uint8_t*>(prediction);
- HorizontalPass<false, true>(src, reference_stride, dest, pred_stride, width,
- height, subpixel_x, filter_index);
+ DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
+ subpixel_x, filter_index);
}
-template <int min_width, int num_taps>
+// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
+// Vertical calculations.
+uint16x8_t Compound1DShift(const int16x8_t sum) {
+ return vreinterpretq_u16_s16(
+ vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
+}
+
+template <int filter_index, bool is_compound = false,
+ bool negative_outside_taps = false>
void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
- uint8_t* dst, const ptrdiff_t dst_stride, const int width,
- const int height, const int16x8_t taps) {
- constexpr int next_row = num_taps - 1;
- // |src| points to the outermost tap of the first value. When doing fewer than
- // 8 taps it needs to be adjusted.
- if (num_taps == 6) {
- src += src_stride;
- } else if (num_taps == 4) {
- src += 2 * src_stride;
- } else if (num_taps == 2) {
- src += 3 * src_stride;
- }
+ void* const dst, const ptrdiff_t dst_stride,
+ const int width, const int height,
+ const uint8x8_t* const taps) {
+ const int num_taps = GetNumTapsInFilter(filter_index);
+ const int next_row = num_taps - 1;
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+ assert(width >= 8);
int x = 0;
do {
- int16x8_t srcs[8];
- srcs[0] = ZeroExtend(vld1_u8(src + x));
+ const uint8_t* src_x = src + x;
+ uint8x8_t srcs[8];
+ srcs[0] = vld1_u8(src_x);
+ src_x += src_stride;
if (num_taps >= 4) {
- srcs[1] = ZeroExtend(vld1_u8(src + x + src_stride));
- srcs[2] = ZeroExtend(vld1_u8(src + x + 2 * src_stride));
+ srcs[1] = vld1_u8(src_x);
+ src_x += src_stride;
+ srcs[2] = vld1_u8(src_x);
+ src_x += src_stride;
if (num_taps >= 6) {
- srcs[3] = ZeroExtend(vld1_u8(src + x + 3 * src_stride));
- srcs[4] = ZeroExtend(vld1_u8(src + x + 4 * src_stride));
+ srcs[3] = vld1_u8(src_x);
+ src_x += src_stride;
+ srcs[4] = vld1_u8(src_x);
+ src_x += src_stride;
if (num_taps == 8) {
- srcs[5] = ZeroExtend(vld1_u8(src + x + 5 * src_stride));
- srcs[6] = ZeroExtend(vld1_u8(src + x + 6 * src_stride));
+ srcs[5] = vld1_u8(src_x);
+ src_x += src_stride;
+ srcs[6] = vld1_u8(src_x);
+ src_x += src_stride;
}
}
}
int y = 0;
do {
- srcs[next_row] =
- ZeroExtend(vld1_u8(src + x + (y + next_row) * src_stride));
+ srcs[next_row] = vld1_u8(src_x);
+ src_x += src_stride;
- const int16x8_t sums = SumTaps<num_taps>(srcs, taps);
- const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits);
-
- if (min_width == 4) {
- StoreLo4(dst + x + y * dst_stride, results);
+ const int16x8_t sums =
+ SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+ if (is_compound) {
+ const uint16x8_t results = Compound1DShift(sums);
+ vst1q_u16(dst16 + x + y * dst_stride, results);
} else {
- vst1_u8(dst + x + y * dst_stride, results);
+ const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+ vst1_u8(dst8 + x + y * dst_stride, results);
}
srcs[0] = srcs[1];
@@ -1544,6 +1816,394 @@
} while (x < width);
}
+template <int filter_index, bool is_compound = false,
+ bool negative_outside_taps = false>
+void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dst, const ptrdiff_t dst_stride,
+ const int height, const uint8x8_t* const taps) {
+ const int num_taps = GetNumTapsInFilter(filter_index);
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ uint8x8_t srcs[9];
+
+ if (num_taps == 2) {
+ srcs[2] = vdup_n_u8(0);
+
+ srcs[0] = Load4(src);
+ src += src_stride;
+
+ int y = 0;
+ do {
+ srcs[0] = Load4<1>(src, srcs[0]);
+ src += src_stride;
+ srcs[2] = Load4<0>(src, srcs[2]);
+ src += src_stride;
+ srcs[1] = vext_u8(srcs[0], srcs[2], 4);
+
+ const int16x8_t sums =
+ SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+ if (is_compound) {
+ const uint16x8_t results = Compound1DShift(sums);
+
+ vst1q_u16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+ StoreLo4(dst8, results);
+ dst8 += dst_stride;
+ StoreHi4(dst8, results);
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ y += 2;
+ } while (y < height);
+ } else if (num_taps == 4) {
+ srcs[4] = vdup_n_u8(0);
+
+ srcs[0] = Load4(src);
+ src += src_stride;
+ srcs[0] = Load4<1>(src, srcs[0]);
+ src += src_stride;
+ srcs[2] = Load4(src);
+ src += src_stride;
+ srcs[1] = vext_u8(srcs[0], srcs[2], 4);
+
+ int y = 0;
+ do {
+ srcs[2] = Load4<1>(src, srcs[2]);
+ src += src_stride;
+ srcs[4] = Load4<0>(src, srcs[4]);
+ src += src_stride;
+ srcs[3] = vext_u8(srcs[2], srcs[4], 4);
+
+ const int16x8_t sums =
+ SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+ if (is_compound) {
+ const uint16x8_t results = Compound1DShift(sums);
+
+ vst1q_u16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+ StoreLo4(dst8, results);
+ dst8 += dst_stride;
+ StoreHi4(dst8, results);
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ y += 2;
+ } while (y < height);
+ } else if (num_taps == 6) {
+ srcs[6] = vdup_n_u8(0);
+
+ srcs[0] = Load4(src);
+ src += src_stride;
+ srcs[0] = Load4<1>(src, srcs[0]);
+ src += src_stride;
+ srcs[2] = Load4(src);
+ src += src_stride;
+ srcs[1] = vext_u8(srcs[0], srcs[2], 4);
+ srcs[2] = Load4<1>(src, srcs[2]);
+ src += src_stride;
+ srcs[4] = Load4(src);
+ src += src_stride;
+ srcs[3] = vext_u8(srcs[2], srcs[4], 4);
+
+ int y = 0;
+ do {
+ srcs[4] = Load4<1>(src, srcs[4]);
+ src += src_stride;
+ srcs[6] = Load4<0>(src, srcs[6]);
+ src += src_stride;
+ srcs[5] = vext_u8(srcs[4], srcs[6], 4);
+
+ const int16x8_t sums =
+ SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+ if (is_compound) {
+ const uint16x8_t results = Compound1DShift(sums);
+
+ vst1q_u16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+ StoreLo4(dst8, results);
+ dst8 += dst_stride;
+ StoreHi4(dst8, results);
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
+ y += 2;
+ } while (y < height);
+ } else if (num_taps == 8) {
+ srcs[8] = vdup_n_u8(0);
+
+ srcs[0] = Load4(src);
+ src += src_stride;
+ srcs[0] = Load4<1>(src, srcs[0]);
+ src += src_stride;
+ srcs[2] = Load4(src);
+ src += src_stride;
+ srcs[1] = vext_u8(srcs[0], srcs[2], 4);
+ srcs[2] = Load4<1>(src, srcs[2]);
+ src += src_stride;
+ srcs[4] = Load4(src);
+ src += src_stride;
+ srcs[3] = vext_u8(srcs[2], srcs[4], 4);
+ srcs[4] = Load4<1>(src, srcs[4]);
+ src += src_stride;
+ srcs[6] = Load4(src);
+ src += src_stride;
+ srcs[5] = vext_u8(srcs[4], srcs[6], 4);
+
+ int y = 0;
+ do {
+ srcs[6] = Load4<1>(src, srcs[6]);
+ src += src_stride;
+ srcs[8] = Load4<0>(src, srcs[8]);
+ src += src_stride;
+ srcs[7] = vext_u8(srcs[6], srcs[8], 4);
+
+ const int16x8_t sums =
+ SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+ if (is_compound) {
+ const uint16x8_t results = Compound1DShift(sums);
+
+ vst1q_u16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+ StoreLo4(dst8, results);
+ dst8 += dst_stride;
+ StoreHi4(dst8, results);
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
+ srcs[5] = srcs[7];
+ srcs[6] = srcs[8];
+ y += 2;
+ } while (y < height);
+ }
+}
+
+template <int filter_index, bool negative_outside_taps = false>
+void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dst, const ptrdiff_t dst_stride,
+ const int height, const uint8x8_t* const taps) {
+ const int num_taps = GetNumTapsInFilter(filter_index);
+ auto* dst8 = static_cast<uint8_t*>(dst);
+
+ uint8x8_t srcs[9];
+
+ if (num_taps == 2) {
+ srcs[2] = vdup_n_u8(0);
+
+ srcs[0] = Load2(src);
+ src += src_stride;
+
+ int y = 0;
+ do {
+ srcs[0] = Load2<1>(src, srcs[0]);
+ src += src_stride;
+ srcs[0] = Load2<2>(src, srcs[0]);
+ src += src_stride;
+ srcs[0] = Load2<3>(src, srcs[0]);
+ src += src_stride;
+ srcs[2] = Load2<0>(src, srcs[2]);
+ src += src_stride;
+ srcs[1] = vext_u8(srcs[0], srcs[2], 2);
+
+ // This uses srcs[0]..srcs[1].
+ const int16x8_t sums =
+ SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+ const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+ Store2<0>(dst8, results);
+ dst8 += dst_stride;
+ Store2<1>(dst8, results);
+ if (height == 2) return;
+ dst8 += dst_stride;
+ Store2<2>(dst8, results);
+ dst8 += dst_stride;
+ Store2<3>(dst8, results);
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[2];
+ y += 4;
+ } while (y < height);
+ } else if (num_taps == 4) {
+ srcs[4] = vdup_n_u8(0);
+
+ srcs[0] = Load2(src);
+ src += src_stride;
+ srcs[0] = Load2<1>(src, srcs[0]);
+ src += src_stride;
+ srcs[0] = Load2<2>(src, srcs[0]);
+ src += src_stride;
+
+ int y = 0;
+ do {
+ srcs[0] = Load2<3>(src, srcs[0]);
+ src += src_stride;
+ srcs[4] = Load2<0>(src, srcs[4]);
+ src += src_stride;
+ srcs[1] = vext_u8(srcs[0], srcs[4], 2);
+ srcs[4] = Load2<1>(src, srcs[4]);
+ src += src_stride;
+ srcs[2] = vext_u8(srcs[0], srcs[4], 4);
+ srcs[4] = Load2<2>(src, srcs[4]);
+ src += src_stride;
+ srcs[3] = vext_u8(srcs[0], srcs[4], 6);
+
+ // This uses srcs[0]..srcs[3].
+ const int16x8_t sums =
+ SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+ const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+ Store2<0>(dst8, results);
+ dst8 += dst_stride;
+ Store2<1>(dst8, results);
+ if (height == 2) return;
+ dst8 += dst_stride;
+ Store2<2>(dst8, results);
+ dst8 += dst_stride;
+ Store2<3>(dst8, results);
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[4];
+ y += 4;
+ } while (y < height);
+ } else if (num_taps == 6) {
+ // During the vertical pass the number of taps is restricted when
+ // |height| <= 4.
+ assert(height > 4);
+ srcs[8] = vdup_n_u8(0);
+
+ srcs[0] = Load2(src);
+ src += src_stride;
+ srcs[0] = Load2<1>(src, srcs[0]);
+ src += src_stride;
+ srcs[0] = Load2<2>(src, srcs[0]);
+ src += src_stride;
+ srcs[0] = Load2<3>(src, srcs[0]);
+ src += src_stride;
+ srcs[4] = Load2(src);
+ src += src_stride;
+ srcs[1] = vext_u8(srcs[0], srcs[4], 2);
+
+ int y = 0;
+ do {
+ srcs[4] = Load2<1>(src, srcs[4]);
+ src += src_stride;
+ srcs[2] = vext_u8(srcs[0], srcs[4], 4);
+ srcs[4] = Load2<2>(src, srcs[4]);
+ src += src_stride;
+ srcs[3] = vext_u8(srcs[0], srcs[4], 6);
+ srcs[4] = Load2<3>(src, srcs[4]);
+ src += src_stride;
+ srcs[8] = Load2<0>(src, srcs[8]);
+ src += src_stride;
+ srcs[5] = vext_u8(srcs[4], srcs[8], 2);
+
+ // This uses srcs[0]..srcs[5].
+ const int16x8_t sums =
+ SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+ const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+ Store2<0>(dst8, results);
+ dst8 += dst_stride;
+ Store2<1>(dst8, results);
+ dst8 += dst_stride;
+ Store2<2>(dst8, results);
+ dst8 += dst_stride;
+ Store2<3>(dst8, results);
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[4];
+ srcs[1] = srcs[5];
+ srcs[4] = srcs[8];
+ y += 4;
+ } while (y < height);
+ } else if (num_taps == 8) {
+ // During the vertical pass the number of taps is restricted when
+ // |height| <= 4.
+ assert(height > 4);
+ srcs[8] = vdup_n_u8(0);
+
+ srcs[0] = Load2(src);
+ src += src_stride;
+ srcs[0] = Load2<1>(src, srcs[0]);
+ src += src_stride;
+ srcs[0] = Load2<2>(src, srcs[0]);
+ src += src_stride;
+ srcs[0] = Load2<3>(src, srcs[0]);
+ src += src_stride;
+ srcs[4] = Load2(src);
+ src += src_stride;
+ srcs[1] = vext_u8(srcs[0], srcs[4], 2);
+ srcs[4] = Load2<1>(src, srcs[4]);
+ src += src_stride;
+ srcs[2] = vext_u8(srcs[0], srcs[4], 4);
+ srcs[4] = Load2<2>(src, srcs[4]);
+ src += src_stride;
+ srcs[3] = vext_u8(srcs[0], srcs[4], 6);
+
+ int y = 0;
+ do {
+ srcs[4] = Load2<3>(src, srcs[4]);
+ src += src_stride;
+ srcs[8] = Load2<0>(src, srcs[8]);
+ src += src_stride;
+ srcs[5] = vext_u8(srcs[4], srcs[8], 2);
+ srcs[8] = Load2<1>(src, srcs[8]);
+ src += src_stride;
+ srcs[6] = vext_u8(srcs[4], srcs[8], 4);
+ srcs[8] = Load2<2>(src, srcs[8]);
+ src += src_stride;
+ srcs[7] = vext_u8(srcs[4], srcs[8], 6);
+
+ // This uses srcs[0]..srcs[7].
+ const int16x8_t sums =
+ SumOnePassTaps<filter_index, negative_outside_taps>(srcs, taps);
+ const uint8x8_t results = vqrshrun_n_s16(sums, kFilterBits - 1);
+
+ Store2<0>(dst8, results);
+ dst8 += dst_stride;
+ Store2<1>(dst8, results);
+ dst8 += dst_stride;
+ Store2<2>(dst8, results);
+ dst8 += dst_stride;
+ Store2<3>(dst8, results);
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[4];
+ srcs[1] = srcs[5];
+ srcs[2] = srcs[6];
+ srcs[3] = srcs[7];
+ srcs[4] = srcs[8];
+ y += 4;
+ } while (y < height);
+ }
+}
+
// This function is a simplified version of Convolve2D_C.
// It is called when it is single prediction mode, where only vertical
// filtering is required.
@@ -1553,107 +2213,129 @@
const ptrdiff_t reference_stride,
const int /*horizontal_filter_index*/,
const int vertical_filter_index,
- const int /*inter_round_bits_vertical*/,
const int /*subpixel_x*/, const int subpixel_y,
- const int /*step_x*/, const int /*step_y*/,
const int width, const int height, void* prediction,
const ptrdiff_t pred_stride) {
const int filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps = GetNumTapsInFilter(filter_index);
const ptrdiff_t src_stride = reference_stride;
- const auto* src =
- static_cast<const uint8_t*>(reference) - kVerticalOffset * src_stride;
+ const auto* src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride;
auto* dest = static_cast<uint8_t*>(prediction);
const ptrdiff_t dest_stride = pred_stride;
const int filter_id = (subpixel_y >> 6) & kSubPixelMask;
- // First filter is always a copy.
- if (filter_id == 0) {
- // Move |src| down the actual values and not the start of the context.
- src = static_cast<const uint8_t*>(reference);
- int y = 0;
- do {
- memcpy(dest, src, width * sizeof(src[0]));
- src += src_stride;
- dest += dest_stride;
- } while (++y < height);
- return;
+ assert(filter_id != 0);
+
+ uint8x8_t taps[8];
+ for (int k = 0; k < kSubPixelTaps; ++k) {
+ taps[k] = vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][filter_id][k]);
}
- // Break up by # of taps
- // |filter_index| taps enum InterpolationFilter
- // 0 6 kInterpolationFilterEightTap
- // 1 6 kInterpolationFilterEightTapSmooth
- // 2 8 kInterpolationFilterEightTapSharp
- // 3 2 kInterpolationFilterBilinear
- // 4 4 kInterpolationFilterSwitchable
- // 5 4 !!! SECRET FILTER !!! only for Wx4.
- if (width >= 4) {
- if (filter_index == 2) { // 8 tap.
- const int16x8_t taps =
- vld1q_s16(kSubPixelFilters[filter_index][filter_id]);
- if (width == 4) {
- FilterVertical<4, 8>(src, src_stride, dest, dest_stride, width, height,
- taps);
- } else {
- FilterVertical<8, 8>(src, src_stride, dest, dest_stride, width, height,
- taps);
- }
- } else if (filter_index < 2) { // 6 tap.
- const int16x8_t taps =
- vld1q_s16(kSubPixelFilters[filter_index][filter_id]);
- if (width == 4) {
- FilterVertical<4, 6>(src, src_stride, dest, dest_stride, width, height,
- taps);
- } else {
- FilterVertical<8, 6>(src, src_stride, dest, dest_stride, width, height,
- taps);
- }
- } else if (filter_index > 3) { // 4 tap.
- // Store taps in vget_low_s16(taps).
- const int16x8_t taps =
- vld1q_s16(kSubPixelFilters[filter_index][filter_id] + 2);
- if (width == 4) {
- FilterVertical<4, 4>(src, src_stride, dest, dest_stride, width, height,
- taps);
- } else {
- FilterVertical<8, 4>(src, src_stride, dest, dest_stride, width, height,
- taps);
- }
- } else { // 2 tap.
- // Store taps in vget_low_s16(taps).
- const int16x8_t taps =
- vld1q_s16(kSubPixelFilters[filter_index][filter_id] + 2);
- if (width == 4) {
- FilterVertical<4, 2>(src, src_stride, dest, dest_stride, width, height,
- taps);
- } else {
- FilterVertical<8, 2>(src, src_stride, dest, dest_stride, width, height,
- taps);
- }
+ if (filter_index == 0) { // 6 tap.
+ if (width == 2) {
+ FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height,
+ taps + 1);
+ } else if (width == 4) {
+ FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height,
+ taps + 1);
+ } else {
+ FilterVertical<0>(src, src_stride, dest, dest_stride, width, height,
+ taps + 1);
+ }
+ } else if ((filter_index == 1) &
+ ((filter_id == 1) | (filter_id == 15))) { // 5 tap.
+ if (width == 2) {
+ FilterVertical2xH<1>(src, src_stride, dest, dest_stride, height,
+ taps + 1);
+ } else if (width == 4) {
+ FilterVertical4xH<1>(src, src_stride, dest, dest_stride, height,
+ taps + 1);
+ } else {
+ FilterVertical<1>(src, src_stride, dest, dest_stride, width, height,
+ taps + 1);
+ }
+ } else if ((filter_index == 1) &
+ ((filter_id == 7) | (filter_id == 8) |
+ (filter_id == 9))) { // 6 tap with weird negative taps.
+ if (width == 2) {
+ FilterVertical2xH<1,
+ /*negative_outside_taps=*/true>(
+ src, src_stride, dest, dest_stride, height, taps + 1);
+ } else if (width == 4) {
+ FilterVertical4xH<1, /*is_compound=*/false,
+ /*negative_outside_taps=*/true>(
+ src, src_stride, dest, dest_stride, height, taps + 1);
+ } else {
+ FilterVertical<1, /*is_compound=*/false, /*negative_outside_taps=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps + 1);
+ }
+ } else if (filter_index == 2) { // 8 tap.
+ if (width == 2) {
+ FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
+ } else if (width == 4) {
+ FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
+ } else {
+ FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
+ taps);
+ }
+ } else if (filter_index == 3) { // 2 tap.
+ if (width == 2) {
+ FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height,
+ taps + 3);
+ } else if (width == 4) {
+ FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height,
+ taps + 3);
+ } else {
+ FilterVertical<3>(src, src_stride, dest, dest_stride, width, height,
+ taps + 3);
+ }
+ } else if (filter_index == 4) { // 4 tap.
+ // Outside taps are negative.
+ if (width == 2) {
+ FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height,
+ taps + 2);
+ } else if (width == 4) {
+ FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height,
+ taps + 2);
+ } else {
+ FilterVertical<4>(src, src_stride, dest, dest_stride, width, height,
+ taps + 2);
}
} else {
- assert(width == 2);
- const int taps = NumTapsInFilter(filter_index);
- src =
- static_cast<const uint8_t*>(reference) - ((taps / 2) - 1) * src_stride;
- VerticalPass2xH</*is_2d=*/false>(src, src_stride, dest, pred_stride, height,
- 0, filter_index, taps, subpixel_y);
+ // 4 tap. When |filter_index| == 1 the |filter_id| values listed below map
+ // to 4 tap filters.
+ assert(filter_index == 5 ||
+ (filter_index == 1 &&
+ (filter_id == 2 || filter_id == 3 || filter_id == 4 ||
+ filter_id == 5 || filter_id == 6 || filter_id == 10 ||
+ filter_id == 11 || filter_id == 12 || filter_id == 13 ||
+ filter_id == 14)));
+ // According to GetNumTapsInFilter() this has 6 taps but here we are
+ // treating it as though it has 4.
+ if (filter_index == 1) src += src_stride;
+ if (width == 2) {
+ FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height,
+ taps + 2);
+ } else if (width == 4) {
+ FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height,
+ taps + 2);
+ } else {
+ FilterVertical<5>(src, src_stride, dest, dest_stride, width, height,
+ taps + 2);
+ }
}
}
void ConvolveCompoundCopy_NEON(
const void* const reference, const ptrdiff_t reference_stride,
const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
- const int /*inter_round_bits_vertical*/, const int /*subpixel_x*/,
- const int /*subpixel_y*/, const int /*step_x*/, const int /*step_y*/,
- const int width, const int height, void* prediction,
- const ptrdiff_t pred_stride) {
+ const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
+ const int height, void* prediction, const ptrdiff_t /*pred_stride*/) {
const auto* src = static_cast<const uint8_t*>(reference);
const ptrdiff_t src_stride = reference_stride;
auto* dest = static_cast<uint16_t*>(prediction);
- const int bitdepth = 8;
- const int compound_round_offset =
- (1 << (bitdepth + 4)) + (1 << (bitdepth + 3));
- const uint16x8_t v_compound_round_offset = vdupq_n_u16(compound_round_offset);
+ constexpr int final_shift =
+ kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
if (width >= 16) {
int y = 0;
@@ -1661,226 +2343,161 @@
int x = 0;
do {
const uint8x16_t v_src = vld1q_u8(&src[x]);
- const uint16x8_t v_src_x16_lo = vshll_n_u8(vget_low_u8(v_src), 4);
- const uint16x8_t v_src_x16_hi = vshll_n_u8(vget_high_u8(v_src), 4);
const uint16x8_t v_dest_lo =
- vaddq_u16(v_src_x16_lo, v_compound_round_offset);
+ vshll_n_u8(vget_low_u8(v_src), final_shift);
const uint16x8_t v_dest_hi =
- vaddq_u16(v_src_x16_hi, v_compound_round_offset);
+ vshll_n_u8(vget_high_u8(v_src), final_shift);
vst1q_u16(&dest[x], v_dest_lo);
x += 8;
vst1q_u16(&dest[x], v_dest_hi);
x += 8;
} while (x < width);
src += src_stride;
- dest += pred_stride;
+ dest += width;
} while (++y < height);
} else if (width == 8) {
int y = 0;
do {
const uint8x8_t v_src = vld1_u8(&src[0]);
- const uint16x8_t v_src_x16 = vshll_n_u8(v_src, 4);
- vst1q_u16(&dest[0], vaddq_u16(v_src_x16, v_compound_round_offset));
+ const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
+ vst1q_u16(&dest[0], v_dest);
src += src_stride;
- dest += pred_stride;
+ dest += width;
} while (++y < height);
- } else if (width == 4) {
- const uint8x8_t zero = vdup_n_u8(0);
+ } else { /* width == 4 */
+ uint8x8_t v_src = vdup_n_u8(0);
+
int y = 0;
do {
- const uint8x8_t v_src = LoadLo4(&src[0], zero);
- const uint16x8_t v_src_x16 = vshll_n_u8(v_src, 4);
- const uint16x8_t v_dest = vaddq_u16(v_src_x16, v_compound_round_offset);
- vst1_u16(&dest[0], vget_low_u16(v_dest));
+ v_src = Load4<0>(&src[0], v_src);
src += src_stride;
- dest += pred_stride;
- } while (++y < height);
- } else { // width == 2
- assert(width == 2);
- int y = 0;
- do {
- dest[0] = (src[0] << 4) + compound_round_offset;
- dest[1] = (src[1] << 4) + compound_round_offset;
+ v_src = Load4<1>(&src[0], v_src);
src += src_stride;
- dest += pred_stride;
- } while (++y < height);
+ const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
+ vst1q_u16(&dest[0], v_dest);
+ dest += 4 << 1;
+ y += 2;
+ } while (y < height);
}
}
-// Input 8 bits and output 16 bits.
-template <int min_width, int num_taps>
-void FilterCompoundVertical(const uint8_t* src, const ptrdiff_t src_stride,
- uint16_t* dst, const ptrdiff_t dst_stride,
- const int width, const int height,
- const int16x8_t taps) {
- constexpr int next_row = num_taps - 1;
- // |src| points to the outermost tap of the first value. When doing fewer than
- // 8 taps it needs to be adjusted.
- if (num_taps == 6) {
- src += src_stride;
- } else if (num_taps == 4) {
- src += 2 * src_stride;
- } else if (num_taps == 2) {
- src += 3 * src_stride;
- }
-
- const uint16x8_t compound_round_offset = vdupq_n_u16(1 << 12);
-
- int x = 0;
- do {
- int16x8_t srcs[8];
- srcs[0] = ZeroExtend(vld1_u8(src + x));
- if (num_taps >= 4) {
- srcs[1] = ZeroExtend(vld1_u8(src + x + src_stride));
- srcs[2] = ZeroExtend(vld1_u8(src + x + 2 * src_stride));
- if (num_taps >= 6) {
- srcs[3] = ZeroExtend(vld1_u8(src + x + 3 * src_stride));
- srcs[4] = ZeroExtend(vld1_u8(src + x + 4 * src_stride));
- if (num_taps == 8) {
- srcs[5] = ZeroExtend(vld1_u8(src + x + 5 * src_stride));
- srcs[6] = ZeroExtend(vld1_u8(src + x + 6 * src_stride));
- }
- }
- }
-
- int y = 0;
- do {
- srcs[next_row] =
- ZeroExtend(vld1_u8(src + x + (y + next_row) * src_stride));
-
- const uint16x8_t sums = SumTaps8To16<num_taps>(srcs, taps);
- const uint16x8_t shifted = vrshrq_n_u16(sums, 3);
- // In order to keep the sum in 16 bits we add an offset to the sum
- // (1 << (bitdepth + kFilterBits - 1) == 1 << 14). This ensures that the
- // results will never be negative.
- // Normally ConvolveCompoundVertical would add |compound_round_offset| at
- // the end. Instead we use that to compensate for the initial offset.
- // (1 << (bitdepth + 4)) + (1 << (bitdepth + 3)) == (1 << 12) + (1 << 11)
- // After taking into account the shift above:
- // RightShiftWithRounding(LeftShift(sum, bits_shift),
- // inter_round_bits_vertical)
- // where bits_shift == kFilterBits - kInterRoundBitsHorizontal == 4
- // and inter_round_bits_vertical == 7
- // and simplifying it to RightShiftWithRounding(sum, 3)
- // we see that the initial offset of 1 << 14 >> 3 == 1 << 11 and
- // |compound_round_offset| can be simplified to 1 << 12.
- const uint16x8_t offset = vaddq_u16(shifted, compound_round_offset);
-
- if (min_width == 4) {
- vst1_u16(dst + x + y * dst_stride, vget_low_u16(offset));
- } else {
- vst1q_u16(dst + x + y * dst_stride, offset);
- }
-
- srcs[0] = srcs[1];
- if (num_taps >= 4) {
- srcs[1] = srcs[2];
- srcs[2] = srcs[3];
- if (num_taps >= 6) {
- srcs[3] = srcs[4];
- srcs[4] = srcs[5];
- if (num_taps == 8) {
- srcs[5] = srcs[6];
- srcs[6] = srcs[7];
- }
- }
- }
- } while (++y < height);
- x += 8;
- } while (x < width);
-}
-
void ConvolveCompoundVertical_NEON(
const void* const reference, const ptrdiff_t reference_stride,
const int /*horizontal_filter_index*/, const int vertical_filter_index,
- const int /*inter_round_bits_vertical*/, const int /*subpixel_x*/,
- const int subpixel_y, const int /*step_x*/, const int /*step_y*/,
- const int width, const int height, void* prediction,
- const ptrdiff_t pred_stride) {
+ const int /*subpixel_x*/, const int subpixel_y, const int width,
+ const int height, void* prediction, const ptrdiff_t /*pred_stride*/) {
const int filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps = GetNumTapsInFilter(filter_index);
const ptrdiff_t src_stride = reference_stride;
- const auto* src =
- static_cast<const uint8_t*>(reference) - kVerticalOffset * src_stride;
+ const auto* src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride;
auto* dest = static_cast<uint16_t*>(prediction);
const int filter_id = (subpixel_y >> 6) & kSubPixelMask;
+ assert(filter_id != 0);
- if (width >= 4) {
- const int16x8_t taps = vld1q_s16(kSubPixelFilters[filter_index][filter_id]);
+ uint8x8_t taps[8];
+ for (int k = 0; k < kSubPixelTaps; ++k) {
+ taps[k] = vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][filter_id][k]);
+ }
- if (filter_index == 2) { // 8 tap.
- if (width == 4) {
- FilterCompoundVertical<4, 8>(src, src_stride, dest, pred_stride, width,
- height, taps);
- } else {
- FilterCompoundVertical<8, 8>(src, src_stride, dest, pred_stride, width,
- height, taps);
- }
- } else if (filter_index < 2) { // 6 tap.
- if (width == 4) {
- FilterCompoundVertical<4, 6>(src, src_stride, dest, pred_stride, width,
- height, taps);
- } else {
- FilterCompoundVertical<8, 6>(src, src_stride, dest, pred_stride, width,
- height, taps);
- }
- } else if (filter_index == 3) { // 2 tap.
- if (width == 4) {
- FilterCompoundVertical<4, 2>(src, src_stride, dest, pred_stride, width,
- height, taps);
- } else {
- FilterCompoundVertical<8, 2>(src, src_stride, dest, pred_stride, width,
- height, taps);
- }
- } else if (filter_index > 3) { // 4 tap.
- if (width == 4) {
- FilterCompoundVertical<4, 4>(src, src_stride, dest, pred_stride, width,
- height, taps);
- } else {
- FilterCompoundVertical<8, 4>(src, src_stride, dest, pred_stride, width,
- height, taps);
- }
+ if (filter_index == 0) { // 6 tap.
+ if (width == 4) {
+ FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps + 1);
+ } else {
+ FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps + 1);
+ }
+ } else if ((filter_index == 1) &
+ ((filter_id == 1) | (filter_id == 15))) { // 5 tap.
+ if (width == 4) {
+ FilterVertical4xH<1, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps + 1);
+ } else {
+ FilterVertical<1, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps + 1);
+ }
+ } else if ((filter_index == 1) &
+ ((filter_id == 7) | (filter_id == 8) |
+ (filter_id == 9))) { // 6 tap with weird negative taps.
+ if (width == 4) {
+ FilterVertical4xH<1, /*is_compound=*/true,
+ /*negative_outside_taps=*/true>(src, src_stride, dest,
+ 4, height, taps + 1);
+ } else {
+ FilterVertical<1, /*is_compound=*/true, /*negative_outside_taps=*/true>(
+ src, src_stride, dest, width, width, height, taps + 1);
+ }
+ } else if (filter_index == 2) { // 8 tap.
+ if (width == 4) {
+ FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
+ } else {
+ FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps);
+ }
+ } else if (filter_index == 3) { // 2 tap.
+ if (width == 4) {
+ FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps + 3);
+ } else {
+ FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps + 3);
+ }
+ } else if (filter_index == 4) { // 4 tap.
+ if (width == 4) {
+ FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps + 2);
+ } else {
+ FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps + 2);
}
} else {
- assert(width == 2);
- const int taps = NumTapsInFilter(filter_index);
- src =
- static_cast<const uint8_t*>(reference) - ((taps / 2) - 1) * src_stride;
- VerticalPass2xH</*is_2d=*/false, /*is_compound=*/true>(
- src, src_stride, dest, pred_stride, height, 0, filter_index, taps,
- subpixel_y);
+ // 4 tap. When |filter_index| == 1 the |filter_id| values listed below map
+ // to 4 tap filters.
+ assert(filter_index == 5 ||
+ (filter_index == 1 &&
+ (filter_id == 2 || filter_id == 3 || filter_id == 4 ||
+ filter_id == 5 || filter_id == 6 || filter_id == 10 ||
+ filter_id == 11 || filter_id == 12 || filter_id == 13 ||
+ filter_id == 14)));
+ // According to GetNumTapsInFilter() this has 6 taps but here we are
+ // treating it as though it has 4.
+ if (filter_index == 1) src += src_stride;
+ if (width == 4) {
+ FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps + 2);
+ } else {
+ FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width,
+ width, height, taps + 2);
+ }
}
}
void ConvolveCompoundHorizontal_NEON(
const void* const reference, const ptrdiff_t reference_stride,
const int horizontal_filter_index, const int /*vertical_filter_index*/,
- const int /*inter_round_bits_vertical*/, const int subpixel_x,
- const int /*subpixel_y*/, const int /*step_x*/, const int /*step_y*/,
- const int width, const int height, void* prediction,
- const ptrdiff_t pred_stride) {
+ const int subpixel_x, const int /*subpixel_y*/, const int width,
+ const int height, void* prediction, const ptrdiff_t /*pred_stride*/) {
const int filter_index = GetFilterIndex(horizontal_filter_index, width);
const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
auto* dest = static_cast<uint16_t*>(prediction);
- HorizontalPass(src, reference_stride, dest, pred_stride, width, height,
- subpixel_x, filter_index);
+ DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>(
+ src, reference_stride, dest, width, width, height, subpixel_x,
+ filter_index);
}
-void ConvolveCompound2D_NEON(const void* const reference,
- const ptrdiff_t reference_stride,
- const int horizontal_filter_index,
- const int vertical_filter_index,
- const int inter_round_bits_vertical,
- const int subpixel_x, const int subpixel_y,
- const int /*step_x*/, const int /*step_y*/,
- const int width, const int height,
- void* prediction, const ptrdiff_t pred_stride) {
+void ConvolveCompound2D_NEON(
+ const void* const reference, const ptrdiff_t reference_stride,
+ const int horizontal_filter_index, const int vertical_filter_index,
+ const int subpixel_x, const int subpixel_y, const int width,
+ const int height, void* prediction, const ptrdiff_t /*pred_stride*/) {
// The output of the horizontal filter, i.e. the intermediate_result, is
// guaranteed to fit in int16_t.
uint16_t
intermediate_result[kMaxSuperBlockSizeInPixels *
(kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
- const int intermediate_stride = kMaxSuperBlockSizeInPixels;
// Horizontal filter.
// Filter types used for width <= 4 are different from those for width > 4.
@@ -1889,66 +2506,586 @@
// Similarly for height.
const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
- const int horizontal_taps = NumTapsInFilter(horiz_filter_index);
- const int vertical_taps = NumTapsInFilter(vert_filter_index);
- uint16_t* intermediate = intermediate_result;
+ const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
const int intermediate_height = height + vertical_taps - 1;
const ptrdiff_t src_stride = reference_stride;
- const auto* src = static_cast<const uint8_t*>(reference) -
- kVerticalOffset * src_stride - kHorizontalOffset;
+ const auto* const src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride -
+ kHorizontalOffset;
+
+ DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
+ src, src_stride, intermediate_result, width, width, intermediate_height,
+ subpixel_x, horiz_filter_index);
+
+ // Vertical filter.
auto* dest = static_cast<uint16_t*>(prediction);
- int filter_id = (subpixel_x >> 6) & kSubPixelMask;
+ const int filter_id = ((subpixel_y & 1023) >> 6) & kSubPixelMask;
+ assert(filter_id != 0);
- if (width >= 4) {
- // TODO(johannkoenig): Use |width| for |intermediate_stride|.
- src = static_cast<const uint8_t*>(reference) -
- (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
- HorizontalPass<true>(src, src_stride, intermediate_result,
- intermediate_stride, width, intermediate_height,
- subpixel_x, horiz_filter_index);
+ const ptrdiff_t dest_stride = width;
+ const int16x8_t taps =
+ vmovl_s8(vld1_s8(kHalfSubPixelFilters[vert_filter_index][filter_id]));
- // Vertical filter.
- intermediate = intermediate_result;
- filter_id = ((subpixel_y & 1023) >> 6) & kSubPixelMask;
-
- const ptrdiff_t dest_stride = pred_stride;
- const int16x8_t taps =
- vld1q_s16(kSubPixelFilters[vert_filter_index][filter_id]);
-
- if (vertical_taps == 8) {
+ if (vertical_taps == 8) {
+ if (width == 4) {
+ Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest,
+ dest_stride, height, taps);
+ } else {
Filter2DVertical<8, /*is_compound=*/true>(
- intermediate, intermediate_stride, dest, dest_stride, width, height,
- taps, inter_round_bits_vertical);
- } else if (vertical_taps == 6) {
- Filter2DVertical<6, /*is_compound=*/true>(
- intermediate, intermediate_stride, dest, dest_stride, width, height,
- taps, inter_round_bits_vertical);
- } else if (vertical_taps == 4) {
- Filter2DVertical<4, /*is_compound=*/true>(
- intermediate, intermediate_stride, dest, dest_stride, width, height,
- taps, inter_round_bits_vertical);
- } else { // |vertical_taps| == 2
- Filter2DVertical<2, /*is_compound=*/true>(
- intermediate, intermediate_stride, dest, dest_stride, width, height,
- taps, inter_round_bits_vertical);
+ intermediate_result, dest, dest_stride, width, height, taps);
}
+ } else if (vertical_taps == 6) {
+ if (width == 4) {
+ Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest,
+ dest_stride, height, taps);
+ } else {
+ Filter2DVertical<6, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps);
+ }
+ } else if (vertical_taps == 4) {
+ if (width == 4) {
+ Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest,
+ dest_stride, height, taps);
+ } else {
+ Filter2DVertical<4, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps);
+ }
+ } else { // |vertical_taps| == 2
+ if (width == 4) {
+ Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest,
+ dest_stride, height, taps);
+ } else {
+ Filter2DVertical<2, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps);
+ }
+ }
+}
+
+inline void HalfAddHorizontal(const uint8_t* src, uint8_t* dst) {
+ const uint8x16_t left = vld1q_u8(src);
+ const uint8x16_t right = vld1q_u8(src + 1);
+ vst1q_u8(dst, vrhaddq_u8(left, right));
+}
+
+template <int width>
+inline void IntraBlockCopyHorizontal(const uint8_t* src,
+ const ptrdiff_t src_stride,
+ const int height, uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
+ const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
+
+ int y = 0;
+ do {
+ HalfAddHorizontal(src, dst);
+ if (width >= 32) {
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ if (width >= 64) {
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ if (width == 128) {
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ }
+ }
+ }
+ src += src_remainder_stride;
+ dst += dst_remainder_stride;
+ } while (++y < height);
+}
+
+void ConvolveIntraBlockCopyHorizontal_NEON(
+ const void* const reference, const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
+ const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
+ const int height, void* const prediction, const ptrdiff_t pred_stride) {
+ const auto* src = static_cast<const uint8_t*>(reference);
+ auto* dest = static_cast<uint8_t*>(prediction);
+
+ if (width == 128) {
+ IntraBlockCopyHorizontal<128>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 64) {
+ IntraBlockCopyHorizontal<64>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 32) {
+ IntraBlockCopyHorizontal<32>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 16) {
+ IntraBlockCopyHorizontal<16>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 8) {
+ int y = 0;
+ do {
+ const uint8x8_t left = vld1_u8(src);
+ const uint8x8_t right = vld1_u8(src + 1);
+ vst1_u8(dest, vrhadd_u8(left, right));
+
+ src += reference_stride;
+ dest += pred_stride;
+ } while (++y < height);
+ } else if (width == 4) {
+ uint8x8_t left = vdup_n_u8(0);
+ uint8x8_t right = vdup_n_u8(0);
+ int y = 0;
+ do {
+ left = Load4<0>(src, left);
+ right = Load4<0>(src + 1, right);
+ src += reference_stride;
+ left = Load4<1>(src, left);
+ right = Load4<1>(src + 1, right);
+ src += reference_stride;
+
+ const uint8x8_t result = vrhadd_u8(left, right);
+
+ StoreLo4(dest, result);
+ dest += pred_stride;
+ StoreHi4(dest, result);
+ dest += pred_stride;
+ y += 2;
+ } while (y < height);
} else {
- src = static_cast<const uint8_t*>(reference) -
- ((vertical_taps / 2) - 1) * src_stride - ((horizontal_taps / 2) - 1);
+ assert(width == 2);
+ uint8x8_t left = vdup_n_u8(0);
+ uint8x8_t right = vdup_n_u8(0);
+ int y = 0;
+ do {
+ left = Load2<0>(src, left);
+ right = Load2<0>(src + 1, right);
+ src += reference_stride;
+ left = Load2<1>(src, left);
+ right = Load2<1>(src + 1, right);
+ src += reference_stride;
- HorizontalPass2xH(src, src_stride, intermediate_result, intermediate_stride,
- intermediate_height, horiz_filter_index, horizontal_taps,
- subpixel_x);
+ const uint8x8_t result = vrhadd_u8(left, right);
- VerticalPass2xH</*is_2d=*/true, /*is_compound=*/true>(
- intermediate_result, intermediate_stride, dest, pred_stride, height,
- inter_round_bits_vertical, vert_filter_index, vertical_taps,
- subpixel_y);
+ Store2<0>(dest, result);
+ dest += pred_stride;
+ Store2<1>(dest, result);
+ dest += pred_stride;
+ y += 2;
+ } while (y < height);
+ }
+}
+
+template <int width>
+inline void IntraBlockCopyVertical(const uint8_t* src,
+ const ptrdiff_t src_stride, const int height,
+ uint8_t* dst, const ptrdiff_t dst_stride) {
+ const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
+ const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
+ uint8x16_t row[8], below[8];
+
+ row[0] = vld1q_u8(src);
+ if (width >= 32) {
+ src += 16;
+ row[1] = vld1q_u8(src);
+ if (width >= 64) {
+ src += 16;
+ row[2] = vld1q_u8(src);
+ src += 16;
+ row[3] = vld1q_u8(src);
+ if (width == 128) {
+ src += 16;
+ row[4] = vld1q_u8(src);
+ src += 16;
+ row[5] = vld1q_u8(src);
+ src += 16;
+ row[6] = vld1q_u8(src);
+ src += 16;
+ row[7] = vld1q_u8(src);
+ }
+ }
+ }
+ src += src_remainder_stride;
+
+ int y = 0;
+ do {
+ below[0] = vld1q_u8(src);
+ if (width >= 32) {
+ src += 16;
+ below[1] = vld1q_u8(src);
+ if (width >= 64) {
+ src += 16;
+ below[2] = vld1q_u8(src);
+ src += 16;
+ below[3] = vld1q_u8(src);
+ if (width == 128) {
+ src += 16;
+ below[4] = vld1q_u8(src);
+ src += 16;
+ below[5] = vld1q_u8(src);
+ src += 16;
+ below[6] = vld1q_u8(src);
+ src += 16;
+ below[7] = vld1q_u8(src);
+ }
+ }
+ }
+ src += src_remainder_stride;
+
+ vst1q_u8(dst, vrhaddq_u8(row[0], below[0]));
+ row[0] = below[0];
+ if (width >= 32) {
+ dst += 16;
+ vst1q_u8(dst, vrhaddq_u8(row[1], below[1]));
+ row[1] = below[1];
+ if (width >= 64) {
+ dst += 16;
+ vst1q_u8(dst, vrhaddq_u8(row[2], below[2]));
+ row[2] = below[2];
+ dst += 16;
+ vst1q_u8(dst, vrhaddq_u8(row[3], below[3]));
+ row[3] = below[3];
+ if (width >= 128) {
+ dst += 16;
+ vst1q_u8(dst, vrhaddq_u8(row[4], below[4]));
+ row[4] = below[4];
+ dst += 16;
+ vst1q_u8(dst, vrhaddq_u8(row[5], below[5]));
+ row[5] = below[5];
+ dst += 16;
+ vst1q_u8(dst, vrhaddq_u8(row[6], below[6]));
+ row[6] = below[6];
+ dst += 16;
+ vst1q_u8(dst, vrhaddq_u8(row[7], below[7]));
+ row[7] = below[7];
+ }
+ }
+ }
+ dst += dst_remainder_stride;
+ } while (++y < height);
+}
+
+void ConvolveIntraBlockCopyVertical_NEON(
+ const void* const reference, const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
+ const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
+ const int height, void* const prediction, const ptrdiff_t pred_stride) {
+ const auto* src = static_cast<const uint8_t*>(reference);
+ auto* dest = static_cast<uint8_t*>(prediction);
+
+ if (width == 128) {
+ IntraBlockCopyVertical<128>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 64) {
+ IntraBlockCopyVertical<64>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 32) {
+ IntraBlockCopyVertical<32>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 16) {
+ IntraBlockCopyVertical<16>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 8) {
+ uint8x8_t row, below;
+ row = vld1_u8(src);
+ src += reference_stride;
+
+ int y = 0;
+ do {
+ below = vld1_u8(src);
+ src += reference_stride;
+
+ vst1_u8(dest, vrhadd_u8(row, below));
+ dest += pred_stride;
+
+ row = below;
+ } while (++y < height);
+ } else if (width == 4) {
+ uint8x8_t row = Load4(src);
+ uint8x8_t below = vdup_n_u8(0);
+ src += reference_stride;
+
+ int y = 0;
+ do {
+ below = Load4<0>(src, below);
+ src += reference_stride;
+
+ StoreLo4(dest, vrhadd_u8(row, below));
+ dest += pred_stride;
+
+ row = below;
+ } while (++y < height);
+ } else {
+ assert(width == 2);
+ uint8x8_t row = Load2(src);
+ uint8x8_t below = vdup_n_u8(0);
+ src += reference_stride;
+
+ int y = 0;
+ do {
+ below = Load2<0>(src, below);
+ src += reference_stride;
+
+ Store2<0>(dest, vrhadd_u8(row, below));
+ dest += pred_stride;
+
+ row = below;
+ } while (++y < height);
+ }
+}
+
+template <int width>
+inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride,
+ const int height, uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ const ptrdiff_t src_remainder_stride = src_stride - (width - 8);
+ const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8);
+ uint16x8_t row[16];
+ row[0] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ if (width >= 16) {
+ src += 8;
+ row[1] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ if (width >= 32) {
+ src += 8;
+ row[2] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ src += 8;
+ row[3] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ if (width >= 64) {
+ src += 8;
+ row[4] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ src += 8;
+ row[5] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ src += 8;
+ row[6] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ src += 8;
+ row[7] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ if (width == 128) {
+ src += 8;
+ row[8] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ src += 8;
+ row[9] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ src += 8;
+ row[10] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ src += 8;
+ row[11] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ src += 8;
+ row[12] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ src += 8;
+ row[13] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ src += 8;
+ row[14] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ src += 8;
+ row[15] = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ }
+ }
+ }
+ }
+ src += src_remainder_stride;
+
+ int y = 0;
+ do {
+ const uint16x8_t below_0 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[0], below_0), 2));
+ row[0] = below_0;
+ if (width >= 16) {
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_1 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[1], below_1), 2));
+ row[1] = below_1;
+ if (width >= 32) {
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_2 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[2], below_2), 2));
+ row[2] = below_2;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_3 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[3], below_3), 2));
+ row[3] = below_3;
+ if (width >= 64) {
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_4 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[4], below_4), 2));
+ row[4] = below_4;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_5 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[5], below_5), 2));
+ row[5] = below_5;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_6 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[6], below_6), 2));
+ row[6] = below_6;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_7 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[7], below_7), 2));
+ row[7] = below_7;
+ if (width == 128) {
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_8 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[8], below_8), 2));
+ row[8] = below_8;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_9 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[9], below_9), 2));
+ row[9] = below_9;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_10 =
+ vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[10], below_10), 2));
+ row[10] = below_10;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_11 =
+ vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[11], below_11), 2));
+ row[11] = below_11;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_12 =
+ vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[12], below_12), 2));
+ row[12] = below_12;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_13 =
+ vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[13], below_13), 2));
+ row[13] = below_13;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_14 =
+ vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[14], below_14), 2));
+ row[14] = below_14;
+ src += 8;
+ dst += 8;
+
+ const uint16x8_t below_15 =
+ vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
+ vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[15], below_15), 2));
+ row[15] = below_15;
+ }
+ }
+ }
+ }
+ src += src_remainder_stride;
+ dst += dst_remainder_stride;
+ } while (++y < height);
+}
+
+void ConvolveIntraBlockCopy2D_NEON(
+ const void* const reference, const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
+ const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
+ const int height, void* const prediction, const ptrdiff_t pred_stride) {
+ const auto* src = static_cast<const uint8_t*>(reference);
+ auto* dest = static_cast<uint8_t*>(prediction);
+ // Note: allow vertical access to height + 1. Because this function is only
+ // for u/v plane of intra block copy, such access is guaranteed to be within
+ // the prediction block.
+
+ if (width == 128) {
+ IntraBlockCopy2D<128>(src, reference_stride, height, dest, pred_stride);
+ } else if (width == 64) {
+ IntraBlockCopy2D<64>(src, reference_stride, height, dest, pred_stride);
+ } else if (width == 32) {
+ IntraBlockCopy2D<32>(src, reference_stride, height, dest, pred_stride);
+ } else if (width == 16) {
+ IntraBlockCopy2D<16>(src, reference_stride, height, dest, pred_stride);
+ } else if (width == 8) {
+ IntraBlockCopy2D<8>(src, reference_stride, height, dest, pred_stride);
+ } else if (width == 4) {
+ uint8x8_t left = Load4(src);
+ uint8x8_t right = Load4(src + 1);
+ src += reference_stride;
+
+ uint16x4_t row = vget_low_u16(vaddl_u8(left, right));
+
+ int y = 0;
+ do {
+ left = Load4<0>(src, left);
+ right = Load4<0>(src + 1, right);
+ src += reference_stride;
+ left = Load4<1>(src, left);
+ right = Load4<1>(src + 1, right);
+ src += reference_stride;
+
+ const uint16x8_t below = vaddl_u8(left, right);
+
+ const uint8x8_t result = vrshrn_n_u16(
+ vaddq_u16(vcombine_u16(row, vget_low_u16(below)), below), 2);
+ StoreLo4(dest, result);
+ dest += pred_stride;
+ StoreHi4(dest, result);
+ dest += pred_stride;
+
+ row = vget_high_u16(below);
+ y += 2;
+ } while (y < height);
+ } else {
+ uint8x8_t left = Load2(src);
+ uint8x8_t right = Load2(src + 1);
+ src += reference_stride;
+
+ uint16x4_t row = vget_low_u16(vaddl_u8(left, right));
+
+ int y = 0;
+ do {
+ left = Load2<0>(src, left);
+ right = Load2<0>(src + 1, right);
+ src += reference_stride;
+ left = Load2<2>(src, left);
+ right = Load2<2>(src + 1, right);
+ src += reference_stride;
+
+ const uint16x8_t below = vaddl_u8(left, right);
+
+ const uint8x8_t result = vrshrn_n_u16(
+ vaddq_u16(vcombine_u16(row, vget_low_u16(below)), below), 2);
+ Store2<0>(dest, result);
+ dest += pred_stride;
+ Store2<2>(dest, result);
+ dest += pred_stride;
+
+ row = vget_high_u16(below);
+ y += 2;
+ } while (y < height);
}
}
void Init8bpp() {
- Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
assert(dsp != nullptr);
dsp->convolve[0][0][0][1] = ConvolveHorizontal_NEON;
dsp->convolve[0][0][1][0] = ConvolveVertical_NEON;
@@ -1959,9 +3096,12 @@
dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_NEON;
dsp->convolve[0][1][1][1] = ConvolveCompound2D_NEON;
- // TODO(petersonab,b/139707209): Fix source buffer overreads.
- // dsp->convolve_scale[1] = ConvolveCompoundScale2D_NEON;
- static_cast<void>(ConvolveCompoundScale2D_NEON);
+ dsp->convolve[1][0][0][1] = ConvolveIntraBlockCopyHorizontal_NEON;
+ dsp->convolve[1][0][1][0] = ConvolveIntraBlockCopyVertical_NEON;
+ dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_NEON;
+
+ dsp->convolve_scale[0] = ConvolveScale2D_NEON<false>;
+ dsp->convolve_scale[1] = ConvolveScale2D_NEON<true>;
}
} // namespace
@@ -1972,7 +3112,7 @@
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/libgav1/src/dsp/arm/convolve_neon.h b/libgav1/src/dsp/arm/convolve_neon.h
index a537650..948ef4d 100644
--- a/libgav1/src/dsp/arm/convolve_neon.h
+++ b/libgav1/src/dsp/arm/convolve_neon.h
@@ -17,8 +17,8 @@
#ifndef LIBGAV1_SRC_DSP_ARM_CONVOLVE_NEON_H_
#define LIBGAV1_SRC_DSP_ARM_CONVOLVE_NEON_H_
-#include "src/dsp/cpu.h"
#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
namespace libgav1 {
namespace dsp {
@@ -30,17 +30,21 @@
} // namespace libgav1
#if LIBGAV1_ENABLE_NEON
-#define LIBGAV1_Dsp8bpp_ConvolveHorizontal LIBGAV1_DSP_NEON
-#define LIBGAV1_Dsp8bpp_ConvolveVertical LIBGAV1_DSP_NEON
-#define LIBGAV1_Dsp8bpp_Convolve2D LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveHorizontal LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_Convolve2D LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_ConvolveCompoundCopy LIBGAV1_DSP_NEON
-#define LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal LIBGAV1_DSP_NEON
-#define LIBGAV1_Dsp8bpp_ConvolveCompoundVertical LIBGAV1_DSP_NEON
-#define LIBGAV1_Dsp8bpp_ConvolveCompound2D LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundCopy LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveCompound2D LIBGAV1_CPU_NEON
-// TODO(petersonab,b/139707209): Fix source buffer overreads.
-// #define LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopyHorizontal LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopyVertical LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveIntraBlockCopy2D LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_ConvolveScale2D LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D LIBGAV1_CPU_NEON
#endif // LIBGAV1_ENABLE_NEON
#endif // LIBGAV1_SRC_DSP_ARM_CONVOLVE_NEON_H_
diff --git a/libgav1/src/dsp/arm/distance_weighted_blend_neon.cc b/libgav1/src/dsp/arm/distance_weighted_blend_neon.cc
index 39b34a9..04952ab 100644
--- a/libgav1/src/dsp/arm/distance_weighted_blend_neon.cc
+++ b/libgav1/src/dsp/arm/distance_weighted_blend_neon.cc
@@ -13,7 +13,7 @@
// limitations under the License.
#include "src/dsp/distance_weighted_blend.h"
-#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
#if LIBGAV1_ENABLE_NEON
@@ -24,138 +24,93 @@
#include <cstdint>
#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
#include "src/utils/common.h"
namespace libgav1 {
namespace dsp {
namespace {
-constexpr int kBitdepth8 = 8;
constexpr int kInterPostRoundBit = 4;
-const int16x8_t kCompoundRoundOffset =
- vdupq_n_s16((1 << kBitdepth8) + (1 << (kBitdepth8 - 1)));
+inline int16x8_t ComputeWeightedAverage8(const int16x8_t pred0,
+ const int16x8_t pred1,
+ const int16x4_t weights[2]) {
+ // TODO(https://issuetracker.google.com/issues/150325685): Investigate range.
+ const int32x4_t wpred0_lo = vmull_s16(weights[0], vget_low_s16(pred0));
+ const int32x4_t wpred0_hi = vmull_s16(weights[0], vget_high_s16(pred0));
+ const int32x4_t blended_lo =
+ vmlal_s16(wpred0_lo, weights[1], vget_low_s16(pred1));
+ const int32x4_t blended_hi =
+ vmlal_s16(wpred0_hi, weights[1], vget_high_s16(pred1));
-inline int16x8_t ComputeWeightedAverage8(const uint16x8_t pred0,
- const uint16x8_t pred1,
- const uint16x4_t weights[2]) {
- const uint32x4_t wpred0_lo = vmull_u16(weights[0], vget_low_u16(pred0));
- const uint32x4_t wpred0_hi = vmull_u16(weights[0], vget_high_u16(pred0));
- const uint32x4_t blended_lo =
- vmlal_u16(wpred0_lo, weights[1], vget_low_u16(pred1));
- const uint32x4_t blended_hi =
- vmlal_u16(wpred0_hi, weights[1], vget_high_u16(pred1));
-
- const uint16x4_t result_lo =
- vqrshrn_n_u32(blended_lo, kInterPostRoundBit + 4);
- const uint16x4_t result_hi =
- vqrshrn_n_u32(blended_hi, kInterPostRoundBit + 4);
- return vsubq_s16(vreinterpretq_s16_u16(vcombine_u16(result_lo, result_hi)),
- kCompoundRoundOffset);
+ return vcombine_s16(vqrshrn_n_s32(blended_lo, kInterPostRoundBit + 4),
+ vqrshrn_n_s32(blended_hi, kInterPostRoundBit + 4));
}
-template <int height>
-inline void DistanceWeightedBlend4xH_NEON(const uint16_t* prediction_0,
- const ptrdiff_t prediction_stride_0,
- const uint16_t* prediction_1,
- const ptrdiff_t prediction_stride_1,
- const uint16x4_t weights[2],
- void* const dest,
- const ptrdiff_t dest_stride) {
+template <int width, int height>
+inline void DistanceWeightedBlendSmall_NEON(const int16_t* prediction_0,
+ const int16_t* prediction_1,
+ const int16x4_t weights[2],
+ void* const dest,
+ const ptrdiff_t dest_stride) {
auto* dst = static_cast<uint8_t*>(dest);
- const uint16_t* pred_0 = prediction_0;
- const uint16_t* pred_1 = prediction_1;
+ constexpr int step = 16 / width;
- for (int y = 0; y < height; y += 4) {
- const uint16x4_t src_00 = vld1_u16(pred_0);
- const uint16x4_t src_10 = vld1_u16(pred_1);
- pred_0 += prediction_stride_0;
- pred_1 += prediction_stride_1;
- const uint16x4_t src_01 = vld1_u16(pred_0);
- const uint16x4_t src_11 = vld1_u16(pred_1);
- pred_0 += prediction_stride_0;
- pred_1 += prediction_stride_1;
- const int16x8_t res01 = ComputeWeightedAverage8(
- vcombine_u16(src_00, src_01), vcombine_u16(src_10, src_11), weights);
-
- const uint16x4_t src_02 = vld1_u16(pred_0);
- const uint16x4_t src_12 = vld1_u16(pred_1);
- pred_0 += prediction_stride_0;
- pred_1 += prediction_stride_1;
- const uint16x4_t src_03 = vld1_u16(pred_0);
- const uint16x4_t src_13 = vld1_u16(pred_1);
- pred_0 += prediction_stride_0;
- pred_1 += prediction_stride_1;
- const int16x8_t res23 = ComputeWeightedAverage8(
- vcombine_u16(src_02, src_03), vcombine_u16(src_12, src_13), weights);
-
- const uint8x8_t result_01 = vqmovun_s16(res01);
- const uint8x8_t result_23 = vqmovun_s16(res23);
- StoreLo4(dst, result_01);
- dst += dest_stride;
- StoreHi4(dst, result_01);
- dst += dest_stride;
- StoreLo4(dst, result_23);
- dst += dest_stride;
- StoreHi4(dst, result_23);
- dst += dest_stride;
- }
-}
-
-template <int height>
-inline void DistanceWeightedBlend8xH_NEON(const uint16_t* prediction_0,
- const ptrdiff_t prediction_stride_0,
- const uint16_t* prediction_1,
- const ptrdiff_t prediction_stride_1,
- const uint16x4_t weights[2],
- void* const dest,
- const ptrdiff_t dest_stride) {
- auto* dst = static_cast<uint8_t*>(dest);
- const uint16_t* pred_0 = prediction_0;
- const uint16_t* pred_1 = prediction_1;
-
- for (int y = 0; y < height; y += 2) {
- const uint16x8_t src_00 = vld1q_u16(pred_0);
- const uint16x8_t src_10 = vld1q_u16(pred_1);
- pred_0 += prediction_stride_0;
- pred_1 += prediction_stride_1;
+ for (int y = 0; y < height; y += step) {
+ const int16x8_t src_00 = vld1q_s16(prediction_0);
+ const int16x8_t src_10 = vld1q_s16(prediction_1);
+ prediction_0 += 8;
+ prediction_1 += 8;
const int16x8_t res0 = ComputeWeightedAverage8(src_00, src_10, weights);
- const uint16x8_t src_01 = vld1q_u16(pred_0);
- const uint16x8_t src_11 = vld1q_u16(pred_1);
- pred_0 += prediction_stride_0;
- pred_1 += prediction_stride_1;
+ const int16x8_t src_01 = vld1q_s16(prediction_0);
+ const int16x8_t src_11 = vld1q_s16(prediction_1);
+ prediction_0 += 8;
+ prediction_1 += 8;
const int16x8_t res1 = ComputeWeightedAverage8(src_01, src_11, weights);
const uint8x8_t result0 = vqmovun_s16(res0);
const uint8x8_t result1 = vqmovun_s16(res1);
- vst1_u8(dst, result0);
- dst += dest_stride;
- vst1_u8(dst, result1);
- dst += dest_stride;
+ if (width == 4) {
+ StoreLo4(dst, result0);
+ dst += dest_stride;
+ StoreHi4(dst, result0);
+ dst += dest_stride;
+ StoreLo4(dst, result1);
+ dst += dest_stride;
+ StoreHi4(dst, result1);
+ dst += dest_stride;
+ } else {
+ assert(width == 8);
+ vst1_u8(dst, result0);
+ dst += dest_stride;
+ vst1_u8(dst, result1);
+ dst += dest_stride;
+ }
}
}
-inline void DistanceWeightedBlendLarge_NEON(
- const uint16_t* prediction_0, const ptrdiff_t prediction_stride_0,
- const uint16_t* prediction_1, const ptrdiff_t prediction_stride_1,
- const uint16x4_t weights[2], const int width, const int height,
- void* const dest, const ptrdiff_t dest_stride) {
+inline void DistanceWeightedBlendLarge_NEON(const int16_t* prediction_0,
+ const int16_t* prediction_1,
+ const int16x4_t weights[2],
+ const int width, const int height,
+ void* const dest,
+ const ptrdiff_t dest_stride) {
auto* dst = static_cast<uint8_t*>(dest);
- const uint16_t* pred_0 = prediction_0;
- const uint16_t* pred_1 = prediction_1;
int y = height;
do {
int x = 0;
do {
- const uint16x8_t src0_lo = vld1q_u16(pred_0 + x);
- const uint16x8_t src1_lo = vld1q_u16(pred_1 + x);
+ const int16x8_t src0_lo = vld1q_s16(prediction_0 + x);
+ const int16x8_t src1_lo = vld1q_s16(prediction_1 + x);
const int16x8_t res_lo =
ComputeWeightedAverage8(src0_lo, src1_lo, weights);
- const uint16x8_t src0_hi = vld1q_u16(pred_0 + x + 8);
- const uint16x8_t src1_hi = vld1q_u16(pred_1 + x + 8);
+ const int16x8_t src0_hi = vld1q_s16(prediction_0 + x + 8);
+ const int16x8_t src1_hi = vld1q_s16(prediction_1 + x + 8);
const int16x8_t res_hi =
ComputeWeightedAverage8(src0_hi, src1_hi, weights);
@@ -165,31 +120,33 @@
x += 16;
} while (x < width);
dst += dest_stride;
- pred_0 += prediction_stride_0;
- pred_1 += prediction_stride_1;
+ prediction_0 += width;
+ prediction_1 += width;
} while (--y != 0);
}
-inline void DistanceWeightedBlend_NEON(
- const uint16_t* prediction_0, const ptrdiff_t prediction_stride_0,
- const uint16_t* prediction_1, const ptrdiff_t prediction_stride_1,
- const uint8_t weight_0, const uint8_t weight_1, const int width,
- const int height, void* const dest, const ptrdiff_t dest_stride) {
- uint16x4_t weights[2] = {vdup_n_u16(weight_0), vdup_n_u16(weight_1)};
+inline void DistanceWeightedBlend_NEON(const void* prediction_0,
+ const void* prediction_1,
+ const uint8_t weight_0,
+ const uint8_t weight_1, const int width,
+ const int height, void* const dest,
+ const ptrdiff_t dest_stride) {
+ const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
+ int16x4_t weights[2] = {vdup_n_s16(weight_0), vdup_n_s16(weight_1)};
+ // TODO(johannkoenig): Investigate the branching. May be fine to call with a
+ // variable height.
if (width == 4) {
if (height == 4) {
- DistanceWeightedBlend4xH_NEON<4>(prediction_0, prediction_stride_0,
- prediction_1, prediction_stride_1,
- weights, dest, dest_stride);
+ DistanceWeightedBlendSmall_NEON<4, 4>(pred_0, pred_1, weights, dest,
+ dest_stride);
} else if (height == 8) {
- DistanceWeightedBlend4xH_NEON<8>(prediction_0, prediction_stride_0,
- prediction_1, prediction_stride_1,
- weights, dest, dest_stride);
+ DistanceWeightedBlendSmall_NEON<4, 8>(pred_0, pred_1, weights, dest,
+ dest_stride);
} else {
assert(height == 16);
- DistanceWeightedBlend4xH_NEON<16>(prediction_0, prediction_stride_0,
- prediction_1, prediction_stride_1,
- weights, dest, dest_stride);
+ DistanceWeightedBlendSmall_NEON<4, 16>(pred_0, pred_1, weights, dest,
+ dest_stride);
}
return;
}
@@ -197,37 +154,32 @@
if (width == 8) {
switch (height) {
case 4:
- DistanceWeightedBlend8xH_NEON<4>(prediction_0, prediction_stride_0,
- prediction_1, prediction_stride_1,
- weights, dest, dest_stride);
+ DistanceWeightedBlendSmall_NEON<8, 4>(pred_0, pred_1, weights, dest,
+ dest_stride);
return;
case 8:
- DistanceWeightedBlend8xH_NEON<8>(prediction_0, prediction_stride_0,
- prediction_1, prediction_stride_1,
- weights, dest, dest_stride);
+ DistanceWeightedBlendSmall_NEON<8, 8>(pred_0, pred_1, weights, dest,
+ dest_stride);
return;
case 16:
- DistanceWeightedBlend8xH_NEON<16>(prediction_0, prediction_stride_0,
- prediction_1, prediction_stride_1,
- weights, dest, dest_stride);
+ DistanceWeightedBlendSmall_NEON<8, 16>(pred_0, pred_1, weights, dest,
+ dest_stride);
return;
default:
assert(height == 32);
- DistanceWeightedBlend8xH_NEON<32>(prediction_0, prediction_stride_0,
- prediction_1, prediction_stride_1,
- weights, dest, dest_stride);
+ DistanceWeightedBlendSmall_NEON<8, 32>(pred_0, pred_1, weights, dest,
+ dest_stride);
return;
}
}
- DistanceWeightedBlendLarge_NEON(prediction_0, prediction_stride_0,
- prediction_1, prediction_stride_1, weights,
- width, height, dest, dest_stride);
+ DistanceWeightedBlendLarge_NEON(pred_0, pred_1, weights, width, height, dest,
+ dest_stride);
}
void Init8bpp() {
- Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
assert(dsp != nullptr);
dsp->distance_weighted_blend = DistanceWeightedBlend_NEON;
}
@@ -239,7 +191,7 @@
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/libgav1/src/dsp/arm/distance_weighted_blend_neon.h b/libgav1/src/dsp/arm/distance_weighted_blend_neon.h
index 6d35956..4d8824c 100644
--- a/libgav1/src/dsp/arm/distance_weighted_blend_neon.h
+++ b/libgav1/src/dsp/arm/distance_weighted_blend_neon.h
@@ -17,8 +17,8 @@
#ifndef LIBGAV1_SRC_DSP_ARM_DISTANCE_WEIGHTED_BLEND_NEON_H_
#define LIBGAV1_SRC_DSP_ARM_DISTANCE_WEIGHTED_BLEND_NEON_H_
-#include "src/dsp/cpu.h"
#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
namespace libgav1 {
namespace dsp {
@@ -32,7 +32,7 @@
// If NEON is enabled signal the NEON implementation should be used instead of
// normal C.
#if LIBGAV1_ENABLE_NEON
-#define LIBGAV1_Dsp8bpp_DistanceWeightedBlend LIBGAV1_DSP_NEON
+#define LIBGAV1_Dsp8bpp_DistanceWeightedBlend LIBGAV1_CPU_NEON
#endif // LIBGAV1_ENABLE_NEON
diff --git a/libgav1/src/dsp/arm/film_grain_neon.cc b/libgav1/src/dsp/arm/film_grain_neon.cc
new file mode 100644
index 0000000..2612466
--- /dev/null
+++ b/libgav1/src/dsp/arm/film_grain_neon.cc
@@ -0,0 +1,1188 @@
+// Copyright 2019 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/film_grain.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <new>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/arm/film_grain_neon.h"
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/film_grain_common.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace film_grain {
+namespace {
+
+// These functions are overloaded for both possible sizes in order to simplify
+// loading and storing to and from intermediate value types from within a
+// template function.
+inline int16x8_t GetSignedSource8(const int8_t* src) {
+ return vmovl_s8(vld1_s8(src));
+}
+
+inline int16x8_t GetSignedSource8(const uint8_t* src) {
+ return ZeroExtend(vld1_u8(src));
+}
+
+inline void StoreUnsigned8(uint8_t* dest, const uint16x8_t data) {
+ vst1_u8(dest, vmovn_u16(data));
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+inline int16x8_t GetSignedSource8(const int16_t* src) { return vld1q_s16(src); }
+
+inline int16x8_t GetSignedSource8(const uint16_t* src) {
+ return vreinterpretq_s16_u16(vld1q_u16(src));
+}
+
+inline void StoreUnsigned8(uint16_t* dest, const uint16x8_t data) {
+ vst1q_u16(dest, data);
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+// Each element in |sum| represents one destination value's running
+// autoregression formula. The fixed source values in |grain_lo| and |grain_hi|
+// allow for a sliding window in successive calls to this function.
+template <int position_offset>
+inline int32x4x2_t AccumulateWeightedGrain(const int16x8_t grain_lo,
+ const int16x8_t grain_hi,
+ int16_t coeff, int32x4x2_t sum) {
+ const int16x8_t grain = vextq_s16(grain_lo, grain_hi, position_offset);
+ sum.val[0] = vmlal_n_s16(sum.val[0], vget_low_s16(grain), coeff);
+ sum.val[1] = vmlal_n_s16(sum.val[1], vget_high_s16(grain), coeff);
+ return sum;
+}
+
+// Because the autoregressive filter requires the output of each pixel to
+// compute pixels that come after in the row, we have to finish the calculations
+// one at a time.
+template <int bitdepth, int auto_regression_coeff_lag, int lane>
+inline void WriteFinalAutoRegression(int8_t* grain_cursor, int32x4x2_t sum,
+ const int8_t* coeffs, int pos, int shift) {
+ int32_t result = vgetq_lane_s32(sum.val[lane >> 2], lane & 3);
+
+ for (int delta_col = -auto_regression_coeff_lag; delta_col < 0; ++delta_col) {
+ result += grain_cursor[lane + delta_col] * coeffs[pos];
+ ++pos;
+ }
+ grain_cursor[lane] =
+ Clip3(grain_cursor[lane] + RightShiftWithRounding(result, shift),
+ GetGrainMin<bitdepth>(), GetGrainMax<bitdepth>());
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template <int bitdepth, int auto_regression_coeff_lag, int lane>
+inline void WriteFinalAutoRegression(int16_t* grain_cursor, int32x4x2_t sum,
+ const int8_t* coeffs, int pos, int shift) {
+ int32_t result = vgetq_lane_s32(sum.val[lane >> 2], lane & 3);
+
+ for (int delta_col = -auto_regression_coeff_lag; delta_col < 0; ++delta_col) {
+ result += grain_cursor[lane + delta_col] * coeffs[pos];
+ ++pos;
+ }
+ grain_cursor[lane] =
+ Clip3(grain_cursor[lane] + RightShiftWithRounding(result, shift),
+ GetGrainMin<bitdepth>(), GetGrainMax<bitdepth>());
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+// Because the autoregressive filter requires the output of each pixel to
+// compute pixels that come after in the row, we have to finish the calculations
+// one at a time.
+template <int bitdepth, int auto_regression_coeff_lag, int lane>
+inline void WriteFinalAutoRegressionChroma(int8_t* u_grain_cursor,
+ int8_t* v_grain_cursor,
+ int32x4x2_t sum_u, int32x4x2_t sum_v,
+ const int8_t* coeffs_u,
+ const int8_t* coeffs_v, int pos,
+ int shift) {
+ WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
+ u_grain_cursor, sum_u, coeffs_u, pos, shift);
+ WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
+ v_grain_cursor, sum_v, coeffs_v, pos, shift);
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+template <int bitdepth, int auto_regression_coeff_lag, int lane>
+inline void WriteFinalAutoRegressionChroma(int16_t* u_grain_cursor,
+ int16_t* v_grain_cursor,
+ int32x4x2_t sum_u, int32x4x2_t sum_v,
+ const int8_t* coeffs_u,
+ const int8_t* coeffs_v, int pos,
+ int shift) {
+ WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
+ u_grain_cursor, sum_u, coeffs_u, pos, shift);
+ WriteFinalAutoRegression<bitdepth, auto_regression_coeff_lag, lane>(
+ v_grain_cursor, sum_v, coeffs_v, pos, shift);
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+inline void SetZero(int32x4x2_t* v) {
+ v->val[0] = vdupq_n_s32(0);
+ v->val[1] = vdupq_n_s32(0);
+}
+
+// Computes subsampled luma for use with chroma, by averaging in the x direction
+// or y direction when applicable.
+int16x8_t GetSubsampledLuma(const int8_t* const luma, int subsampling_x,
+ int subsampling_y, ptrdiff_t stride) {
+ if (subsampling_y != 0) {
+ assert(subsampling_x != 0);
+ const int8x16_t src0 = vld1q_s8(luma);
+ const int8x16_t src1 = vld1q_s8(luma + stride);
+ const int16x8_t ret0 = vcombine_s16(vpaddl_s8(vget_low_s8(src0)),
+ vpaddl_s8(vget_high_s8(src0)));
+ const int16x8_t ret1 = vcombine_s16(vpaddl_s8(vget_low_s8(src1)),
+ vpaddl_s8(vget_high_s8(src1)));
+ return vrshrq_n_s16(vaddq_s16(ret0, ret1), 2);
+ }
+ if (subsampling_x != 0) {
+ const int8x16_t src = vld1q_s8(luma);
+ return vrshrq_n_s16(
+ vcombine_s16(vpaddl_s8(vget_low_s8(src)), vpaddl_s8(vget_high_s8(src))),
+ 1);
+ }
+ return vmovl_s8(vld1_s8(luma));
+}
+
+// For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
+inline uint16x8_t GetAverageLuma(const uint8_t* const luma, int subsampling_x) {
+ if (subsampling_x != 0) {
+ const uint8x16_t src = vld1q_u8(luma);
+ return vrshrq_n_u16(vpaddlq_u8(src), 1);
+ }
+ return vmovl_u8(vld1_u8(luma));
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+// Computes subsampled luma for use with chroma, by averaging in the x direction
+// or y direction when applicable.
+int16x8_t GetSubsampledLuma(const int16_t* const luma, int subsampling_x,
+ int subsampling_y, ptrdiff_t stride) {
+ if (subsampling_y != 0) {
+ assert(subsampling_x != 0);
+ int16x8_t src0_lo = vld1q_s16(luma);
+ int16x8_t src0_hi = vld1q_s16(luma + 8);
+ const int16x8_t src1_lo = vld1q_s16(luma + stride);
+ const int16x8_t src1_hi = vld1q_s16(luma + stride + 8);
+ const int16x8_t src0 =
+ vcombine_s16(vpadd_s16(vget_low_s16(src0_lo), vget_high_s16(src0_lo)),
+ vpadd_s16(vget_low_s16(src0_hi), vget_high_s16(src0_hi)));
+ const int16x8_t src1 =
+ vcombine_s16(vpadd_s16(vget_low_s16(src1_lo), vget_high_s16(src1_lo)),
+ vpadd_s16(vget_low_s16(src1_hi), vget_high_s16(src1_hi)));
+ return vrshrq_n_s16(vaddq_s16(src0, src1), 2);
+ }
+ if (subsampling_x != 0) {
+ const int16x8_t src_lo = vld1q_s16(luma);
+ const int16x8_t src_hi = vld1q_s16(luma + 8);
+ const int16x8_t ret =
+ vcombine_s16(vpadd_s16(vget_low_s16(src_lo), vget_high_s16(src_lo)),
+ vpadd_s16(vget_low_s16(src_hi), vget_high_s16(src_hi)));
+ return vrshrq_n_s16(ret, 1);
+ }
+ return vld1q_s16(luma);
+}
+